├── README.md
├── cycle.Rmd
├── dimred.Rmd
├── flowsom.Rmd
├── intro.Rmd
├── normalize.Rmd
├── parallel
    ├── slurm-aaron.tmpl
    └── slurm.R
├── pics
    └── make_pics.R
├── preprocess.Rmd
├── resources
    └── marker_genes.csv
└── variance.Rmd


/README.md:
--------------------------------------------------------------------------------
 1 | # Scripts for analyzing the 10X 1.3 million brain cell data
 2 | 
 3 | The scripts should be executed in the following order:
 4 | 
 5 | - `intro.Rmd`: An introduction, duh.
 6 | - `preprocess.Rmd`: Downloading the data and quality control
 7 | - `cycle.Rmd`: Cell cycle phase assignment
 8 | - `normalize.Rmd`: Calculation of cell-specific size factors
 9 | - `variance.Rmd`: Identification of highly variable genes
10 | - `dimred.Rmd`: Dimensionality reduction with randomized PCA 
11 | 
12 | Various output objects will be saved to `objects/`.
13 | A few of these objects are currently hosted at https://drive.google.com/open?id=1_0WbmJ2BriLKlyKEf1Bbb8K0_NwD9rw-.
14 | Note that `sce.rds` does not contain the actual counts or normalized expression values, and requires something like this:
15 | 
16 | ```r
17 | library(TENxBrainData)
18 | tenx <- TENxBrainData()
19 | sce <- readRDS("sce.rds")
20 | tenx <- tenx[,colnames(sce)] # drop 19,672 cells from the raw TENxBrainData
21 | counts(sce) <- counts(tenx) # overwrite inbuilt absolute path
22 | library(scater)
23 | sce <- normalize(sce) # generate normalized expression values
24 | ```
25 | 
26 | The `pics/make_pics.R` scripts will generate the figures used in the paper.
27 | 


--------------------------------------------------------------------------------
/cycle.Rmd:
--------------------------------------------------------------------------------
 1 | # Assigning cell cycle phase
 2 | 
 3 | ```{r, echo=FALSE, results="hide"}
 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
 5 | ```
 6 | 
 7 | ```{r, echo=FALSE, results="hide"}
 8 | library(BiocStyle)
 9 | library(HDF5Array)
10 | library(SingleCellExperiment)
11 | sce <- readRDS("objects/sce.rds")
12 | ```
13 | 
14 | Here, we use the `cyclone` method to assign cells to the cell cycle phase.
15 | We do so using the pre-trained mouse classifier and multiple cores to reduce computational time.
16 | 
17 | ```{r}
18 | library(scran)
19 | set.seed(100)
20 | mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
21 | assigned <- cyclone(sce, mm.pairs, gene.names=rowData(sce)$Ensembl, BPPARAM=MulticoreParam(3))
22 | ```
23 | 
24 | Making a plot of the cell cycle scores.
25 | 
26 | ```{r cyclescores}
27 | smoothScatter(assigned$scores$G1, assigned$scores$G2M, 
28 |     xlab="G1 score", ylab="G2M score")
29 | ```
30 | 
31 | We save the phases to the `SummarizedExperiment` object in case it's useful.
32 | 
33 | ```{r}
34 | sce$Phase <- assigned$phase
35 | table(assigned$phase)
36 | ```
37 | 
38 | Saving all of the results to file for later use.
39 | 
40 | ```{r}
41 | saveRDS(assigned, file="objects/cycle_output.rds")
42 | ```
43 | 
44 | <!--
45 | Saving the updated object as well:
46 | 
47 | ```{r}
48 | saveRDS(file="objects/sce.rds", sce)
49 | ```
50 | -->
51 | 


--------------------------------------------------------------------------------
/dimred.Rmd:
--------------------------------------------------------------------------------
 1 | # Dimensionality reduction
 2 | 
 3 | ```{r, echo=FALSE, results="hide"}
 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
 5 | ```
 6 | 
 7 | ```{r, echo=FALSE, results="hide"}
 8 | library(BiocStyle)
 9 | library(SingleCellExperiment)
10 | sce <- readRDS("objects/sce.rds")
11 | dec <- read.table("objects/hvg_output.txt", stringsAsFactors=FALSE, header=TRUE)
12 | ```
13 | 
14 | Taking the HVGs to define the biological subspace of interest.
15 | 
16 | ```{r}
17 | sig <- dec$FDR <= 0.05 & !is.na(dec$FDR)
18 | summary(sig)
19 | ```
20 | 
21 | Extracting the HVGs and mean-centering them.
22 | 
23 | ```{r}
24 | exprs.mat <- logcounts(sce)
25 | exprs.mat <- exprs.mat[match(dec$Ensembl[sig], rowData(sce)$Ensembl), ]
26 | exprs.mat <- exprs.mat - dec$mean[sig]
27 | ```
28 | 
29 | Performing random PCA on the resulting expression matrix to retain the first 50 PCs.
30 | Here we use the `r Githubpkg("Bioconductor/BigDataAlgorithms")` package that contains a randomized SVD implementation for `DelayedArray` objects.
31 | 
32 | ```{r}
33 | library(BigDataAlgorithms)
34 | set.seed(1000)
35 | options(DelayedArray.block.size=200e6)
36 | svd.out <- rsvd(t(exprs.mat), k=50)
37 | pc.out <- sweep(svd.out$u, 2, svd.out$d, "*")
38 | ```
39 | 
40 | Having a look at the variance explained by the top 50 PCs.
41 | 
42 | ```{r screeplot}
43 | var.exp <- svd.out$d^2/(ncol(exprs.mat)-1)
44 | total.var <- sum(dec$total[sig])
45 | plot(var.exp/total.var*100, xlab="PC number", ylab="% variance explained")
46 | ```
47 | 
48 | Making a plot of the first three PCs.
49 | 
50 | ```{r pcaplot, fig.width=12, fig.height=6}
51 | par(mfrow=c(1,2))
52 | smoothScatter(pc.out[,1], pc.out[,2], xlab="PC1", ylab="PC2")
53 | smoothScatter(pc.out[,1], pc.out[,3], xlab="PC1", ylab="PC3")
54 | ```
55 | 
56 | Storing the PCs in the `SingleCellExperiment` object. 
57 | Also saving the SVD values for future use.
58 | 
59 | ```{r}
60 | attr(pc.out, "percentVar") <- var.exp/total.var
61 | reducedDim(sce, "rPCA") <- pc.out
62 | saveRDS(svd.out, file="objects/rsvd.rds")
63 | ```
64 | 
65 | <!--
66 | Saving the updated SCE object.
67 | 
68 | ```{r}
69 | saveRDS(sce, file="objects/sce.rds")
70 | ```
71 | -->
72 | 


--------------------------------------------------------------------------------
/flowsom.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Clustering 1.3M cells with FlowSOM"
  3 | author: ""
  4 | output: html_document
  5 | ---
  6 | 
  7 | ```{r}
  8 | suppressPackageStartupMessages(library(TENxBrainData))
  9 | suppressPackageStartupMessages(library(scater))
 10 | suppressPackageStartupMessages(library(flowCore))
 11 | suppressPackageStartupMessages(library(FlowSOM))
 12 | suppressPackageStartupMessages(library(scran))
 13 | suppressPackageStartupMessages(library(pheatmap))
 14 | suppressPackageStartupMessages(library(tibble))
 15 | suppressPackageStartupMessages(library(dplyr))
 16 | suppressPackageStartupMessages(library(reshape2))
 17 | suppressPackageStartupMessages(library(ggplot2))
 18 | suppressPackageStartupMessages(library(Rtsne))
 19 | suppressPackageStartupMessages(library(mclust))
 20 | ```
 21 | 
 22 | ## Read data
 23 | 
 24 | ```{r}
 25 | tenx <- TENxBrainData()
 26 | sce <- readRDS("objects/sce.rds")
 27 | tenx <- tenx[, colnames(sce)] # drop 19,672 cells from the raw TENxBrainData
 28 | counts(sce) <- counts(tenx) # overwrite inbuilt absolute path
 29 | system.time(sce <- scater::normalize(sce)) # generate normalized expression values
 30 | ```
 31 | 
 32 | ## Read rSVD results
 33 | 
 34 | ```{r}
 35 | svd <- readRDS("objects/rsvd.rds")
 36 | pca <- sweep(svd$u, 2, svd$d, "*")
 37 | colnames(pca) <- paste0("PC", seq_len(ncol(pca)))
 38 | dim(pca)
 39 | ```
 40 | 
 41 | ## Apply FlowSOM to cluster the cells based on the first 50 PCs
 42 | 
 43 | ```{r}
 44 | set.seed(123)
 45 | system.time(ff <- flowFrame(exprs = pca))
 46 | system.time(fSOM <- FlowSOM::ReadInput(ff, compensate = FALSE, transform = FALSE, 
 47 |                                        scale = FALSE, silent = TRUE))
 48 | system.time(fSOM <- FlowSOM::BuildSOM(fSOM, silent = TRUE, xdim = 13, ydim = 13))
 49 | system.time(metaClustering <- metaClustering_consensus(fSOM$map$codes, k = 16))
 50 | ```
 51 | 
 52 | ## Add cluster info to data object
 53 | 
 54 | ```{r}
 55 | colData(sce)$som100 <- fSOM$map$mapping[, 1]
 56 | colData(sce)$sommeta <- metaClustering[fSOM$map$mapping[, 1]]
 57 | ```
 58 | 
 59 | ## Add rownames
 60 | 
 61 | ```{r}
 62 | rownames(sce) <- paste0(rowData(sce)$Ensembl, ".", rowData(sce)$Symbol)
 63 | ```
 64 | 
 65 | 
 66 | ## Plot PCA representation
 67 | 
 68 | ```{r pca, fig.width = 8, fig.height = 8}
 69 | dim(sce)
 70 | dim(pca)
 71 | cols <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6",
 72 |           "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265",
 73 |           "#90C987", "#CAEDAB", "#777777", "black", "cyan", "pink")
 74 | names(cols) <- as.character(seq_len(length(cols)))
 75 | dfpca <- data.frame(pca, som100 = factor(colData(sce)$som100), 
 76 |                     sommeta = factor(colData(sce)$sommeta), 
 77 |                     library_id = factor(colData(sce)$Library),
 78 |                     mouse = factor(colData(sce)$Mouse),
 79 |                     stringsAsFactors = FALSE)
 80 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = sommeta)) + 
 81 |         geom_point(size = 0.5) + scale_color_manual(values = cols) + 
 82 |         theme_bw())
 83 | print(ggplot(dfpca, aes(x = PC3, y = PC4, color = sommeta)) + 
 84 |         geom_point(size = 0.5) + scale_color_manual(values = cols) + 
 85 |         theme_bw())
 86 | print(ggplot(dfpca, aes(x = PC5, y = PC6, color = sommeta)) + 
 87 |         geom_point(size = 0.5) + scale_color_manual(values = cols) + 
 88 |         theme_bw())
 89 | print(ggplot(dfpca, aes(x = PC7, y = PC8, color = sommeta)) + 
 90 |         geom_point(size = 0.5) + scale_color_manual(values = cols) + 
 91 |         theme_bw())
 92 | print(ggplot(dfpca, aes(x = PC9, y = PC10, color = sommeta)) + 
 93 |         geom_point(size = 0.5) + scale_color_manual(values = cols) + 
 94 |         theme_bw())
 95 | ```
 96 | 
 97 | Also plot PCA colored by library ID/mouse
 98 | 
 99 | ```{r pca2}
100 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = library_id)) + 
101 |         geom_point(size = 0.5) + theme_bw())
102 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = mouse)) + 
103 |         geom_point(size = 0.5) + theme_bw())
104 | ```
105 | 
106 | ## Find marker genes
107 | 
108 | ```{r scran_markers}
109 | system.time(scran_markers_all <- scran::findMarkers(sce, 
110 |                                                     clusters = colData(sce)$sommeta,
111 |                                                     block = NULL, 
112 |                                                     design = NULL,
113 |                                                     direction = "up",
114 |                                                     pval.type = "any",
115 |                                                     assay.type = "logcounts",
116 |                                                     get.spikes = FALSE, 
117 |                                                     log.p = TRUE, 
118 |                                                     lfc = 0.5))
119 | for (i in seq_len(length(scran_markers_all))) {
120 |   print(head(as.data.frame(scran_markers_all[[i]])))
121 | }
122 | scran_markers <- unique(unlist(lapply(scran_markers_all, function(w) {
123 |   rownames(subset(w, Top <= 1))
124 | })))
125 | 
126 | scescranmarker <- sce[which(rownames(sce) %in% scran_markers), ]
127 | logcounts_scranmarkers <- logcounts(scescranmarker)
128 | rownames(logcounts_scranmarkers) <- rowData(scescranmarker)$Symbol
129 | dfscran <- as.data.frame(logcounts_scranmarkers) %>% tibble::rownames_to_column("gene") %>%
130 |   reshape2::melt() %>% dplyr::left_join(as.data.frame(colData(scescranmarker)) %>% 
131 |                                           tibble::rownames_to_column("variable") %>%
132 |                                           dplyr::select(variable, som100, sommeta)) %>%
133 |   dplyr::mutate(sommeta = factor(sommeta)) %>%
134 |   dplyr::mutate(som100 = factor(som100))
135 | ```
136 | 
137 | Heatmap of inferred marker genes
138 | 
139 | ```{r scran_markers_heatmap, fig.height = 12, fig.width = 10}
140 | dfsumscran <- dfscran %>% dplyr::group_by(gene, som100) %>%
141 |   dplyr::summarize(value = quantile(value, probs = 0.75)) %>%
142 |   tidyr::spread(som100, value) %>% as.data.frame()
143 | rownames(dfsumscran) <- dfsumscran$gene
144 | dfsumscran$gene <- NULL
145 | dfsumscran <- dfsumscran[apply(dfsumscran, 1, sd) > 0, ]
146 | pheatmap::pheatmap(dfsumscran, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, 
147 |                    annotation_col = data.frame(metaClust = factor(metaClustering), row.names = as.character(seq_len(length(metaClustering)))), 
148 |                    show_colnames = FALSE, show_rownames = TRUE, fontsize_row = 6, 
149 |                    annotation_colors = list(metaClust = cols))
150 | 
151 | ```
152 | 
153 | 
154 | ## Look at known marker genes. 
155 | Marker genes for cell types in mouse brain were obtained from two recent publications:
156 | 
157 | - [http://science.sciencemag.org/content/347/6226/1138.full](http://science.sciencemag.org/content/347/6226/1138.full)
158 | - [https://www.nature.com/articles/nn.4216](https://www.nature.com/articles/nn.4216)
159 | 
160 | ```{r markers, fig.width = 24, fig.height = 24}
161 | marker_genes <- read.csv("resources/marker_genes.csv", header = TRUE, as.is = TRUE) %>%
162 |   dplyr::arrange(population) %>% dplyr::select(gene, population) %>%
163 |   dplyr::distinct()
164 | 
165 | keep <- which(rowData(sce)$Symbol %in% unique(marker_genes$gene))
166 | scemarker <- sce[keep, ]
167 | logcounts_markers <- assays(scemarker)[["logcounts"]]
168 | rownames(logcounts_markers) <- rowData(scemarker)$Symbol
169 | df <- as.data.frame(logcounts_markers) %>% tibble::rownames_to_column("gene") %>%
170 |   reshape2::melt() %>% dplyr::left_join(as.data.frame(colData(scemarker)) %>% 
171 |                                           tibble::rownames_to_column("variable") %>%
172 |                                           dplyr::select(variable, som100, sommeta)) %>%
173 |   dplyr::mutate(sommeta = factor(sommeta)) %>%
174 |   dplyr::mutate(som100 = factor(som100)) %>% 
175 |   dplyr::left_join(marker_genes %>% dplyr::select(gene, population)) %>%
176 |   dplyr::mutate(genepop = paste0(gene, " (", population, ")")) %>%
177 |   dplyr::arrange(population) %>% 
178 |   dplyr::mutate(gene = factor(gene, levels = unique(marker_genes$gene)))
179 | 
180 | print(ggplot(df, aes(x = sommeta, y = value, fill = sommeta)) + 
181 |         geom_boxplot(outlier.size = 0.5) + facet_wrap(~ genepop, scales = "free_y") + 
182 |         scale_fill_manual(values = cols) + theme_bw())
183 | ```
184 | 
185 | Check the FDRs of the marker genes in each of the clusters
186 | 
187 | ```{r known_markers_fdr, fig.height = 12}
188 | L <- do.call(rbind, lapply(structure(marker_genes$gene, names = marker_genes$gene),
189 |                            function(g) {
190 |   do.call(rbind, lapply(seq_along(scran_markers_all), function(i) {
191 |     j <- grep(paste0("\\.", g, "$"), rownames(scran_markers_all[[i]]))
192 |     if (length(j) == 1) data.frame(gene = g, cluster = i, 
193 |                                    log.FDR = scran_markers_all[[i]][j, "log.FDR"],
194 |                                    stringsAsFactors = FALSE)
195 |     else data.frame(gene = g, cluster = i, log.FDR = NA, stringsAsFactors = FALSE)
196 |   }))
197 | }))
198 | L <- tidyr::spread(L, cluster, log.FDR)
199 | rownames(L) <- L$gene
200 | L$gene <- NULL
201 | L <- L[rowSums(is.na(L)) == 0, ]
202 | pheatmap::pheatmap(exp(L), scale = "none", cluster_rows = TRUE, cluster_cols = TRUE)
203 | ```
204 | 
205 | 
206 | Plot the individual marker gene expression in each of the 100 original FlowSOM
207 | clusters, color by the final cluster assignment. 
208 | 
209 | ```{r markers-indiv, fig.width = 18}
210 | for (g in unique(df$genepop)) {
211 |   print(ggplot(df %>% dplyr::filter(genepop == g), 
212 |                aes(x = som100, y = value, fill = sommeta)) + 
213 |           geom_boxplot(outlier.size = 0.5) + ggtitle(g) + theme_bw() + 
214 |           theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
215 |           scale_fill_manual(values = cols))
216 | }
217 | ```
218 | 
219 | ## Heatmap of marker genes
220 | 
221 | We make a heatmap of the marker genes, summarized for the cells in each of the 
222 | original FlowSOM clusters. As a summarization function, we use the third
223 | quartile.
224 | 
225 | ```{r, fig.width = 10, fig.height = 12}
226 | dfsum <- df %>% dplyr::group_by(genepop, som100) %>%
227 |   dplyr::summarize(value = quantile(value, probs = 0.75)) %>%
228 |   tidyr::spread(som100, value) %>% as.data.frame()
229 | rownames(dfsum) <- dfsum$genepop
230 | dfsum$genepop <- NULL
231 | dfsum <- dfsum[apply(dfsum, 1, sd) > 0, ]
232 | pheatmap::pheatmap(dfsum, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, 
233 |                    annotation_col = data.frame(metaClust = factor(metaClustering), row.names = as.character(seq_len(length(metaClustering)))), 
234 |                    show_colnames = FALSE, show_rownames = TRUE, fontsize_row = 6, 
235 |                    annotation_colors = list(metaClust = cols))
236 | ```
237 | 
238 | ## t-SNE of subsampled data
239 | 
240 | Here we apply t-SNE to 10,000 randomly selected cells, using the 50 first PCs as
241 | the input. We color the resulting representation by the assigned cluster. 
242 | 
243 | ```{r tsnesub}
244 | set.seed(123)
245 | subs <- sample(seq_len(nrow(pca)), 10000, replace = FALSE)
246 | pcasub <- pca[subs, ]
247 | rtsne_out <- Rtsne(as.matrix(pcasub), pca = FALSE, verbose = TRUE, perplexity = 30)
248 | rtsne_out <- data.frame(rtsne_out$Y)
249 | colnames(rtsne_out) <- c("tSNE1", "tSNE2")
250 | rtsne_out$sommeta <- factor(colData(sce)$sommeta[subs])
251 | print(ggplot(rtsne_out, aes(x = tSNE1, y = tSNE2, color = sommeta)) + 
252 |         geom_point(size = 0.75) + scale_color_manual(values = cols) + 
253 |         theme_bw())
254 | ```
255 | 
256 | ## Session info
257 | 
258 | ```{r}
259 | date()
260 | sessionInfo()
261 | ```
262 | 
263 | 


--------------------------------------------------------------------------------
/intro.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Short analysis of the 10X million neuron data set
 3 | author: Aaron Lun
 4 | date: 29 June 2017
 5 | output: 
 6 |   BiocStyle::html_document:
 7 |     fig.caption: no 
 8 | ---
 9 | 
10 | ```{r style, echo = FALSE, results = 'asis'}
11 | BiocStyle::markdown()
12 | ```
13 | 
14 | # Introduction
15 | 
16 | Here we perform a brief analysis of the 1M neuron data set from 10X genomics (https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons).
17 | This uses a variety of low-level packages to represent the data on-disk, as well as high-level packages for scRNA-seq data analysis.
18 | The aim is to demonstrate the usefulness of the _beachmat_ API in facilitating high-level analyses from a `HDF5Matrix` object.
19 | 
20 | 


--------------------------------------------------------------------------------
/normalize.Rmd:
--------------------------------------------------------------------------------
 1 | # Normalization for cell-specific biases
 2 | 
 3 | ```{r, echo=FALSE, results="hide"}
 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
 5 | knitr::opts_chunk$set(dpi=300, dev="png", dev.args=list(pointsize=15))
 6 | ```
 7 | 
 8 | ```{r, echo=FALSE, results="hide"}
 9 | library(BiocStyle)
10 | library(HDF5Array)
11 | library(scran)
12 | sce <- readRDS("objects/sce.rds")
13 | ```
14 | 
15 | ## Performing a rough pre-clustering
16 | 
17 | Here, we use the deconvolution method to compute size factors for each cell.
18 | We start by doing some pre-clustering to split the cells into sensible clusters, to avoid violating the non-DE assumption and distorting the size factors.
19 | 
20 | ```{r}
21 | library(scran)
22 | system.time({
23 |     ids <- quickCluster(sce, min.mean=0.1, method="igraph",
24 |         block=sce$Library, block.BPPARAM=BPPARAM)
25 | })
26 | summary(tabulate(as.integer(ids)))
27 | ```
28 | 
29 | ## Calculating size factors
30 | 
31 | We now compute a size factor for each cell using the specified parameters.
32 | 
33 | ```{r}
34 | system.time({
35 |     sce <- computeSumFactors(sce, cluster=ids, min.mean=0.1, BPPARAM=BPPARAM)
36 | })
37 | summary(sizeFactors(sce))
38 | ```
39 | 
40 | We can have a look at them in more detail, compared to the library size for each cell.
41 | 
42 | ```{r sizefacplot}
43 | plot(sce$scater_qc$all$total_counts, sizeFactors(sce), 
44 |     log="xy", xlab="Library size", 
45 |     ylab="Size factors", cex=0.2, pch=16)
46 | ```
47 | 
48 | ## Normalizing the expression values 
49 | 
50 | We calculate normalized log-expression values, using delayed operations for speed instead of saving to a new HDF5 file.
51 | 
52 | ```{r}
53 | logcounts(sce) <- log2(t(t(counts(sce))/sizeFactors(sce)) + 1)
54 | ```
55 | 
56 | <!--
57 | ```{r echo=FALSE, results="hide"}
58 | # Saving the new object as well.
59 | saveRDS(file="objects/sce.rds", sce)
60 | ```
61 | -->
62 | 


--------------------------------------------------------------------------------
/parallel/slurm-aaron.tmpl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## Job Resource Interface Definition
 4 | ##
 5 | ## ntasks [integer(1)]:       Number of required tasks,
 6 | ##                            Set larger than 1 if you want to further parallelize
 7 | ##                            with MPI within your job.
 8 | ## ncpus [integer(1)]:        Number of required cpus per task,
 9 | ##                            Set larger than 1 if you want to further parallelize
10 | ##                            with multicore/parallel within each task.
11 | ## walltime [integer(1)]:     Walltime for this job, in minutes.
12 | ##                            Must be at least 1 minute.
13 | ## memory   [integer(1)]:     Memory in megabytes for each cpu.
14 | ##                            Must be at least 100 (when I tried lower values my
15 | ##                            jobs did not start at all).
16 | ##
17 | ## Default resources can be set in your .batchtools.conf.R by defining the variable
18 | ## 'default.resources' as a named list.
19 | 
20 | <%
21 | # relative paths are not handled well by Slurm
22 | log.file = fs::path_expand(log.file)
23 | -%>
24 | 
25 | 
26 | #SBATCH --job-name=<%= job.name %>
27 | #SBATCH --output=<%= log.file %>
28 | #SBATCH --error=<%= log.file %>
29 | #SBATCH --time=<%= ceiling(resources$walltime / 60) %>
30 | #SBATCH --ntasks=1
31 | #SBATCH --cpus-per-task=<%= resources$ncpus %>
32 | #SBATCH --mem-per-cpu=<%= resources$memory %>
33 | <%= if (!is.null(resources$partition)) sprintf(paste0("#SBATCH --partition='", resources$partition, "'")) %>
34 | <%= if (array.jobs) sprintf("#SBATCH --array=1-%i", nrow(jobs)) else "" %>
35 | 
36 | ## Initialize work environment like
37 | ## source /etc/profile
38 | ## module add ...
39 | 
40 | ## Export value of DEBUGME environemnt var to slave
41 | export DEBUGME=<%= Sys.getenv("DEBUGME") %>
42 | 
43 | <%= sprintf("export OMP_NUM_THREADS=%i", resources$omp.threads) -%>
44 | <%= sprintf("export OPENBLAS_NUM_THREADS=%i", resources$blas.threads) -%>
45 | <%= sprintf("export MKL_NUM_THREADS=%i", resources$blas.threads) -%>
46 | 
47 | ## Run R:
48 | ## we merge R output with stdout from SLURM, which gets then logged via --output option
49 | echo 'batchtools::doJobCollection("<%= uri %>")' | Rdevel --no-save --slave
50 | 


--------------------------------------------------------------------------------
/parallel/slurm.R:
--------------------------------------------------------------------------------
1 | # Defines a BatchtoolsParam object for submission on a SLURM cluster.
2 | 
3 | BPPARAM <- BatchtoolsParam(10, 
4 |     cluster="slurm", template="parallel/slurm-aaron.tmpl", 
5 |     logdir="parallel", log=TRUE, 
6 |     RNGseed=10000L,
7 |     resources=list(walltime=20000, memory=8000, ncpus=1))
8 | 


--------------------------------------------------------------------------------
/pics/make_pics.R:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Making a cell cycle picture.
 3 | 
 4 | assignments <- readRDS("../objects/cycle_output.rds")
 5 | X <- assignments$scores$G1
 6 | Y <- assignments$scores$G2M
 7 | 
 8 | png("cycle.png", width=7, height=7, units="in", res=300, pointsize=12)
 9 | smoothScatter(X, Y, xlab="G1 score", ylab="G2M score", cex.axis=1.2, cex.lab=1.4)
10 | 
11 | segments(-1, 0.5, 0.5, 0.5, col="red", lty=2, lwd=2)
12 | segments(0.5, -1, 0.5, 0.5, col="red", lty=2, lwd=2)
13 | segments(0.5, 0.5, 2, 2, col="red", lty=2, lwd=2)
14 | 
15 | text(0, 0.6, col="red", sprintf("G2M phase:\n%i", sum(assignments$phase=="G2M")), cex=1.2, pos=4)
16 | text(0, 0.4, col="red", sprintf("S phase:\n%i", sum(assignments$phase=="S")), cex=1.2, pos=4)
17 | text(0.5, 0.4, col="red", sprintf("G1 phase:\n%i", sum(assignments$phase=="G1")), cex=1.2, pos=4)
18 | dev.off()
19 | 
20 | ################################################################################
21 | # Making a plot of the size factors.
22 | 
23 | library(SingleCellExperiment)
24 | sce <- readRDS("../objects/sce.rds")
25 | 
26 | ratio <- log(sizeFactors(sce)/sce$total_counts)
27 | nmads <- abs((ratio - median(ratio))/mad(ratio))
28 | 
29 | library(viridis)
30 | my.cols <- rev(viridis(11))
31 | coldex <- findInterval(nmads, 0:10/2)
32 | 
33 | options(bitmapType="cairo")
34 | png("sizefacs.png", width=7, height=7, units="in", res=300, pointsize=12)
35 | plot(sce$total_counts/1e3, sizeFactors(sce), log="xy", col=my.cols[coldex], 
36 |      xlab=expression("Library size ("*10^3*")"), ylab="Size factor", 
37 |      cex.axis=1.2, cex.lab=1.4, pch=16, cex=0.2)
38 | dev.off()
39 | 
40 | ################################################################################
41 | # Making a plot of the top HVGs.
42 | 
43 | hvg.out <- read.table("../objects/hvg_output.txt", header=TRUE)
44 | is.sig <- hvg.out$FDR <= 0.05
45 | 
46 | png("hvg.png", width=7, height=7, units="in", res=300, pointsize=12)
47 | par(mar=c(5.1, 5.1, 4.1, 2.1))
48 | plot(hvg.out$mean, hvg.out$total, pch=16, col="grey", 
49 |      xlab=expression("Mean log"[2]~"expression"),
50 |      ylab=expression("Variance of log"[2]~"expression"), 
51 |      cex.axis=1.2, cex.lab=1.4)
52 | points(hvg.out$mean[is.sig], hvg.out$total[is.sig], col="orange", pch=16)
53 | o <- order(hvg.out$mean)
54 | lines(hvg.out$mean[o], hvg.out$tech[o], col="red", lwd=2)
55 | legend("bottomright", sprintf("%i HVGs out of %i genes", sum(is.sig & !is.na(is.sig)), length(is.sig)), bty="n", cex=1.1)
56 | 
57 | # Adding the top set of genes.
58 | chosen <- hvg.out$Symbol[1:10]
59 | xoffset <- c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2)
60 | yoffset <- c(0., -0.1, 0.1, 0.1, 0.1, 0., 0.1, 0.1, 0.1, 0.)
61 | 
62 | for (i in seq_along(chosen)) {
63 |     idex <- hvg.out$Symbol==chosen[i]
64 |     xpos <- hvg.out$mean[idex]
65 |     ypos <- hvg.out$total[idex]
66 |     xpos2 <- xpos + xoffset[i]
67 |     ypos2 <- ypos + yoffset[i]
68 |     text(xpos2, ypos2, chosen[i], pos=4, offset=0.1, cex=1.1)
69 |     segments(xpos, ypos, xpos2, ypos2)
70 | }
71 | dev.off()
72 | 
73 | ################################################################################
74 | # Making a PCA plot of the first two PCs.
75 | 
76 | #pc.data <- readRDS("../objects/sce.rds")
77 | pc.mat <- reducedDim(sce)
78 | pc.var <- attr(pc.mat, "percentVar")
79 | 
80 | options(bitmapType="cairo")
81 | png("pca.png", width=7, height=7, units="in", res=300, pointsize=12)
82 | smoothScatter(pc.mat[,1], pc.mat[,2], colramp=colorRampPalette(c("white", "black")),
83 |      xlab=sprintf("PC1 (%.2f%%)", pc.var[1]*100),
84 |      ylab=sprintf("PC2 (%.2f%%)", pc.var[2]*100),
85 |      cex.axis=1.2, cex.lab=1.4)
86 | dev.off()
87 | 
88 | 


--------------------------------------------------------------------------------
/preprocess.Rmd:
--------------------------------------------------------------------------------
  1 | # Preprocessing 
  2 | 
  3 | ```{r, echo=FALSE, results="hide"}
  4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
  5 | knitr::opts_chunk$set(dpi=300, dev="png", dev.args=list(pointsize=15))
  6 | ```
  7 | 
  8 | ```{r, echo=FALSE, results="hide"}
  9 | library(BiocStyle)
 10 | ```
 11 | 
 12 | ## Loading in the data
 13 | 
 14 | The aim here is to convert the 10X data set into a `HDF5Matrix` object.
 15 | We first obtain the data using the `r Biocpkg("TENxBrainData")` package:
 16 | 
 17 | ```{r}
 18 | #library(TENxBrainData)
 19 | #sce <- TENxBrainData() # or we would, if it was working properly.
 20 | library(SingleCellExperiment)
 21 | library(HDF5Array)
 22 | sce <- SingleCellExperiment(
 23 |     list(counts=HDF5Array("rawdata/counts.h5", "counts")),
 24 |     rowData=readRDS("rawdata/rowdata.rds"),
 25 |     colData=readRDS("rawdata/coldata.rds")
 26 | )
 27 | sce
 28 | ```
 29 | 
 30 | We have a look at some of the cell-level metadata.
 31 | Data were obtained from multiple mice, which were captured and sequenced in multiple libraries. 
 32 | Note that the libraries and nested within the mice.
 33 | 
 34 | ```{r}
 35 | table(sce$Library, sce$Mouse)
 36 | ```
 37 | 
 38 | We also add some gene-level annotation.
 39 | There's already gene symbols, so we just add the chromosome location.
 40 | 
 41 | ```{r}
 42 | library(TxDb.Mmusculus.UCSC.mm10.ensGene)
 43 | chr.loc <- mapIds(TxDb.Mmusculus.UCSC.mm10.ensGene, keys=rowData(sce)$Ensembl, 
 44 |     keytype="GENEID", column="CDSCHROM")
 45 | rowData(sce)$Chr <- chr.loc
 46 | head(rowData(sce))
 47 | ```
 48 | 
 49 | ## Performing cell-based quality control
 50 | 
 51 | We use `r Biocpkg("scater")` to calculate quality control summaries for each cell and gene.
 52 | 
 53 | ```{r}
 54 | library(scater)
 55 | system.time({
 56 |     sce <- calculateQCMetrics(sce, 
 57 |         feature_controls=list(Mito=which(rowData(sce)$Chr=="chrM")),
 58 |         compact=TRUE, BPPARAM=BPPARAM)
 59 | })
 60 | head(colnames(rowData(sce)))
 61 | ```
 62 | 
 63 | We have a look at the three relevant metrics, plotted against the batch of origin for each cell.
 64 | 
 65 | ```{r qchist, fig.height=6, fig.width=12}
 66 | par(mfrow=c(1,3))
 67 | hist(sce$scater_qc$all$log10_total_counts, 
 68 |     xlab=expression(Log[10]~"library size"), col="grey80")
 69 | hist(sce$scater_qc$all$log10_total_features_by_counts, 
 70 |     xlab=expression(Log[10]~"number of genes expressed"), col="grey80")
 71 | hist(sce$scater_qc$feature_control_Mito$pct_counts, 
 72 |     xlab="Percentage of mitochondrial reads", col="grey80", breaks=50)
 73 | ```
 74 | 
 75 | We use some of these metrics for quality control on the cells.
 76 | This is done within each batch to avoid discarding cells, e.g., if one batch was sequenced at lower depth.
 77 | 
 78 | ```{r}
 79 | low.libsize <- isOutlier(sce$scater_qc$all$log10_total_counts, 
 80 |     nmad=3, batch=sce$Library, type="lower")
 81 | low.ngenes <- isOutlier(sce$scater_qc$all$log10_total_features_by_counts, 
 82 |     nmad=3, batch=sce$Library, type="lower")
 83 | discard <- low.libsize | low.ngenes 
 84 | data.frame(LowLib=sum(low.libsize), LowGenes=sum(low.ngenes), Lost=sum(discard))
 85 | ```
 86 | 
 87 | Low-quality cells are discarded from the object.
 88 | 
 89 | ```{r}
 90 | sce <- sce[,!discard]
 91 | ```
 92 | 
 93 | ## Looking at gene-based metrics
 94 | 
 95 | For each feature, we plot the average count against the percentage of cells in which expression was detected.
 96 | 
 97 | ```{r ave-per-gene}
 98 | ave.count <- rowData(sce)$scater_qc$all$mean_counts
 99 | pct.cells <- rowData(sce)$scater_qc$all$n_cells_by_counts/ncol(sce) * 100
100 | smoothScatter(log10(ave.count), pct.cells, ylab="Percentage of cells",
101 |     xlab=expression("Log"[10]~"average count"))
102 | ```
103 | 
104 | We also inspect the identities of the top most-highly-expressed genes.
105 | 
106 | ```{r}
107 | top.genes <- rowData(sce)[,c("Ensembl", "Symbol")]
108 | top.genes$AveCount <- ave.count
109 | top.genes$PctCells <- pct.cells
110 | top.genes <- top.genes[order(ave.count, decreasing=TRUE),]
111 | head(as.data.frame(top.genes), 20)
112 | ```
113 | 
114 | <!--
115 | Serializing the SCE object:
116 | 
117 | ```{r}
118 | dir.create("objects")
119 | saveRDS(sce, file="objects/sce.rds")  
120 | ```
121 | -->
122 | 


--------------------------------------------------------------------------------
/resources/marker_genes.csv:
--------------------------------------------------------------------------------
1 | gene,population,sourceDlx5,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullArx,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullDlx2,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullDlx1,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullElavl2,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullSp9,Interneurons,http://science.sciencemag.org/content/347/6226/1138.fullTbr1,S1Pyramidal_ExcitatoryNeurons,http://science.sciencemag.org/content/347/6226/1138.fullPou3f1,CA1Pyramidal,http://science.sciencemag.org/content/347/6226/1138.fullLmo1,CA1Pyramidal,http://science.sciencemag.org/content/347/6226/1138.fullHeyl,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.fullTbx2,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.fullFoxs1,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.fullTbx18,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.fullBcl6b,Vend,http://science.sciencemag.org/content/347/6226/1138.fullMeox1,Vend,http://science.sciencemag.org/content/347/6226/1138.fullNmi,Vend,http://science.sciencemag.org/content/347/6226/1138.fullSox7,Vend,http://science.sciencemag.org/content/347/6226/1138.fullFoxl2,Vend,http://science.sciencemag.org/content/347/6226/1138.fullIrf8,Microglia,http://science.sciencemag.org/content/347/6226/1138.fullNlrp3,Microglia,http://science.sciencemag.org/content/347/6226/1138.fullIrf5,Microglia,http://science.sciencemag.org/content/347/6226/1138.fullHcls1,Microglia,http://science.sciencemag.org/content/347/6226/1138.fullSpi1,Microglia,http://science.sciencemag.org/content/347/6226/1138.fullDbx2,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.fullSall3,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.fullSox21,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.fullNkx6-2,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.fullSox10,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.fullSt18,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.fullOlig1,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.fullOlig2,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.fullSnap25,Neurons,https://www.nature.com/articles/nn.4216Gad1,GABAergic_InhibitoryNeurons,https://www.nature.com/articles/nn.4217Vip,VipGABAergicNeurons,https://www.nature.com/articles/nn.4218Sst,SstGABAergicNeurons,https://www.nature.com/articles/nn.4219Pvalb,PvalbGABAergicNeurons,https://www.nature.com/articles/nn.4220Slc17a7,GlutamatergicNeurons,https://www.nature.com/articles/nn.4221Rorb,GlutamatergicNeuronsL4L5a,https://www.nature.com/articles/nn.4222Foxp2,GlutamatergicNeuronsL6,https://www.nature.com/articles/nn.4223Aqp4,Astrocytes,https://www.nature.com/articles/nn.4224Pdgfra,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4225Mog,Oligodendrocytes,https://www.nature.com/articles/nn.4226Itgam,Microglia,https://www.nature.com/articles/nn.4227Flt1,Endothelial,https://www.nature.com/articles/nn.4228Bgn,SMC,https://www.nature.com/articles/nn.4229Myl9,SMC,https://www.nature.com/articles/nn.4230Crip1,SMC,https://www.nature.com/articles/nn.4231Vtn,SMC,https://www.nature.com/articles/nn.4232Id1,Endothelial,https://www.nature.com/articles/nn.4233Xdh,Endothelial,https://www.nature.com/articles/nn.4234Tbc1d4,Endothelial,https://www.nature.com/articles/nn.4235Exosc7,Endothelial,https://www.nature.com/articles/nn.4236Ctss,Microglia,https://www.nature.com/articles/nn.4237Cx3cr1,Microglia,https://www.nature.com/articles/nn.4238C1qb,Microglia,https://www.nature.com/articles/nn.4239Gpr34,Microglia,https://www.nature.com/articles/nn.4240Opalin,Oligodendrocytes,https://www.nature.com/articles/nn.4241Mag,Oligodendrocytes,https://www.nature.com/articles/nn.4242Mbp,Oligodendrocytes,https://www.nature.com/articles/nn.4243Cspg4,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4244Pcdh15,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4245Gria3,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4246Cxcl14,Astrocytes,https://www.nature.com/articles/nn.4247Gja1,Astrocytes,https://www.nature.com/articles/nn.4248F3,Astrocytes,https://www.nature.com/articles/nn.4249Sox9,Astrocytes,https://www.nature.com/articles/nn.4250Stmn2,Neurons,10xGenomicsChromiumWPHes1,Glial,10xGenomicsChromiumWPAldoc,Astrocytes,10xGenomicsChromiumWPOligo1,Oligodendrocytes,10xGenomicsChromiumWP


--------------------------------------------------------------------------------
/variance.Rmd:
--------------------------------------------------------------------------------
 1 | # Detecting highly variable genes
 2 | 
 3 | ```{r, echo=FALSE, results="hide"}
 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
 5 | ```
 6 | 
 7 | ```{r, echo=FALSE, results="hide"}
 8 | library(BiocStyle)
 9 | library(scran)
10 | sce <- readRDS("objects/sce.rds")
11 | 
12 | library(DelayedArray)
13 | setAutoBlockSize(100*100*8) # Ensure we load one chunk's worth of rows.
14 | ```
15 | 
16 | We compute the variance of the normalized log-expression values while blocking on the library of origin, and we fit a trend to it.
17 | Some fiddling with the trend parameters is necessary to obtain an appropriate fit at high abundances.
18 | 
19 | ```{r}
20 | system.time({
21 |     fit <- trendVar(sce, method="loess", parametric=TRUE, 
22 |         design=sce$Library, use.spikes=FALSE, BPPARAM=BPPARAM,
23 |         loess.args=list(span=0.05, control=loess.control(iterations=100)))
24 | })
25 | ```
26 | 
27 | For comparison, we fit a trend corresponding to pure Poisson noise.
28 | 
29 | ```{r}
30 | system.time({
31 |     poistrend <- makeTechTrend(
32 |         means=2^seq(0, max(fit$mean), length.out=100) - 1,
33 |         size.factors=sizeFactors(sce),
34 |         approx.npts=10000, BPPARAM=BPPARAM)
35 | })
36 | ```
37 | 
38 | We decompose the biological and technical component for each gene.
39 | Note that we use the Poisson trend here, as the trend fitted to the endogenous variances does not provide a good estimate of the technical noise for UMI data.
40 | 
41 | ```{r}
42 | fit0 <- fit
43 | fit0$trend <- poistrend
44 | dec <- decomposeVar(fit=fit0)
45 | dec <- cbind(rowData(sce)[,1:2], dec)
46 | dec <- dec[order(dec$p.value, -dec$bio),]
47 | head(dec)
48 | ```
49 | 
50 | We examine the mean-variance relationship.
51 | 
52 | ```{r hvgplot}
53 | plot(fit$mean, fit$var, pch=16, cex=0.5, xlab="Mean of log-expression", 
54 |     ylab="Variance of log-expression")
55 | curve(fit$trend(x), add=TRUE, col="red")
56 | curve(poistrend(x), add=TRUE, col="blue")
57 | ```
58 | 
59 | Finally we save the results to file.
60 | 
61 | ```{r}
62 | write.table(file="objects/hvg_output.txt", dec, sep="\t", quote=FALSE, row.names=FALSE)
63 | ```
64 | 
65 | <!--
66 | Also saving the trend.
67 | 
68 | ```{r}
69 | saveRDS(file="objects/trendfit.rds", fit)
70 | ```
71 | -->
72 | 


--------------------------------------------------------------------------------