├── Figure1.png
├── README.md
├── Step0.png
├── Step1.png
├── Step2.png
├── Step3.png
├── Step4.png
└── code
├── 01_Filter_Cells_v2.R
├── 02_Get_Peak_Set_hg19_v2.R
├── 03_Run_chromVAR_v2.R
├── 04_Run_Cicero_v2.R
├── 05_Cluster_Unique_Peaks_v2.R
├── 06_Analyze_UMAP_Trajectory.R
├── 07_ChromVAR_For_GWAS_w_CoAccessbility_v2.R
└── 08_Run_scCNV_v2.R
/Figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Figure1.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Massively parallel single-cell chromatin landscapes of human immune cell development and intratumoral T cell exhaustion. Nature Biotechnology (Satpathy*, Granja* et al. 2019)
2 |
3 | ## **Link** : https://www.nature.com/articles/s41587-019-0206-z
4 |
5 | ## Please cite : Satpathy*, Granja* et al. , Massively parallel single-cell chromatin landscapes of human immune cell development and intratumoral T cell exhaustion. Nature Biotechnology (2019)
6 |
7 | ## For current up to date scATAC-seq analysis https://github.com/GreenleafLab/ArchR
8 |
9 | 
10 |
11 | # Downsampled test data for PBMCs is available (~500 MB)
12 |
13 | https://jeffgranja.s3.amazonaws.com/10x-scATAC-share/10x-scATAC-Downsampled-PBMC-hg19-data.zip
14 |
15 | # Links To Supplementary Data
16 |
17 | ## Notes
18 |
19 | **.rds** file is an R binarized object to read into R use readRDS(filename)
20 |
21 | **SummarizedExperiment** is a class in R see :
https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html
22 |
23 | **deviations** (TF chromVAR) is a class in R see :
https://bioconductor.org/packages/release/bioc/html/chromVAR.html
24 |
25 | ## scATAC-seq Hematopoiesis
26 |
27 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_Heme_All_SummarizedExperiment.final.rds
28 |
29 | **chromVAR Summarized Experiment** :
30 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/chromVAR_Heme_All_SummarizedExperiment.final.rds
31 |
32 | **Cicero Log2 Gene Acitvity Scores Summarized Experiment** :
33 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/Log2_Gene_Activity_Heme_All_SummarizedExperiment.final.rds
34 |
35 | ## scATAC-seq CD34 Hematopoiesis
36 |
37 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_CD34_BM_SummarizedExperiment.final.rds
38 |
39 | ## scATAC-seq BCC Tumor Microenvironment
40 |
41 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_TME_All_SummarizedExperiment.final.rds
42 |
43 | **chromVAR Summarized Experiment** :
44 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/chromVAR_TME_All_SummarizedExperiment.final.rds
45 |
46 | **Cicero Log2 Gene Acitvity Scores Summarized Experiment** :
47 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/Log2_Gene_Activity_TME_All_SummarizedExperiment.final.rds
48 |
49 | ## scATAC-seq BCC Tcells (Exhaustion)
50 |
51 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_TME_TCells_SummarizedExperiment.final.rds
52 |
53 | If you want Tcell clusters get colData(se)$T_Cell_Cluster
54 |
55 | # Getting 10x scATAC-seq Bam Files
56 |
57 | ## 1. Go to NIH GEO Page : https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE129785&holding=F1000&otool=stanford.
58 |
59 | 
60 |
61 | ## 2. Click on a sample (I am showing SU001_Tumor_Immune_Post).
62 |
63 | 
64 |
65 | ## 3. Navigate down to the bottom and click on SRA link.
66 |
67 | 
68 |
69 | ## 4. Navigate down and fine the run SRR.
70 |
71 | 
72 |
73 | ## 5. Click on "Data Access" Tab and then navigate to "Original Format"
74 |
75 | 
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/Step0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step0.png
--------------------------------------------------------------------------------
/Step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step1.png
--------------------------------------------------------------------------------
/Step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step2.png
--------------------------------------------------------------------------------
/Step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step3.png
--------------------------------------------------------------------------------
/Step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step4.png
--------------------------------------------------------------------------------
/code/01_Filter_Cells_v2.R:
--------------------------------------------------------------------------------
1 | #Filtering Cells based on TSS enrichment and unique fragments
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
8 | library(magrittr)
9 | library(ggplot2)
10 | library(Rcpp)
11 | library(viridis)
12 |
13 | #--------------------------------------------
14 | # Functions
15 | #--------------------------------------------
16 |
17 | sourceCpp(code='
18 | #include
19 |
20 | using namespace Rcpp;
21 | using namespace std;
22 |
23 | // [[Rcpp::export]]
24 | IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){
25 | if(x1.size() != y1.size()){
26 | stop("width must equal size!");
27 | }
28 | IntegerVector x = clone(x1);
29 | IntegerVector y = clone(y1);
30 | int n = x.size();
31 | IntegerVector rx = seq(xmin,xmax);
32 | IntegerVector ry = seq(ymin,ymax);
33 | IntegerMatrix mat( ry.size() , rx.size() );
34 | int xi,yi;
35 | for(int i = 0; i < n; i++){
36 | xi = (x[i] - xmin);
37 | yi = (y[i] - ymin);
38 | if(yi >= 0 && yi < ry.size()){
39 | if(xi >= 0 && xi < rx.size()){
40 | mat( yi , xi ) = mat( yi , xi ) + 1;
41 | }
42 | }
43 | }
44 | return mat;
45 | }'
46 | )
47 |
48 | insertionProfileSingles <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
49 |
50 | insertionProfileSingles_helper <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
51 | #Convert To Insertion Sites
52 | if(getInsertions){
53 | insertions <- c(
54 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
55 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
56 | )
57 | by <- "RG"
58 | }else{
59 | insertions <- fragments
60 | }
61 | remove(fragments)
62 | gc()
63 |
64 | #center the feature
65 | center <- unique(resize(feature, width = 1, fix = fix, ignore.strand = FALSE))
66 |
67 | #get overlaps between the feature and insertions only up to flank bp
68 | overlap <- DataFrame(findOverlaps(query = center, subject = insertions, maxgap = flank, ignore.strand = TRUE))
69 | overlap$strand <- strand(center)[overlap[,1]]
70 | overlap$name <- mcols(insertions)[overlap[,2],by]
71 | overlap <- transform(overlap, id=match(name, unique(name)))
72 | ids <- length(unique(overlap$name))
73 |
74 | #distance
75 | overlap$dist <- NA
76 | minus <- which(overlap$strand == "-")
77 | other <- which(overlap$strand != "-")
78 | overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])
79 | overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])
80 |
81 | #Insertion Mat
82 | profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -flank, ymax = flank)
83 | colnames(profile_mat) <- unique(overlap$name)
84 | profile <- rowSums(profile_mat)
85 |
86 | #normalize
87 | profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells
88 | profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])
89 |
90 | #smooth
91 | profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))
92 | profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1)
93 |
94 | #enrichment
95 | max_finite <- function(x){
96 | suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
97 | }
98 | e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))
99 | names(e_mat) <- colnames(profile_mat_norm_smooth)
100 | e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)])
101 |
102 | #Summary
103 | df_mat <- data.frame(
104 | enrichment = e_mat,
105 | insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),
106 | insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
107 | )
108 | df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
109 | rownames(df_sum) <- NULL
110 |
111 | return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
112 | }
113 |
114 | uniqueTags <- as.character(unique(mcols(fragments)[,by]))
115 | splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
116 |
117 | pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)
118 | batchTSS <- lapply(seq_along(splitTags), function(x){
119 | setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))
120 | profilex <- insertionProfileSingles_helper(
121 | feature=feature,
122 | fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])],
123 | by = by,
124 | getInsertions = getInsertions,
125 | fix = fix,
126 | flank = flank,
127 | norm = norm,
128 | smooth = smooth,
129 | range = range
130 | )
131 |
132 | return(profilex)
133 | })
134 | df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)
135 | dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)
136 | profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
137 | profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
138 | return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
139 | }
140 |
141 | #--------------------------------------------
142 | # Input
143 | #--------------------------------------------
144 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
145 | minFrags <- 100
146 | filterFrags <- 1000
147 | filterTSS <- 8
148 | file_fragments <- "data/PBMC_10x-Sub25M-fragments.tsv.gz"
149 | out_fragments <- "data/PBMC_10x-Sub25M-fragments.gr.rds"
150 | name <- "PBMC"
151 |
152 | #-----------------
153 | # Reading Fragment Files
154 | #-----------------
155 | message("Reading in fragment files...")
156 | fragments <- data.frame(readr::read_tsv(file_fragments, col_names=FALSE))
157 | #fragmentSub <- fragments[sample(seq_len(nrow(fragments)),25*10^6),]
158 | #write.table(fragmentSub, "data/PBMC_10x-Sub25M-fragments.tsv.gz", col.names=FALSE, row.names =FALSE, sep = "\t", quote = FALSE)
159 |
160 | fragments <- GRanges(
161 | seqnames = fragments[,1],
162 | IRanges(fragments[,2]+1, fragments[,3]),
163 | RG = fragments[,4],
164 | N = fragments[,5]
165 | )
166 |
167 | message("Filtering Lowly Represented Cells...")
168 | tabRG <- table(fragments$RG)
169 | keep <- names(tabRG)[which(tabRG >= minFrags)]
170 | fragments <- fragments[fragments$RG %in% keep,]
171 | fragments <- sort(sortSeqlevels(fragments))
172 |
173 | #-----------------
174 | # TSS Profile
175 | #-----------------
176 | feature <- txdb %>% transcripts(.) %>% resize(., width = 1, fix = "start") %>% unique
177 | tssProfile <- insertionProfileSingles(feature = feature, fragments = fragments,
178 | getInsertions = TRUE, batchSize = 1000)
179 | tssSingles <- tssProfile$dfall
180 | tssSingles$uniqueFrags <- 0
181 | tssSingles[names(tabRG),"uniqueFrags"] <- tabRG
182 | tssSingles$cellCall <- 0
183 | tssSingles$cellCall[tssSingles$uniqueFrags >= filterFrags & tssSingles$enrichment >= filterTSS] <- 1
184 |
185 | #-----------------
186 | # Plot Stats
187 | #-----------------
188 | tssSingles <- tssSingles[complete.cases(tssSingles),]
189 | nPass <- sum(tssSingles$cellCall==1)
190 | nTotal <- sum(tssSingles$uniqueFrags >= filterFrags)
191 |
192 | pdf("results/Filter-Cells.pdf")
193 | ggplot(tssSingles[tssSingles$uniqueFrags > 500,], aes(x = log10(uniqueFrags), y = enrichment)) +
194 | geom_hex(bins = 100) +
195 | theme_bw() + scale_fill_viridis() +
196 | xlab("log10 Unique Fragments") +
197 | ylab("TSS Enrichment") +
198 | geom_hline(yintercept = filterTSS, lty = "dashed") +
199 | geom_vline(xintercept = log10(filterFrags), lty = "dashed") +
200 | ggtitle(sprintf("Pass Rate : %s of %s (%s)", nPass, nTotal, round(100*nPass/nTotal,2)))
201 | dev.off()
202 |
203 | write.table(tssSingles, "results/Filter-Cells.txt")
204 |
205 | #Filter
206 | fragments <- fragments[mcols(fragments)$RG %in% rownames(tssSingles)[tssSingles$cellCall==1]]
207 | fragments$RG <- paste0(name,"#",fragments$RG)
208 |
209 | #Save
210 | saveRDS(fragments, out_fragments)
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------
/code/02_Get_Peak_Set_hg19_v2.R:
--------------------------------------------------------------------------------
1 | #Creating a peak set, summarized experiment and LSI clustering
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(Matrix)
8 | library(SummarizedExperiment)
9 | library(matrixStats)
10 | library(readr)
11 | library(GenomicRanges)
12 | library(magrittr)
13 | library(edgeR)
14 | library(Seurat)
15 | library(BSgenome.Hsapiens.UCSC.hg19)
16 | set.seed(1)
17 |
18 | countInsertions <- function(query, fragments, by = "RG"){
19 | #Count By Fragments Insertions
20 | inserts <- c(
21 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
22 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
23 | )
24 | by <- "RG"
25 | overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap=-1L, minoverlap=0L, type = "any"))
26 | overlapDF$name <- mcols(inserts)[overlapDF[, 2], by]
27 | overlapTDF <- transform(overlapDF, id = match(name, unique(name)))
28 | #Calculate Overlap Stats
29 | inPeaks <- table(overlapDF$name)
30 | total <- table(mcols(inserts)[, by])
31 | total <- total[names(inPeaks)]
32 | frip <- inPeaks / total
33 | #Summarize
34 | sparseM <- Matrix::sparseMatrix(
35 | i = overlapTDF[, 1],
36 | j = overlapTDF[, 4],
37 | x = rep(1, nrow(overlapTDF)),
38 | dims = c(length(query), length(unique(overlapDF$name))))
39 | colnames(sparseM) <- unique(overlapDF$name)
40 | total <- total[colnames(sparseM)]
41 | frip <- frip[colnames(sparseM)]
42 | out <- list(counts = sparseM, frip = frip, total = total)
43 | return(out)
44 | }
45 |
46 | seuratLSI <- function(mat, nComponents = 50, binarize = TRUE, nFeatures = NULL){
47 | #TF IDF LSI adapted from flyATAC
48 | cs <- Matrix::colSums(mat)
49 | if(binarize){
50 | message(paste0("Binarizing matrix..."))
51 | mat@x[mat@x > 0] <- 1
52 | }
53 | if(!is.null(nFeatures)){
54 | message(paste0("Getting top ", nFeatures, " features..."))
55 | mat <- mat[head(order(Matrix::rowSums(mat),decreasing = TRUE),nFeatures),]
56 | }
57 | #Calc TF IDF
58 | message("Computing Term Frequency IDF...")
59 | freqs <- t(t(mat)/Matrix::colSums(mat))
60 | idf <- as(log(1 + ncol(mat) / Matrix::rowSums(mat)), "sparseVector")
61 | tfidf <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% freqs
62 | #Calc SVD then LSI
63 | message("Computing SVD using irlba...")
64 | svd <- irlba::irlba(tfidf, nComponents, nComponents)
65 | svdDiag <- matrix(0, nrow=nComponents, ncol=nComponents)
66 | diag(svdDiag) <- svd$d
67 | matSVD <- t(svdDiag %*% t(svd$v))
68 | rownames(matSVD) <- colnames(mat)
69 | colnames(matSVD) <- paste0("PC",seq_len(ncol(matSVD)))
70 | #Make Seurat Object
71 | message("Making Seurat Object...")
72 | mat <- mat[1:100,] + 1
73 | obj <- CreateSeuratObject(mat, project='scATAC', min.cells=0, min.genes=0)
74 | obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "cell.embeddings", new.data = matSVD)
75 | obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "key", new.data = "PC")
76 | return(obj)
77 | }
78 |
79 | addClusters <- function(obj, minGroupSize = 50, dims.use = seq_len(50), initialResolution = 0.8){
80 | #First Iteration of Find Clusters
81 | currentResolution <- initialResolution
82 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE)
83 | minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]]))
84 | nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]])))
85 | message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize))
86 | #If clusters are smaller than minimum group size
87 | while(minSize <= minGroupSize){
88 | obj@meta.data <- obj@meta.data[,-which(colnames(obj@meta.data)==paste0("res.",currentResolution))]
89 | currentResolution <- currentResolution*initialResolution
90 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE, force.recalc = TRUE)
91 | minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]]))
92 | nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]])))
93 | message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize))
94 | }
95 | return(obj)
96 | }
97 |
98 | extendedPeakSet <- function(df, BSgenome = NULL, extend = 250, blacklist = NULL, nSummits = 100000){
99 | #Helper Functions
100 | readSummits <- function(file){
101 | df <- suppressMessages(data.frame(readr::read_tsv(file, col_names = c("chr","start","end","name","score"))))
102 | df <- df[,c(1,2,3,5)] #do not keep name column it can make the size really large
103 | return(GenomicRanges::makeGRangesFromDataFrame(df=df,keep.extra.columns = TRUE,starts.in.df.are.0based = TRUE))
104 | }
105 | nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE){
106 | stopifnot(by %in% colnames(mcols(gr)))
107 | clusterGRanges <- function(gr, filter = TRUE, by = "score", decreasing = TRUE){
108 | gr <- sort(sortSeqlevels(gr))
109 | r <- GenomicRanges::reduce(gr, min.gapwidth=0L, ignore.strand=TRUE)
110 | o <- findOverlaps(gr,r)
111 | mcols(gr)$cluster <- subjectHits(o)
112 | gr <- gr[order(mcols(gr)[,by], decreasing = decreasing),]
113 | gr <- gr[!duplicated(mcols(gr)$cluster),]
114 | gr <- sort(sortSeqlevels(gr))
115 | mcols(gr)$cluster <- NULL
116 | return(gr)
117 | }
118 | if(verbose){
119 | message("Converging", appendLF = FALSE)
120 | }
121 | i <- 0
122 | gr_converge <- gr
123 | while(length(gr_converge) > 0){
124 | if(verbose){
125 | message(".", appendLF = FALSE)
126 | }
127 | i <- i + 1
128 | gr_selected <- clusterGRanges(gr = gr_converge, filter = TRUE, by = by, decreasing = decreasing)
129 | gr_converge <- subsetByOverlaps(gr_converge ,gr_selected, invert=TRUE) #blacklist selected gr
130 | if(i == 1){ #if i=1 then set gr_all to clustered
131 | gr_all <- gr_selected
132 | }else{
133 | gr_all <- c(gr_all, gr_selected)
134 | }
135 | }
136 | if(verbose){
137 | message("\nSelected ", length(gr_all), " from ", length(gr))
138 | }
139 | gr_all <- sort(sortSeqlevels(gr_all))
140 | return(gr_all)
141 | }
142 | #Check-------
143 | stopifnot(extend > 0)
144 | stopifnot("samples" %in% colnames(df))
145 | stopifnot("groups" %in% colnames(df))
146 | stopifnot("summits" %in% colnames(df))
147 | stopifnot(!is.null(BSgenome))
148 | stopifnot(all(apply(df,1,function(x){file.exists(paste0(x[3]))})))
149 | #------------
150 | #Deal with blacklist
151 | if(is.null(blacklist)){
152 | blacklist <- GRanges()
153 | }else if(is.character(blacklist)){
154 | blacklist <- rtracklayer::import.bed(blacklist)
155 | }
156 | stopifnot(inherits(blacklist,"GenomicRanges"))
157 | #------------
158 | #Time to do stuff
159 | chromSizes <- GRanges(names(seqlengths(BSgenome)), IRanges(1, seqlengths(BSgenome)))
160 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
161 | groups <- unique(df$groups)
162 | groupGRList <- GenomicRanges::GenomicRangesList(lapply(seq_along(groups), function(i){
163 | df_group = df[which(df$groups==groups[i]),]
164 | grList <- GenomicRanges::GenomicRangesList(lapply(paste0(df_group$summits), function(x){
165 | extended_summits <- readSummits(x) %>%
166 | resize(., width = 2 * extend + 1, fix = "center") %>%
167 | subsetByOverlaps(.,chromSizes,type="within") %>%
168 | subsetByOverlaps(.,blacklist,invert=TRUE) %>%
169 | nonOverlappingGRanges(., by="score", decreasing=TRUE)
170 | extended_summits <- extended_summits[order(extended_summits$score,decreasing=TRUE)]
171 | if(!is.null(nSummits)){
172 | extended_summits <- head(extended_summits, nSummits)
173 | }
174 | mcols(extended_summits)$scoreQuantile <- trunc(rank(mcols(extended_summits)$score))/length(mcols(extended_summits)$score)
175 | extended_summits
176 | }))
177 | #Non Overlapping
178 | grNonOverlapping <- nonOverlappingGRanges(unlist(grList), by = "scoreQuantile", decreasing = TRUE)
179 | #Free Up Memory
180 | remove(grList)
181 | gc()
182 | grNonOverlapping
183 | }))
184 | grFinal <- nonOverlappingGRanges(unlist(groupGRList), by = "scoreQuantile", decreasing = TRUE)
185 | grFinal <- sort(sortSeqlevels(grFinal))
186 | return(grFinal)
187 | }
188 |
189 | groupSums <- function(mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
190 | stopifnot(!is.null(groups))
191 | stopifnot(length(groups) == ncol(mat))
192 | gm <- lapply(unique(groups), function(x) {
193 | if (sparse) {
194 | Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
195 | }else {
196 | rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
197 | }
198 | }) %>% Reduce("cbind", .)
199 | colnames(gm) <- unique(groups)
200 | return(gm)
201 | }
202 |
203 | #-------------------------------------------------------------------------------------------------
204 | # Start
205 | #-------------------------------------------------------------------------------------------------
206 | fragmentFiles <- list.files("data", pattern = ".rds", full.names = TRUE)
207 |
208 | #-------------------------------------------------------------------------------------------------
209 | # Get Counts In Windows
210 | #-------------------------------------------------------------------------------------------------
211 | genome <- BSgenome.Hsapiens.UCSC.hg19
212 | chromSizes <- GRanges(names(seqlengths(genome)), IRanges(1, seqlengths(genome)))
213 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
214 | windows <- unlist(tile(chromSizes, width = 2500))
215 | countsList <- lapply(seq_along(fragmentFiles), function(i){
216 | message(sprintf("%s of %s", i, length(fragmentFiles)))
217 | counts <- countInsertions(windows, readRDS(fragmentFiles[i]), by = "RG")[[1]]
218 | counts
219 | })
220 | mat <- lapply(countsList, function(x) x) %>% Reduce("cbind",.)
221 | remove(countsList)
222 | gc()
223 |
224 | #-------------------------------------------------------------------------------------------------
225 | # Run LSI Clustering with Seurat
226 | #-------------------------------------------------------------------------------------------------
227 | set.seed(1)
228 | message("Making Seurat LSI Object...")
229 | obj <- seuratLSI(mat, nComponents = 25, nFeatures = 20000)
230 | message("Adding Graph Clusters...")
231 | obj <- addClusters(obj, dims.use = 2:25, minGroupSize = 200, initialResolution = 0.8)
232 | saveRDS(obj, "results/Save-LSI-Windows-Seurat.rds")
233 | clusterResults <- split(rownames(obj@meta.data), paste0("Cluster",obj@meta.data[,ncol(obj@meta.data)]))
234 | remove(obj)
235 | gc()
236 |
237 | #-------------------------------------------------------------------------------------------------
238 | # Get Cluster Beds
239 | #-------------------------------------------------------------------------------------------------
240 | dirClusters <- "results/LSI-Cluster-Beds/"
241 | dir.create(dirClusters)
242 | for(i in seq_along(fragmentFiles)){
243 | fragments <-readRDS(fragmentFiles[i])
244 | for(j in seq_along(clusterResults)){
245 | message(sprintf("%s of %s", j, length(clusterResults)))
246 | fragmentsj <- fragments[fragments$RG %in% clusterResults[[j]]]
247 | if(length(fragmentsj) > 0){
248 | out <- data.frame(
249 | chr = c(seqnames(fragmentsj), seqnames(fragmentsj)),
250 | start = c(as.integer(start(fragmentsj) - 1), as.integer(end(fragmentsj) - 1)),
251 | end = c(as.integer(start(fragmentsj)), as.integer(end(fragmentsj)))
252 | ) %>% readr::write_tsv(
253 | x = .,
254 | append = TRUE,
255 | path = paste0(dirClusters, paste0(names(clusterResults)[j], ".bed")),
256 | col_names = FALSE)
257 | }
258 | }
259 | }
260 |
261 | #-------------------------------------------------------------------------------------------------
262 | # Run MACS2
263 | #-------------------------------------------------------------------------------------------------
264 | dirPeaks <- "results/LSI-Cluster-Peaks/"
265 | method <- "q"
266 | cutoff <- 0.05
267 | shift <- -75
268 | extsize <- 150
269 | genome_size <- 2.7e9
270 | for(j in seq_along(clusterResults)){
271 | message(sprintf("%s of %s", j, length(clusterResults)))
272 | clusterBedj <- paste0(dirClusters,names(clusterResults)[j],".bed")
273 | cmdPeaks <- sprintf(
274 | "macs2 callpeak -g %s --name %s --treatment %s --outdir %s --format BED --nomodel --call-summits --nolambda --keep-dup all",
275 | genome_size,
276 | names(clusterResults)[j],
277 | clusterBedj,
278 | dirPeaks
279 | )
280 | if (!is.null(shift) & !is.null(extsize)) {
281 | cmdPeaks <- sprintf("%s --shift %s --extsize %s", cmdPeaks, shift, extsize)
282 | }
283 | if (tolower(method) == "p") {
284 | cmdPeaks <- sprintf("%s -p %s", cmdPeaks, cutoff)
285 | }else {
286 | cmdPeaks <- sprintf("%s -q %s", cmdPeaks, cutoff)
287 | }
288 | message("Running Macs2...")
289 | message(cmdPeaks)
290 | system(cmdPeaks, intern = TRUE)
291 | }
292 |
293 | #-------------------------------------------------------------------------------------------------
294 | # Make Non-Overlapping Peak Set
295 | #-------------------------------------------------------------------------------------------------
296 | df <- data.frame(
297 | samples = gsub("\\_summits.bed","",list.files(dirPeaks, pattern = "\\_summits.bed", full.names = FALSE)),
298 | groups = "scATAC",
299 | summits = list.files(dirPeaks, pattern = "\\_summits.bed", full.names = TRUE)
300 | )
301 |
302 | unionPeaks <- extendedPeakSet(
303 | df = df,
304 | BSgenome = genome,
305 | extend = 250,
306 | blacklist = "data/hg19.blacklist.bed",
307 | nSummits = 200000
308 | )
309 | unionPeaks <- unionPeaks[seqnames(unionPeaks) %in% paste0("chr",c(1:22,"X"))]
310 | unionPeaks <- keepSeqlevels(unionPeaks, paste0("chr",c(1:22,"X")))
311 |
312 | #Create Counts list
313 | countsPeaksList <- lapply(seq_along(fragmentFiles), function(i){
314 | message(sprintf("%s of %s", i, length(fragmentFiles)))
315 | gc()
316 | countInsertions(unionPeaks, readRDS(fragmentFiles[i]), by = "RG")
317 | })
318 |
319 | #CountsMatrix
320 | mat <- lapply(countsPeaksList, function(x) x[[1]]) %>% Reduce("cbind",.)
321 | frip <- lapply(countsPeaksList, function(x) x[[2]]) %>% unlist
322 | total <- lapply(countsPeaksList, function(x) x[[3]]) %>% unlist
323 |
324 | se <- SummarizedExperiment(
325 | assays = SimpleList(counts = mat),
326 | rowRanges = unionPeaks
327 | )
328 | rownames(se) <- paste(seqnames(se),start(se),end(se),sep="_")
329 | colData(se)$FRIP <- frip
330 | colData(se)$uniqueFrags <- total / 2
331 |
332 | #----------------------------
333 | # Get Clusters in Peaks
334 | #----------------------------
335 | nTop <- 25000
336 | nPCs1 <- 1:50
337 | nPCs2 <- 1:50
338 |
339 | message("Making Seurat LSI Object...")
340 | obj <- seuratLSI(assay(se), nComponents = max(nPCs1), nFeatures = NULL)
341 | stopifnot(identical(rownames(obj@meta.data), colnames(se)))
342 | obj@meta.data <- as.data.frame(cbind(obj@meta.data, colData(se)))
343 |
344 | message("Adding Graph Clusters...")
345 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = nPCs1, print.output = TRUE, n.start = 10)
346 |
347 | #Make Pseudo Bulk Library
348 | mat <- assay(se)
349 | mat@x[mat@x > 0] <- 1
350 | clusterSums <- groupSums(mat = mat, groups = paste0("C",obj@meta.data$res.0.8), sparse = TRUE)
351 | logMat <- edgeR::cpm(clusterSums, log = TRUE, prior.count = 3)
352 | varPeaks <- head(order(matrixStats::rowVars(logMat), decreasing = TRUE), nTop)
353 |
354 | #Re-run Seurat LSI
355 | message("Making Seurat LSI Object...")
356 | obj2 <- seuratLSI(assay(se)[varPeaks,], nComponents = max(nPCs2), nFeatures = NULL)
357 | stopifnot(identical(rownames(obj2@meta.data), colnames(se)))
358 | obj2@meta.data <- as.data.frame(cbind(obj2@meta.data, colData(se)))
359 |
360 | message("Adding Graph Clusters...")
361 | obj2 <- FindClusters(object = obj2, reduction.type = "pca", dims.use = nPCs2, print.output = TRUE, n.start = 10)
362 |
363 | #Plot uMAP
364 | message("Running UMAP")
365 | obj2 <- RunUMAP(object = obj2, reduction.use = "pca", dims.use = nPCs2)
366 | plotUMAP <- data.frame(GetCellEmbeddings(obj2,reduction.type="umap"), obj2@meta.data)
367 | colnames(plotUMAP) <- c("x","y",colnames(plotUMAP)[3:ncol(plotUMAP)])
368 | clustCol <- colnames(plotUMAP)[grep("res",colnames(plotUMAP))]
369 | colData(se)$Clusters <- paste0("Cluster",as.integer(plotUMAP[,clustCol]) + 1)
370 | colData(se)$UMAP1 <- plotUMAP$x
371 | colData(se)$UMAP2 <- plotUMAP$y
372 |
373 | pdf("results/LSI-Clustering-Peaks.pdf")
374 | ggplot(plotUMAP, aes(x=x,y=y,color=res.0.8)) + geom_point(size = 0.5) +
375 | theme_bw() + xlab("UMAP1") + ylab("UMAP2")
376 | dev.off()
377 |
378 | saveRDS(se, "results/scATAC-Summarized-Experiment.rds")
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
--------------------------------------------------------------------------------
/code/03_Run_chromVAR_v2.R:
--------------------------------------------------------------------------------
1 | #Running chromVAR on single-cell summarized experiment
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(chromVAR)
8 | library(SummarizedExperiment)
9 | library(chromVARmotifs)
10 | library(motifmatchr)
11 | library(BiocParallel)
12 | library(BSgenome.Hsapiens.UCSC.hg19)
13 | register(SerialParam())
14 | set.seed(1)
15 |
16 | #-----------------
17 | # Read Inputs
18 | #-----------------
19 | genome <- BSgenome.Hsapiens.UCSC.hg19
20 | se <- readRDS("results/scATAC-Summarized-Experiment.rds") # single-cell summarized experiment rowRanges as peaks
21 | se <- addGCBias(se, genome = genome)
22 | data("human_pwms_v1")
23 | matches <- matchMotifs(human_pwms_v1, rowRanges(se), genome = "BSgenome.Hsapiens.UCSC.hg19")
24 |
25 | #compute deviations
26 | dev <- computeDeviations(object = se, annotations = matches)
27 |
28 | #compute variability
29 | metadata(dev)$Variability <- computeVariability(dev)
30 |
31 | #add se
32 | metadata(dev)$SummarizedExperiment <- se
33 |
34 | #add matches
35 | metadata(dev)$motifMatches <- matches
36 |
37 | saveRDS(dev, "results/chromVAR-Summarized-Experiment.rds")
38 |
--------------------------------------------------------------------------------
/code/04_Run_Cicero_v2.R:
--------------------------------------------------------------------------------
1 | #Computing Gene Activity scores using Cicero and Co-Accessibility
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(cicero)
8 | library(data.table)
9 | library(Matrix)
10 | library(GenomicRanges)
11 | library(magrittr)
12 | library(SummarizedExperiment)
13 | library(optparse)
14 | library(yaml)
15 | library(Rcpp)
16 | set.seed(1)
17 |
18 | grToFeature <- function(gr){
19 | peakinfo <- data.frame(
20 | row.names = paste(seqnames(gr),start(gr),end(gr),sep="_"),
21 | site_name = paste(seqnames(gr),start(gr),end(gr),sep="_"),
22 | chr = gsub("chr","",as.character(seqnames(gr))),
23 | bp1 = start(gr),
24 | bp2 = end(gr)
25 | )
26 | return(peakinfo)
27 | }
28 |
29 | featureToGR <- function(feature){
30 | featureSplit <- stringr::str_split(paste0(feature), pattern = "_", n = 3, simplify = TRUE)
31 | gr <- GRanges(featureSplit[,1],IRanges(as.integer(featureSplit[,2]),as.integer(featureSplit[,3])))
32 | return(gr)
33 | }
34 |
35 | makeCDS <- function(se, binarize = TRUE){
36 | peakinfo <- grToFeature(se)
37 | mat <- assay(se)
38 | if(binarize){
39 | mat@x[which(mat@x > 0)] <- 1
40 | }
41 | cellinfo <- data.frame(colData(se))
42 | cellinfo$cells <- rownames(cellinfo)
43 | cds <- suppressWarnings(newCellDataSet(mat,
44 | phenoData = methods::new("AnnotatedDataFrame", data = cellinfo),
45 | featureData = methods::new("AnnotatedDataFrame", data = peakinfo),
46 | expressionFamily=negbinomial.size(),
47 | lowerDetectionLimit=0))
48 | fData(cds)$chr <- as.character(fData(cds)$chr)
49 | fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
50 | fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))
51 | cds <- cds[order(fData(cds)$chr, fData(cds)$bp1),]
52 | return(cds)
53 | }
54 |
55 |
56 | sourceCpp(code='
57 | #include
58 |
59 | using namespace Rcpp;
60 | using namespace std;
61 |
62 | // Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp
63 | // [[Rcpp::export]]
64 | Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y) {
65 |
66 | if(X.ncol() != Y.ncol()){
67 | stop("Columns of Matrix X and Y must be equal length!");
68 | }
69 |
70 | if(max(idxX) > X.nrow()){
71 | stop("Idx X greater than nrow of Matrix X");
72 | }
73 |
74 | if(max(idxY) > Y.nrow()){
75 | stop("Idx Y greater than nrow of Matrix Y");
76 | }
77 |
78 | // Transpose Matrices
79 | X = transpose(X);
80 | Y = transpose(Y);
81 |
82 | const int nx = X.ncol();
83 | const int ny = Y.ncol();
84 |
85 | // Centering the matrices
86 | for (int j = 0; j < nx; ++j) {
87 | X(Rcpp::_, j) = X(Rcpp::_, j) - Rcpp::mean(X(Rcpp::_, j));
88 | }
89 |
90 | for (int j = 0; j < ny; ++j) {
91 | Y(Rcpp::_, j) = Y(Rcpp::_, j) - Rcpp::mean(Y(Rcpp::_, j));
92 | }
93 |
94 | // Compute 1 over the sample standard deviation
95 | Rcpp::NumericVector inv_sqrt_ss_X(nx);
96 | for (int i = 0; i < nx; ++i) {
97 | inv_sqrt_ss_X(i) = 1/sqrt(Rcpp::sum( X(Rcpp::_, i) * X(Rcpp::_, i) ));
98 | }
99 |
100 | Rcpp::NumericVector inv_sqrt_ss_Y(ny);
101 | for (int i = 0; i < ny; ++i) {
102 | inv_sqrt_ss_Y(i) = 1/sqrt(Rcpp::sum( Y(Rcpp::_, i) * Y(Rcpp::_, i) ));
103 | }
104 |
105 | //Calculate Correlations
106 | const int n = idxX.size();
107 | Rcpp::NumericVector cor(n);
108 | for(int k = 0; k < n; k++){
109 | cor[k] = Rcpp::sum( X(Rcpp::_, idxX[k] - 1) * Y(Rcpp::_, idxY[k] - 1) ) * inv_sqrt_ss_X(idxX[k] - 1) * inv_sqrt_ss_Y(idxY[k] - 1);
110 | }
111 |
112 | return(cor);
113 |
114 | }'
115 | )
116 |
117 | getTxDbGenes <- function(txdb = NULL, orgdb = NULL, gr = NULL, ignore.strand = TRUE){
118 |
119 | if (is.null(genome)) {
120 | if (is.null(txdb) | is.null(orgdb)) {
121 | stop("If no provided genome then you need txdb and orgdb!")
122 | }
123 | }
124 |
125 | if (is.null(gr)) {
126 | genes <- GenomicFeatures::genes(txdb)
127 | }else {
128 | genes <- suppressWarnings(subsetByOverlaps(GenomicFeatures::genes(txdb), gr, ignore.strand = ignore.strand))
129 | }
130 |
131 | if (length(genes) > 1) {
132 | mcols(genes)$symbol <- suppressMessages(mapIds(orgdb,
133 | keys = mcols(genes)$gene_id, column = "SYMBOL", keytype = "ENTREZID",
134 | multiVals = "first"))
135 | genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE)
136 | names(genes) <- NULL
137 | out <- genes
138 | }else {
139 | out <- GRanges(seqnames(gr), ranges = IRanges(0, 0), gene_id = 0, symbol = "none")[-1]
140 | }
141 |
142 | return(out)
143 |
144 | }
145 |
146 | library(BSgenome.Hsapiens.UCSC.hg19)
147 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
148 | library(org.Hs.eg.db)
149 |
150 | #Read input
151 | obj <- readRDS("results/scATAC-Summarized-Experiment.rds")
152 | mdata <- colData(obj)
153 | tssWindow <- 2500
154 | flank <- 250*10^3
155 | corCutOff <- 0.35
156 | bsgenome <- BSgenome.Hsapiens.UCSC.hg19
157 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
158 | orgdb <- org.Hs.eg.db
159 |
160 | #Reduced Dimensions
161 | dimred <- data.frame(
162 | row.names = colnames(obj),
163 | colData(obj)$UMAP1,
164 | colData(obj)$UMAP2
165 | )
166 |
167 | #Get ChromSizes
168 | chromSizes <- seqlengths(bsgenome)[paste0("chr",c(1:22,"X"))]
169 | genome <- data.frame(names(chromSizes),chromSizes)
170 | rownames(genome) <- NULL
171 |
172 | #Get CDS
173 | obj <- makeCDS(obj, binarize = TRUE)
174 | obj <- detectGenes(obj)
175 | obj <- estimateSizeFactors(obj)
176 | ciceroObj <- make_cicero_cds(obj, k = 50, reduced_coordinates = dimred[colnames(obj),])
177 |
178 | #Compute Correlations
179 | message("Computing grouped correlations...")
180 | gr <- featureToGR(featureData(ciceroObj)[[1]])
181 | o <- suppressWarnings(as.matrix( findOverlaps(resize( resize(gr,1,"center"), 2*flank + 1, "center"), resize(gr,1,"center"), ignore.strand=TRUE) ))
182 | o <- data.table::as.data.table(data.frame(i = matrixStats::rowMins(o), j = matrixStats::rowMaxs(o)))
183 | o <- data.frame(o[!duplicated(o),])
184 | o <- o[o[,1]!=o[,2],]
185 | o$cor <- rowCorCpp(o[,1], o[,2], assayData(ciceroObj)$exprs, assayData(ciceroObj)$exprs)
186 | connections <- data.frame(
187 | Peak1 = featureData(ciceroObj)[[1]][o[,1]],
188 | Peak2 = featureData(ciceroObj)[[1]][o[,2]],
189 | coaccess = o[,3]
190 | )
191 |
192 | #Annotate CDS
193 | message("Annotating Cell Data Set...")
194 | genes <- getTxDbGenes(txdb=txdb,orgdb=orgdb)
195 | names(genes) <- genes$symbol
196 | genes <- resize(genes, 1, "start") %>% resize(tssWindow * 2 + 1, "center")
197 | geneDF <- data.frame(chromosome=seqnames(genes),start=start(genes),end=end(genes), gene=genes$symbol)
198 | obj <- annotate_cds_by_site(obj, geneDF)
199 |
200 | #Prepare for Co-Accessibility
201 | nSites <- Matrix::colSums(assayData(obj)$exprs)
202 | names(nSites) <- row.names(pData(obj))
203 |
204 | #Cicero with Correlations
205 | message("Calculating normalized gene activities...")
206 | ciceroGA <- normalize_gene_activities(build_gene_activity_matrix(obj, connections, coaccess_cutoff = corCutOff), nSites)
207 |
208 | seCicero <- SummarizedExperiment(
209 | assays = SimpleList(gA = ciceroGA),
210 | rowRanges = genes[rownames(ciceroGA),],
211 | colData = mdata
212 | )
213 |
214 | seCiceroLog <- SummarizedExperiment(
215 | assays = SimpleList(logGA = log2(10^6 * ciceroGA + 1)),
216 | rowRanges = genes[rownames(ciceroGA),],
217 | colData = mdata
218 | )
219 |
220 | #Save Output
221 | saveRDS(connections, "results/Peaks-Co-Accessibility.rds")
222 | saveRDS(seCicero, "results/Cicero-Gene-Activity.rds")
223 | saveRDS(seCiceroLog, "results/Cicero-Log2-Gene-Activity.rds")
224 |
225 |
226 |
--------------------------------------------------------------------------------
/code/05_Cluster_Unique_Peaks_v2.R:
--------------------------------------------------------------------------------
1 | #Identifying Cluster Specific ATAC-seq peaks
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(Matrix)
8 | library(SummarizedExperiment)
9 | library(matrixStats)
10 | library(readr)
11 | library(GenomicRanges)
12 | library(magrittr)
13 | library(data.table)
14 | library(edgeR)
15 | set.seed(1)
16 |
17 | groupMeans <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
18 | stopifnot(!is.null(groups))
19 | stopifnot(length(groups) == ncol(mat))
20 | gm <- lapply(unique(groups), function(x) {
21 | if (sparse) {
22 | Matrix::rowMeans(mat[, which(groups == x), drop = F], na.rm = na.rm)
23 | }
24 | else {
25 | rowMeans(mat[, which(groups == x), drop = F], na.rm = na.rm)
26 | }
27 | }) %>% Reduce("cbind", .)
28 | colnames(gm) <- unique(groups)
29 | return(gm)
30 | }
31 |
32 | groupSds <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE) {
33 | stopifnot(!is.null(groups))
34 | stopifnot(length(groups) == ncol(mat))
35 | gs <- lapply(unique(groups), function(x) {
36 | if (sparse) {
37 | matrixStats::rowSds(as.matrix(mat[, which(groups == x), drop = F]), na.rm = na.rm)
38 | }
39 | else {
40 | matrixStats::rowSds(mat[, which(groups == x), drop = F], na.rm = na.rm)
41 | }
42 | }) %>% Reduce("cbind", .)
43 | colnames(gs) <- unique(groups)
44 | return(gs)
45 | }
46 |
47 | createPseudoBulk <- function(mat, groups, labels, minCells = 100, maxCells = 500, minReps = 3, ceiling = 1, prior.count = 3, nSim = 1000, distMethod = "vars", seed = 1){
48 |
49 | calcDiff <- function(mat, method = "vars"){
50 | if(tolower(method)=="vars"){
51 | sum(matrixStats::rowVars(mat))/ncol(mat)
52 | }else if(tolower(method)=="euclidean"){
53 | sum(dist(t(mat))/ncol(mat))
54 | }else{
55 | stop("Error method not found!")
56 | }
57 | }
58 |
59 | sumCells <- function(mat, groups, maxCells = NULL, na.rm = TRUE, sparse = FALSE){
60 | stopifnot(!is.null(groups))
61 | stopifnot(length(groups) == ncol(mat))
62 | gm <- lapply(unique(groups), function(x) {
63 | idx <- which(groups == x)
64 | if(!is.null(maxCells)){
65 | idx <- sample(idx, size = min(maxCells, length(idx)), replace = FALSE)
66 | }
67 | if (sparse) {
68 | Matrix::rowSums(mat[, idx, drop = FALSE], na.rm = na.rm)
69 | }
70 | else{
71 | rowSums(mat[, idx, drop = FALSE], na.rm = na.rm)
72 | }
73 | }) %>% Reduce("cbind", .) %>% as.matrix
74 | colnames(gm) <- unique(groups)
75 | return(gm)
76 | }
77 |
78 | message(paste0("Setting Seed = ",seed))
79 | set.seed(seed)
80 |
81 | names(labels) <- colnames(mat)
82 | groupList <- split(labels,groups)
83 |
84 | if(minReps <= 1){
85 | stop("Minimum 2 replicates!")
86 | }
87 |
88 | if(is.numeric(ceiling)){
89 | message(paste0("Setting ceiling of input matrix to ", ceiling, "!"))
90 | mat@x[mat@x>ceiling]<-ceiling
91 | }
92 |
93 | #--------------------------------------------------------------------------
94 | # Constructing Bulk Pseudo ATAC Matrix v1.0
95 | #--------------------------------------------------------------------------
96 |
97 | pseudoAll <- lapply(seq_along(groupList), function(x){
98 |
99 | message(sprintf("####################################\n Groups %s of %s : %s\n####################################",x,length(groupList), names(groupList)[x]))
100 | start <- Sys.time()
101 | groupx <- groupList[[x]]
102 | matx <- mat[,names(groupx)]
103 | bioReps <- names(table(groupx))[which(table(groupx)>minCells)]
104 | nToSim <- minReps - length(bioReps)
105 |
106 | if(length(bioReps) >= minReps){
107 |
108 | #--------------------------------------------------------------------------
109 | # If there is enough biological samples passing the minimal cells, great
110 | # good to go! Just merge into true pseudo bulk replicates!
111 | #--------------------------------------------------------------------------
112 |
113 | message(sprintf("Found %s Bio Reps which is more or equal than required (%s)", length(bioReps), minReps))
114 | nBio <- length(bioReps)
115 | groupBio <- groupx[which(groupx %in% bioReps)]
116 | pseudoBio <- sumCells(matx[,names(groupBio)],groups=groupBio,sparse=TRUE,na.rm=TRUE, maxCells = maxCells)
117 | nBio <- table(groupBio)[bioReps]
118 | if(!is.null(maxCells)){
119 | nBio[nBio > maxCells] <- maxCells
120 | }
121 |
122 | colnames(pseudoBio) <- lapply(seq_along(bioReps),function(k){
123 | paste0(names(groupList)[x],"._.BRep_",bioReps[k],".",nBio[which(names(nBio)==bioReps[k])],".FALSE")
124 | })
125 | pseudoMat <- pseudoBio
126 |
127 | }else if(length(bioReps) > 0 & ((length(groupx[groupx %ni% bioReps]) + 1) / min(nToSim, 2)) > minCells){
128 |
129 | #--------------------------------------------------------------------------
130 | # If there is at least 1 biological sample with the minimum cells but not
131 | # as many as required, we will make pseudo replicates with the true replicate
132 | # to attempt to capture real biological varation
133 | #--------------------------------------------------------------------------
134 |
135 | message("PSA : To ensure minimum replicates, simulation must be performed!")
136 | groupBio <- groupx[which(groupx %in% bioReps)]
137 |
138 | nBio <- table(groupBio)
139 | if(length(bioReps) == 1){
140 | pseudoBio <- Matrix::rowSums(matx[,names(groupBio)],na.rm=TRUE)
141 | }else{
142 | pseudoBio <- sumCells(matx[,names(groupBio)],groups=groupBio,sparse=TRUE,na.rm=TRUE, maxCells = maxCells)
143 | }
144 | pseudoBioLog <- edgeR::cpm(pseudoBio, log = TRUE, prior.count = prior.count)
145 |
146 | #Determine how sampling is to be performed, ideally we could have the minimum cells * non bio cells of cells!
147 | nSplit <- floor(length(groupx[groupx %ni% bioReps])/(nToSim))
148 | if(!is.null(maxCells)){
149 | nSplit <- min(nSplit, maxCells)
150 | }
151 | if(nSplit < minCells){
152 | message("Splitting cluster into overlapping cells using sampling with replacement BE CAREFUL OF LOW VARIANCE!!!!")
153 | replacement <- TRUE
154 | nSplit <- minCells
155 | }else{
156 | replacement <- FALSE
157 | }
158 |
159 | for(i in seq_len(nSim)){
160 |
161 | #Figure out how to split Matrix!
162 | randOrder <- sample(seq_len(ncol(matx)), ncol(matx))
163 | if(replacement){
164 | splitList <- lapply(seq_len(nToSim), function(x) sample(seq_len(ncol(matx)), size = nSplit, replace = replacement))
165 | }else{
166 | splitList <- split(randOrder, ceiling(seq_along(randOrder)/nSplit))[seq_len(nToSim)]
167 | }
168 |
169 | if(i == 1){
170 |
171 | pseudoMin <- lapply(seq_len(nToSim), function(j){
172 | jj <- splitList[[j]]
173 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE)
174 | }) %>% Reduce("cbind",.)
175 | pseudoMinLog <- edgeR::cpm(pseudoMin, log = TRUE, prior.count = prior.count)
176 | diffMax <- calcDiff(cbind(pseudoBioLog, pseudoMinLog), method = distMethod)
177 |
178 | }else{
179 |
180 | pseudoI <- lapply(seq_len(nToSim), function(j){
181 | jj <- splitList[[j]]
182 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE)
183 | }) %>% Reduce("cbind",.)
184 |
185 | pseudoILog <- edgeR::cpm(pseudoI, log = TRUE, prior.count = prior.count)
186 | diffI <- calcDiff(cbind(pseudoBioLog, pseudoILog), method = distMethod)
187 | message(sprintf("Trial %s, Current Distance = %s , Max Distance = %s", i, diffI, diffMax))
188 |
189 | if(diffI > diffMax){
190 | message("Found new maxima pseudo to be conservative...")
191 | pseudoMin <- pseudoI
192 | pseudoMinLog <- pseudoILog
193 | diffMax <- diffI
194 | }
195 |
196 | }
197 |
198 | }
199 |
200 | pseudoBio <- as.matrix(pseudoBio)
201 | pseudoMin <- as.matrix(pseudoMin)
202 | colnames(pseudoBio) <- lapply(seq_along(bioReps),function(k){
203 | paste0(names(groupList)[x],"._.BRep_",bioReps[k],".",nBio[which(names(nBio)==bioReps[k])],".FALSE")
204 | })
205 | colnames(pseudoMin) <- paste0(names(groupList)[x],"._.Rep",seq_len(nToSim),".",nSplit,".",replacement)
206 | pseudoMat <- cbind(pseudoBio, pseudoMin)
207 |
208 | }else{
209 |
210 | #--------------------------------------------------------------------------
211 | # If there is not at least 1 sample with enough cells we will bootstrap replicates.
212 | # This is not preferred as we will have a large underestimate of true biological
213 | # variation.
214 | #--------------------------------------------------------------------------
215 |
216 | message("PSA : No representation by at least one separate rep, please be cautious with result!")
217 |
218 | #Determine how sampling is to be performed, ideally we could have the minimum cells * non bio cells of cells!
219 | nToSim <- minReps
220 | nSplit <- floor(length(groupx[groupx %ni% bioReps])/(nToSim))
221 | if(floor(1.5 * minCells) > length(groupx)){
222 | nToSim <- 2
223 | nSplit <- floor(2 / 3 * length(groupx))
224 | message(sprintf("Warning! Group size (%s) is smaller than the 3/2 * minimal number of cells (%s)",length(groupx),floor(1.5 * minCells)))
225 | message(sprintf("To deal with this, we will sample %s replicates at 2/3 the group size (%s of %s)", nToSim, nSplit, length(groupx)))
226 | }
227 | if(!is.null(maxCells)){
228 | nSplit <- min(nSplit, maxCells)
229 | }
230 | if(nSplit < minCells){
231 | message("Splitting cluster into overlapping cells using sampling with replacement BE CAREFUL OF LOW VARIANCE!!!!")
232 | replacement <- TRUE
233 | nSplit <- minCells
234 | }else{
235 | replacement <- FALSE
236 | }
237 |
238 | for(i in seq_len(nSim)){
239 |
240 | #Figure out how to split Matrix!
241 | randOrder <- sample(seq_len(ncol(matx)), ncol(matx))
242 | if(replacement){
243 | splitList <- lapply(seq_len(nToSim), function(x) sample(seq_len(ncol(matx)), size = minCells, replace = replacement))
244 | }else{
245 | splitList <- split(randOrder, ceiling(seq_along(randOrder)/nSplit))[seq_len(nToSim)]
246 | }
247 |
248 | if(i == 1){
249 |
250 | pseudoMin <- lapply(seq_len(nToSim), function(j){
251 | jj <- splitList[[j]]
252 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE)
253 | }) %>% Reduce("cbind",.)
254 | pseudoMinLog <- edgeR::cpm(pseudoMin, log = TRUE, prior.count = prior.count)
255 | diffMax <- calcDiff(pseudoMinLog, method = distMethod)
256 |
257 | }else{
258 |
259 | pseudoI <- lapply(seq_len(nToSim), function(j){
260 | jj <- splitList[[j]]
261 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE)
262 | }) %>% Reduce("cbind",.)
263 | pseudoILog <- edgeR::cpm(pseudoI, log = TRUE, prior.count = prior.count)
264 | diffI <- calcDiff(pseudoILog, method = distMethod)
265 |
266 | message(sprintf("Trial %s, Current Distance = %s , Max Distance = %s", i, diffI, diffMax))
267 | if(diffI > diffMax){
268 | message("Found new maxima pseudo to be conservative...")
269 | pseudoMin <- pseudoI
270 | pseudoMinLog <- pseudoILog
271 | diffMax <- diffI
272 | }
273 |
274 | }
275 | }
276 |
277 | pseudoMin <- as.matrix(pseudoMin)
278 | colnames(pseudoMin) <- paste0(names(groupList)[x],"._.Rep_",seq_len(nToSim),".",nSplit,".",replacement)
279 | pseudoMat <- pseudoMin
280 |
281 | }
282 |
283 | print(Sys.time() - start)
284 |
285 | pseudoMat
286 |
287 | }) %>% Reduce("cbind",.)
288 |
289 | out <- list(pseudoMat = pseudoAll, groupMat = sumCells(mat, groups=groups, sparse=TRUE, na.rm=TRUE))
290 |
291 | return(out)
292 |
293 | }
294 |
295 |
296 | uniqueFeatures <- function(
297 | mat, groups, padj = 0.01, minSdRatio = 0.001,
298 | minLFC = 0.25, zCutoff = 1, breakPt = "last",
299 | padjMethod = "fdr", clusterCols = TRUE,
300 | sparse = FALSE, twoWay = FALSE, groupMin = 10,
301 | minGroupSize = 1, maxGroupSize = NULL){
302 |
303 | #----------------------
304 | # Functions
305 | #----------------------
306 | binarizeMatrix <- function(matMean, matSd, cutoff, method, minSdRatio, minLFC){
307 |
308 | binarizeVector <- function(vMean, vSd, cutoff, method, minSdRatio, minLFC){
309 |
310 | #Method
311 | if(method == "mean"){
312 | vTest = vMean
313 | }else{
314 | vTest <- vMean - cutoff * vSd
315 | }
316 |
317 | #Order lowest to highest
318 | idx <- order(vTest)
319 | vTest <- vTest[idx]
320 | vMean <- vMean[idx]
321 | vSd <- vSd[idx]
322 |
323 | #Set which are too low for evaluation ie low Sd that is probably due to 0s or weird offsets
324 | sdToLow <- which(vSd < minSdRatio*vMean | vSd < 10^-5)
325 | vBinarySd <- rep(1,length(vSd))
326 | if(length(sdToLow) > 0){
327 | vBinarySd[sdToLow] <- 0
328 | }
329 |
330 | #Initialize
331 | vMeanCutSd <- vMean + cutoff*vSd;
332 | maxValue <- vMeanCutSd[1]
333 | maxBSd <- vBinarySd[1]
334 | maxMean <- vMean[1]
335 |
336 | #Create out vector and initialize breakpoint
337 | n <- length(vMean)
338 | out <- rep(0, n)
339 | breakPoint <- 0
340 | out[1] <- breakPoint
341 |
342 | #Evaluate
343 | for(i in seq(2,n)){
344 |
345 | #Check if break point assuming log space
346 | if((vTest[i] - maxValue) > 0 & (vMean[i] - maxMean) >= minLFC & maxBSd != 0){
347 | breakPoint <- breakPoint + 1
348 | }
349 |
350 | #Set current value of break point
351 | out[i] <- breakPoint
352 |
353 | #Keep Max value observed
354 | if(vMeanCutSd[i] > maxValue){
355 | maxValue <- vMeanCutSd[i]
356 | maxMean <- vMean[i]
357 | maxBSd <- vBinarySd[i]
358 | }
359 | }
360 |
361 | out <- out[order(idx)]
362 | return(out)
363 |
364 | }
365 |
366 | #Create binary matrix
367 | bMat <- matrix(NA,nrow=nrow(matMean),ncol=ncol(matMean))
368 | for(i in seq_len(nrow(matMean))){
369 | if(i%%5000==0){message(sprintf("%s of %s (percent = %s)", i, nrow(bMat), round(100*i/nrow(bMat),2)))}
370 | bMat[i,] <- binarizeVector(matMean[i,],matSd[i,],cutoff, method, minSdRatio, minLFC)
371 | }
372 |
373 | #Add names
374 | colnames(bMat) <- colnames(matMean)
375 | rownames(bMat) <- rownames(matMean)
376 | return(bMat)
377 |
378 | }
379 |
380 | idxRow <- seq_len(nrow(mat))
381 | #-----------------------------------------------
382 | #Within Group Statistics
383 | #-----------------------------------------------
384 | message("Getting Within Group Stats...")
385 | intraMean <- groupMeans(mat, groups = groups, sparse = sparse, na.rm = TRUE)
386 | rownames(intraMean) <- rownames(mat)
387 |
388 | intraSd <- groupSds(mat, groups = groups, sparse = sparse, na.rm = TRUE)
389 | rownames(intraSd) <- rownames(mat)
390 |
391 | #-----------------------------------------------
392 | #Binarize Rows of Matrix
393 | #-----------------------------------------------
394 | message("Binarizing Features...")
395 | if (twoWay) {
396 | binarizedMat <- binarizeMatrix(intraMean, intraSd, zCutoff, "meanSd", minSdRatio, minLFC)
397 | }else {
398 | binarizedMat <- binarizeMatrix(intraMean, intraSd, zCutoff, "mean", minSdRatio, minLFC)
399 | }
400 | colnames(binarizedMat) <- colnames(intraMean)
401 | rownames(binarizedMat) <- rownames(intraMean)
402 | for(i in seq_len(nrow(binarizedMat))){
403 | bvi <- binarizedMat[i,]
404 | if(tolower(breakPt) == "last"){
405 | bvi[bvi!=max(bvi)] <- 0
406 | bvi[bvi>0] <- 1
407 | }else{
408 | bvi[bvi<1] <- 0
409 | bvi[bvi>0] <- 1
410 | }
411 | binarizedMat[i,] <- bvi
412 | }
413 | message(sprintf("Successful Binarization of %s Features...", sum(rowSums(binarizedMat) > 0)))
414 |
415 | #-----------------------------------------------
416 | #Get Test Statistics
417 | #-----------------------------------------------
418 | pb <- txtProgressBar(min=0,max=100,initial=0,style=3)
419 | groupSplit <- split(colnames(mat), groups)
420 | bpval <- unlist(lapply(seq_len(nrow(binarizedMat)), function(i){
421 | setTxtProgressBar(pb,round(i*100/nrow(binarizedMat),0))
422 | if(any(binarizedMat[i,]>0)){
423 | cu <- as.character(unlist(groupSplit[names(which(binarizedMat[i,]==max(binarizedMat[i,])))]))
424 | cd <- as.character(unlist(groupSplit[names(which(binarizedMat[i,]!=max(binarizedMat[i,])))]))
425 | mati <- mat[i,,drop=F]
426 | pval <- t.test(mati[,cu],mati[,cd])$p.value
427 | }else{
428 | pval <- 1
429 | }
430 | pval
431 | }))
432 | bpadj <- p.adjust(bpval, method=padjMethod) #sum(binarizedMat + noSdMat < 1))
433 |
434 | message(sprintf("\nFiltering by signficance %s of %s...", sum(bpadj < padj), sum(bpval < padj)))
435 | idxKeep <- which(bpadj < padj)
436 | bpadj <- bpadj[idxKeep]
437 | idxRow <- idxRow[idxKeep]
438 | binarizedMat <- binarizedMat[idxKeep, ]
439 | mat <- mat[idxKeep, ]
440 | intraMean <- intraMean[idxKeep, ]
441 | intraSd <- intraSd[idxKeep, ]
442 |
443 | #-----------------------------------------------
444 | #Determine which rows are above min group size and max group size to keep
445 | #-----------------------------------------------
446 | if (!is.null(maxGroupSize)) {
447 | idxKeep <- which(rowSums(binarizedMat) < (maxGroupSize + 1) & rowSums(binarizedMat) > (minGroupSize - 1))
448 | }else {
449 | idxKeep <- which(rowSums(binarizedMat) > (minGroupSize - 1))
450 | }
451 | message(sprintf("Filtering Features that are within Group Size %s of %s...", length(idxKeep), nrow(binarizedMat)))
452 | idxRow <- idxRow[idxKeep]
453 | bpadj <- bpadj[idxKeep]
454 | binarizedMat <- binarizedMat[idxKeep, ]
455 | mat <- mat[idxKeep, ]
456 | intraMean <- intraMean[idxKeep, ]
457 | intraSd <- intraSd[idxKeep, ]
458 |
459 | #-----------------------------------------------
460 | #Determine Pattern Occurences using data.table
461 | #-----------------------------------------------
462 | binarizedTable <- data.table::as.data.table(binarizedMat)[,.N,by=c(colnames(binarizedMat))]
463 | binarizedTable <- data.frame(binarizedTable[which(binarizedTable$N > groupMin),])[,which(colnames(binarizedTable) %ni% "N")]
464 | idxKeep <- unlist(lapply(seq_len(nrow(binarizedTable)), function(x){
465 | idx1 <- which(binarizedTable[x,,drop=TRUE] > 0)
466 | rs1 <- which(rowSums(binarizedMat[,idx1,drop=FALSE]) == length(idx1))
467 | rs2 <- which(rowSums(binarizedMat[,-idx1,drop=FALSE]) == 0)
468 | idxBM <- intersect(rs1,rs2)
469 | idxBM
470 | }))
471 | message(sprintf("Filtering Features Pattern Appearances %s of %s...", length(idxKeep), nrow(binarizedMat)))
472 | idxRow <- idxRow[idxKeep]
473 | bpadj <- bpadj[idxKeep]
474 | binarizedMat <- binarizedMat[idxKeep, ]
475 | mat <- mat[idxKeep, ]
476 | intraMean <- intraMean[idxKeep, ]
477 | intraSd <- intraSd[idxKeep, ]
478 |
479 | #-----------------------------------------------
480 | #Organize the output for maximum interpretation
481 | #-----------------------------------------------
482 | message(sprintf("Found %s unique elements!", length(idxKeep)))
483 | message("Finalizing Output...")
484 | colClust <- hclust(as.dist(1 - cor(intraMean)))
485 | colOrder <- unique(groups)[colClust$order]
486 | binarizedMat <- binarizedMat[, colClust$order]
487 | idxOrdered <- do.call("order", c(as.data.frame(binarizedMat)[seq_len(ncol(binarizedMat))], list(decreasing = TRUE)))
488 | binarizedMat <- binarizedMat[idxOrdered, ]
489 | idxRow <- idxRow[idxOrdered]
490 | intraMean <- intraMean[idxOrdered, colClust$order]
491 | mat <- mat[idxOrdered, order(match(groups, colOrder))]
492 | bpadj <- bpadj[idxOrdered]
493 |
494 | #-----------------------------------------------
495 | #Time to label each row
496 | #-----------------------------------------------
497 | binarizedTable <- data.frame(data.table::as.data.table(binarizedMat)[,.N,by=c(colnames(binarizedMat))])
498 | rownames(binarizedTable) <- paste0("Feature_",seq_len(nrow(binarizedTable)))
499 | dfUniuqe <- lapply(seq_len(nrow(binarizedTable)), function(x){
500 | idx1 <- which(binarizedTable[x,-ncol(binarizedTable),drop=TRUE] > 0)
501 | rs1 <- which(rowSums(binarizedMat[,idx1,drop=FALSE]) == length(idx1))
502 | rs2 <- which(rowSums(binarizedMat[,-idx1,drop=FALSE])==0)
503 | idxBM <- intersect(rs1,rs2)
504 | data.frame(feature = rownames(binarizedTable)[x], rows = idxBM)
505 | }) %>% Reduce("rbind",.)
506 |
507 | #-----------------------------------------------
508 | #Return Output
509 | #-----------------------------------------------
510 | out <- list(mat = mat, binaryMat = binarizedMat, groupMat = intraMean, binarizedTable = binarizedTable, dfFeature = dfUniuqe, rowOrder = idxRow, padj = bpadj)
511 | return(out)
512 |
513 | }
514 |
515 | '%ni%' <- Negate('%in%')
516 |
517 | #----------------------------
518 | # Get Inputs
519 | #----------------------------
520 | se <- readRDS("results/scATAC-Summarized-Experiment.rds")
521 |
522 | #----------------------------
523 | # PseudoBulk
524 | #----------------------------
525 | colData(se)$Group <- "PBMC" #Replicate info
526 | objPB <- createPseudoBulk(
527 | mat = assay(se),
528 | groups=paste0(colData(se)$Clusters),
529 | labels=colData(se)$Group,
530 | ceiling = 1,
531 | minCells = 100,
532 | maxCells = 500,
533 | minReps = 2,
534 | prior.count = 3,
535 | nSim = 250
536 | )
537 |
538 | cdPB <- DataFrame(
539 | row.names=colnames(objPB[[1]]),
540 | Group = stringr::str_split(colnames(objPB[[1]]), "._.", simplify = TRUE)[,1],
541 | Type = stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,1],
542 | Cells = as.integer(stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,2]),
543 | Replace = stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,3]
544 | )
545 |
546 | sePB <- SummarizedExperiment(assays = SimpleList(counts = objPB[[1]]), rowRanges = rowRanges(se), colData = cdPB)
547 | metadata(sePB)$matCS <- objPB[[2]]
548 |
549 | saveRDS(sePB, "results/Cluster_PseudoBulk-Summarized-Experiment.rds")
550 |
551 | #----------------------------
552 | # Unique Features
553 | #----------------------------
554 | params <- list(
555 | padj = 0.01,
556 | minSdRatio = 0.001,
557 | minLFC = 0.25,
558 | zCutoff = 1,
559 | breakPt = "last",
560 | groupMin = 25,
561 | maxGroupSize = max(floor(length(unique(colData(se)$Group))/3), 2),
562 | date = Sys.Date()
563 | )
564 |
565 | #Unique Features
566 | uf <- uniqueFeatures(
567 | edgeR::cpm(assay(sePB),log=TRUE,prior.count=3),
568 | groups = colData(sePB)$Group,
569 | padj = params$padj,
570 | minSdRatio = params$minSdRatio,
571 | minLFC = params$minLFC,
572 | zCutoff = params$zCutoff,
573 | breakPt = params$breakPt,
574 | groupMin = params$groupMin,
575 | maxGroupSize = params$maxGroupSize
576 | )
577 |
578 | library(pheatmap)
579 | pdf("results/Unique_Peaks.pdf",width=4,height=8)
580 | m <- uf$mat
581 | z <- sweep(m - rowMeans(m), 1, matrixStats::rowSds(m), `/`)
582 | z[z > 2] <- 2
583 | z[z < -2] <- -2
584 | pheatmap(z, cluster_cols=FALSE, cluster_rows=FALSE, show_rownames = FALSE)
585 | dev.off()
586 |
587 | saveRDS(uf, "results/Unique_Peaks.rds")
588 | saveRDS(params, "results/Unique_Peaks_Params.rds")
589 |
590 |
591 |
592 |
593 |
--------------------------------------------------------------------------------
/code/06_Analyze_UMAP_Trajectory.R:
--------------------------------------------------------------------------------
1 | #Constructing a Trajectory in UMAP projected sub-space
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(Matrix)
8 | library(SummarizedExperiment)
9 | library(matrixStats)
10 | library(magrittr)
11 | library(edgeR)
12 | set.seed(1)
13 |
14 | getQuantiles <- function(x){
15 | trunc(rank(x))/length(x)
16 | }
17 |
18 | alignTrajectory <- function(df, trajectory, filter = 0.05, dof = 250, spar = 1){
19 | findClosest <- function(x, y, fitx, fity){
20 | distxy <- sqrt(rowSums(cbind((fitx - x)^2 + (fity - y)^2)))
21 | idxMin <- which.min(distxy)
22 | if(idxMin==1){
23 | idxMin <- idxMin + 1
24 | }else if(idxMin==length(fitx)){
25 | idxMin <- idxMin - 1
26 | }
27 | if(distxy[idxMin + 1] < distxy[idxMin - 1]){
28 | diff <- 1
29 | }else{
30 | diff <- -1
31 | }
32 | data.frame(idx = idxMin, dist = distxy[idxMin], diff = diff)
33 | }
34 | dfAll <- data.frame()
35 | for(x in seq_along(trajectory)){
36 | #Subset
37 | dfx <- df[df$Group==trajectory[x],]
38 | #Mean Diff Filter
39 | xmean <- colMeans(dfx[,c(1,2)])
40 | diffx <- sqrt(colSums((t(dfx[,1:2]) - xmean)^2))
41 | dfx <- dfx[which(diffx <= quantile(diffx,1 - filter)),]
42 | #Get diff
43 | if(x!=length(trajectory)){
44 | xmean1 <- colMeans(df[df$Group==trajectory[x+1],c(1,2)])
45 | diffx1 <- sqrt(colSums((t(dfx[,1:2]) - xmean1)^2))
46 | dfx$time <- (1 - getQuantiles(diffx1)) + x
47 | }else{
48 | xmean1 <- colMeans(df[df$Group==trajectory[x-1],c(1,2)])
49 | diffx1 <- sqrt(colSums((t(dfx[,1:2]) - xmean1)^2))
50 | dfx$time <- getQuantiles(diffx1) + x
51 | }
52 | dfAll <- rbind(dfAll , dfx)
53 | }
54 | sx <- smooth.spline(dfAll$time, dfAll$x, df = dof, spar = spar)
55 | sy <- smooth.spline(dfAll$time, dfAll$y, df = dof, spar = spar)
56 | dfFit <- data.frame(x = sx[[2]], y = sy[[2]], t = seq_along(sy[[2]]))
57 | dfTrajectory <- df[df$Group %in% trajectory,]
58 | dfTime <- lapply(seq_len(nrow(dfTrajectory)), function(x){
59 | findClosest(dfTrajectory[x,1],dfTrajectory[x,2], dfFit[,1],dfFit[,2])
60 | }) %>% Reduce("rbind",.)
61 | dfTime$distQ <- getQuantiles(dfTime$dist)
62 | dfTrajectory$pseudotime <- 100*getQuantiles(dfTime$idx + matrixStats::rowProds(as.matrix(dfTime[,c("diff","distQ")])))
63 |
64 | out <- list(trajectory=dfTrajectory, fitTrajectory = dfFit)
65 | }
66 |
67 | trajectoryStats <- function(mat, trajectory, nSim = 1000, nFeatures = 10000){
68 | #--------------
69 | # Functions
70 | #--------------
71 | rankTrajectory <- function(mat, trajectory, n = 10000, method = "vec"){
72 | if(method=="vec"){
73 | vecRank <- c()
74 | for(i in seq_along(trajectory)[-length(trajectory)]){
75 | if(i == 1){
76 | rem <- c(trajectory[i])
77 | }else{
78 | rem <- c(trajectory[i], trajectory[seq(1,i-1)])
79 | }
80 | trajectoryI <- trajectory[i]
81 | peaksI <- head(order(mat[,trajectoryI],decreasing=TRUE), n = 10000)
82 | distI <- rank(sqrt(colSums((mat[peaksI,] - mat[peaksI,trajectoryI])^2))[colnames(mat) %ni% rem])
83 | vecRank <- c(vecRank, distI[trajectory[i+1]])
84 | }
85 | vecRank
86 | }else{
87 | matRank <- matrix(ncol=length(trajectory)-1,nrow=ncol(mat))
88 | rownames(matRank) <- colnames(mat)
89 | colnames(matRank) <- paste0("T",seq_along(trajectory[-1]))
90 | for(i in seq_along(trajectory)[-length(trajectory)]){
91 | if(i == 1){
92 | rem <- c(trajectory[i])
93 | }else{
94 | rem <- c(trajectory[i], trajectory[seq(1,i-1)])
95 | }
96 | trajectoryI <- trajectory[i]
97 | peaksI <- head(order(mat[,trajectoryI],decreasing=TRUE), n = 10000)
98 | distI <- sqrt(colSums((mat[peaksI,] - mat[peaksI,trajectoryI])^2))[colnames(mat) %ni% rem]
99 | matRank[names(distI),i] <- distI
100 | }
101 | matRank
102 | }
103 | }
104 | nullTracjectory <- function(trajectory, n = 1000){
105 | nullTracjectory <- list()
106 | while(length(nullTracjectory) < n){
107 | trajx <- sample(trajectory, length(trajectory))
108 | if(!identical(trajx, trajectory)){
109 | nullTracjectory[[length(nullTracjectory) + 1]] <- trajx
110 | }
111 |
112 | }
113 | nullTracjectory
114 | }
115 | #Reverse Trajectory
116 | trajectory <- rev(trajectory)
117 | # Compute Null
118 | message("Computing Null Trajectories...")
119 | nullT <- nullTracjectory(trajectory, nSim)
120 | pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)
121 | nullRanks <- Reduce("rbind",lapply(seq_along(nullT), function(x){
122 | setTxtProgressBar(pb, round(x * 100/length(nullT), 0))
123 | rankX <- rankTrajectory(mat, nullT[[x]], n = nFeatures)
124 | data.frame(mean = mean(rankX), median = median(rankX))
125 | }))
126 | # Compute Actual
127 | rankT <- rankTrajectory(mat, trajectory, n = nFeatures)
128 | rankTMat <- rankTrajectory(mat, trajectory, n = nFeatures, method = "mat")
129 | pvalMean <- sum(nullRanks$mean < mean(rankT)) / nrow(nullRanks)
130 | pvalMedian <- sum(nullRanks$median < median(rankT)) / nrow(nullRanks)
131 | out <- list(
132 | pvalMean = pvalMean,
133 | pvalMedian = pvalMedian,
134 | rankT = rankT,
135 | rankNull = nullRanks,
136 | rankTMat = rankTMat
137 | )
138 | return(out)
139 | }
140 |
141 | groupSums <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
142 | stopifnot(!is.null(groups))
143 | stopifnot(length(groups) == ncol(mat))
144 | gm <- lapply(unique(groups), function(x) {
145 | if (sparse) {
146 | Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
147 | }
148 | else {
149 | rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
150 | }
151 | }) %>% Reduce("cbind", .)
152 | colnames(gm) <- unique(groups)
153 | return(gm)
154 | }
155 |
156 | '%ni%' <- Negate('%in%')
157 |
158 | #----------------------------
159 | # Get Inputs
160 | #----------------------------
161 | se <- readRDS("results/scATAC-Summarized-Experiment.rds")
162 | trajectory <- paste0("Cluster",c(2,3,4,1,6,5))
163 |
164 | #Align single cells to Trajectory
165 | df <- data.frame(row.names = colnames(se), x = colData(se)$UMAP1, y = colData(se)$UMAP2, Group = colData(se)$Clusters)
166 | trajAligned <- alignTrajectory(df, trajectory)
167 |
168 | df2 <- trajAligned[[1]]
169 | dfT <- trajAligned[[2]]
170 |
171 | pdf("results/UMAP-PseudoTime.pdf")
172 | library(ggplot2)
173 | ggplot(df2, aes(x,y,color=pseudotime)) + geom_point() +
174 | theme_bw() + viridis::scale_color_viridis() +
175 | geom_path(data=data.frame(dfT), aes(x,y,color=NULL), size= 1,
176 | arrow = arrow(type = "open", angle = 30, length = unit(0.1, "inches")))
177 | dev.off()
178 | saveRDS(trajAligned, "results/Aligned-Trajectory.rds")
179 |
180 | #Significance of Trajectory Order
181 | clustMeans <- edgeR::cpm(groupSums(assay(se), colData(se)$Clusters, sparse = TRUE),log=TRUE,prior.count=3)
182 | trajStats <- trajectoryStats(clustMeans, trajectory, nSim = 500, nFeatures = 10000)
183 | rankMat <- trajStats$rankTMat
184 | rankDF <- lapply(seq_len(ncol(rankMat)), function(x){
185 | data.frame(names=names(rankMat[,x]),ranks=rankMat[,x],t=x)
186 | }) %>% Reduce("rbind",.)
187 | rankV <- lapply(seq_len(ncol(rankMat)), function(x){
188 | data.frame(ranks=rankMat[rev(trajectory)[x+1],x],t=x)
189 | }) %>% Reduce("rbind",.)
190 |
191 | pdf("results/Trajectory_Plot_Distance.pdf", useDingbats = FALSE, width = 12, height = 12)
192 | ggplot(rankDF, aes(t,ranks,color=names)) +
193 | geom_point(size = 8) +
194 | geom_point(data = rankV, aes(color=NULL), color = "black", size = 2) +
195 | geom_line(data = rankV, aes(color=NULL), color = "black", size = 1,
196 | lty = "dashed", arrow = arrow(type = "open", length = unit(0.2, "inches"))) +
197 | theme_bw() +
198 | ylab("Trajectory Distance (Dist logCPM)") + xlab(paste0("Trajectory 5000 Sim Pval = ",trajStats$pvalMean)) +
199 | ggtitle(paste0(rev(trajectory),collapse=" -> "))
200 | dev.off()
201 |
202 | out <- list(trajStats = trajStats, rankDF = rankDF, rankV = rankV)
203 | saveRDS(out, "results/Trajectory-Stats.rds")
204 |
205 |
206 |
207 |
--------------------------------------------------------------------------------
/code/07_ChromVAR_For_GWAS_w_CoAccessbility_v2.R:
--------------------------------------------------------------------------------
1 | #Running gwas chromVAR on single-cell summarized experiment using co-accessibility
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(chromVAR)
8 | library(Matrix)
9 | library(MatrixStats)
10 | library(SummarizedExperiment)
11 | library(magrittr)
12 | library(BiocParallel)
13 | library(BSgenome.Hsapiens.UCSC.hg19)
14 | register(SerialParam())
15 | set.seed(1)
16 |
17 | se <- readRDS("results/scATAC-Summarized-Experiment.rds")
18 | gr <- readRDS("data/PICS_GWAS_SNPS.gr.rds")
19 | conn <- readRDS("results/Peaks-Co-Accessibility.rds")
20 | conn <- conn[conn[,3] >= 0.35,]
21 | peaknames <- paste(seqnames(se),start(se),end(se),sep="_")
22 | conn[,4] <- match(paste0(conn[,1]), peaknames)
23 | conn[,5] <- match(paste0(conn[,2]), peaknames)
24 | connMat <- Matrix::sparseMatrix(i=conn[,4],j=conn[,5],x=rep(1,nrow(conn)),dims=c(nrow(se),nrow(se)))
25 |
26 | #Add Bias
27 | genome <- BSgenome.Hsapiens.UCSC.hg19
28 | se <- addGCBias(se, genome = genome)
29 |
30 | #Overlap GWAS SNPs
31 | o <- lapply(split(gr, gr$disease), function(x){
32 | #extend snp +- 500 bp
33 | overlapsAny(se, resize(x,1001,"center"), ignore.strand = TRUE)
34 | }) %>% Reduce("cbind",.)
35 | ow <- which(o > 0, arr.ind=TRUE)
36 | matches <- Matrix::sparseMatrix(i=ow[,1],j=ow[,2],x=o[cbind(ow[,1],ow[,2])], dims = c(nrow(se),length(unique(gr$disease))))
37 | colnames(matches) <- names(split(gr, gr$disease))
38 |
39 | #Use connections mat!
40 | matches2 <- matches
41 | idx <- which(Matrix::rowSums(matches2)>0) #which peaks have a snp
42 | for(i in seq_along(idx)){
43 | if(i %% 100 == 0){message(sprintf("%s of %s",i,length(idx)))}
44 | #peaks co-accessible to peak with snp
45 | coi <- unique(c(which(connMat[,idx[i]]>0),which(connMat[idx[i],]>0)))
46 | if(length(coi) > 0){
47 | #create sub mat
48 | mati <- as(t(replicate(length(coi), matches[idx[i],])),"dgCMatrix")
49 | #add it to sub mat of connected peaks
50 | matches2[coi,,drop=FALSE] <- matches2[coi,,drop=FALSE] + mati
51 | }
52 | }
53 | diff <- Matrix::colSums(matches2) - Matrix::colSums(matches)
54 | print(diff)
55 |
56 | #Make Annotation SE
57 | anno_ix <- SummarizedExperiment(assays=SimpleList(motifMatches=matches2), rowRanges=rowRanges(se))
58 |
59 | #Compute Deviations
60 | dev <- computeDeviations(se, anno_ix)
61 |
62 | #compute variability
63 | metadata(dev)$Variability <- computeVariability(dev)
64 |
65 | #add matches
66 | metadata(dev)$gwasMatches <- anno_ix
67 |
68 | #save output
69 | saveRDS(dev, "results/GWAS-Co-Accessibility-chromVAR-Summarized-Experiment.rds")
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/code/08_Run_scCNV_v2.R:
--------------------------------------------------------------------------------
1 | #Estimating Copy Number Variation in scATAC-seq
2 | #05/02/19
3 | #Cite Satpathy*, Granja*, et al.
4 | #Massively parallel single-cell chromatin landscapes of human immune
5 | #cell development and intratumoral T cell exhaustion (2019)
6 | #Created by Jeffrey Granja
7 | library(Matrix)
8 | library(SummarizedExperiment)
9 | library(matrixStats)
10 | library(readr)
11 | library(GenomicRanges)
12 | library(magrittr)
13 | library(edgeR)
14 | library(Seurat)
15 | library(BSgenome.Hsapiens.UCSC.hg19)
16 | set.seed(1)
17 |
18 | "%ni%" <- Negate("%in%")
19 |
20 | countInsertions <- function(query, fragments, by = "RG"){
21 | #Count By Fragments Insertions
22 | inserts <- c(
23 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
24 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
25 | )
26 | by <- "RG"
27 | overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap=-1L, minoverlap=0L, type = "any"))
28 | overlapDF$name <- mcols(inserts)[overlapDF[, 2], by]
29 | overlapTDF <- transform(overlapDF, id = match(name, unique(name)))
30 | #Calculate Overlap Stats
31 | inPeaks <- table(overlapDF$name)
32 | total <- table(mcols(inserts)[, by])
33 | total <- total[names(inPeaks)]
34 | frip <- inPeaks / total
35 | #Summarize
36 | sparseM <- Matrix::sparseMatrix(
37 | i = overlapTDF[, 1],
38 | j = overlapTDF[, 4],
39 | x = rep(1, nrow(overlapTDF)),
40 | dims = c(length(query), length(unique(overlapDF$name))))
41 | colnames(sparseM) <- unique(overlapDF$name)
42 | total <- total[colnames(sparseM)]
43 | frip <- frip[colnames(sparseM)]
44 | out <- list(counts = sparseM, frip = frip, total = total)
45 | return(out)
46 | }
47 |
48 | makeWindows <- function(genome, blacklist, windowSize = 10e6, slidingSize = 2e6){
49 | chromSizes <- GRanges(names(seqlengths(genome)), IRanges(1, seqlengths(genome)))
50 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
51 | windows <- slidingWindows(x = chromSizes, width = windowSize, step = slidingSize) %>% unlist %>% .[which(width(.)==windowSize),]
52 | mcols(windows)$wSeq <- as.character(seqnames(windows))
53 | mcols(windows)$wStart <- start(windows)
54 | mcols(windows)$wEnd <- end(windows)
55 | message("Subtracting Blacklist...")
56 | windowsBL <- lapply(seq_along(windows), function(x){
57 | if(x %% 100 == 0){
58 | message(sprintf("%s of %s", x, length(windows)))
59 | }
60 | gr <- GenomicRanges::setdiff(windows[x,], blacklist)
61 | mcols(gr) <- mcols(windows[x,])
62 | return(gr)
63 | })
64 | names(windowsBL) <- paste0("w",seq_along(windowsBL))
65 | windowsBL <- unlist(GenomicRangesList(windowsBL), use.names = TRUE)
66 | mcols(windowsBL)$name <- names(windowsBL)
67 | message("Adding Nucleotide Information...")
68 | windowSplit <- split(windowsBL, as.character(seqnames(windowsBL)))
69 | windowNuc <- lapply(seq_along(windowSplit), function(x){
70 | message(sprintf("%s of %s", x, length(windowSplit)))
71 | chrSeq <- Biostrings::getSeq(genome,chromSizes[which(seqnames(chromSizes)==names(windowSplit)[x])])
72 | grx <- windowSplit[[x]]
73 | aFreq <- alphabetFrequency(Biostrings::Views(chrSeq[[1]], ranges(grx)))
74 | mcols(grx)$GC <- rowSums(aFreq[, c("G","C")]) / rowSums(aFreq)
75 | mcols(grx)$AT <- rowSums(aFreq[, c("A","T")]) / rowSums(aFreq)
76 | return(grx)
77 | }) %>% GenomicRangesList %>% unlist %>% sortSeqlevels %>% sort
78 | windowNuc$N <- 1 - (windowNuc$GC + windowNuc$AT)
79 | windowNuc
80 | }
81 |
82 | scCNA <- function(windows, fragments, neighbors = 100, LFC = 1.5, FDR = 0.1, force = FALSE, remove = c("chrM","chrX","chrY")){
83 |
84 | #Keep only regions in filtered chromosomes
85 | windows <- GenomeInfoDb::keepStandardChromosomes(windows, pruning.mode = "coarse")
86 | fragments <- GenomeInfoDb::keepStandardChromosomes(fragments, pruning.mode = "coarse")
87 | windows <- windows[seqnames(windows) %ni% remove]
88 | fragments <- fragments[seqnames(fragments) %ni% remove]
89 |
90 | #Count Insertions in windows
91 | message("Getting Counts...")
92 | counts <- countInsertions(windows, fragments, by = "RG")[[1]]
93 | message("Summarizing...")
94 | windowSummary <- GenomicRangesList()
95 | countSummary <- matrix(nrow=length(unique(windows$name)), ncol = ncol(counts))
96 | for(x in seq_along(unique(mcols(windows)$name))){
97 | if(x %% 100 == 0){
98 | message(sprintf("%s of %s", x, length(unique(mcols(windows)$name))))
99 | }
100 | idx <- which(mcols(windows)$name == unique(mcols(windows)$name)[x])
101 | wx <- windows[idx,]
102 | wo <- GRanges(mcols(wx)$wSeq , ranges = IRanges(mcols(wx)$wStart, mcols(wx)$wEnd))[1,]
103 | mcols(wo)$name <- mcols(wx)$name[1]
104 | mcols(wo)$effectiveLength <- sum(width(wx))
105 | mcols(wo)$percentEffectiveLength <- 100*sum(width(wx))/width(wo)
106 | mcols(wo)$GC <- sum(mcols(wx)$GC * width(wx))/width(wo)
107 | mcols(wo)$AT <- sum(mcols(wx)$AT * width(wx))/width(wo)
108 | mcols(wo)$N <- sum(mcols(wx)$N * width(wx))/width(wo)
109 | countSummary[x,] <- Matrix::colSums(counts[idx,,drop=FALSE])
110 | windowSummary[[x]] <- wo
111 | }
112 | windowSummary <- unlist(windowSummary)
113 |
114 | #Keep only regions with less than 0.1% N
115 | keep <- which(windowSummary$N < 0.001)
116 | windowSummary <- windowSummary[keep,]
117 | countSummary <- countSummary[keep,]
118 |
119 | #Now determine the nearest neighbors by GC content
120 | message("Computing Background...")
121 | bdgMean <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary))
122 | bdgSd <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary))
123 | log2FC <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary))
124 | z <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary))
125 | pval <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary))
126 |
127 | for(x in seq_len(nrow(countSummary))){
128 | if(x %% 100 == 0){
129 | message(sprintf("%s of %s", x, nrow(countSummary)))
130 | }
131 | #Get Nearest Indices
132 | idxNN <- head(order(abs(windowSummary$GC[x] - windowSummary$GC)), neighbors + 1)
133 | idxNN <- idxNN[idxNN %ni% x]
134 | #Background
135 | if(any(colMeans(countSummary[idxNN, ])==0)){
136 | if(force){
137 | message("Warning! Background Mean = 0 Try a higher neighbor count or remove cells with 0 in colMins")
138 | }else{
139 | stop("Background Mean = 0!")
140 | }
141 | }
142 | bdgMean[x, ] <- colMeans(countSummary[idxNN, ])
143 | bdgSd[x, ] <- matrixStats::colSds(countSummary[idxNN, ])
144 | log2FC[x, ] <- log2((countSummary[x, ]+1e-5) / (bdgMean[x, ]+1e-5))
145 | z[x, ] <- (countSummary[x,] - bdgMean[x, ]) / bdgSd[x, ]
146 | pval[x, ] <- 2*pnorm(-abs(z[x, ]))
147 | }
148 | padj <- apply(pval, 2, function(x) p.adjust(x, method = "fdr"))
149 | CNA <- matrix(0, nrow=nrow(countSummary), ncol=ncol(countSummary))
150 | CNA[which(log2FC >= LFC & padj <= FDR)] <- 1
151 |
152 | se <- SummarizedExperiment(
153 | assays = SimpleList(
154 | CNA = CNA,
155 | counts = countSummary,
156 | log2FC = log2FC,
157 | padj = padj,
158 | pval = pval,
159 | z = z,
160 | bdgMean = bdgMean,
161 | bdgSd = bdgSd
162 | ),
163 | rowRanges = windowSummary
164 | )
165 | colnames(se) <- colnames(counts)
166 |
167 | return(se)
168 | }
169 |
170 | #----------------------------
171 | # Get Inputs
172 | #----------------------------
173 | blacklist <- import.bed("data/hg19.blacklist.bed")
174 | windows <- makeWindows(genome = BSgenome.Hsapiens.UCSC.hg19, blacklist = blacklist)
175 | cnaObj <- scCNA(windows, readRDS("data/PBMC_10x-Sub25M-fragments.gr.rds"), neighbors = 100, LFC = 1.5, FDR = 0.1, force = FALSE, remove = c("chrM","chrX","chrY"))
176 | saveRDS(cnaObj, "results/PMC-CNV_LFC_GC.rds")
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
--------------------------------------------------------------------------------