├── Figure1.png ├── README.md ├── Step0.png ├── Step1.png ├── Step2.png ├── Step3.png ├── Step4.png └── code ├── 01_Filter_Cells_v2.R ├── 02_Get_Peak_Set_hg19_v2.R ├── 03_Run_chromVAR_v2.R ├── 04_Run_Cicero_v2.R ├── 05_Cluster_Unique_Peaks_v2.R ├── 06_Analyze_UMAP_Trajectory.R ├── 07_ChromVAR_For_GWAS_w_CoAccessbility_v2.R └── 08_Run_scCNV_v2.R /Figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Figure1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Massively parallel single-cell chromatin landscapes of human immune cell development and intratumoral T cell exhaustion. Nature Biotechnology (Satpathy*, Granja* et al. 2019) 2 | 3 | ## **Link** : https://www.nature.com/articles/s41587-019-0206-z 4 | 5 | ## Please cite : Satpathy*, Granja* et al. , Massively parallel single-cell chromatin landscapes of human immune cell development and intratumoral T cell exhaustion. Nature Biotechnology (2019)
6 | 7 | ## For current up to date scATAC-seq analysis https://github.com/GreenleafLab/ArchR 8 | 9 | ![](Figure1.png) 10 | 11 | # Downsampled test data for PBMCs is available (~500 MB) 12 | 13 | https://jeffgranja.s3.amazonaws.com/10x-scATAC-share/10x-scATAC-Downsampled-PBMC-hg19-data.zip 14 | 15 | # Links To Supplementary Data 16 | 17 | ## Notes 18 | 19 | **.rds** file is an R binarized object to read into R use readRDS(filename) 20 | 21 | **SummarizedExperiment** is a class in R see :
https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html 22 | 23 | **deviations** (TF chromVAR) is a class in R see :
https://bioconductor.org/packages/release/bioc/html/chromVAR.html 24 | 25 | ## scATAC-seq Hematopoiesis 26 | 27 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_Heme_All_SummarizedExperiment.final.rds 28 | 29 | **chromVAR Summarized Experiment** : 30 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/chromVAR_Heme_All_SummarizedExperiment.final.rds 31 | 32 | **Cicero Log2 Gene Acitvity Scores Summarized Experiment** : 33 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/Log2_Gene_Activity_Heme_All_SummarizedExperiment.final.rds 34 | 35 | ## scATAC-seq CD34 Hematopoiesis 36 | 37 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_CD34_BM_SummarizedExperiment.final.rds 38 | 39 | ## scATAC-seq BCC Tumor Microenvironment 40 | 41 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_TME_All_SummarizedExperiment.final.rds 42 | 43 | **chromVAR Summarized Experiment** : 44 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/chromVAR_TME_All_SummarizedExperiment.final.rds 45 | 46 | **Cicero Log2 Gene Acitvity Scores Summarized Experiment** : 47 |
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/Log2_Gene_Activity_TME_All_SummarizedExperiment.final.rds 48 | 49 | ## scATAC-seq BCC Tcells (Exhaustion) 50 | 51 | **scATAC Summarized Experiment** :
https://changseq.s3.amazonaws.com/Jeff/10x_ScATAC/scATAC_TME_TCells_SummarizedExperiment.final.rds 52 | 53 | If you want Tcell clusters get colData(se)$T_Cell_Cluster 54 | 55 | # Getting 10x scATAC-seq Bam Files 56 | 57 | ## 1. Go to NIH GEO Page : https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE129785&holding=F1000&otool=stanford. 58 | 59 | ![](Step0.png) 60 | 61 | ## 2. Click on a sample (I am showing SU001_Tumor_Immune_Post). 62 | 63 | ![](Step1.png) 64 | 65 | ## 3. Navigate down to the bottom and click on SRA link. 66 | 67 | ![](Step2.png) 68 | 69 | ## 4. Navigate down and fine the run SRR. 70 | 71 | ![](Step3.png) 72 | 73 | ## 5. Click on "Data Access" Tab and then navigate to "Original Format" 74 | 75 | ![](Step4.png) 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Step0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step0.png -------------------------------------------------------------------------------- /Step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step1.png -------------------------------------------------------------------------------- /Step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step2.png -------------------------------------------------------------------------------- /Step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step3.png -------------------------------------------------------------------------------- /Step4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenleafLab/10x-scATAC-2019/ca70be6bc80bea957a161dd6ef773314909ecef5/Step4.png -------------------------------------------------------------------------------- /code/01_Filter_Cells_v2.R: -------------------------------------------------------------------------------- 1 | #Filtering Cells based on TSS enrichment and unique fragments 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 8 | library(magrittr) 9 | library(ggplot2) 10 | library(Rcpp) 11 | library(viridis) 12 | 13 | #-------------------------------------------- 14 | # Functions 15 | #-------------------------------------------- 16 | 17 | sourceCpp(code=' 18 | #include 19 | 20 | using namespace Rcpp; 21 | using namespace std; 22 | 23 | // [[Rcpp::export]] 24 | IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){ 25 | if(x1.size() != y1.size()){ 26 | stop("width must equal size!"); 27 | } 28 | IntegerVector x = clone(x1); 29 | IntegerVector y = clone(y1); 30 | int n = x.size(); 31 | IntegerVector rx = seq(xmin,xmax); 32 | IntegerVector ry = seq(ymin,ymax); 33 | IntegerMatrix mat( ry.size() , rx.size() ); 34 | int xi,yi; 35 | for(int i = 0; i < n; i++){ 36 | xi = (x[i] - xmin); 37 | yi = (y[i] - ymin); 38 | if(yi >= 0 && yi < ry.size()){ 39 | if(xi >= 0 && xi < rx.size()){ 40 | mat( yi , xi ) = mat( yi , xi ) + 1; 41 | } 42 | } 43 | } 44 | return mat; 45 | }' 46 | ) 47 | 48 | insertionProfileSingles <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){ 49 | 50 | insertionProfileSingles_helper <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){ 51 | #Convert To Insertion Sites 52 | if(getInsertions){ 53 | insertions <- c( 54 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]), 55 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by]) 56 | ) 57 | by <- "RG" 58 | }else{ 59 | insertions <- fragments 60 | } 61 | remove(fragments) 62 | gc() 63 | 64 | #center the feature 65 | center <- unique(resize(feature, width = 1, fix = fix, ignore.strand = FALSE)) 66 | 67 | #get overlaps between the feature and insertions only up to flank bp 68 | overlap <- DataFrame(findOverlaps(query = center, subject = insertions, maxgap = flank, ignore.strand = TRUE)) 69 | overlap$strand <- strand(center)[overlap[,1]] 70 | overlap$name <- mcols(insertions)[overlap[,2],by] 71 | overlap <- transform(overlap, id=match(name, unique(name))) 72 | ids <- length(unique(overlap$name)) 73 | 74 | #distance 75 | overlap$dist <- NA 76 | minus <- which(overlap$strand == "-") 77 | other <- which(overlap$strand != "-") 78 | overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]]) 79 | overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]]) 80 | 81 | #Insertion Mat 82 | profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -flank, ymax = flank) 83 | colnames(profile_mat) <- unique(overlap$name) 84 | profile <- rowSums(profile_mat) 85 | 86 | #normalize 87 | profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells 88 | profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))]) 89 | 90 | #smooth 91 | profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1)) 92 | profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1) 93 | 94 | #enrichment 95 | max_finite <- function(x){ 96 | suppressWarnings(max(x[is.finite(x)], na.rm=TRUE)) 97 | } 98 | e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)])) 99 | names(e_mat) <- colnames(profile_mat_norm_smooth) 100 | e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)]) 101 | 102 | #Summary 103 | df_mat <- data.frame( 104 | enrichment = e_mat, 105 | insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]), 106 | insertionsWindow = as.vector(table(overlap$name)[names(e_mat)]) 107 | ) 108 | df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e) 109 | rownames(df_sum) <- NULL 110 | 111 | return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth)) 112 | } 113 | 114 | uniqueTags <- as.character(unique(mcols(fragments)[,by])) 115 | splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize)) 116 | 117 | pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3) 118 | batchTSS <- lapply(seq_along(splitTags), function(x){ 119 | setTxtProgressBar(pb, round(x * 100/length(splitTags), 0)) 120 | profilex <- insertionProfileSingles_helper( 121 | feature=feature, 122 | fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])], 123 | by = by, 124 | getInsertions = getInsertions, 125 | fix = fix, 126 | flank = flank, 127 | norm = norm, 128 | smooth = smooth, 129 | range = range 130 | ) 131 | 132 | return(profilex) 133 | }) 134 | df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.) 135 | dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.) 136 | profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.) 137 | profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.) 138 | return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth)) 139 | } 140 | 141 | #-------------------------------------------- 142 | # Input 143 | #-------------------------------------------- 144 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 145 | minFrags <- 100 146 | filterFrags <- 1000 147 | filterTSS <- 8 148 | file_fragments <- "data/PBMC_10x-Sub25M-fragments.tsv.gz" 149 | out_fragments <- "data/PBMC_10x-Sub25M-fragments.gr.rds" 150 | name <- "PBMC" 151 | 152 | #----------------- 153 | # Reading Fragment Files 154 | #----------------- 155 | message("Reading in fragment files...") 156 | fragments <- data.frame(readr::read_tsv(file_fragments, col_names=FALSE)) 157 | #fragmentSub <- fragments[sample(seq_len(nrow(fragments)),25*10^6),] 158 | #write.table(fragmentSub, "data/PBMC_10x-Sub25M-fragments.tsv.gz", col.names=FALSE, row.names =FALSE, sep = "\t", quote = FALSE) 159 | 160 | fragments <- GRanges( 161 | seqnames = fragments[,1], 162 | IRanges(fragments[,2]+1, fragments[,3]), 163 | RG = fragments[,4], 164 | N = fragments[,5] 165 | ) 166 | 167 | message("Filtering Lowly Represented Cells...") 168 | tabRG <- table(fragments$RG) 169 | keep <- names(tabRG)[which(tabRG >= minFrags)] 170 | fragments <- fragments[fragments$RG %in% keep,] 171 | fragments <- sort(sortSeqlevels(fragments)) 172 | 173 | #----------------- 174 | # TSS Profile 175 | #----------------- 176 | feature <- txdb %>% transcripts(.) %>% resize(., width = 1, fix = "start") %>% unique 177 | tssProfile <- insertionProfileSingles(feature = feature, fragments = fragments, 178 | getInsertions = TRUE, batchSize = 1000) 179 | tssSingles <- tssProfile$dfall 180 | tssSingles$uniqueFrags <- 0 181 | tssSingles[names(tabRG),"uniqueFrags"] <- tabRG 182 | tssSingles$cellCall <- 0 183 | tssSingles$cellCall[tssSingles$uniqueFrags >= filterFrags & tssSingles$enrichment >= filterTSS] <- 1 184 | 185 | #----------------- 186 | # Plot Stats 187 | #----------------- 188 | tssSingles <- tssSingles[complete.cases(tssSingles),] 189 | nPass <- sum(tssSingles$cellCall==1) 190 | nTotal <- sum(tssSingles$uniqueFrags >= filterFrags) 191 | 192 | pdf("results/Filter-Cells.pdf") 193 | ggplot(tssSingles[tssSingles$uniqueFrags > 500,], aes(x = log10(uniqueFrags), y = enrichment)) + 194 | geom_hex(bins = 100) + 195 | theme_bw() + scale_fill_viridis() + 196 | xlab("log10 Unique Fragments") + 197 | ylab("TSS Enrichment") + 198 | geom_hline(yintercept = filterTSS, lty = "dashed") + 199 | geom_vline(xintercept = log10(filterFrags), lty = "dashed") + 200 | ggtitle(sprintf("Pass Rate : %s of %s (%s)", nPass, nTotal, round(100*nPass/nTotal,2))) 201 | dev.off() 202 | 203 | write.table(tssSingles, "results/Filter-Cells.txt") 204 | 205 | #Filter 206 | fragments <- fragments[mcols(fragments)$RG %in% rownames(tssSingles)[tssSingles$cellCall==1]] 207 | fragments$RG <- paste0(name,"#",fragments$RG) 208 | 209 | #Save 210 | saveRDS(fragments, out_fragments) 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /code/02_Get_Peak_Set_hg19_v2.R: -------------------------------------------------------------------------------- 1 | #Creating a peak set, summarized experiment and LSI clustering 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(Matrix) 8 | library(SummarizedExperiment) 9 | library(matrixStats) 10 | library(readr) 11 | library(GenomicRanges) 12 | library(magrittr) 13 | library(edgeR) 14 | library(Seurat) 15 | library(BSgenome.Hsapiens.UCSC.hg19) 16 | set.seed(1) 17 | 18 | countInsertions <- function(query, fragments, by = "RG"){ 19 | #Count By Fragments Insertions 20 | inserts <- c( 21 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]), 22 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by]) 23 | ) 24 | by <- "RG" 25 | overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap=-1L, minoverlap=0L, type = "any")) 26 | overlapDF$name <- mcols(inserts)[overlapDF[, 2], by] 27 | overlapTDF <- transform(overlapDF, id = match(name, unique(name))) 28 | #Calculate Overlap Stats 29 | inPeaks <- table(overlapDF$name) 30 | total <- table(mcols(inserts)[, by]) 31 | total <- total[names(inPeaks)] 32 | frip <- inPeaks / total 33 | #Summarize 34 | sparseM <- Matrix::sparseMatrix( 35 | i = overlapTDF[, 1], 36 | j = overlapTDF[, 4], 37 | x = rep(1, nrow(overlapTDF)), 38 | dims = c(length(query), length(unique(overlapDF$name)))) 39 | colnames(sparseM) <- unique(overlapDF$name) 40 | total <- total[colnames(sparseM)] 41 | frip <- frip[colnames(sparseM)] 42 | out <- list(counts = sparseM, frip = frip, total = total) 43 | return(out) 44 | } 45 | 46 | seuratLSI <- function(mat, nComponents = 50, binarize = TRUE, nFeatures = NULL){ 47 | #TF IDF LSI adapted from flyATAC 48 | cs <- Matrix::colSums(mat) 49 | if(binarize){ 50 | message(paste0("Binarizing matrix...")) 51 | mat@x[mat@x > 0] <- 1 52 | } 53 | if(!is.null(nFeatures)){ 54 | message(paste0("Getting top ", nFeatures, " features...")) 55 | mat <- mat[head(order(Matrix::rowSums(mat),decreasing = TRUE),nFeatures),] 56 | } 57 | #Calc TF IDF 58 | message("Computing Term Frequency IDF...") 59 | freqs <- t(t(mat)/Matrix::colSums(mat)) 60 | idf <- as(log(1 + ncol(mat) / Matrix::rowSums(mat)), "sparseVector") 61 | tfidf <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% freqs 62 | #Calc SVD then LSI 63 | message("Computing SVD using irlba...") 64 | svd <- irlba::irlba(tfidf, nComponents, nComponents) 65 | svdDiag <- matrix(0, nrow=nComponents, ncol=nComponents) 66 | diag(svdDiag) <- svd$d 67 | matSVD <- t(svdDiag %*% t(svd$v)) 68 | rownames(matSVD) <- colnames(mat) 69 | colnames(matSVD) <- paste0("PC",seq_len(ncol(matSVD))) 70 | #Make Seurat Object 71 | message("Making Seurat Object...") 72 | mat <- mat[1:100,] + 1 73 | obj <- CreateSeuratObject(mat, project='scATAC', min.cells=0, min.genes=0) 74 | obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "cell.embeddings", new.data = matSVD) 75 | obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "key", new.data = "PC") 76 | return(obj) 77 | } 78 | 79 | addClusters <- function(obj, minGroupSize = 50, dims.use = seq_len(50), initialResolution = 0.8){ 80 | #First Iteration of Find Clusters 81 | currentResolution <- initialResolution 82 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE) 83 | minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]])) 84 | nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]]))) 85 | message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize)) 86 | #If clusters are smaller than minimum group size 87 | while(minSize <= minGroupSize){ 88 | obj@meta.data <- obj@meta.data[,-which(colnames(obj@meta.data)==paste0("res.",currentResolution))] 89 | currentResolution <- currentResolution*initialResolution 90 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE, force.recalc = TRUE) 91 | minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]])) 92 | nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]]))) 93 | message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize)) 94 | } 95 | return(obj) 96 | } 97 | 98 | extendedPeakSet <- function(df, BSgenome = NULL, extend = 250, blacklist = NULL, nSummits = 100000){ 99 | #Helper Functions 100 | readSummits <- function(file){ 101 | df <- suppressMessages(data.frame(readr::read_tsv(file, col_names = c("chr","start","end","name","score")))) 102 | df <- df[,c(1,2,3,5)] #do not keep name column it can make the size really large 103 | return(GenomicRanges::makeGRangesFromDataFrame(df=df,keep.extra.columns = TRUE,starts.in.df.are.0based = TRUE)) 104 | } 105 | nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE){ 106 | stopifnot(by %in% colnames(mcols(gr))) 107 | clusterGRanges <- function(gr, filter = TRUE, by = "score", decreasing = TRUE){ 108 | gr <- sort(sortSeqlevels(gr)) 109 | r <- GenomicRanges::reduce(gr, min.gapwidth=0L, ignore.strand=TRUE) 110 | o <- findOverlaps(gr,r) 111 | mcols(gr)$cluster <- subjectHits(o) 112 | gr <- gr[order(mcols(gr)[,by], decreasing = decreasing),] 113 | gr <- gr[!duplicated(mcols(gr)$cluster),] 114 | gr <- sort(sortSeqlevels(gr)) 115 | mcols(gr)$cluster <- NULL 116 | return(gr) 117 | } 118 | if(verbose){ 119 | message("Converging", appendLF = FALSE) 120 | } 121 | i <- 0 122 | gr_converge <- gr 123 | while(length(gr_converge) > 0){ 124 | if(verbose){ 125 | message(".", appendLF = FALSE) 126 | } 127 | i <- i + 1 128 | gr_selected <- clusterGRanges(gr = gr_converge, filter = TRUE, by = by, decreasing = decreasing) 129 | gr_converge <- subsetByOverlaps(gr_converge ,gr_selected, invert=TRUE) #blacklist selected gr 130 | if(i == 1){ #if i=1 then set gr_all to clustered 131 | gr_all <- gr_selected 132 | }else{ 133 | gr_all <- c(gr_all, gr_selected) 134 | } 135 | } 136 | if(verbose){ 137 | message("\nSelected ", length(gr_all), " from ", length(gr)) 138 | } 139 | gr_all <- sort(sortSeqlevels(gr_all)) 140 | return(gr_all) 141 | } 142 | #Check------- 143 | stopifnot(extend > 0) 144 | stopifnot("samples" %in% colnames(df)) 145 | stopifnot("groups" %in% colnames(df)) 146 | stopifnot("summits" %in% colnames(df)) 147 | stopifnot(!is.null(BSgenome)) 148 | stopifnot(all(apply(df,1,function(x){file.exists(paste0(x[3]))}))) 149 | #------------ 150 | #Deal with blacklist 151 | if(is.null(blacklist)){ 152 | blacklist <- GRanges() 153 | }else if(is.character(blacklist)){ 154 | blacklist <- rtracklayer::import.bed(blacklist) 155 | } 156 | stopifnot(inherits(blacklist,"GenomicRanges")) 157 | #------------ 158 | #Time to do stuff 159 | chromSizes <- GRanges(names(seqlengths(BSgenome)), IRanges(1, seqlengths(BSgenome))) 160 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse") 161 | groups <- unique(df$groups) 162 | groupGRList <- GenomicRanges::GenomicRangesList(lapply(seq_along(groups), function(i){ 163 | df_group = df[which(df$groups==groups[i]),] 164 | grList <- GenomicRanges::GenomicRangesList(lapply(paste0(df_group$summits), function(x){ 165 | extended_summits <- readSummits(x) %>% 166 | resize(., width = 2 * extend + 1, fix = "center") %>% 167 | subsetByOverlaps(.,chromSizes,type="within") %>% 168 | subsetByOverlaps(.,blacklist,invert=TRUE) %>% 169 | nonOverlappingGRanges(., by="score", decreasing=TRUE) 170 | extended_summits <- extended_summits[order(extended_summits$score,decreasing=TRUE)] 171 | if(!is.null(nSummits)){ 172 | extended_summits <- head(extended_summits, nSummits) 173 | } 174 | mcols(extended_summits)$scoreQuantile <- trunc(rank(mcols(extended_summits)$score))/length(mcols(extended_summits)$score) 175 | extended_summits 176 | })) 177 | #Non Overlapping 178 | grNonOverlapping <- nonOverlappingGRanges(unlist(grList), by = "scoreQuantile", decreasing = TRUE) 179 | #Free Up Memory 180 | remove(grList) 181 | gc() 182 | grNonOverlapping 183 | })) 184 | grFinal <- nonOverlappingGRanges(unlist(groupGRList), by = "scoreQuantile", decreasing = TRUE) 185 | grFinal <- sort(sortSeqlevels(grFinal)) 186 | return(grFinal) 187 | } 188 | 189 | groupSums <- function(mat, groups = NULL, na.rm = TRUE, sparse = FALSE){ 190 | stopifnot(!is.null(groups)) 191 | stopifnot(length(groups) == ncol(mat)) 192 | gm <- lapply(unique(groups), function(x) { 193 | if (sparse) { 194 | Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm) 195 | }else { 196 | rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm) 197 | } 198 | }) %>% Reduce("cbind", .) 199 | colnames(gm) <- unique(groups) 200 | return(gm) 201 | } 202 | 203 | #------------------------------------------------------------------------------------------------- 204 | # Start 205 | #------------------------------------------------------------------------------------------------- 206 | fragmentFiles <- list.files("data", pattern = ".rds", full.names = TRUE) 207 | 208 | #------------------------------------------------------------------------------------------------- 209 | # Get Counts In Windows 210 | #------------------------------------------------------------------------------------------------- 211 | genome <- BSgenome.Hsapiens.UCSC.hg19 212 | chromSizes <- GRanges(names(seqlengths(genome)), IRanges(1, seqlengths(genome))) 213 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse") 214 | windows <- unlist(tile(chromSizes, width = 2500)) 215 | countsList <- lapply(seq_along(fragmentFiles), function(i){ 216 | message(sprintf("%s of %s", i, length(fragmentFiles))) 217 | counts <- countInsertions(windows, readRDS(fragmentFiles[i]), by = "RG")[[1]] 218 | counts 219 | }) 220 | mat <- lapply(countsList, function(x) x) %>% Reduce("cbind",.) 221 | remove(countsList) 222 | gc() 223 | 224 | #------------------------------------------------------------------------------------------------- 225 | # Run LSI Clustering with Seurat 226 | #------------------------------------------------------------------------------------------------- 227 | set.seed(1) 228 | message("Making Seurat LSI Object...") 229 | obj <- seuratLSI(mat, nComponents = 25, nFeatures = 20000) 230 | message("Adding Graph Clusters...") 231 | obj <- addClusters(obj, dims.use = 2:25, minGroupSize = 200, initialResolution = 0.8) 232 | saveRDS(obj, "results/Save-LSI-Windows-Seurat.rds") 233 | clusterResults <- split(rownames(obj@meta.data), paste0("Cluster",obj@meta.data[,ncol(obj@meta.data)])) 234 | remove(obj) 235 | gc() 236 | 237 | #------------------------------------------------------------------------------------------------- 238 | # Get Cluster Beds 239 | #------------------------------------------------------------------------------------------------- 240 | dirClusters <- "results/LSI-Cluster-Beds/" 241 | dir.create(dirClusters) 242 | for(i in seq_along(fragmentFiles)){ 243 | fragments <-readRDS(fragmentFiles[i]) 244 | for(j in seq_along(clusterResults)){ 245 | message(sprintf("%s of %s", j, length(clusterResults))) 246 | fragmentsj <- fragments[fragments$RG %in% clusterResults[[j]]] 247 | if(length(fragmentsj) > 0){ 248 | out <- data.frame( 249 | chr = c(seqnames(fragmentsj), seqnames(fragmentsj)), 250 | start = c(as.integer(start(fragmentsj) - 1), as.integer(end(fragmentsj) - 1)), 251 | end = c(as.integer(start(fragmentsj)), as.integer(end(fragmentsj))) 252 | ) %>% readr::write_tsv( 253 | x = ., 254 | append = TRUE, 255 | path = paste0(dirClusters, paste0(names(clusterResults)[j], ".bed")), 256 | col_names = FALSE) 257 | } 258 | } 259 | } 260 | 261 | #------------------------------------------------------------------------------------------------- 262 | # Run MACS2 263 | #------------------------------------------------------------------------------------------------- 264 | dirPeaks <- "results/LSI-Cluster-Peaks/" 265 | method <- "q" 266 | cutoff <- 0.05 267 | shift <- -75 268 | extsize <- 150 269 | genome_size <- 2.7e9 270 | for(j in seq_along(clusterResults)){ 271 | message(sprintf("%s of %s", j, length(clusterResults))) 272 | clusterBedj <- paste0(dirClusters,names(clusterResults)[j],".bed") 273 | cmdPeaks <- sprintf( 274 | "macs2 callpeak -g %s --name %s --treatment %s --outdir %s --format BED --nomodel --call-summits --nolambda --keep-dup all", 275 | genome_size, 276 | names(clusterResults)[j], 277 | clusterBedj, 278 | dirPeaks 279 | ) 280 | if (!is.null(shift) & !is.null(extsize)) { 281 | cmdPeaks <- sprintf("%s --shift %s --extsize %s", cmdPeaks, shift, extsize) 282 | } 283 | if (tolower(method) == "p") { 284 | cmdPeaks <- sprintf("%s -p %s", cmdPeaks, cutoff) 285 | }else { 286 | cmdPeaks <- sprintf("%s -q %s", cmdPeaks, cutoff) 287 | } 288 | message("Running Macs2...") 289 | message(cmdPeaks) 290 | system(cmdPeaks, intern = TRUE) 291 | } 292 | 293 | #------------------------------------------------------------------------------------------------- 294 | # Make Non-Overlapping Peak Set 295 | #------------------------------------------------------------------------------------------------- 296 | df <- data.frame( 297 | samples = gsub("\\_summits.bed","",list.files(dirPeaks, pattern = "\\_summits.bed", full.names = FALSE)), 298 | groups = "scATAC", 299 | summits = list.files(dirPeaks, pattern = "\\_summits.bed", full.names = TRUE) 300 | ) 301 | 302 | unionPeaks <- extendedPeakSet( 303 | df = df, 304 | BSgenome = genome, 305 | extend = 250, 306 | blacklist = "data/hg19.blacklist.bed", 307 | nSummits = 200000 308 | ) 309 | unionPeaks <- unionPeaks[seqnames(unionPeaks) %in% paste0("chr",c(1:22,"X"))] 310 | unionPeaks <- keepSeqlevels(unionPeaks, paste0("chr",c(1:22,"X"))) 311 | 312 | #Create Counts list 313 | countsPeaksList <- lapply(seq_along(fragmentFiles), function(i){ 314 | message(sprintf("%s of %s", i, length(fragmentFiles))) 315 | gc() 316 | countInsertions(unionPeaks, readRDS(fragmentFiles[i]), by = "RG") 317 | }) 318 | 319 | #CountsMatrix 320 | mat <- lapply(countsPeaksList, function(x) x[[1]]) %>% Reduce("cbind",.) 321 | frip <- lapply(countsPeaksList, function(x) x[[2]]) %>% unlist 322 | total <- lapply(countsPeaksList, function(x) x[[3]]) %>% unlist 323 | 324 | se <- SummarizedExperiment( 325 | assays = SimpleList(counts = mat), 326 | rowRanges = unionPeaks 327 | ) 328 | rownames(se) <- paste(seqnames(se),start(se),end(se),sep="_") 329 | colData(se)$FRIP <- frip 330 | colData(se)$uniqueFrags <- total / 2 331 | 332 | #---------------------------- 333 | # Get Clusters in Peaks 334 | #---------------------------- 335 | nTop <- 25000 336 | nPCs1 <- 1:50 337 | nPCs2 <- 1:50 338 | 339 | message("Making Seurat LSI Object...") 340 | obj <- seuratLSI(assay(se), nComponents = max(nPCs1), nFeatures = NULL) 341 | stopifnot(identical(rownames(obj@meta.data), colnames(se))) 342 | obj@meta.data <- as.data.frame(cbind(obj@meta.data, colData(se))) 343 | 344 | message("Adding Graph Clusters...") 345 | obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = nPCs1, print.output = TRUE, n.start = 10) 346 | 347 | #Make Pseudo Bulk Library 348 | mat <- assay(se) 349 | mat@x[mat@x > 0] <- 1 350 | clusterSums <- groupSums(mat = mat, groups = paste0("C",obj@meta.data$res.0.8), sparse = TRUE) 351 | logMat <- edgeR::cpm(clusterSums, log = TRUE, prior.count = 3) 352 | varPeaks <- head(order(matrixStats::rowVars(logMat), decreasing = TRUE), nTop) 353 | 354 | #Re-run Seurat LSI 355 | message("Making Seurat LSI Object...") 356 | obj2 <- seuratLSI(assay(se)[varPeaks,], nComponents = max(nPCs2), nFeatures = NULL) 357 | stopifnot(identical(rownames(obj2@meta.data), colnames(se))) 358 | obj2@meta.data <- as.data.frame(cbind(obj2@meta.data, colData(se))) 359 | 360 | message("Adding Graph Clusters...") 361 | obj2 <- FindClusters(object = obj2, reduction.type = "pca", dims.use = nPCs2, print.output = TRUE, n.start = 10) 362 | 363 | #Plot uMAP 364 | message("Running UMAP") 365 | obj2 <- RunUMAP(object = obj2, reduction.use = "pca", dims.use = nPCs2) 366 | plotUMAP <- data.frame(GetCellEmbeddings(obj2,reduction.type="umap"), obj2@meta.data) 367 | colnames(plotUMAP) <- c("x","y",colnames(plotUMAP)[3:ncol(plotUMAP)]) 368 | clustCol <- colnames(plotUMAP)[grep("res",colnames(plotUMAP))] 369 | colData(se)$Clusters <- paste0("Cluster",as.integer(plotUMAP[,clustCol]) + 1) 370 | colData(se)$UMAP1 <- plotUMAP$x 371 | colData(se)$UMAP2 <- plotUMAP$y 372 | 373 | pdf("results/LSI-Clustering-Peaks.pdf") 374 | ggplot(plotUMAP, aes(x=x,y=y,color=res.0.8)) + geom_point(size = 0.5) + 375 | theme_bw() + xlab("UMAP1") + ylab("UMAP2") 376 | dev.off() 377 | 378 | saveRDS(se, "results/scATAC-Summarized-Experiment.rds") 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /code/03_Run_chromVAR_v2.R: -------------------------------------------------------------------------------- 1 | #Running chromVAR on single-cell summarized experiment 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(chromVAR) 8 | library(SummarizedExperiment) 9 | library(chromVARmotifs) 10 | library(motifmatchr) 11 | library(BiocParallel) 12 | library(BSgenome.Hsapiens.UCSC.hg19) 13 | register(SerialParam()) 14 | set.seed(1) 15 | 16 | #----------------- 17 | # Read Inputs 18 | #----------------- 19 | genome <- BSgenome.Hsapiens.UCSC.hg19 20 | se <- readRDS("results/scATAC-Summarized-Experiment.rds") # single-cell summarized experiment rowRanges as peaks 21 | se <- addGCBias(se, genome = genome) 22 | data("human_pwms_v1") 23 | matches <- matchMotifs(human_pwms_v1, rowRanges(se), genome = "BSgenome.Hsapiens.UCSC.hg19") 24 | 25 | #compute deviations 26 | dev <- computeDeviations(object = se, annotations = matches) 27 | 28 | #compute variability 29 | metadata(dev)$Variability <- computeVariability(dev) 30 | 31 | #add se 32 | metadata(dev)$SummarizedExperiment <- se 33 | 34 | #add matches 35 | metadata(dev)$motifMatches <- matches 36 | 37 | saveRDS(dev, "results/chromVAR-Summarized-Experiment.rds") 38 | -------------------------------------------------------------------------------- /code/04_Run_Cicero_v2.R: -------------------------------------------------------------------------------- 1 | #Computing Gene Activity scores using Cicero and Co-Accessibility 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(cicero) 8 | library(data.table) 9 | library(Matrix) 10 | library(GenomicRanges) 11 | library(magrittr) 12 | library(SummarizedExperiment) 13 | library(optparse) 14 | library(yaml) 15 | library(Rcpp) 16 | set.seed(1) 17 | 18 | grToFeature <- function(gr){ 19 | peakinfo <- data.frame( 20 | row.names = paste(seqnames(gr),start(gr),end(gr),sep="_"), 21 | site_name = paste(seqnames(gr),start(gr),end(gr),sep="_"), 22 | chr = gsub("chr","",as.character(seqnames(gr))), 23 | bp1 = start(gr), 24 | bp2 = end(gr) 25 | ) 26 | return(peakinfo) 27 | } 28 | 29 | featureToGR <- function(feature){ 30 | featureSplit <- stringr::str_split(paste0(feature), pattern = "_", n = 3, simplify = TRUE) 31 | gr <- GRanges(featureSplit[,1],IRanges(as.integer(featureSplit[,2]),as.integer(featureSplit[,3]))) 32 | return(gr) 33 | } 34 | 35 | makeCDS <- function(se, binarize = TRUE){ 36 | peakinfo <- grToFeature(se) 37 | mat <- assay(se) 38 | if(binarize){ 39 | mat@x[which(mat@x > 0)] <- 1 40 | } 41 | cellinfo <- data.frame(colData(se)) 42 | cellinfo$cells <- rownames(cellinfo) 43 | cds <- suppressWarnings(newCellDataSet(mat, 44 | phenoData = methods::new("AnnotatedDataFrame", data = cellinfo), 45 | featureData = methods::new("AnnotatedDataFrame", data = peakinfo), 46 | expressionFamily=negbinomial.size(), 47 | lowerDetectionLimit=0)) 48 | fData(cds)$chr <- as.character(fData(cds)$chr) 49 | fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1)) 50 | fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2)) 51 | cds <- cds[order(fData(cds)$chr, fData(cds)$bp1),] 52 | return(cds) 53 | } 54 | 55 | 56 | sourceCpp(code=' 57 | #include 58 | 59 | using namespace Rcpp; 60 | using namespace std; 61 | 62 | // Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp 63 | // [[Rcpp::export]] 64 | Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y) { 65 | 66 | if(X.ncol() != Y.ncol()){ 67 | stop("Columns of Matrix X and Y must be equal length!"); 68 | } 69 | 70 | if(max(idxX) > X.nrow()){ 71 | stop("Idx X greater than nrow of Matrix X"); 72 | } 73 | 74 | if(max(idxY) > Y.nrow()){ 75 | stop("Idx Y greater than nrow of Matrix Y"); 76 | } 77 | 78 | // Transpose Matrices 79 | X = transpose(X); 80 | Y = transpose(Y); 81 | 82 | const int nx = X.ncol(); 83 | const int ny = Y.ncol(); 84 | 85 | // Centering the matrices 86 | for (int j = 0; j < nx; ++j) { 87 | X(Rcpp::_, j) = X(Rcpp::_, j) - Rcpp::mean(X(Rcpp::_, j)); 88 | } 89 | 90 | for (int j = 0; j < ny; ++j) { 91 | Y(Rcpp::_, j) = Y(Rcpp::_, j) - Rcpp::mean(Y(Rcpp::_, j)); 92 | } 93 | 94 | // Compute 1 over the sample standard deviation 95 | Rcpp::NumericVector inv_sqrt_ss_X(nx); 96 | for (int i = 0; i < nx; ++i) { 97 | inv_sqrt_ss_X(i) = 1/sqrt(Rcpp::sum( X(Rcpp::_, i) * X(Rcpp::_, i) )); 98 | } 99 | 100 | Rcpp::NumericVector inv_sqrt_ss_Y(ny); 101 | for (int i = 0; i < ny; ++i) { 102 | inv_sqrt_ss_Y(i) = 1/sqrt(Rcpp::sum( Y(Rcpp::_, i) * Y(Rcpp::_, i) )); 103 | } 104 | 105 | //Calculate Correlations 106 | const int n = idxX.size(); 107 | Rcpp::NumericVector cor(n); 108 | for(int k = 0; k < n; k++){ 109 | cor[k] = Rcpp::sum( X(Rcpp::_, idxX[k] - 1) * Y(Rcpp::_, idxY[k] - 1) ) * inv_sqrt_ss_X(idxX[k] - 1) * inv_sqrt_ss_Y(idxY[k] - 1); 110 | } 111 | 112 | return(cor); 113 | 114 | }' 115 | ) 116 | 117 | getTxDbGenes <- function(txdb = NULL, orgdb = NULL, gr = NULL, ignore.strand = TRUE){ 118 | 119 | if (is.null(genome)) { 120 | if (is.null(txdb) | is.null(orgdb)) { 121 | stop("If no provided genome then you need txdb and orgdb!") 122 | } 123 | } 124 | 125 | if (is.null(gr)) { 126 | genes <- GenomicFeatures::genes(txdb) 127 | }else { 128 | genes <- suppressWarnings(subsetByOverlaps(GenomicFeatures::genes(txdb), gr, ignore.strand = ignore.strand)) 129 | } 130 | 131 | if (length(genes) > 1) { 132 | mcols(genes)$symbol <- suppressMessages(mapIds(orgdb, 133 | keys = mcols(genes)$gene_id, column = "SYMBOL", keytype = "ENTREZID", 134 | multiVals = "first")) 135 | genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE) 136 | names(genes) <- NULL 137 | out <- genes 138 | }else { 139 | out <- GRanges(seqnames(gr), ranges = IRanges(0, 0), gene_id = 0, symbol = "none")[-1] 140 | } 141 | 142 | return(out) 143 | 144 | } 145 | 146 | library(BSgenome.Hsapiens.UCSC.hg19) 147 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 148 | library(org.Hs.eg.db) 149 | 150 | #Read input 151 | obj <- readRDS("results/scATAC-Summarized-Experiment.rds") 152 | mdata <- colData(obj) 153 | tssWindow <- 2500 154 | flank <- 250*10^3 155 | corCutOff <- 0.35 156 | bsgenome <- BSgenome.Hsapiens.UCSC.hg19 157 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 158 | orgdb <- org.Hs.eg.db 159 | 160 | #Reduced Dimensions 161 | dimred <- data.frame( 162 | row.names = colnames(obj), 163 | colData(obj)$UMAP1, 164 | colData(obj)$UMAP2 165 | ) 166 | 167 | #Get ChromSizes 168 | chromSizes <- seqlengths(bsgenome)[paste0("chr",c(1:22,"X"))] 169 | genome <- data.frame(names(chromSizes),chromSizes) 170 | rownames(genome) <- NULL 171 | 172 | #Get CDS 173 | obj <- makeCDS(obj, binarize = TRUE) 174 | obj <- detectGenes(obj) 175 | obj <- estimateSizeFactors(obj) 176 | ciceroObj <- make_cicero_cds(obj, k = 50, reduced_coordinates = dimred[colnames(obj),]) 177 | 178 | #Compute Correlations 179 | message("Computing grouped correlations...") 180 | gr <- featureToGR(featureData(ciceroObj)[[1]]) 181 | o <- suppressWarnings(as.matrix( findOverlaps(resize( resize(gr,1,"center"), 2*flank + 1, "center"), resize(gr,1,"center"), ignore.strand=TRUE) )) 182 | o <- data.table::as.data.table(data.frame(i = matrixStats::rowMins(o), j = matrixStats::rowMaxs(o))) 183 | o <- data.frame(o[!duplicated(o),]) 184 | o <- o[o[,1]!=o[,2],] 185 | o$cor <- rowCorCpp(o[,1], o[,2], assayData(ciceroObj)$exprs, assayData(ciceroObj)$exprs) 186 | connections <- data.frame( 187 | Peak1 = featureData(ciceroObj)[[1]][o[,1]], 188 | Peak2 = featureData(ciceroObj)[[1]][o[,2]], 189 | coaccess = o[,3] 190 | ) 191 | 192 | #Annotate CDS 193 | message("Annotating Cell Data Set...") 194 | genes <- getTxDbGenes(txdb=txdb,orgdb=orgdb) 195 | names(genes) <- genes$symbol 196 | genes <- resize(genes, 1, "start") %>% resize(tssWindow * 2 + 1, "center") 197 | geneDF <- data.frame(chromosome=seqnames(genes),start=start(genes),end=end(genes), gene=genes$symbol) 198 | obj <- annotate_cds_by_site(obj, geneDF) 199 | 200 | #Prepare for Co-Accessibility 201 | nSites <- Matrix::colSums(assayData(obj)$exprs) 202 | names(nSites) <- row.names(pData(obj)) 203 | 204 | #Cicero with Correlations 205 | message("Calculating normalized gene activities...") 206 | ciceroGA <- normalize_gene_activities(build_gene_activity_matrix(obj, connections, coaccess_cutoff = corCutOff), nSites) 207 | 208 | seCicero <- SummarizedExperiment( 209 | assays = SimpleList(gA = ciceroGA), 210 | rowRanges = genes[rownames(ciceroGA),], 211 | colData = mdata 212 | ) 213 | 214 | seCiceroLog <- SummarizedExperiment( 215 | assays = SimpleList(logGA = log2(10^6 * ciceroGA + 1)), 216 | rowRanges = genes[rownames(ciceroGA),], 217 | colData = mdata 218 | ) 219 | 220 | #Save Output 221 | saveRDS(connections, "results/Peaks-Co-Accessibility.rds") 222 | saveRDS(seCicero, "results/Cicero-Gene-Activity.rds") 223 | saveRDS(seCiceroLog, "results/Cicero-Log2-Gene-Activity.rds") 224 | 225 | 226 | -------------------------------------------------------------------------------- /code/05_Cluster_Unique_Peaks_v2.R: -------------------------------------------------------------------------------- 1 | #Identifying Cluster Specific ATAC-seq peaks 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(Matrix) 8 | library(SummarizedExperiment) 9 | library(matrixStats) 10 | library(readr) 11 | library(GenomicRanges) 12 | library(magrittr) 13 | library(data.table) 14 | library(edgeR) 15 | set.seed(1) 16 | 17 | groupMeans <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){ 18 | stopifnot(!is.null(groups)) 19 | stopifnot(length(groups) == ncol(mat)) 20 | gm <- lapply(unique(groups), function(x) { 21 | if (sparse) { 22 | Matrix::rowMeans(mat[, which(groups == x), drop = F], na.rm = na.rm) 23 | } 24 | else { 25 | rowMeans(mat[, which(groups == x), drop = F], na.rm = na.rm) 26 | } 27 | }) %>% Reduce("cbind", .) 28 | colnames(gm) <- unique(groups) 29 | return(gm) 30 | } 31 | 32 | groupSds <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE) { 33 | stopifnot(!is.null(groups)) 34 | stopifnot(length(groups) == ncol(mat)) 35 | gs <- lapply(unique(groups), function(x) { 36 | if (sparse) { 37 | matrixStats::rowSds(as.matrix(mat[, which(groups == x), drop = F]), na.rm = na.rm) 38 | } 39 | else { 40 | matrixStats::rowSds(mat[, which(groups == x), drop = F], na.rm = na.rm) 41 | } 42 | }) %>% Reduce("cbind", .) 43 | colnames(gs) <- unique(groups) 44 | return(gs) 45 | } 46 | 47 | createPseudoBulk <- function(mat, groups, labels, minCells = 100, maxCells = 500, minReps = 3, ceiling = 1, prior.count = 3, nSim = 1000, distMethod = "vars", seed = 1){ 48 | 49 | calcDiff <- function(mat, method = "vars"){ 50 | if(tolower(method)=="vars"){ 51 | sum(matrixStats::rowVars(mat))/ncol(mat) 52 | }else if(tolower(method)=="euclidean"){ 53 | sum(dist(t(mat))/ncol(mat)) 54 | }else{ 55 | stop("Error method not found!") 56 | } 57 | } 58 | 59 | sumCells <- function(mat, groups, maxCells = NULL, na.rm = TRUE, sparse = FALSE){ 60 | stopifnot(!is.null(groups)) 61 | stopifnot(length(groups) == ncol(mat)) 62 | gm <- lapply(unique(groups), function(x) { 63 | idx <- which(groups == x) 64 | if(!is.null(maxCells)){ 65 | idx <- sample(idx, size = min(maxCells, length(idx)), replace = FALSE) 66 | } 67 | if (sparse) { 68 | Matrix::rowSums(mat[, idx, drop = FALSE], na.rm = na.rm) 69 | } 70 | else{ 71 | rowSums(mat[, idx, drop = FALSE], na.rm = na.rm) 72 | } 73 | }) %>% Reduce("cbind", .) %>% as.matrix 74 | colnames(gm) <- unique(groups) 75 | return(gm) 76 | } 77 | 78 | message(paste0("Setting Seed = ",seed)) 79 | set.seed(seed) 80 | 81 | names(labels) <- colnames(mat) 82 | groupList <- split(labels,groups) 83 | 84 | if(minReps <= 1){ 85 | stop("Minimum 2 replicates!") 86 | } 87 | 88 | if(is.numeric(ceiling)){ 89 | message(paste0("Setting ceiling of input matrix to ", ceiling, "!")) 90 | mat@x[mat@x>ceiling]<-ceiling 91 | } 92 | 93 | #-------------------------------------------------------------------------- 94 | # Constructing Bulk Pseudo ATAC Matrix v1.0 95 | #-------------------------------------------------------------------------- 96 | 97 | pseudoAll <- lapply(seq_along(groupList), function(x){ 98 | 99 | message(sprintf("####################################\n Groups %s of %s : %s\n####################################",x,length(groupList), names(groupList)[x])) 100 | start <- Sys.time() 101 | groupx <- groupList[[x]] 102 | matx <- mat[,names(groupx)] 103 | bioReps <- names(table(groupx))[which(table(groupx)>minCells)] 104 | nToSim <- minReps - length(bioReps) 105 | 106 | if(length(bioReps) >= minReps){ 107 | 108 | #-------------------------------------------------------------------------- 109 | # If there is enough biological samples passing the minimal cells, great 110 | # good to go! Just merge into true pseudo bulk replicates! 111 | #-------------------------------------------------------------------------- 112 | 113 | message(sprintf("Found %s Bio Reps which is more or equal than required (%s)", length(bioReps), minReps)) 114 | nBio <- length(bioReps) 115 | groupBio <- groupx[which(groupx %in% bioReps)] 116 | pseudoBio <- sumCells(matx[,names(groupBio)],groups=groupBio,sparse=TRUE,na.rm=TRUE, maxCells = maxCells) 117 | nBio <- table(groupBio)[bioReps] 118 | if(!is.null(maxCells)){ 119 | nBio[nBio > maxCells] <- maxCells 120 | } 121 | 122 | colnames(pseudoBio) <- lapply(seq_along(bioReps),function(k){ 123 | paste0(names(groupList)[x],"._.BRep_",bioReps[k],".",nBio[which(names(nBio)==bioReps[k])],".FALSE") 124 | }) 125 | pseudoMat <- pseudoBio 126 | 127 | }else if(length(bioReps) > 0 & ((length(groupx[groupx %ni% bioReps]) + 1) / min(nToSim, 2)) > minCells){ 128 | 129 | #-------------------------------------------------------------------------- 130 | # If there is at least 1 biological sample with the minimum cells but not 131 | # as many as required, we will make pseudo replicates with the true replicate 132 | # to attempt to capture real biological varation 133 | #-------------------------------------------------------------------------- 134 | 135 | message("PSA : To ensure minimum replicates, simulation must be performed!") 136 | groupBio <- groupx[which(groupx %in% bioReps)] 137 | 138 | nBio <- table(groupBio) 139 | if(length(bioReps) == 1){ 140 | pseudoBio <- Matrix::rowSums(matx[,names(groupBio)],na.rm=TRUE) 141 | }else{ 142 | pseudoBio <- sumCells(matx[,names(groupBio)],groups=groupBio,sparse=TRUE,na.rm=TRUE, maxCells = maxCells) 143 | } 144 | pseudoBioLog <- edgeR::cpm(pseudoBio, log = TRUE, prior.count = prior.count) 145 | 146 | #Determine how sampling is to be performed, ideally we could have the minimum cells * non bio cells of cells! 147 | nSplit <- floor(length(groupx[groupx %ni% bioReps])/(nToSim)) 148 | if(!is.null(maxCells)){ 149 | nSplit <- min(nSplit, maxCells) 150 | } 151 | if(nSplit < minCells){ 152 | message("Splitting cluster into overlapping cells using sampling with replacement BE CAREFUL OF LOW VARIANCE!!!!") 153 | replacement <- TRUE 154 | nSplit <- minCells 155 | }else{ 156 | replacement <- FALSE 157 | } 158 | 159 | for(i in seq_len(nSim)){ 160 | 161 | #Figure out how to split Matrix! 162 | randOrder <- sample(seq_len(ncol(matx)), ncol(matx)) 163 | if(replacement){ 164 | splitList <- lapply(seq_len(nToSim), function(x) sample(seq_len(ncol(matx)), size = nSplit, replace = replacement)) 165 | }else{ 166 | splitList <- split(randOrder, ceiling(seq_along(randOrder)/nSplit))[seq_len(nToSim)] 167 | } 168 | 169 | if(i == 1){ 170 | 171 | pseudoMin <- lapply(seq_len(nToSim), function(j){ 172 | jj <- splitList[[j]] 173 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE) 174 | }) %>% Reduce("cbind",.) 175 | pseudoMinLog <- edgeR::cpm(pseudoMin, log = TRUE, prior.count = prior.count) 176 | diffMax <- calcDiff(cbind(pseudoBioLog, pseudoMinLog), method = distMethod) 177 | 178 | }else{ 179 | 180 | pseudoI <- lapply(seq_len(nToSim), function(j){ 181 | jj <- splitList[[j]] 182 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE) 183 | }) %>% Reduce("cbind",.) 184 | 185 | pseudoILog <- edgeR::cpm(pseudoI, log = TRUE, prior.count = prior.count) 186 | diffI <- calcDiff(cbind(pseudoBioLog, pseudoILog), method = distMethod) 187 | message(sprintf("Trial %s, Current Distance = %s , Max Distance = %s", i, diffI, diffMax)) 188 | 189 | if(diffI > diffMax){ 190 | message("Found new maxima pseudo to be conservative...") 191 | pseudoMin <- pseudoI 192 | pseudoMinLog <- pseudoILog 193 | diffMax <- diffI 194 | } 195 | 196 | } 197 | 198 | } 199 | 200 | pseudoBio <- as.matrix(pseudoBio) 201 | pseudoMin <- as.matrix(pseudoMin) 202 | colnames(pseudoBio) <- lapply(seq_along(bioReps),function(k){ 203 | paste0(names(groupList)[x],"._.BRep_",bioReps[k],".",nBio[which(names(nBio)==bioReps[k])],".FALSE") 204 | }) 205 | colnames(pseudoMin) <- paste0(names(groupList)[x],"._.Rep",seq_len(nToSim),".",nSplit,".",replacement) 206 | pseudoMat <- cbind(pseudoBio, pseudoMin) 207 | 208 | }else{ 209 | 210 | #-------------------------------------------------------------------------- 211 | # If there is not at least 1 sample with enough cells we will bootstrap replicates. 212 | # This is not preferred as we will have a large underestimate of true biological 213 | # variation. 214 | #-------------------------------------------------------------------------- 215 | 216 | message("PSA : No representation by at least one separate rep, please be cautious with result!") 217 | 218 | #Determine how sampling is to be performed, ideally we could have the minimum cells * non bio cells of cells! 219 | nToSim <- minReps 220 | nSplit <- floor(length(groupx[groupx %ni% bioReps])/(nToSim)) 221 | if(floor(1.5 * minCells) > length(groupx)){ 222 | nToSim <- 2 223 | nSplit <- floor(2 / 3 * length(groupx)) 224 | message(sprintf("Warning! Group size (%s) is smaller than the 3/2 * minimal number of cells (%s)",length(groupx),floor(1.5 * minCells))) 225 | message(sprintf("To deal with this, we will sample %s replicates at 2/3 the group size (%s of %s)", nToSim, nSplit, length(groupx))) 226 | } 227 | if(!is.null(maxCells)){ 228 | nSplit <- min(nSplit, maxCells) 229 | } 230 | if(nSplit < minCells){ 231 | message("Splitting cluster into overlapping cells using sampling with replacement BE CAREFUL OF LOW VARIANCE!!!!") 232 | replacement <- TRUE 233 | nSplit <- minCells 234 | }else{ 235 | replacement <- FALSE 236 | } 237 | 238 | for(i in seq_len(nSim)){ 239 | 240 | #Figure out how to split Matrix! 241 | randOrder <- sample(seq_len(ncol(matx)), ncol(matx)) 242 | if(replacement){ 243 | splitList <- lapply(seq_len(nToSim), function(x) sample(seq_len(ncol(matx)), size = minCells, replace = replacement)) 244 | }else{ 245 | splitList <- split(randOrder, ceiling(seq_along(randOrder)/nSplit))[seq_len(nToSim)] 246 | } 247 | 248 | if(i == 1){ 249 | 250 | pseudoMin <- lapply(seq_len(nToSim), function(j){ 251 | jj <- splitList[[j]] 252 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE) 253 | }) %>% Reduce("cbind",.) 254 | pseudoMinLog <- edgeR::cpm(pseudoMin, log = TRUE, prior.count = prior.count) 255 | diffMax <- calcDiff(pseudoMinLog, method = distMethod) 256 | 257 | }else{ 258 | 259 | pseudoI <- lapply(seq_len(nToSim), function(j){ 260 | jj <- splitList[[j]] 261 | Matrix::rowSums(matx[,jj,drop=FALSE],na.rm=TRUE) 262 | }) %>% Reduce("cbind",.) 263 | pseudoILog <- edgeR::cpm(pseudoI, log = TRUE, prior.count = prior.count) 264 | diffI <- calcDiff(pseudoILog, method = distMethod) 265 | 266 | message(sprintf("Trial %s, Current Distance = %s , Max Distance = %s", i, diffI, diffMax)) 267 | if(diffI > diffMax){ 268 | message("Found new maxima pseudo to be conservative...") 269 | pseudoMin <- pseudoI 270 | pseudoMinLog <- pseudoILog 271 | diffMax <- diffI 272 | } 273 | 274 | } 275 | } 276 | 277 | pseudoMin <- as.matrix(pseudoMin) 278 | colnames(pseudoMin) <- paste0(names(groupList)[x],"._.Rep_",seq_len(nToSim),".",nSplit,".",replacement) 279 | pseudoMat <- pseudoMin 280 | 281 | } 282 | 283 | print(Sys.time() - start) 284 | 285 | pseudoMat 286 | 287 | }) %>% Reduce("cbind",.) 288 | 289 | out <- list(pseudoMat = pseudoAll, groupMat = sumCells(mat, groups=groups, sparse=TRUE, na.rm=TRUE)) 290 | 291 | return(out) 292 | 293 | } 294 | 295 | 296 | uniqueFeatures <- function( 297 | mat, groups, padj = 0.01, minSdRatio = 0.001, 298 | minLFC = 0.25, zCutoff = 1, breakPt = "last", 299 | padjMethod = "fdr", clusterCols = TRUE, 300 | sparse = FALSE, twoWay = FALSE, groupMin = 10, 301 | minGroupSize = 1, maxGroupSize = NULL){ 302 | 303 | #---------------------- 304 | # Functions 305 | #---------------------- 306 | binarizeMatrix <- function(matMean, matSd, cutoff, method, minSdRatio, minLFC){ 307 | 308 | binarizeVector <- function(vMean, vSd, cutoff, method, minSdRatio, minLFC){ 309 | 310 | #Method 311 | if(method == "mean"){ 312 | vTest = vMean 313 | }else{ 314 | vTest <- vMean - cutoff * vSd 315 | } 316 | 317 | #Order lowest to highest 318 | idx <- order(vTest) 319 | vTest <- vTest[idx] 320 | vMean <- vMean[idx] 321 | vSd <- vSd[idx] 322 | 323 | #Set which are too low for evaluation ie low Sd that is probably due to 0s or weird offsets 324 | sdToLow <- which(vSd < minSdRatio*vMean | vSd < 10^-5) 325 | vBinarySd <- rep(1,length(vSd)) 326 | if(length(sdToLow) > 0){ 327 | vBinarySd[sdToLow] <- 0 328 | } 329 | 330 | #Initialize 331 | vMeanCutSd <- vMean + cutoff*vSd; 332 | maxValue <- vMeanCutSd[1] 333 | maxBSd <- vBinarySd[1] 334 | maxMean <- vMean[1] 335 | 336 | #Create out vector and initialize breakpoint 337 | n <- length(vMean) 338 | out <- rep(0, n) 339 | breakPoint <- 0 340 | out[1] <- breakPoint 341 | 342 | #Evaluate 343 | for(i in seq(2,n)){ 344 | 345 | #Check if break point assuming log space 346 | if((vTest[i] - maxValue) > 0 & (vMean[i] - maxMean) >= minLFC & maxBSd != 0){ 347 | breakPoint <- breakPoint + 1 348 | } 349 | 350 | #Set current value of break point 351 | out[i] <- breakPoint 352 | 353 | #Keep Max value observed 354 | if(vMeanCutSd[i] > maxValue){ 355 | maxValue <- vMeanCutSd[i] 356 | maxMean <- vMean[i] 357 | maxBSd <- vBinarySd[i] 358 | } 359 | } 360 | 361 | out <- out[order(idx)] 362 | return(out) 363 | 364 | } 365 | 366 | #Create binary matrix 367 | bMat <- matrix(NA,nrow=nrow(matMean),ncol=ncol(matMean)) 368 | for(i in seq_len(nrow(matMean))){ 369 | if(i%%5000==0){message(sprintf("%s of %s (percent = %s)", i, nrow(bMat), round(100*i/nrow(bMat),2)))} 370 | bMat[i,] <- binarizeVector(matMean[i,],matSd[i,],cutoff, method, minSdRatio, minLFC) 371 | } 372 | 373 | #Add names 374 | colnames(bMat) <- colnames(matMean) 375 | rownames(bMat) <- rownames(matMean) 376 | return(bMat) 377 | 378 | } 379 | 380 | idxRow <- seq_len(nrow(mat)) 381 | #----------------------------------------------- 382 | #Within Group Statistics 383 | #----------------------------------------------- 384 | message("Getting Within Group Stats...") 385 | intraMean <- groupMeans(mat, groups = groups, sparse = sparse, na.rm = TRUE) 386 | rownames(intraMean) <- rownames(mat) 387 | 388 | intraSd <- groupSds(mat, groups = groups, sparse = sparse, na.rm = TRUE) 389 | rownames(intraSd) <- rownames(mat) 390 | 391 | #----------------------------------------------- 392 | #Binarize Rows of Matrix 393 | #----------------------------------------------- 394 | message("Binarizing Features...") 395 | if (twoWay) { 396 | binarizedMat <- binarizeMatrix(intraMean, intraSd, zCutoff, "meanSd", minSdRatio, minLFC) 397 | }else { 398 | binarizedMat <- binarizeMatrix(intraMean, intraSd, zCutoff, "mean", minSdRatio, minLFC) 399 | } 400 | colnames(binarizedMat) <- colnames(intraMean) 401 | rownames(binarizedMat) <- rownames(intraMean) 402 | for(i in seq_len(nrow(binarizedMat))){ 403 | bvi <- binarizedMat[i,] 404 | if(tolower(breakPt) == "last"){ 405 | bvi[bvi!=max(bvi)] <- 0 406 | bvi[bvi>0] <- 1 407 | }else{ 408 | bvi[bvi<1] <- 0 409 | bvi[bvi>0] <- 1 410 | } 411 | binarizedMat[i,] <- bvi 412 | } 413 | message(sprintf("Successful Binarization of %s Features...", sum(rowSums(binarizedMat) > 0))) 414 | 415 | #----------------------------------------------- 416 | #Get Test Statistics 417 | #----------------------------------------------- 418 | pb <- txtProgressBar(min=0,max=100,initial=0,style=3) 419 | groupSplit <- split(colnames(mat), groups) 420 | bpval <- unlist(lapply(seq_len(nrow(binarizedMat)), function(i){ 421 | setTxtProgressBar(pb,round(i*100/nrow(binarizedMat),0)) 422 | if(any(binarizedMat[i,]>0)){ 423 | cu <- as.character(unlist(groupSplit[names(which(binarizedMat[i,]==max(binarizedMat[i,])))])) 424 | cd <- as.character(unlist(groupSplit[names(which(binarizedMat[i,]!=max(binarizedMat[i,])))])) 425 | mati <- mat[i,,drop=F] 426 | pval <- t.test(mati[,cu],mati[,cd])$p.value 427 | }else{ 428 | pval <- 1 429 | } 430 | pval 431 | })) 432 | bpadj <- p.adjust(bpval, method=padjMethod) #sum(binarizedMat + noSdMat < 1)) 433 | 434 | message(sprintf("\nFiltering by signficance %s of %s...", sum(bpadj < padj), sum(bpval < padj))) 435 | idxKeep <- which(bpadj < padj) 436 | bpadj <- bpadj[idxKeep] 437 | idxRow <- idxRow[idxKeep] 438 | binarizedMat <- binarizedMat[idxKeep, ] 439 | mat <- mat[idxKeep, ] 440 | intraMean <- intraMean[idxKeep, ] 441 | intraSd <- intraSd[idxKeep, ] 442 | 443 | #----------------------------------------------- 444 | #Determine which rows are above min group size and max group size to keep 445 | #----------------------------------------------- 446 | if (!is.null(maxGroupSize)) { 447 | idxKeep <- which(rowSums(binarizedMat) < (maxGroupSize + 1) & rowSums(binarizedMat) > (minGroupSize - 1)) 448 | }else { 449 | idxKeep <- which(rowSums(binarizedMat) > (minGroupSize - 1)) 450 | } 451 | message(sprintf("Filtering Features that are within Group Size %s of %s...", length(idxKeep), nrow(binarizedMat))) 452 | idxRow <- idxRow[idxKeep] 453 | bpadj <- bpadj[idxKeep] 454 | binarizedMat <- binarizedMat[idxKeep, ] 455 | mat <- mat[idxKeep, ] 456 | intraMean <- intraMean[idxKeep, ] 457 | intraSd <- intraSd[idxKeep, ] 458 | 459 | #----------------------------------------------- 460 | #Determine Pattern Occurences using data.table 461 | #----------------------------------------------- 462 | binarizedTable <- data.table::as.data.table(binarizedMat)[,.N,by=c(colnames(binarizedMat))] 463 | binarizedTable <- data.frame(binarizedTable[which(binarizedTable$N > groupMin),])[,which(colnames(binarizedTable) %ni% "N")] 464 | idxKeep <- unlist(lapply(seq_len(nrow(binarizedTable)), function(x){ 465 | idx1 <- which(binarizedTable[x,,drop=TRUE] > 0) 466 | rs1 <- which(rowSums(binarizedMat[,idx1,drop=FALSE]) == length(idx1)) 467 | rs2 <- which(rowSums(binarizedMat[,-idx1,drop=FALSE]) == 0) 468 | idxBM <- intersect(rs1,rs2) 469 | idxBM 470 | })) 471 | message(sprintf("Filtering Features Pattern Appearances %s of %s...", length(idxKeep), nrow(binarizedMat))) 472 | idxRow <- idxRow[idxKeep] 473 | bpadj <- bpadj[idxKeep] 474 | binarizedMat <- binarizedMat[idxKeep, ] 475 | mat <- mat[idxKeep, ] 476 | intraMean <- intraMean[idxKeep, ] 477 | intraSd <- intraSd[idxKeep, ] 478 | 479 | #----------------------------------------------- 480 | #Organize the output for maximum interpretation 481 | #----------------------------------------------- 482 | message(sprintf("Found %s unique elements!", length(idxKeep))) 483 | message("Finalizing Output...") 484 | colClust <- hclust(as.dist(1 - cor(intraMean))) 485 | colOrder <- unique(groups)[colClust$order] 486 | binarizedMat <- binarizedMat[, colClust$order] 487 | idxOrdered <- do.call("order", c(as.data.frame(binarizedMat)[seq_len(ncol(binarizedMat))], list(decreasing = TRUE))) 488 | binarizedMat <- binarizedMat[idxOrdered, ] 489 | idxRow <- idxRow[idxOrdered] 490 | intraMean <- intraMean[idxOrdered, colClust$order] 491 | mat <- mat[idxOrdered, order(match(groups, colOrder))] 492 | bpadj <- bpadj[idxOrdered] 493 | 494 | #----------------------------------------------- 495 | #Time to label each row 496 | #----------------------------------------------- 497 | binarizedTable <- data.frame(data.table::as.data.table(binarizedMat)[,.N,by=c(colnames(binarizedMat))]) 498 | rownames(binarizedTable) <- paste0("Feature_",seq_len(nrow(binarizedTable))) 499 | dfUniuqe <- lapply(seq_len(nrow(binarizedTable)), function(x){ 500 | idx1 <- which(binarizedTable[x,-ncol(binarizedTable),drop=TRUE] > 0) 501 | rs1 <- which(rowSums(binarizedMat[,idx1,drop=FALSE]) == length(idx1)) 502 | rs2 <- which(rowSums(binarizedMat[,-idx1,drop=FALSE])==0) 503 | idxBM <- intersect(rs1,rs2) 504 | data.frame(feature = rownames(binarizedTable)[x], rows = idxBM) 505 | }) %>% Reduce("rbind",.) 506 | 507 | #----------------------------------------------- 508 | #Return Output 509 | #----------------------------------------------- 510 | out <- list(mat = mat, binaryMat = binarizedMat, groupMat = intraMean, binarizedTable = binarizedTable, dfFeature = dfUniuqe, rowOrder = idxRow, padj = bpadj) 511 | return(out) 512 | 513 | } 514 | 515 | '%ni%' <- Negate('%in%') 516 | 517 | #---------------------------- 518 | # Get Inputs 519 | #---------------------------- 520 | se <- readRDS("results/scATAC-Summarized-Experiment.rds") 521 | 522 | #---------------------------- 523 | # PseudoBulk 524 | #---------------------------- 525 | colData(se)$Group <- "PBMC" #Replicate info 526 | objPB <- createPseudoBulk( 527 | mat = assay(se), 528 | groups=paste0(colData(se)$Clusters), 529 | labels=colData(se)$Group, 530 | ceiling = 1, 531 | minCells = 100, 532 | maxCells = 500, 533 | minReps = 2, 534 | prior.count = 3, 535 | nSim = 250 536 | ) 537 | 538 | cdPB <- DataFrame( 539 | row.names=colnames(objPB[[1]]), 540 | Group = stringr::str_split(colnames(objPB[[1]]), "._.", simplify = TRUE)[,1], 541 | Type = stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,1], 542 | Cells = as.integer(stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,2]), 543 | Replace = stringr::str_split(stringr::str_split(colnames(objPB[[1]]), "._\\.", simplify = TRUE)[,2], "\\.", simplify = TRUE)[,3] 544 | ) 545 | 546 | sePB <- SummarizedExperiment(assays = SimpleList(counts = objPB[[1]]), rowRanges = rowRanges(se), colData = cdPB) 547 | metadata(sePB)$matCS <- objPB[[2]] 548 | 549 | saveRDS(sePB, "results/Cluster_PseudoBulk-Summarized-Experiment.rds") 550 | 551 | #---------------------------- 552 | # Unique Features 553 | #---------------------------- 554 | params <- list( 555 | padj = 0.01, 556 | minSdRatio = 0.001, 557 | minLFC = 0.25, 558 | zCutoff = 1, 559 | breakPt = "last", 560 | groupMin = 25, 561 | maxGroupSize = max(floor(length(unique(colData(se)$Group))/3), 2), 562 | date = Sys.Date() 563 | ) 564 | 565 | #Unique Features 566 | uf <- uniqueFeatures( 567 | edgeR::cpm(assay(sePB),log=TRUE,prior.count=3), 568 | groups = colData(sePB)$Group, 569 | padj = params$padj, 570 | minSdRatio = params$minSdRatio, 571 | minLFC = params$minLFC, 572 | zCutoff = params$zCutoff, 573 | breakPt = params$breakPt, 574 | groupMin = params$groupMin, 575 | maxGroupSize = params$maxGroupSize 576 | ) 577 | 578 | library(pheatmap) 579 | pdf("results/Unique_Peaks.pdf",width=4,height=8) 580 | m <- uf$mat 581 | z <- sweep(m - rowMeans(m), 1, matrixStats::rowSds(m), `/`) 582 | z[z > 2] <- 2 583 | z[z < -2] <- -2 584 | pheatmap(z, cluster_cols=FALSE, cluster_rows=FALSE, show_rownames = FALSE) 585 | dev.off() 586 | 587 | saveRDS(uf, "results/Unique_Peaks.rds") 588 | saveRDS(params, "results/Unique_Peaks_Params.rds") 589 | 590 | 591 | 592 | 593 | -------------------------------------------------------------------------------- /code/06_Analyze_UMAP_Trajectory.R: -------------------------------------------------------------------------------- 1 | #Constructing a Trajectory in UMAP projected sub-space 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(Matrix) 8 | library(SummarizedExperiment) 9 | library(matrixStats) 10 | library(magrittr) 11 | library(edgeR) 12 | set.seed(1) 13 | 14 | getQuantiles <- function(x){ 15 | trunc(rank(x))/length(x) 16 | } 17 | 18 | alignTrajectory <- function(df, trajectory, filter = 0.05, dof = 250, spar = 1){ 19 | findClosest <- function(x, y, fitx, fity){ 20 | distxy <- sqrt(rowSums(cbind((fitx - x)^2 + (fity - y)^2))) 21 | idxMin <- which.min(distxy) 22 | if(idxMin==1){ 23 | idxMin <- idxMin + 1 24 | }else if(idxMin==length(fitx)){ 25 | idxMin <- idxMin - 1 26 | } 27 | if(distxy[idxMin + 1] < distxy[idxMin - 1]){ 28 | diff <- 1 29 | }else{ 30 | diff <- -1 31 | } 32 | data.frame(idx = idxMin, dist = distxy[idxMin], diff = diff) 33 | } 34 | dfAll <- data.frame() 35 | for(x in seq_along(trajectory)){ 36 | #Subset 37 | dfx <- df[df$Group==trajectory[x],] 38 | #Mean Diff Filter 39 | xmean <- colMeans(dfx[,c(1,2)]) 40 | diffx <- sqrt(colSums((t(dfx[,1:2]) - xmean)^2)) 41 | dfx <- dfx[which(diffx <= quantile(diffx,1 - filter)),] 42 | #Get diff 43 | if(x!=length(trajectory)){ 44 | xmean1 <- colMeans(df[df$Group==trajectory[x+1],c(1,2)]) 45 | diffx1 <- sqrt(colSums((t(dfx[,1:2]) - xmean1)^2)) 46 | dfx$time <- (1 - getQuantiles(diffx1)) + x 47 | }else{ 48 | xmean1 <- colMeans(df[df$Group==trajectory[x-1],c(1,2)]) 49 | diffx1 <- sqrt(colSums((t(dfx[,1:2]) - xmean1)^2)) 50 | dfx$time <- getQuantiles(diffx1) + x 51 | } 52 | dfAll <- rbind(dfAll , dfx) 53 | } 54 | sx <- smooth.spline(dfAll$time, dfAll$x, df = dof, spar = spar) 55 | sy <- smooth.spline(dfAll$time, dfAll$y, df = dof, spar = spar) 56 | dfFit <- data.frame(x = sx[[2]], y = sy[[2]], t = seq_along(sy[[2]])) 57 | dfTrajectory <- df[df$Group %in% trajectory,] 58 | dfTime <- lapply(seq_len(nrow(dfTrajectory)), function(x){ 59 | findClosest(dfTrajectory[x,1],dfTrajectory[x,2], dfFit[,1],dfFit[,2]) 60 | }) %>% Reduce("rbind",.) 61 | dfTime$distQ <- getQuantiles(dfTime$dist) 62 | dfTrajectory$pseudotime <- 100*getQuantiles(dfTime$idx + matrixStats::rowProds(as.matrix(dfTime[,c("diff","distQ")]))) 63 | 64 | out <- list(trajectory=dfTrajectory, fitTrajectory = dfFit) 65 | } 66 | 67 | trajectoryStats <- function(mat, trajectory, nSim = 1000, nFeatures = 10000){ 68 | #-------------- 69 | # Functions 70 | #-------------- 71 | rankTrajectory <- function(mat, trajectory, n = 10000, method = "vec"){ 72 | if(method=="vec"){ 73 | vecRank <- c() 74 | for(i in seq_along(trajectory)[-length(trajectory)]){ 75 | if(i == 1){ 76 | rem <- c(trajectory[i]) 77 | }else{ 78 | rem <- c(trajectory[i], trajectory[seq(1,i-1)]) 79 | } 80 | trajectoryI <- trajectory[i] 81 | peaksI <- head(order(mat[,trajectoryI],decreasing=TRUE), n = 10000) 82 | distI <- rank(sqrt(colSums((mat[peaksI,] - mat[peaksI,trajectoryI])^2))[colnames(mat) %ni% rem]) 83 | vecRank <- c(vecRank, distI[trajectory[i+1]]) 84 | } 85 | vecRank 86 | }else{ 87 | matRank <- matrix(ncol=length(trajectory)-1,nrow=ncol(mat)) 88 | rownames(matRank) <- colnames(mat) 89 | colnames(matRank) <- paste0("T",seq_along(trajectory[-1])) 90 | for(i in seq_along(trajectory)[-length(trajectory)]){ 91 | if(i == 1){ 92 | rem <- c(trajectory[i]) 93 | }else{ 94 | rem <- c(trajectory[i], trajectory[seq(1,i-1)]) 95 | } 96 | trajectoryI <- trajectory[i] 97 | peaksI <- head(order(mat[,trajectoryI],decreasing=TRUE), n = 10000) 98 | distI <- sqrt(colSums((mat[peaksI,] - mat[peaksI,trajectoryI])^2))[colnames(mat) %ni% rem] 99 | matRank[names(distI),i] <- distI 100 | } 101 | matRank 102 | } 103 | } 104 | nullTracjectory <- function(trajectory, n = 1000){ 105 | nullTracjectory <- list() 106 | while(length(nullTracjectory) < n){ 107 | trajx <- sample(trajectory, length(trajectory)) 108 | if(!identical(trajx, trajectory)){ 109 | nullTracjectory[[length(nullTracjectory) + 1]] <- trajx 110 | } 111 | 112 | } 113 | nullTracjectory 114 | } 115 | #Reverse Trajectory 116 | trajectory <- rev(trajectory) 117 | # Compute Null 118 | message("Computing Null Trajectories...") 119 | nullT <- nullTracjectory(trajectory, nSim) 120 | pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3) 121 | nullRanks <- Reduce("rbind",lapply(seq_along(nullT), function(x){ 122 | setTxtProgressBar(pb, round(x * 100/length(nullT), 0)) 123 | rankX <- rankTrajectory(mat, nullT[[x]], n = nFeatures) 124 | data.frame(mean = mean(rankX), median = median(rankX)) 125 | })) 126 | # Compute Actual 127 | rankT <- rankTrajectory(mat, trajectory, n = nFeatures) 128 | rankTMat <- rankTrajectory(mat, trajectory, n = nFeatures, method = "mat") 129 | pvalMean <- sum(nullRanks$mean < mean(rankT)) / nrow(nullRanks) 130 | pvalMedian <- sum(nullRanks$median < median(rankT)) / nrow(nullRanks) 131 | out <- list( 132 | pvalMean = pvalMean, 133 | pvalMedian = pvalMedian, 134 | rankT = rankT, 135 | rankNull = nullRanks, 136 | rankTMat = rankTMat 137 | ) 138 | return(out) 139 | } 140 | 141 | groupSums <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){ 142 | stopifnot(!is.null(groups)) 143 | stopifnot(length(groups) == ncol(mat)) 144 | gm <- lapply(unique(groups), function(x) { 145 | if (sparse) { 146 | Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm) 147 | } 148 | else { 149 | rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm) 150 | } 151 | }) %>% Reduce("cbind", .) 152 | colnames(gm) <- unique(groups) 153 | return(gm) 154 | } 155 | 156 | '%ni%' <- Negate('%in%') 157 | 158 | #---------------------------- 159 | # Get Inputs 160 | #---------------------------- 161 | se <- readRDS("results/scATAC-Summarized-Experiment.rds") 162 | trajectory <- paste0("Cluster",c(2,3,4,1,6,5)) 163 | 164 | #Align single cells to Trajectory 165 | df <- data.frame(row.names = colnames(se), x = colData(se)$UMAP1, y = colData(se)$UMAP2, Group = colData(se)$Clusters) 166 | trajAligned <- alignTrajectory(df, trajectory) 167 | 168 | df2 <- trajAligned[[1]] 169 | dfT <- trajAligned[[2]] 170 | 171 | pdf("results/UMAP-PseudoTime.pdf") 172 | library(ggplot2) 173 | ggplot(df2, aes(x,y,color=pseudotime)) + geom_point() + 174 | theme_bw() + viridis::scale_color_viridis() + 175 | geom_path(data=data.frame(dfT), aes(x,y,color=NULL), size= 1, 176 | arrow = arrow(type = "open", angle = 30, length = unit(0.1, "inches"))) 177 | dev.off() 178 | saveRDS(trajAligned, "results/Aligned-Trajectory.rds") 179 | 180 | #Significance of Trajectory Order 181 | clustMeans <- edgeR::cpm(groupSums(assay(se), colData(se)$Clusters, sparse = TRUE),log=TRUE,prior.count=3) 182 | trajStats <- trajectoryStats(clustMeans, trajectory, nSim = 500, nFeatures = 10000) 183 | rankMat <- trajStats$rankTMat 184 | rankDF <- lapply(seq_len(ncol(rankMat)), function(x){ 185 | data.frame(names=names(rankMat[,x]),ranks=rankMat[,x],t=x) 186 | }) %>% Reduce("rbind",.) 187 | rankV <- lapply(seq_len(ncol(rankMat)), function(x){ 188 | data.frame(ranks=rankMat[rev(trajectory)[x+1],x],t=x) 189 | }) %>% Reduce("rbind",.) 190 | 191 | pdf("results/Trajectory_Plot_Distance.pdf", useDingbats = FALSE, width = 12, height = 12) 192 | ggplot(rankDF, aes(t,ranks,color=names)) + 193 | geom_point(size = 8) + 194 | geom_point(data = rankV, aes(color=NULL), color = "black", size = 2) + 195 | geom_line(data = rankV, aes(color=NULL), color = "black", size = 1, 196 | lty = "dashed", arrow = arrow(type = "open", length = unit(0.2, "inches"))) + 197 | theme_bw() + 198 | ylab("Trajectory Distance (Dist logCPM)") + xlab(paste0("Trajectory 5000 Sim Pval = ",trajStats$pvalMean)) + 199 | ggtitle(paste0(rev(trajectory),collapse=" -> ")) 200 | dev.off() 201 | 202 | out <- list(trajStats = trajStats, rankDF = rankDF, rankV = rankV) 203 | saveRDS(out, "results/Trajectory-Stats.rds") 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /code/07_ChromVAR_For_GWAS_w_CoAccessbility_v2.R: -------------------------------------------------------------------------------- 1 | #Running gwas chromVAR on single-cell summarized experiment using co-accessibility 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(chromVAR) 8 | library(Matrix) 9 | library(MatrixStats) 10 | library(SummarizedExperiment) 11 | library(magrittr) 12 | library(BiocParallel) 13 | library(BSgenome.Hsapiens.UCSC.hg19) 14 | register(SerialParam()) 15 | set.seed(1) 16 | 17 | se <- readRDS("results/scATAC-Summarized-Experiment.rds") 18 | gr <- readRDS("data/PICS_GWAS_SNPS.gr.rds") 19 | conn <- readRDS("results/Peaks-Co-Accessibility.rds") 20 | conn <- conn[conn[,3] >= 0.35,] 21 | peaknames <- paste(seqnames(se),start(se),end(se),sep="_") 22 | conn[,4] <- match(paste0(conn[,1]), peaknames) 23 | conn[,5] <- match(paste0(conn[,2]), peaknames) 24 | connMat <- Matrix::sparseMatrix(i=conn[,4],j=conn[,5],x=rep(1,nrow(conn)),dims=c(nrow(se),nrow(se))) 25 | 26 | #Add Bias 27 | genome <- BSgenome.Hsapiens.UCSC.hg19 28 | se <- addGCBias(se, genome = genome) 29 | 30 | #Overlap GWAS SNPs 31 | o <- lapply(split(gr, gr$disease), function(x){ 32 | #extend snp +- 500 bp 33 | overlapsAny(se, resize(x,1001,"center"), ignore.strand = TRUE) 34 | }) %>% Reduce("cbind",.) 35 | ow <- which(o > 0, arr.ind=TRUE) 36 | matches <- Matrix::sparseMatrix(i=ow[,1],j=ow[,2],x=o[cbind(ow[,1],ow[,2])], dims = c(nrow(se),length(unique(gr$disease)))) 37 | colnames(matches) <- names(split(gr, gr$disease)) 38 | 39 | #Use connections mat! 40 | matches2 <- matches 41 | idx <- which(Matrix::rowSums(matches2)>0) #which peaks have a snp 42 | for(i in seq_along(idx)){ 43 | if(i %% 100 == 0){message(sprintf("%s of %s",i,length(idx)))} 44 | #peaks co-accessible to peak with snp 45 | coi <- unique(c(which(connMat[,idx[i]]>0),which(connMat[idx[i],]>0))) 46 | if(length(coi) > 0){ 47 | #create sub mat 48 | mati <- as(t(replicate(length(coi), matches[idx[i],])),"dgCMatrix") 49 | #add it to sub mat of connected peaks 50 | matches2[coi,,drop=FALSE] <- matches2[coi,,drop=FALSE] + mati 51 | } 52 | } 53 | diff <- Matrix::colSums(matches2) - Matrix::colSums(matches) 54 | print(diff) 55 | 56 | #Make Annotation SE 57 | anno_ix <- SummarizedExperiment(assays=SimpleList(motifMatches=matches2), rowRanges=rowRanges(se)) 58 | 59 | #Compute Deviations 60 | dev <- computeDeviations(se, anno_ix) 61 | 62 | #compute variability 63 | metadata(dev)$Variability <- computeVariability(dev) 64 | 65 | #add matches 66 | metadata(dev)$gwasMatches <- anno_ix 67 | 68 | #save output 69 | saveRDS(dev, "results/GWAS-Co-Accessibility-chromVAR-Summarized-Experiment.rds") 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /code/08_Run_scCNV_v2.R: -------------------------------------------------------------------------------- 1 | #Estimating Copy Number Variation in scATAC-seq 2 | #05/02/19 3 | #Cite Satpathy*, Granja*, et al. 4 | #Massively parallel single-cell chromatin landscapes of human immune 5 | #cell development and intratumoral T cell exhaustion (2019) 6 | #Created by Jeffrey Granja 7 | library(Matrix) 8 | library(SummarizedExperiment) 9 | library(matrixStats) 10 | library(readr) 11 | library(GenomicRanges) 12 | library(magrittr) 13 | library(edgeR) 14 | library(Seurat) 15 | library(BSgenome.Hsapiens.UCSC.hg19) 16 | set.seed(1) 17 | 18 | "%ni%" <- Negate("%in%") 19 | 20 | countInsertions <- function(query, fragments, by = "RG"){ 21 | #Count By Fragments Insertions 22 | inserts <- c( 23 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]), 24 | GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by]) 25 | ) 26 | by <- "RG" 27 | overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap=-1L, minoverlap=0L, type = "any")) 28 | overlapDF$name <- mcols(inserts)[overlapDF[, 2], by] 29 | overlapTDF <- transform(overlapDF, id = match(name, unique(name))) 30 | #Calculate Overlap Stats 31 | inPeaks <- table(overlapDF$name) 32 | total <- table(mcols(inserts)[, by]) 33 | total <- total[names(inPeaks)] 34 | frip <- inPeaks / total 35 | #Summarize 36 | sparseM <- Matrix::sparseMatrix( 37 | i = overlapTDF[, 1], 38 | j = overlapTDF[, 4], 39 | x = rep(1, nrow(overlapTDF)), 40 | dims = c(length(query), length(unique(overlapDF$name)))) 41 | colnames(sparseM) <- unique(overlapDF$name) 42 | total <- total[colnames(sparseM)] 43 | frip <- frip[colnames(sparseM)] 44 | out <- list(counts = sparseM, frip = frip, total = total) 45 | return(out) 46 | } 47 | 48 | makeWindows <- function(genome, blacklist, windowSize = 10e6, slidingSize = 2e6){ 49 | chromSizes <- GRanges(names(seqlengths(genome)), IRanges(1, seqlengths(genome))) 50 | chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse") 51 | windows <- slidingWindows(x = chromSizes, width = windowSize, step = slidingSize) %>% unlist %>% .[which(width(.)==windowSize),] 52 | mcols(windows)$wSeq <- as.character(seqnames(windows)) 53 | mcols(windows)$wStart <- start(windows) 54 | mcols(windows)$wEnd <- end(windows) 55 | message("Subtracting Blacklist...") 56 | windowsBL <- lapply(seq_along(windows), function(x){ 57 | if(x %% 100 == 0){ 58 | message(sprintf("%s of %s", x, length(windows))) 59 | } 60 | gr <- GenomicRanges::setdiff(windows[x,], blacklist) 61 | mcols(gr) <- mcols(windows[x,]) 62 | return(gr) 63 | }) 64 | names(windowsBL) <- paste0("w",seq_along(windowsBL)) 65 | windowsBL <- unlist(GenomicRangesList(windowsBL), use.names = TRUE) 66 | mcols(windowsBL)$name <- names(windowsBL) 67 | message("Adding Nucleotide Information...") 68 | windowSplit <- split(windowsBL, as.character(seqnames(windowsBL))) 69 | windowNuc <- lapply(seq_along(windowSplit), function(x){ 70 | message(sprintf("%s of %s", x, length(windowSplit))) 71 | chrSeq <- Biostrings::getSeq(genome,chromSizes[which(seqnames(chromSizes)==names(windowSplit)[x])]) 72 | grx <- windowSplit[[x]] 73 | aFreq <- alphabetFrequency(Biostrings::Views(chrSeq[[1]], ranges(grx))) 74 | mcols(grx)$GC <- rowSums(aFreq[, c("G","C")]) / rowSums(aFreq) 75 | mcols(grx)$AT <- rowSums(aFreq[, c("A","T")]) / rowSums(aFreq) 76 | return(grx) 77 | }) %>% GenomicRangesList %>% unlist %>% sortSeqlevels %>% sort 78 | windowNuc$N <- 1 - (windowNuc$GC + windowNuc$AT) 79 | windowNuc 80 | } 81 | 82 | scCNA <- function(windows, fragments, neighbors = 100, LFC = 1.5, FDR = 0.1, force = FALSE, remove = c("chrM","chrX","chrY")){ 83 | 84 | #Keep only regions in filtered chromosomes 85 | windows <- GenomeInfoDb::keepStandardChromosomes(windows, pruning.mode = "coarse") 86 | fragments <- GenomeInfoDb::keepStandardChromosomes(fragments, pruning.mode = "coarse") 87 | windows <- windows[seqnames(windows) %ni% remove] 88 | fragments <- fragments[seqnames(fragments) %ni% remove] 89 | 90 | #Count Insertions in windows 91 | message("Getting Counts...") 92 | counts <- countInsertions(windows, fragments, by = "RG")[[1]] 93 | message("Summarizing...") 94 | windowSummary <- GenomicRangesList() 95 | countSummary <- matrix(nrow=length(unique(windows$name)), ncol = ncol(counts)) 96 | for(x in seq_along(unique(mcols(windows)$name))){ 97 | if(x %% 100 == 0){ 98 | message(sprintf("%s of %s", x, length(unique(mcols(windows)$name)))) 99 | } 100 | idx <- which(mcols(windows)$name == unique(mcols(windows)$name)[x]) 101 | wx <- windows[idx,] 102 | wo <- GRanges(mcols(wx)$wSeq , ranges = IRanges(mcols(wx)$wStart, mcols(wx)$wEnd))[1,] 103 | mcols(wo)$name <- mcols(wx)$name[1] 104 | mcols(wo)$effectiveLength <- sum(width(wx)) 105 | mcols(wo)$percentEffectiveLength <- 100*sum(width(wx))/width(wo) 106 | mcols(wo)$GC <- sum(mcols(wx)$GC * width(wx))/width(wo) 107 | mcols(wo)$AT <- sum(mcols(wx)$AT * width(wx))/width(wo) 108 | mcols(wo)$N <- sum(mcols(wx)$N * width(wx))/width(wo) 109 | countSummary[x,] <- Matrix::colSums(counts[idx,,drop=FALSE]) 110 | windowSummary[[x]] <- wo 111 | } 112 | windowSummary <- unlist(windowSummary) 113 | 114 | #Keep only regions with less than 0.1% N 115 | keep <- which(windowSummary$N < 0.001) 116 | windowSummary <- windowSummary[keep,] 117 | countSummary <- countSummary[keep,] 118 | 119 | #Now determine the nearest neighbors by GC content 120 | message("Computing Background...") 121 | bdgMean <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary)) 122 | bdgSd <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary)) 123 | log2FC <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary)) 124 | z <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary)) 125 | pval <- matrix(nrow=nrow(countSummary), ncol=ncol(countSummary)) 126 | 127 | for(x in seq_len(nrow(countSummary))){ 128 | if(x %% 100 == 0){ 129 | message(sprintf("%s of %s", x, nrow(countSummary))) 130 | } 131 | #Get Nearest Indices 132 | idxNN <- head(order(abs(windowSummary$GC[x] - windowSummary$GC)), neighbors + 1) 133 | idxNN <- idxNN[idxNN %ni% x] 134 | #Background 135 | if(any(colMeans(countSummary[idxNN, ])==0)){ 136 | if(force){ 137 | message("Warning! Background Mean = 0 Try a higher neighbor count or remove cells with 0 in colMins") 138 | }else{ 139 | stop("Background Mean = 0!") 140 | } 141 | } 142 | bdgMean[x, ] <- colMeans(countSummary[idxNN, ]) 143 | bdgSd[x, ] <- matrixStats::colSds(countSummary[idxNN, ]) 144 | log2FC[x, ] <- log2((countSummary[x, ]+1e-5) / (bdgMean[x, ]+1e-5)) 145 | z[x, ] <- (countSummary[x,] - bdgMean[x, ]) / bdgSd[x, ] 146 | pval[x, ] <- 2*pnorm(-abs(z[x, ])) 147 | } 148 | padj <- apply(pval, 2, function(x) p.adjust(x, method = "fdr")) 149 | CNA <- matrix(0, nrow=nrow(countSummary), ncol=ncol(countSummary)) 150 | CNA[which(log2FC >= LFC & padj <= FDR)] <- 1 151 | 152 | se <- SummarizedExperiment( 153 | assays = SimpleList( 154 | CNA = CNA, 155 | counts = countSummary, 156 | log2FC = log2FC, 157 | padj = padj, 158 | pval = pval, 159 | z = z, 160 | bdgMean = bdgMean, 161 | bdgSd = bdgSd 162 | ), 163 | rowRanges = windowSummary 164 | ) 165 | colnames(se) <- colnames(counts) 166 | 167 | return(se) 168 | } 169 | 170 | #---------------------------- 171 | # Get Inputs 172 | #---------------------------- 173 | blacklist <- import.bed("data/hg19.blacklist.bed") 174 | windows <- makeWindows(genome = BSgenome.Hsapiens.UCSC.hg19, blacklist = blacklist) 175 | cnaObj <- scCNA(windows, readRDS("data/PBMC_10x-Sub25M-fragments.gr.rds"), neighbors = 100, LFC = 1.5, FDR = 0.1, force = FALSE, remove = c("chrM","chrX","chrY")) 176 | saveRDS(cnaObj, "results/PMC-CNV_LFC_GC.rds") 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | --------------------------------------------------------------------------------