├── .DS_Store ├── .Rbuildignore ├── .Rhistory ├── .gitignore ├── CellTagR.Rproj ├── DESCRIPTION ├── Examples ├── .DS_Store ├── CellTagR CellTag Object V1 V2 V3.pdf ├── CellTag_UTR.fa ├── CloneHunterWhitelistTestRun.Rmd ├── CloneHunterWhitelistTestRun.nb.html ├── bar_Chart.png ├── clone.calling.permutation.test.R ├── hf1.d15.network.construction.html ├── jaccard example.png ├── jaccard wo collapsing.png ├── network construction and visualization.Rmd ├── network construction and visualization.nb.html ├── permutation_python.py ├── post_filtering.png ├── pre_filtering.png ├── sc analysis.Rmd ├── sc analysis.nb.html └── v1_v2_v3.html ├── NAMESPACE ├── R ├── .DS_Store ├── AuxiliaryFunctions.R ├── CellTagExtraction.R ├── CellTagForCollapsing.R ├── CellTagMatrixGeneration.R ├── CellTagNetworkContruction.R ├── CellTagNetworkVisualiztion.R ├── CellTagObjSet.R ├── CellTagWhitelistGeneration.R ├── CloneCalling.R ├── CreateCellTagObject.R ├── MetricBasedPlottingAndFiltering.R ├── ScCellTagMatrixProcess.R └── scripts.zip ├── README.md ├── V2-1_S2_L001_R1_001.fastq ├── inst └── extdata │ ├── .DS_Store │ ├── Demo_V1.Rds │ ├── V2-1_R1.zip │ ├── bam_v123_obj.Rds │ ├── barcodes.tsv │ ├── hf1.d28.prefiltered.Rds │ └── v1_whitelist.csv └── man ├── AddCellTagFreqSort.Rd ├── Barcode.Aggregate.Rd ├── CellTagDataForCollapsing.Rd ├── CellTagDataPostCollapsing.Rd ├── CellTagExtraction.Rd ├── CellTagMatrixCount.Rd ├── CellTagObject.Rd ├── CellTagPatternCalling.Rd ├── CellTagWhitelistFiltering.Rd ├── CloneCalling.Rd ├── JaccardAnalysis.Rd ├── MetricBasedFiltering.Rd ├── MetricPlots.Rd ├── SingleCellDataBinatization.Rd ├── SingleCellDataWhitelist.Rd ├── addData2Nodes.Rd ├── bam.process.Rd ├── convertCellTagMatrix2LinkList.Rd ├── drawSubnet.Rd ├── fastq.process.Rd └── getNodesfromLinkList.Rd /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/.DS_Store -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CellTagR\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | panel.grid.major = element_blank(), 2 | panel.grid.minor = element_blank(), 3 | panel.background = element_blank(), 4 | axis.line = element_line(colour = "black")) 5 | background.error <- background.mtx[,c(4,5)] 6 | background.error$category <- col.path.sub[ref.meta.path[rownames(background.error), "cell.bc"], "label"] 7 | ggplot(background.error, aes(x = Error, color = category)) + 8 | geom_density() + 9 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 10 | theme(legend.position="right", 11 | legend.text = element_text(size = 10), 12 | axis.title = element_blank(), 13 | panel.grid.major = element_blank(), 14 | panel.grid.minor = element_blank(), 15 | panel.background = element_blank(), 16 | axis.line = element_line(colour = "black")) 17 | ggplot(background.error, aes(x = Lagrangian, color = category)) + 18 | geom_density() + 19 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 20 | theme(legend.position="right", 21 | legend.text = element_text(size = 10), 22 | axis.title = element_blank(), 23 | panel.grid.major = element_blank(), 24 | panel.grid.minor = element_blank(), 25 | panel.background = element_blank(), 26 | axis.line = element_line(colour = "black")) 27 | background.error$category <- paste0("Background.", background.error$category) 28 | error.ref.background <- rbind(error.path.all, background.error) 29 | ggplot(error.ref.background, aes(x = Error, color = category)) + 30 | geom_density() + 31 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 32 | theme(legend.position="right", 33 | legend.text = element_text(size = 10), 34 | axis.title = element_blank(), 35 | panel.grid.major = element_blank(), 36 | panel.grid.minor = element_blank(), 37 | panel.background = element_blank(), 38 | axis.line = element_line(colour = "black")) 39 | ggplot(error.ref.background, aes(x = Lagrangian, color = category)) + 40 | geom_density() + 41 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 42 | theme(legend.position="right", 43 | legend.text = element_text(size = 10), 44 | axis.title = element_blank(), 45 | panel.grid.major = element_blank(), 46 | panel.grid.minor = element_blank(), 47 | panel.background = element_blank(), 48 | axis.line = element_line(colour = "black")) 49 | background.error$category <- paste0("Background.", background.error$category) 50 | error.ref.background <- rbind(error.path.all, background.error) 51 | ggplot(error.ref.background, aes(x = Error, color = category)) + 52 | geom_density() + 53 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 54 | theme(legend.position="right", 55 | legend.text = element_text(size = 10), 56 | axis.title = element_blank(), 57 | panel.grid.major = element_blank(), 58 | panel.grid.minor = element_blank(), 59 | panel.background = element_blank(), 60 | axis.line = element_line(colour = "black")) 61 | ggplot(error.ref.background, aes(x = Lagrangian, color = category)) + 62 | geom_density() + 63 | scale_color_viridis_d(begin = 0.15, end = 0.85, option = "A") + 64 | theme(legend.position="right", 65 | legend.text = element_text(size = 10), 66 | axis.title = element_blank(), 67 | panel.grid.major = element_blank(), 68 | panel.grid.minor = element_blank(), 69 | panel.background = element_blank(), 70 | axis.line = element_line(colour = "black")) 71 | ggplot(error.path.all, aes(x = Error)) + 72 | geom_density() 73 | ggplot(background.error, aes(x = Error)) + 74 | geom_density() 75 | ggplot(background.error, aes(x = Lagrangian)) + 76 | geom_density() 77 | rslt <- table(classification.path$new.classification, classification.path$actual) 78 | rslt 79 | rslt <- as.data.frame(apply(rslt, 2, function(x) round(x * 100/sum(x), digits = 3))) 80 | rownames(rslt) <- paste0("Capy.", rownames(rslt)) 81 | colnames(rslt) <- paste0("Actual.", colnames(rslt)) 82 | rslt$capy <- rownames(rslt) 83 | rslt.stk <- reshape2::melt(rslt) 84 | ggplot(rslt.stk, aes(x = variable, y = capy, fill = value)) + 85 | geom_tile() + 86 | scale_fill_viridis_c(begin = 0.15, end = 0.85, option = "A") + 87 | theme(legend.position="right", 88 | legend.text = element_text(size = 10), 89 | axis.text.x = element_text(angle = 45, size = 12, hjust = 1), 90 | axis.text.y = element_text(size = 12), 91 | axis.title = element_blank(), 92 | panel.grid.major = element_blank(), 93 | panel.grid.minor = element_blank(), 94 | panel.background = element_blank(), 95 | axis.line = element_line(colour = "black")) 96 | library(ggpubr) 97 | error.p1.p2.p3 <- error.path.other[which(error.path.other$category %in% c("Path1.Term", "Path2.Term", "Path3.Term")),] 98 | ggqqplot(error.p1.p2.p3$Error) 99 | shapiro.test(error.p1.p2.p3$Error) 100 | library(ggpubr) 101 | error.p1.p2.p3 <- error.path.other[which(error.path.other$category %in% c("Path1.Term", "Path2.Term", "Path3.Term")),] 102 | ggqqplot(error.p1.p2.p3$Error) 103 | shapiro.test(error.p1.p2.p3$Error) 104 | ggqqplot(error.p1.p2.p3$Lagrangian) 105 | shapiro.test(error.p1.p2.p3$Lagrangian) 106 | library(ggpubr) 107 | error.p1.p2.p3 <- error.path.other[which(error.path.other$category %in% c("Path1.Term", "Path2.Term", "Path3.Term")),] 108 | ggqqplot(error.p1.p2.p3$Error) 109 | shapiro.test(error.p1.p2.p3$Error) 110 | ggqqplot(error.p1.p2.p3$Lagrangian) 111 | shapiro.test(error.p1.p2.p3$Lagrangian) 112 | plot(density(error.p1.p2.p3$Error)) 113 | plot(density(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path1.Term")])) 114 | ggqqplot(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path1.Term")]) 115 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path1.Term")]) 116 | ggqqplot(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path2.Term")]) 117 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path2.Term")]) 118 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 119 | ggqqplot(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 120 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 121 | ggqqplot(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 122 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 123 | plot(density(error.p1.p2.p3$Error)) 124 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 125 | plot(density(error.p1.p2.p3$Error)) 126 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 127 | plot(density(error.p1.p2.p3$Error)) + ylim(0,10) 128 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 129 | plot(density(error.p1.p2.p3$Error)) + ylim(0,10) 130 | plot(density(error.p1.p2.p3$Error),ylim = c(0,10)) 131 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 132 | plot(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2))) 133 | plot(density(error.p1.p2.p3$Error)) 134 | lines(density(rnorm(1000, mean = major.norm.mean, sd = new.sigma)), col = "red") 135 | plot(density(error.p1.p2.p3$Error)) 136 | lines(density(rnorm(1000, mean = major.norm.mean, sd = new.sigma)), col = "red") 137 | plot(density(error.p1.p2.p3$Error)) 138 | lines(density(rnorm(2000, mean = major.norm.mean, sd = new.sigma)), col = "red") 139 | plot(density(error.p1.p2.p3$Error)) 140 | lines(density(rnorm(2000, mean = major.norm.mean, sd = 150)), col = "red") 141 | plot(density(error.p1.p2.p3$Error)) 142 | lines(density(rnorm(2000, mean = major.norm.mean, sd = new.sigma)), col = "red") 143 | ks.test(x = error.p1.p2.p3$Error, y = rnorm(2000, mean = major.norm.mean, sd = new.sigma)) 144 | library(ggpubr) 145 | error.p1.p2.p3 <- error.path.other[which(error.path.other$category %in% c("Path1.Term", "Path2.Term", "Path3.Term")),] 146 | ggqqplot(error.p1.p2.p3$Error) 147 | shapiro.test(error.p1.p2.p3$Error) 148 | ggqqplot(error.p1.p2.p3$Lagrangian) 149 | shapiro.test(error.p1.p2.p3$Lagrangian) 150 | ggqqplot(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 151 | shapiro.test(error.p1.p2.p3$Error[which(error.p1.p2.p3$category == "Path3.Term")]) 152 | ## Form a major normal distribution for the error 153 | major.norm.mean <- mean(fitpro$parameters$mu) 154 | new.sigma <- mean(fitpro$parameters$sigma) 155 | plot(fitpro, ylim = c(0,0.003)) 156 | ## Form a major normal distribution for the error 157 | major.norm.mean <- mean(fitpro$parameters$mu) 158 | new.sigma <- mean(fitpro$parameters$sigma) 159 | plot(fitpro, ylim = 0.003) 160 | ## Form a major normal distribution for the error 161 | major.norm.mean <- mean(fitpro$parameters$mu) 162 | new.sigma <- mean(fitpro$parameters$sigma) 163 | plot(fitpro) 164 | lines(density(rnorm(1000, mean = major.norm.mean, sd = new.sigma)), col = "red") 165 | ## Form a major normal distribution for the lagrangian 166 | major.norm.mean.2 <- mean(fitpro.2$parameters$mu) 167 | new.sigma.2 <- mean(fitpro.2$parameters$sigma) 168 | plot(fitpro.2) 169 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 170 | ## Form a major normal distribution for the error 171 | major.norm.mean <- sum(fitpro$parameters$mu * fitpro$parameters$pi) 172 | new.sigma <- sum(fitpro$parameters$sigma * fitpro$parameters$pi) 173 | plot(fitpro) 174 | lines(density(rnorm(1000, mean = major.norm.mean, sd = new.sigma)), col = "red") 175 | ## Form a major normal distribution for the lagrangian 176 | major.norm.mean.2 <- sum(fitpro.2$parameters$mu * fitpro.2$parameters$pi) 177 | new.sigma.2 <- sum(fitpro.2$parameters$sigma * fitpro.2$parameters$pi) 178 | plot(fitpro.2) 179 | lines(density(rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)), col = "red") 180 | ### Get some p-values in the test qp errors 181 | error.path.all$p.values <- 1 - pnorm(error.path.all$Error, mean = major.norm.mean, sd = new.sigma) 182 | error.path.all$p.values.lm.lower <- pnorm(error.path.all$Lagrangian, mean = fitpro.2$parameters$mu[1], sd = fitpro.2$parameters$sigma[1]) 183 | error.path.all$p.values.lm.mode <- pnorm(error.path.all$Lagrangian, mean = major.norm.mean.2, sd = new.sigma.2) 184 | classification.path$new.classification <- classification.path$call 185 | classification.path[rownames(error.path.all)[which(error.path.all$p.values <= 0.05 | (error.path.all$p.values.lm <= 0.005))],"new.classification"] <- "Unassigned" 186 | table(classification.path$new.classification, classification.path$actual) 187 | ### Get some p-values in the test qp errors 188 | error.path.all$p.values <- 1 - pnorm(error.path.all$Error, mean = major.norm.mean, sd = new.sigma) 189 | error.path.all$p.values.lm.lower <- pnorm(error.path.all$Lagrangian, mean = fitpro.2$parameters$mu[1], sd = fitpro.2$parameters$sigma[1]) 190 | error.path.all$p.values.lm.mode <- pnorm(error.path.all$Lagrangian, mean = major.norm.mean.2, sd = new.sigma.2) 191 | classification.path$new.classification <- classification.path$call 192 | classification.path[rownames(error.path.all)[which(error.path.all$p.values <= 0.05 | (error.path.all$p.values.lm.lower <= 0.005))],"new.classification"] <- "Unassigned" 193 | table(classification.path$new.classification, classification.path$actual) 194 | ks.test(x = error.p1.p2.p3$Lagrangian, y = rnorm(2000, mean = major.norm.mean.2, sd = new.sigma.2)) 195 | ks.test(x = error.p1.p2.p3$Lagrangian, y = rnorm(1000, mean = major.norm.mean.2, sd = new.sigma.2)) 196 | ks.test(x = error.p1.p2.p3$Lagrangian, y = rnorm(nrow(error.p1.p2.p3), mean = major.norm.mean.2, sd = new.sigma.2)) 197 | ks.test(x = error.p1.p2.p3$Error, y = rnorm(nrow(error.p1.p2.p3), mean = major.norm.mean, sd = new.sigma)) 198 | rslt <- table(classification.path$call, classification.path$actual) 199 | rslt 200 | rslt <- as.data.frame(apply(rslt, 2, function(x) round(x * 100/sum(x), digits = 3))) 201 | rownames(rslt) <- paste0("Capy.", rownames(rslt)) 202 | colnames(rslt) <- paste0("Actual.", colnames(rslt)) 203 | rslt$capy <- rownames(rslt) 204 | rslt.stk <- reshape2::melt(rslt) 205 | ggplot(rslt.stk, aes(x = variable, y = capy, fill = value)) + 206 | geom_tile() + 207 | scale_fill_viridis_c(begin = 0.15, end = 0.85, option = "A") + 208 | theme(legend.position="right", 209 | legend.text = element_text(size = 10), 210 | axis.text.x = element_text(angle = 45, size = 12, hjust = 1), 211 | axis.text.y = element_text(size = 12), 212 | axis.title = element_blank(), 213 | panel.grid.major = element_blank(), 214 | panel.grid.minor = element_blank(), 215 | panel.background = element_blank(), 216 | axis.line = element_line(colour = "black")) 217 | ggplot(rslt.stk, aes(x = variable, y = capy, fill = value)) + 218 | geom_tile() + 219 | scale_fill_viridis_c(begin = 0.15, end = 0.85, option = "A") + 220 | theme(legend.position="right", 221 | legend.text = element_text(size = 10), 222 | axis.text.x = element_text(angle = 45, size = 12, hjust = 1), 223 | axis.text.y = element_text(size = 12), 224 | axis.title = element_blank(), 225 | panel.grid.major = element_blank(), 226 | panel.grid.minor = element_blank(), 227 | panel.background = element_blank(), 228 | axis.line = element_line(colour = "black")) 229 | save.image("~/Desktop/Morris Lab/Manuscripts/Capybara/error evaluation/Simulation Study Notebook/111920_final_simulation_intermed_multi_unknown_workspace.RData") 230 | ref.lsk <- readRDS("~/Desktop/Morris Lab/Manuscripts/Capybara/LARRY Dataset/in vitro/lsk_reference_wo_undifferentiated.Rds") 231 | ref.df.lsk <- ref.lsk[[3]] 232 | ref.sc.lsk <- ref.lsk[[1]] 233 | View(ref.df.lsk) 234 | ref.meta.lsk <- ref.lsk[[2]] 235 | View(ref.meta.lsk) 236 | library(Seurat) 237 | library(dplyr) 238 | library(patchwork) 239 | library(ggplot2) 240 | library(ggpubr) 241 | library(viridis) 242 | dox <- Read10X("~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/nova_seq_110920/Dox_RA/filtered_feature_bc_matrix/") 243 | dox.ra.sag <- Read10X("~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/nova_seq_110920/Dox_SAG/filtered_feature_bc_matrix/") 244 | dox.obj <- CreateSeuratObject(counts = dox, project = "mn.dox.ra", min.cells = 3, min.features = 200) 245 | dox.ra.sag.obj <- CreateSeuratObject(counts = dox.ra.sag, project = "mn.dox.sag", min.cells = 3, min.features = 200) 246 | dox.obj[["percent.mt"]] <- PercentageFeatureSet(dox.obj, pattern = "^mt-") 247 | dox.ra.sag.obj[["percent.mt"]] <- PercentageFeatureSet(dox.ra.sag.obj, pattern = "^mt-") 248 | VlnPlot(dox.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 249 | VlnPlot(dox.ra.sag.obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 250 | # FeatureScatter is typically used to visualize feature-feature relationships, but can be used 251 | # for anything calculated by the object, i.e. columns in object metadata, PC scores etc. 252 | plot1 <- FeatureScatter(dox.obj, feature1 = "nCount_RNA", feature2 = "percent.mt") 253 | plot2 <- FeatureScatter(dox.ra.sag.obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") 254 | plot1 + plot2 255 | # FeatureScatter is typically used to visualize feature-feature relationships, but can be used 256 | # for anything calculated by the object, i.e. columns in object metadata, PC scores etc. 257 | plot1 <- FeatureScatter(dox.ra.sag.obj, feature1 = "nCount_RNA", feature2 = "percent.mt") 258 | plot2 <- FeatureScatter(dox.ra.sag.obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") 259 | plot1 + plot2 260 | dox.obj <- subset(dox.obj, subset = nFeature_RNA > 1000 & nFeature_RNA < 7000 & percent.mt <= 15) 261 | dox.ra.sag.obj <- subset(dox.ra.sag.obj, subset = nFeature_RNA > 1000 & nFeature_RNA < 7000 & percent.mt <= 15) 262 | dox.obj <- NormalizeData(dox.obj, normalization.method = "LogNormalize", scale.factor = 10000) 263 | dox.ra.sag.obj <- NormalizeData(dox.ra.sag.obj, normalization.method = "LogNormalize", scale.factor = 10000) 264 | dox.obj <- FindVariableFeatures(dox.obj, selection.method = "vst", nfeatures = 2000) 265 | # Identify the 10 most highly variable genes 266 | top10 <- head(VariableFeatures(dox.obj), 10) 267 | # plot variable features with and without labels 268 | plot1 <- VariableFeaturePlot(dox.obj) 269 | plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) 270 | plot1 + plot2 271 | dox.ra.sag.obj <- FindVariableFeatures(dox.ra.sag.obj, selection.method = "vst", nfeatures = 2000) 272 | # Identify the 10 most highly variable genes 273 | top10 <- head(VariableFeatures(dox.ra.sag.obj), 10) 274 | # plot variable features with and without labels 275 | plot1 <- VariableFeaturePlot(dox.ra.sag.obj) 276 | plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) 277 | plot1 + plot2 278 | all.genes <- rownames(dox.obj) 279 | dox.obj <- ScaleData(dox.obj, features = all.genes, vars.to.regress = c("nCount_RNA", "percent.mt")) 280 | all.genes <- rownames(dox.ra.sag.obj) 281 | dox.ra.sag.obj <- ScaleData(dox.ra.sag.obj, features = all.genes, vars.to.regress = c("nCount_RNA", "percent.mt")) 282 | dox.obj <- RunPCA(dox.obj, features = VariableFeatures(object = dox.obj)) 283 | dox.ra.sag.obj <- RunPCA(dox.ra.sag.obj, features = VariableFeatures(object = dox.ra.sag.obj)) 284 | VizDimLoadings(dox.obj, dims = 1:2, reduction = "pca") 285 | DimPlot(dox.obj, reduction = "pca") 286 | DimHeatmap(dox.obj, dims = 1:15, cells = 500, balanced = TRUE) 287 | # NOTE: This process can take a long time for big datasets, comment out for expediency. More 288 | # approximate techniques such as those implemented in ElbowPlot() can be used to reduce 289 | # computation time 290 | dox.obj <- JackStraw(dox.obj, num.replicate = 100) 291 | dox.obj <- ScoreJackStraw(dox.obj, dims = 1:20) 292 | JackStrawPlot(dox.obj, dims = 1:20) 293 | ElbowPlot(dox.obj) 294 | VizDimLoadings(dox.ra.sag.obj, dims = 1:2, reduction = "pca") 295 | DimHeatmap(dox.ra.sag.obj, dims = 1:15, cells = 500, balanced = TRUE) 296 | # NOTE: This process can take a long time for big datasets, comment out for expediency. More 297 | # approximate techniques such as those implemented in ElbowPlot() can be used to reduce 298 | # computation time 299 | dox.ra.sag.obj <- JackStraw(dox.ra.sag.obj, num.replicate = 100) 300 | dox.ra.sag.obj <- ScoreJackStraw(dox.ra.sag.obj, dims = 1:20) 301 | JackStrawPlot(dox.ra.sag.obj, dims = 1:20) 302 | ElbowPlot(dox.ra.sag.obj) 303 | dox.obj <- FindNeighbors(dox.obj, dims = 1:17) 304 | dox.ra.sag.obj <- FindNeighbors(dox.ra.sag.obj, dims = 1:17) 305 | dox.obj <- FindClusters(dox.obj, resolution = 0.8) 306 | dox.ra.sag.obj <- FindClusters(dox.ra.sag.obj, resolution = 0.8) 307 | dox.obj <- RunUMAP(dox.obj, dims = 1:17) 308 | dox.ra.sag.obj <- RunUMAP(dox.ra.sag.obj, dims = 1:17) 309 | DimPlot(dox.obj, reduction = "umap", label = T, label.size = 12) 310 | DimPlot(dox.ra.sag.obj, reduction = "umap", label = T, label.size = 12) 311 | FeaturePlot(dox.obj, features = c("Pou5f1", "Nanog", "Esrrb"), reduction = "umap") 312 | FeaturePlot(dox.ra.sag.obj, features = c("Pou5f1", "Nanog", "Esrrb"), reduction = "umap") 313 | FeaturePlot(dox.obj, features = c("Tubb3", "Map2", "Mnx1", "Isl1", "Lhx3", "Nefl", "Nefm", "Slit2","Onecut2"), reduction = "umap") 314 | FeaturePlot(dox.ra.sag.obj, features = c("Tubb3", "Map2", "Mnx1", "Isl1", "Lhx3", "Nefl", "Nefm", "Slit2", "Onecut2"), reduction = "umap") 315 | dox.marker <- FindAllMarkers(dox.obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) 316 | dox.marker %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) 317 | dox.ra.sag.marker <- FindAllMarkers(dox.ra.sag.obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) 318 | dox.ra.sag.marker %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC) 319 | FeaturePlot(dox.obj, features = c("nCount_RNA", "percent.mito"), reduction = "umap") 320 | FeaturePlot(dox.ra.sag.obj, features = c("nCount_RNA", "percent.mito"), reduction = "umap") 321 | FeaturePlot(dox.obj, features = c("nCount_RNA", "percent.mt"), reduction = "umap") 322 | FeaturePlot(dox.ra.sag.obj, features = c("nCount_RNA", "percent.mt"), reduction = "umap") 323 | marker.read <- readLines("~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/TableS2.csv") 324 | marker.read.region <- unlist(lapply(strsplit(marker.read, ";"), function(x) x[1])) 325 | marker.read.gm_id <- unlist(lapply(strsplit(marker.read, ";"), function(x) x[2])) 326 | marker.read.gene <- unlist(lapply(strsplit(marker.read, ";"), function(x) x[3])) 327 | marker.df <- data.frame(marker.read.region, marker.read.gm_id, marker.read.gene, stringsAsFactors = F) 328 | cnms <- marker.df[1,] 329 | marker.df <- marker.df[-c(1), ] 330 | colnames(marker.df) <- cnms 331 | gene.list.construct <- list() 332 | unique.region <- unique(marker.df$domain) 333 | for (i in 1:length(unique.region)) { 334 | curr.region <- unique.region[i] 335 | curr.sub <- marker.df[which(marker.df$domain == curr.region), ] 336 | curr.gene.list <- curr.sub$Genes 337 | curr.gene.list <- unique(unlist(lapply(strsplit(curr.gene.list, ", "), function(x) x))) 338 | gene.list.construct[[curr.region]] <- curr.gene.list 339 | } 340 | dox.obj <- AddModuleScore(dox.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl1") 341 | dox.obj <- AddModuleScore(dox.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl2") 342 | dox.obj <- AddModuleScore(dox.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl3") 343 | dox.obj <- AddModuleScore(dox.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl4") 344 | dox.obj <- AddModuleScore(dox.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl5") 345 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$dl1, gene.list.construct$dl2, 346 | gene.list.construct$dl3, gene.list.construct$dl4, 347 | gene.list.construct$dl5)), ctrl = 5, name = "dorsal_features") 348 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V0)), ctrl = 5, name = "V0") 349 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V1)), ctrl = 5, name = "V1") 350 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V2a)), ctrl = 5, name = "V2a") 351 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V2b)), ctrl = 5, name = "V2b") 352 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V3)), ctrl = 5, name = "V3") 353 | dox.obj <- AddModuleScore(dox.obj, features = list(c(gene.list.construct$V3)), ctrl = 5, name = "MN") 354 | FeaturePlot(dox.obj, features = c("dorsal_features1", "dl11", "dl21", "dl31", "dl41", "dl51")) 355 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl1") 356 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl2") 357 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl3") 358 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl4") 359 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(gene.list.construct$dl1), ctrl = 5, name = "dl5") 360 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$dl1, gene.list.construct$dl2, 361 | gene.list.construct$dl3, gene.list.construct$dl4, 362 | gene.list.construct$dl5)), ctrl = 5, name = "dorsal_features") 363 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V0)), ctrl = 5, name = "V0") 364 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V1)), ctrl = 5, name = "V1") 365 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V2a)), ctrl = 5, name = "V2a") 366 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V2b)), ctrl = 5, name = "V2b") 367 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V3)), ctrl = 5, name = "V3") 368 | dox.ra.sag.obj <- AddModuleScore(dox.ra.sag.obj, features = list(c(gene.list.construct$V3)), ctrl = 5, name = "MN") 369 | FeaturePlot(dox.ra.sag.obj, features = c("dorsal_features1", "dl11", "dl21", "dl31", "dl41", "dl51")) 370 | dox.meta <- dox.obj@meta.data 371 | dox.ra.sag.meta <- dox.ra.sag.obj@meta.data 372 | dox.meta$category <- "Dox Only" 373 | dox.ra.sag.meta$category <- "Dox RA SAG" 374 | meta.all <- rbind(dox.meta, dox.ra.sag.meta) 375 | meta.all.sub <- meta.all[,c(8:20)] 376 | dox.meta <- dox.obj@meta.data 377 | dox.ra.sag.meta <- dox.ra.sag.obj@meta.data 378 | dox.meta$category <- "Dox + RA" 379 | dox.ra.sag.meta$category <- "Dox + SAG" 380 | meta.all <- rbind(dox.meta, dox.ra.sag.meta) 381 | meta.all.sub <- meta.all[,c(7:19)] 382 | meta.all.sub.melt <- reshape2::melt(meta.all.sub[,c(6:13)]) 383 | ## Ref: https://stackoverflow.com/questions/17319487/median-and-quartile-on-violin-plots-in-ggplot2 384 | median.quartile <- function(x){ 385 | out <- quantile(x, probs = c(0.25,0.5,0.75)) 386 | names(out) <- c("ymin","y","ymax") 387 | return(out) 388 | } 389 | cs <- viridis(20) 390 | ggplot(meta.all.sub.melt, aes(x = variable, y = value, fill = category)) + 391 | geom_boxplot() + 392 | scale_fill_viridis_d(option = "A", begin = 0.5, end = 0.9) 393 | ggplot(meta.all.sub, aes(x = category, y = dorsal_features1, fill = category)) + 394 | geom_violin(trim = T) + 395 | scale_fill_viridis_d(option = "A", begin = 0.5, end = 0.9) + 396 | stat_summary(fun.y=median.quartile,geom='point', color = rep(cs[c(20,1)], each = 3)) + 397 | stat_summary(fun.y=median.quartile,geom='line', color = rep(cs[c(20,1)], each = 3)) + 398 | stat_compare_means(label = "p.signif", label.x = 1.5) 399 | save.image("~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/nova_seq_110920/Reprogrammed_mapped/112220_Dox_with_ra_or_sag_workspace.RData") 400 | saveRDS(dox.obj, "~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/nova_seq_110920/dox_ra_seurat.Rds") 401 | saveRDS(dox.ra.sag.obj, "~/Desktop/Morris Lab/Manuscripts/Capybara/MN revision/nova_seq_110920/dox_sag_seurat.Rds") 402 | library(CellTagR) 403 | test.obj <- readRDS("~/Desktop/Morris Lab/CellTagR Edit/filtered_celltag_obj.Rds") 404 | CellTagDataForCollapsing(test.obj, "~/Desktop/collpasing.txt") 405 | remove.packages("CellTagR") 406 | devtools::install_github("morris-lab/CellTagR") 407 | library(CellTagR) 408 | setwd("~/Desktop/") 409 | devtools::install("CellTagR") 410 | library(CellTagR) 411 | test.obj <- readRDS("~/Desktop/Morris Lab/CellTagR Edit/filtered_celltag_obj.Rds") 412 | CellTagDataForCollapsing(test.obj, "~/Desktop/test.txt") 413 | CellTagDataForCollapsing() 414 | CellTagDataForCollapsing 415 | devtools::install("CellTagR") 416 | library(CellTagR) 417 | CellTagDataForCollapsing(test.obj, "~/Desktop/test.txt") 418 | devtools::install("CellTagR") 419 | library(CellTagR) 420 | CellTagDataForCollapsing(test.obj, "~/Desktop/test.txt") 421 | devtools::install("CellTagR") 422 | CellTagDataForCollapsing(test.obj, "~/Desktop/test.txt") 423 | devtools::install("CellTagR") 424 | CellTagDataForCollapsing(test.obj, "~/Desktop/test.txt") 425 | devtools::document() 426 | setwd("CellTagR/") 427 | devtools::document() 428 | GetCellTagCurrentVersionWorkingMatrix <- function(celltag.obj, slot.to.select) { 429 | curr.mtx <- slot(celltag.obj, slot.to.select) 430 | if (nrow(curr.mtx) <= 0) { 431 | return(curr.mtx) 432 | } else { 433 | curr.version <- celltag.obj@curr.version 434 | curr.mtx.sub <- curr.mtx[, which(startsWith(colnames(curr.mtx), curr.version))] 435 | colnames(curr.mtx.sub) <- gsub(pattern = paste0(curr.version, "."), replacement = "", colnames(curr.mtx.sub)) 436 | full.mtx.sub <- curr.mtx.sub[Matrix::rowSums(is.na(curr.mtx.sub)) != ncol(curr.mtx.sub),] 437 | return(full.mtx.sub) 438 | } 439 | } 440 | test.obj <- readRDS("~/Desktop/Morris Lab/CellTagR Edit/post_collapsing_hf1_d15.Rds") 441 | test.obj <- SingleCellDataBinatization(test.obj, 2) 442 | celltag.obj <- test.obj 443 | dt.mtx.whitelist.path <- system.file("extdata", "v1_whitelist.csv", package = "CellTagR") 444 | celltag.obj <- SingleCellDataWhitelist(celltag.obj, dt.mtx.whitelist.path) 445 | celltag.obj <- MetricBasedFiltering(celltag.obj, 20, comparison = "less") 446 | celltag.obj <- MetricBasedFiltering(celltag.obj, 2, comparison = "greater") 447 | filtered.whitelised.data <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "metric.filtered.count") 448 | Jac <- simil(filtered.whitelised.data, method = "Jaccard") 449 | Jac <- proxy::simil(filtered.whitelised.data, method = "Jaccard") 450 | library(Matrix) 451 | install.packages("proxyC") 452 | library(proxyC) 453 | Jac <- proxy::simil(filtered.whitelised.data, method = "Jaccard") 454 | Jac <- proxyC::simil(filtered.whitelised.data, method = "Jaccard") 455 | Jac <- proxyC::simil(filtered.whitelised.data, method = "jaccard") 456 | Jac.2 <- proxy::simil(as.matrix(filtered.whitelised.data), method = "Jaccard") 457 | Jac.2 <- as.matrix(Jac.2) 458 | sum(abs(Jac - Jac.2)) 459 | Jac.mtx <- as.matrix(Jac) 460 | sum(abs(Jac.mtx[rownames(Jac.2), colnames(Jac.2)] - Jac.2)) 461 | View(Jac.2) 462 | View(Jac.mtx) 463 | diag(Jac.2) <- 1 464 | sum(abs(Jac.mtx[rownames(Jac.2), colnames(Jac.2)] - Jac.2)) 465 | as(Jac, "dgCMatrix") 466 | as(Jac, "dgTMatrix") 467 | as(as(Jac, "dgTMatrix"), "dgCMatrix") 468 | devtools::install_github("morris-lab/CellTagR") 469 | JaccardAnalysis 470 | library(CellTagR) 471 | JaccardAnalysis 472 | library(CellTagR) 473 | library(CellTagR) 474 | JaccardAnalysis 475 | remove.packages("CellTagR") 476 | devtools::install_github("morris-lab/CellTagR") 477 | library(CellTagR) 478 | JaccardAnalysis 479 | devtools::document() 480 | rm(list = c("GetCellTagCurrentVersionWorkingMatrix")) 481 | devtools::document() 482 | library(CellTagR) 483 | JaccardAnalysis() 484 | JaccardAnalysis 485 | celltag.obj <- JaccardAnalysis(celltag.obj, fast = T) 486 | celltag.obj.2 <- JaccardAnalysis(celltag.obj) 487 | Jaccard.Matrix <- celltag.obj@jaccard.mtx 488 | # Using the igraph package to facilitate the identification of membership to each clone 489 | jac.summ <- Matrix::summary(Jaccard.Matrix) 490 | lower.tri.summ <- subset(jac.summ, i>=j) 491 | test <- sparseMatrix(i = lower.tri.summ$i, 492 | j = lower.tri.summ$j, 493 | x = lower.tri.summ$x, 494 | dims = dim(Jaccard.Matrix)) 495 | test.df <- as.data.frame(Matrix::summary(test)) 496 | test.2 <- Jaccard.Matrix * lower.tri(Jaccard.Matrix) 497 | test.df.2 <- as.data.frame(Matrix::summary(test.2)) 498 | View(test.2) 499 | View(test.df) 500 | View(test.df.2) 501 | # Using the igraph package to facilitate the identification of membership to each clone 502 | jac.summ <- Matrix::summary(Jaccard.Matrix) 503 | lower.tri.summ <- subset(jac.summ, i>j) 504 | test <- sparseMatrix(i = lower.tri.summ$i, 505 | j = lower.tri.summ$j, 506 | x = lower.tri.summ$x, 507 | dims = dim(Jaccard.Matrix)) 508 | test.df <- as.data.frame(Matrix::summary(test)) 509 | View(test.df) 510 | View(test.df.2) 511 | sum(abs(test.df$x - test.df.2$x)) 512 | devtools::document() 513 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | -------------------------------------------------------------------------------- /CellTagR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | Encoding: UTF-8 9 | 10 | AutoAppendNewline: Yes 11 | StripTrailingWhitespace: Yes 12 | 13 | BuildType: Package 14 | PackageUseDevtools: Yes 15 | PackageInstallArgs: --no-multiarch --with-keep.source 16 | PackageRoxygenize: rd,collate,namespace 17 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CellTagR 2 | Title: Identify Clonal Identity from ScRNA-Seq and CellTag Data 3 | Version: 0.0.0.9000 4 | Authors@R: person("Samantha", "Morris", email = "s.morris@wustl.edu", role = c("aut", "cre")) 5 | Description: <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Description >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 6 | Depends: R (>= 3.5.0), 7 | gridExtra, 8 | tools, 9 | proxy, 10 | corrplot, 11 | igraph, 12 | data.table, 13 | plyr, 14 | reshape, 15 | Matrix, 16 | tidyverse, 17 | foreach, 18 | networkD3, 19 | proxyC 20 | License: MIT License 21 | Encoding: UTF-8 22 | LazyData: true 23 | RoxygenNote: 7.1.1 24 | -------------------------------------------------------------------------------- /Examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/.DS_Store -------------------------------------------------------------------------------- /Examples/CellTagR CellTag Object V1 V2 V3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/CellTagR CellTag Object V1 V2 V3.pdf -------------------------------------------------------------------------------- /Examples/CellTag_UTR.fa: -------------------------------------------------------------------------------- 1 | >CellTag.UTR 2 | GAATTCGATGACAGGCGCAGCTTCCGAGGGATTTGAGATCCAGACATGATAAGATACATT 3 | GATGAGTTTGGACAAACCAAAACTAGAATGCAGTGAAAAAAATGCCTTATTTGTGAAATT 4 | TGTGATGCTATTGCCTTATTTGTAACCATTATAAGCTGCAATAAACAAGTTAACA 5 | 6 | >GFP.CDS 7 | ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGAC 8 | GGCGACGTAAACGGCCACAAGTTCAGCGTGTCTGGCGAGGGCGAGGGCGATGCCACCTAC 9 | GGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACC 10 | CTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAG 11 | CAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTC 12 | TTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTG 13 | GTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCAC 14 | AAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAAC 15 | GGCATCAAGGCGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCC 16 | GACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCAC 17 | TACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTC 18 | CTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA 19 | 20 | 21 | #GTF Entries: 22 | CellTag.UTR custom exon 1 175 . + . gene_id "CellTag.UTR"; transcript_id "celltag.utr"; 23 | GFP.CDS custom exon 1 720 . + . gene_id "GFP.CDS"; transcript_id "gfp.cds"; 24 | -------------------------------------------------------------------------------- /Examples/CloneHunterWhitelistTestRun.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Whitelist Regeneration Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ### Load the Package 7 | ```{r} 8 | library(roxygen2) 9 | library(devtools) 10 | setwd("~/Desktop/CloneHunterNew_5/") 11 | setwd("CloneHunterNew/") 12 | devtools::document() 13 | ``` 14 | 15 | ## V1 16 | ### Create CellTag Object 17 | ```{r} 18 | v1.whitelist <- CellTagObject("v1.whitelist", "~/Desktop/CloneHunterTest/V1_S1_L001_R1_001.fastq") 19 | ``` 20 | 21 | ### Extract the CellTag Reads 22 | ```{r} 23 | v1.whitelist <- CellTagExtraction(v1.whitelist, "v1") 24 | ``` 25 | 26 | ### Sort by CellTag Frequency 27 | ```{r} 28 | v1.whitelist <- AddCellTagFreqSort(v1.whitelist) 29 | ``` 30 | 31 | ### V1 Whitelist Generation 32 | ```{r} 33 | v1.whitelist <- CellTagWhitelistFiltering(v1.whitelist, 0.9) 34 | ``` 35 | 36 | ## V2 37 | ### Create CellTag Object 38 | ```{r} 39 | v2.whitelist <- CellTagObject("v1.whitelist", "~/Desktop/CloneHunterTest/V2-1_S2_L001_R1_001.fastq") 40 | ``` 41 | 42 | ### Extract the CellTag Reads 43 | ```{r} 44 | v2.whitelist <- CellTagExtraction(v2.whitelist, "v2") 45 | ``` 46 | 47 | ### Sort by CellTag Frequency 48 | ```{r} 49 | v2.whitelist <- AddCellTagFreqSort(v2.whitelist) 50 | ``` 51 | 52 | ### V2 Whitelist Generation 53 | ```{r} 54 | v2.whitelist <- CellTagWhitelistFiltering(v2.whitelist, 0.9) 55 | ``` 56 | 57 | ## V3 58 | ### Create CellTag Object 59 | ```{r} 60 | v3.whitelist <- CellTagObject("v3.whitelist", "~/Desktop/CloneHunterTest/V2-2_S3_L001_R1_001.fastq") 61 | ``` 62 | 63 | ### Extract the CellTag Reads 64 | ```{r} 65 | v3.whitelist <- CellTagExtraction(v3.whitelist, "v3") 66 | ``` 67 | 68 | ### Sort by CellTag Frequency 69 | ```{r} 70 | v3.whitelist <- AddCellTagFreqSort(v3.whitelist) 71 | ``` 72 | 73 | ### V2 Whitelist Generation 74 | ```{r} 75 | v3.whitelist <- CellTagWhitelistFiltering(v3.whitelist, 0.9) 76 | ``` 77 | -------------------------------------------------------------------------------- /Examples/bar_Chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/bar_Chart.png -------------------------------------------------------------------------------- /Examples/clone.calling.permutation.test.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(parallel) 3 | 4 | meta.data <- read.table("/scratch/smlab/CellTag_paper_analysis/permutation_test/final.drop.seq.10x.meta.data.txt", sep = "\t", 5 | stringsAsFactors = F, header = T, row.names = 1) 6 | 7 | meta.data.orig <- read.table("/scratch/smlab/CellTag_paper_analysis/permutation_test/qp.comb.clone.meta.data.drop.seq.10X.txt", sep = "\t", 8 | stringsAsFactors = F, header = T, row.names = 1) 9 | 10 | clones <- meta.data.orig[,c("hf1.v1", "hf1.v2.1", "hf1.v2.2", "hf2.v1", "hf2.v2.1", "hf2.v2.2")] 11 | 12 | hf1 <- clones[, c("hf1.v1", "hf1.v2.1", "hf1.v2.2")] 13 | hf2 <- clones[, c("hf2.v1", "hf2.v2.1", "hf2.v2.2")] 14 | 15 | colnames(hf1) <- c("v1.1", "v2.1", "v2.2") 16 | colnames(hf2) <- c("v1.1", "v2.1", "v2.2") 17 | 18 | hf2.not.na.v1.1 <- which(!is.na(hf2$v1.1)) 19 | hf2.not.na.v2.1 <- which(!is.na(hf2$v2.1)) 20 | hf2.not.na.v2.2 <- which(!is.na(hf2$v2.2)) 21 | index.v1 <- intersect(which(is.na(hf1$v1.1)), hf2.not.na.v1.1) 22 | hf1$v1.1[index.v1] <- 2000 + hf2$v1.1[index.v1] 23 | hf1$v2.1[hf2.not.na.v2.1] <- 2000 + hf2$v2.1[hf2.not.na.v2.1] 24 | hf1$v2.2[hf2.not.na.v2.2] <- 2000 + hf2$v2.2[hf2.not.na.v2.2] 25 | 26 | all.clones <- hf1 27 | # v2.1.unique.clones <- unique(all.clones$v2.1) 28 | # clone.2.1 <- all.clones[,2] 29 | # clone.2.1.count <- as.data.frame(table(clone.2.1)) 30 | # over.10.freq <- clone.2.1.count[which(clone.2.1.count$Freq > 10), ] 31 | 32 | v1.1.unique.clones <- unique(all.clones$v1.1) 33 | clone.1.1 <- all.clones[,1] 34 | clone.1.1.count <- as.data.frame(table(clone.1.1)) 35 | over.10.freq.1.1 <- clone.1.1.count[which(clone.1.1.count$Freq > 10), ] 36 | 37 | 38 | # high.number.clone.2.1 <- hf1[which(hf1$v2.1 %in% over.10.freq$clone.2.1), ] 39 | high.number.clone.1.1 <- hf1[which(hf1$v1.1 %in% over.10.freq.1.1$clone.1.1), ] 40 | 41 | #### 42 | # Fast sampling only 43 | sampling <- function(clone.id, clone.info, over.threshold.df, subset.df=NULL) { 44 | curr.count <- over.threshold.df[as.character(clone.id), "Freq"] 45 | curr.cell.barcode <- rownames(clone.info)[which(clone.info$v1.1 == clone.id)] 46 | replicate.num <- ceiling(nrow(clone.info)/curr.count) 47 | barcode.names <- rownames(clone.info)[which(startsWith(rownames(clone.info), "_10X"))] 48 | if (!is.null(subset.df)){ 49 | bc.nams <- rownames(subset.df) 50 | perm.subset <- replicate(replicate.num, sample(bc.nams, curr.count)) 51 | } 52 | perm <- replicate(replicate.num, sample(barcode.names, curr.count)) 53 | clone.perm <- replicate(replicate.num, sample(curr.cell.barcode, curr.count)) 54 | return(list(perm, clone.perm, perm.subset)) 55 | } 56 | 57 | # clones.id <- as.numeric(as.character(over.10.freq$clone.2.1)) 58 | # over.10.freq$clone.2.1 <- as.integer(as.character(over.10.freq$clone.2.1)) 59 | # rownames(over.10.freq) <- over.10.freq$clone.2.1 60 | 61 | clones.id <- as.numeric(as.character(over.10.freq.1.1$clone.1.1)) 62 | over.10.freq.1.1$clone.1.1 <- as.integer(as.character(over.10.freq.1.1$clone.1.1)) 63 | rownames(over.10.freq.1.1) <- over.10.freq.1.1$clone.1.1 64 | num.to.rep <- as.data.frame(seq(1, 50)) 65 | 66 | hf1.w.tp <- cbind(hf1, timepoints = unlist(lapply(strsplit(rownames(hf1), "-"), function(x) x[length(x)]))) 67 | hf1.w.tp.10x.only <- hf1.w.tp[which(startsWith(rownames(hf1.w.tp), "_10X")), ] 68 | hf1.w.tp.10x.only$timepoints <- as.integer(as.character(hf1.w.tp.10x.only$timepoints)) 69 | hf1.aft.tp.3 <- hf1.w.tp.10x.only[which(hf1.w.tp.10x.only$timepoints >= 3), ] 70 | rslt <- apply(num.to.rep,1, 71 | function(x) { 72 | sampling.ls <- mclapply(over.10.freq.1.1$clone.1.1, sampling, 73 | clone.info = hf1, over.threshold.df = over.10.freq.1.1, subset.df = hf1.w.tp.10x.only, 74 | mc.cores = 24) 75 | return(sampling.ls) 76 | }) 77 | 78 | 79 | save(rslt, file = "/scratch/smlab/CellTag_paper_analysis/permutation_test/sampling_output_v1.RData") 80 | 81 | # load("/scratch/smlab/CellTag_paper_analysis/permutation_test/sampling_output.RData") 82 | # Calculate the percentages 83 | percentage.perm.calc <- function(col.num, meta.data.original, x) { 84 | curr.samp.perm <- x[[1]][,col.num] 85 | curr.samp.perm.clone <- x[[2]][,col.num] 86 | curr.samp.perm.subset <- x[[3]][,col.num] 87 | 88 | perm.cluster.0.8 <- meta.data.original[curr.samp.perm, "res.0.8"] 89 | clone.perm.cluster.0.8 <- meta.data.original[curr.samp.perm.clone, "res.0.8"] 90 | subset.perm.cluster.0.8 <- meta.data.original[curr.samp.perm.subset, "res.0.8"] 91 | 92 | perm.percent <- length(which(perm.cluster.0.8 == 5)) * 100/length(perm.cluster.0.8) 93 | clone.perm.percent <- length(which(clone.perm.cluster.0.8 == 5)) * 100/length(clone.perm.cluster.0.8) 94 | subset.perm.percent <- length(which(subset.perm.cluster.0.8 == 5)) * 100/length(subset.perm.cluster.0.8) 95 | 96 | return(c(perm.percent, clone.perm.percent, subset.perm.percent)) 97 | } 98 | 99 | percentage.ls <- lapply(rslt, 100 | function(x) { 101 | rep <- 102 | lapply(x, 103 | function(x) { 104 | curr.cell.bar <- x[[2]][1,1] 105 | clone.id <- hf1.w.tp.10x.only[curr.cell.bar, ]$v1.1 106 | curr.cell.barcode <- rownames(hf1.w.tp.10x.only)[which(hf1.w.tp.10x.only$v1.1 == clone.id)] 107 | clone.cluster.0.8 <- meta.data.orig[curr.cell.barcode, "res.0.8"] 108 | percent.null <- length(which(clone.cluster.0.8 == 5)) * 100/length(clone.cluster.0.8) 109 | perc.calc.rslt.ls <- mclapply(seq(1, ncol(x[[1]])), percentage.perm.calc, 110 | meta.data.original = meta.data.orig, x = x, 111 | mc.cores = 24) 112 | percent.perm <- unlist(lapply(perc.calc.rslt.ls, function(x) x[1])) 113 | percent.clone.perm <- unlist(lapply(perc.calc.rslt.ls, function(x) x[2])) 114 | percent.subset.perm <- unlist(lapply(perc.calc.rslt.ls, function(x) x[3])) 115 | return(list(clone.id, percent.null, percent.perm, percent.clone.perm, percent.subset.perm)) 116 | } 117 | ) 118 | return(rep) 119 | } 120 | ) 121 | 122 | save(percentage.ls, file = "/scratch/smlab/CellTag_paper_analysis/permutation_test/percentage_over_10_v1.RData") 123 | 124 | perm.test.super.ls <- lapply(percentage.ls, 125 | function(x) { 126 | p.value.ls <- lapply(x, 127 | function(y) { 128 | clone.id <- y[[1]] 129 | null.percent <- y[[2]] 130 | real.distribution <- y[[5]] 131 | curr.p <- sum(real.distribution > null.percent)/length(real.distribution) 132 | return(data.frame(clone.num = clone.id, p.val = curr.p)) 133 | }) 134 | p.value.df <- rbindlist(p.value.ls) 135 | return(p.value.df) 136 | }) 137 | 138 | perm.df <- data.frame() 139 | for (i in 1:length(perm.test.super.ls)) { 140 | curr.df <- perm.test.super.ls[[i]] 141 | if (ncol(perm.df) == 0) { 142 | perm.df <- curr.df 143 | } else { 144 | perm.df <- cbind(perm.df, curr.df[,2]) 145 | } 146 | } 147 | 148 | clone.vec <- perm.df$clone.num 149 | perm.df <- as.data.frame(perm.df[,-1]) 150 | rownames(perm.df) <- clone.vec 151 | perm.df <- cbind(perm.df, avg = rowMeans(perm.df)) 152 | 153 | p.val.df <- data.frame(clone.id = rownames(perm.df), avg.p = perm.df[rownames(perm.df),]$avg) 154 | 155 | save(p.val.df, file = "/scratch/smlab/CellTag_paper_analysis/permutation_test/p_value_v1.RData") -------------------------------------------------------------------------------- /Examples/jaccard example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/jaccard example.png -------------------------------------------------------------------------------- /Examples/jaccard wo collapsing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/jaccard wo collapsing.png -------------------------------------------------------------------------------- /Examples/network construction and visualization.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Network Construction Test" 3 | output: html_notebook 4 | --- 5 | 6 | ### Load the package 7 | ```{r} 8 | library(roxygen2) 9 | library(devtools) 10 | setwd("~/Desktop/CloneHunterNew_5/CloneHunterNew/") 11 | devtools::document() 12 | 13 | library(tidyverse) 14 | library(foreach) 15 | library(networkD3) 16 | ``` 17 | 18 | ### Read in the CellTag Object 19 | ```{r} 20 | bam.test.obj <- readRDS("~/Desktop/bam_v123_obj.Rds") 21 | ``` 22 | 23 | ### Calculate the Linked list 24 | ```{r} 25 | bam.test.obj <- convertCellTagMatrix2LinkList(bam.test.obj) 26 | ``` 27 | 28 | ### Get the nodes 29 | ```{r} 30 | bam.test.obj <- getNodesfromLinkList(bam.test.obj) 31 | ``` 32 | 33 | ### Add additional information 34 | ```{r} 35 | additional_data <- data.frame(sample(1:10, size = length(rownames(bam.test.obj@celltag.aggr.final)), replace = TRUE), row.names = rownames(bam.test.obj@celltag.aggr.final)) 36 | colnames(additional_data) <- "Cluster" 37 | 38 | bam.test.obj <- addData2Nodes(bam.test.obj, additional_data) 39 | ``` 40 | 41 | ### Network visualization and plot 42 | 43 | ```{r, fig.width=10, fig.height=10} 44 | bam.test.obj <- drawSubnet(tag = "CellTagV1_2", overlay = "Cluster", celltag.obj = bam.test.obj) 45 | bam.test.obj@network 46 | saveNetwork(bam.test.obj@network, "~/Desktop/hf1.d15.network.construction.html") 47 | ``` 48 | 49 | ### Stacked bar charts 50 | ```{r} 51 | bar.data <- bam.test.obj@celltag.aggr.final 52 | bar.data$Cell.BC <- rownames(bar.data) 53 | 54 | bar.data <- gather(bar.data, key = "CellTag", value = "Clone", 1:3, na.rm = FALSE) 55 | ``` 56 | 57 | ### ggplot 58 | ```{r} 59 | ggplot(data = bar.data) + 60 | geom_bar(mapping = aes(x = CellTag, fill = factor(Clone)), position = "fill", show.legend = FALSE) + 61 | scale_y_continuous(labels = scales::percent_format()) + 62 | theme_bw() 63 | ``` 64 | 65 | 66 | -------------------------------------------------------------------------------- /Examples/permutation_python.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import collections 3 | import numpy as np 4 | import math as mt 5 | import time 6 | 7 | def calculate_proportion(sp, cluster_num_list): 8 | clusters_curr = sp.loc[:,"res.0.8"] 9 | unique_curr, counts_curr = np.unique(clusters_curr, return_counts=True) 10 | count_dict_curr = dict(zip(unique_curr, counts_curr)) 11 | curr_total = sum(count_dict_curr.values()) 12 | cluster_dict = {} 13 | for c_n in cluster_num_list: 14 | if c_n in count_dict_curr.keys(): 15 | cluster_dict[c_n] = count_dict_curr[c_n] * 100/curr_total 16 | else: 17 | cluster_dict[c_n] = 0 18 | 19 | return cluster_dict 20 | 21 | 22 | def randomized_test(clones, orig): 23 | clone_sample_randoms = {} 24 | for c_1 in clones: 25 | curr_clone_cells = orig.loc[df['v2.2'] == c_1] 26 | clone_size = len(curr_clone_cells.index) 27 | rep_num = int(mt.ceil(len(orig.index)/clone_size)) 28 | curr_clone_random_percentage = {} 29 | for rep in range(rep_num): 30 | # Sample without replacement 31 | curr_sample = orig.sample(n=clone_size) 32 | curr_percentage_dict = calculate_proportion(curr_sample, cluster_ls) 33 | curr_clone_random_percentage[rep] = curr_percentage_dict 34 | 35 | clone_sample_randoms[c_1] = curr_clone_random_percentage 36 | 37 | return clone_sample_randoms 38 | 39 | 40 | df = pd.read_table("meta.clone.clean.integrated.v1.v2.1.v2.2.txt", sep="\t") 41 | v1 = df.loc[:,"v1.1"] 42 | v2 = df.loc[:,"v2.1"] 43 | v22 = df.loc[:, "v2.2"] 44 | clusters = df.loc[:,"res.0.8"] 45 | 46 | not_na_v1 = pd.notnull(v1) 47 | not_na_v2 = pd.notnull(v2) 48 | not_na_v22 = pd.notnull(v22) 49 | 50 | v1_not_na = v1[not_na_v1] 51 | v2_not_na = v2[not_na_v2] 52 | v22_not_na = v22[not_na_v22] 53 | 54 | unique, counts = np.unique(v22_not_na, return_counts=True) 55 | count_dict = dict(zip(unique, counts)) 56 | grt_5_count_dict = {} 57 | 58 | for key, value in count_dict.items(): 59 | if value > 5: 60 | grt_5_count_dict[key] = value 61 | 62 | clones = list(grt_5_count_dict.keys()) 63 | cluster_list, counts_cls = np.unique(clusters, return_counts=True) 64 | cluster_dict = dict(zip(cluster_list, counts_cls)) 65 | cluster_ls = list(cluster_dict.keys()) 66 | 67 | replication_number = 50 68 | time_vec = [] 69 | replication_dict = {} 70 | for j in range(replication_number): 71 | print(j) 72 | start_time = time.time() 73 | curr_replicate_rslt = randomized_test(clones, df) 74 | end_time = time.time() 75 | replication_dict[j] = curr_replicate_rslt 76 | time_vec.append(end_time - start_time) 77 | print(end_time - start_time) 78 | 79 | # Format: {clone id1: {0:[], 1:[], 3:[], 4:[], 6:[]}, clone id2: {0:[], 1:[], 3:[], 4:[], 6:[]}, ...} 80 | rearrange_dict = {} 81 | for k,v in replication_dict.items(): 82 | for sk,sv in v.items(): 83 | curr_c = sk 84 | curr_pct_dict = {} 85 | for ssk,ssv in sv.items(): 86 | for sssk,sssv in ssv.items(): 87 | if sssk not in curr_pct_dict.keys(): 88 | curr_pct_dict[sssk] = [sssv] 89 | else: 90 | curr_pct_dict[sssk].append(sssv) 91 | #print(len(curr_pct_dict[0])) 92 | if curr_c not in rearrange_dict.keys(): 93 | rearrange_dict[curr_c] = curr_pct_dict 94 | else: 95 | for pct_k,pct_v in curr_pct_dict.items(): 96 | rearrange_dict[curr_c][pct_k].extend(pct_v) 97 | 98 | p_val_grt_overall = {} 99 | p_val_less_overall = {} 100 | for key_1,val_1 in rearrange_dict.items(): 101 | curr_cl = key_1 102 | grt_p = {} 103 | less_p = {} 104 | for ky,vl in val_1.items(): 105 | grt_p[ky] = sum(i > clone_null_pct[curr_cl][ky] for i in vl)/len(vl) 106 | less_p[ky] = sum(j < clone_null_pct[curr_cl][ky] for j in vl)/len(vl) 107 | p_val_grt_overall[curr_cl] = grt_p 108 | p_val_less_overall[curr_cl] = less_p 109 | 110 | grt_df = pd.DataFrame(p_val_grt_overall) 111 | less_df = pd.DataFrame(p_val_less_overall) 112 | 113 | clusters_all = {} 114 | for k1,vl in rearrange_dict.items(): 115 | for k2,vl2 in vl.items(): 116 | if k2 in clusters_all.keys(): 117 | clusters_all[k2].extend(vl2) 118 | else: 119 | clusters_all[k2] = vl2 120 | 121 | clster_df = pd.DataFrame(clusters_all) 122 | null_df = pd.DataFrame(clone_null_pct) 123 | 124 | null_df.to_csv("permutation_clean_null_v2_2_ca.txt", sep = "\t") 125 | clster_df.to_csv("percentages_all_clusters_v2_2_ca.txt", sep = "\t") 126 | grt_df.to_csv("p_value_hyper_v2_2_ca.txt", sep = "\t") 127 | less_df.to_csv("p_value_hypo_v2_2_ca.txt", sep = "\t") 128 | 129 | -------------------------------------------------------------------------------- /Examples/post_filtering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/post_filtering.png -------------------------------------------------------------------------------- /Examples/pre_filtering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/Examples/pre_filtering.png -------------------------------------------------------------------------------- /Examples/sc analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Single-Cell CellTag Data Analysis" 3 | output: html_notebook 4 | --- 5 | 6 | # Load the CloneHunter package if already installed while carrying out whitelisting 7 | ```{r} 8 | library(CellTagR) 9 | ``` 10 | 11 | ## The following two steps are skipped for time saving and demo purposes 12 | ### Create a CellTag Object 13 | ```{r, eval=FALSE} 14 | bam.test.obj <- CellTagObject(object.name = "", fastq.bam.directory = "") 15 | ``` 16 | 17 | ### Extract CellTag Information 18 | ```{r, eval=FALSE} 19 | bam.test.obj <- CellTagExtraction(celltag.obj = bam.test.obj, celltag.version = "v1") 20 | ``` 21 | 22 | ### Load the demo object 23 | ```{r} 24 | bam.test.obj <- readRDS("~/Desktop/presentation/Demo/demo_object.Rds") 25 | head(bam.test.obj@bam.parse.rslt[["v1"]]) 26 | bam.test.obj@celltag.stats 27 | ``` 28 | 29 | ### Generate the count matrix 30 | ```{r} 31 | bam.test.obj <- CellTagMatrixCount(celltag.obj = bam.test.obj, barcodes.file = "~/Desktop/presentation/Demo/barcodes.tsv") 32 | dim(bam.test.obj@raw.count) 33 | ``` 34 | 35 | ### Generate file for collapsing 36 | ```{r} 37 | bam.test.obj <- CellTagDataForCollapsing(celltag.obj = bam.test.obj, output.file = "~/Desktop/collapsing.txt") 38 | ``` 39 | 40 | ### Process the collapsing result 41 | ```{r} 42 | bam.test.obj <- CellTagDataPostCollapsing(celltag.obj = bam.test.obj, collapsed.rslt.file = "~/Desktop/test_starcode_out_2.txt") 43 | dim(bam.test.obj@collapsed.count) 44 | ``` 45 | 46 | ### generate the Binary matrix 47 | ```{r} 48 | bam.test.obj <- SingleCellDataBinarization(bam.test.obj, 2) 49 | ``` 50 | 51 | ### Look at the metric plots 52 | ```{r, fig.width=10, fig.height=10} 53 | MetricPlots(bam.test.obj) 54 | ``` 55 | 56 | ### Whitelist based filtering 57 | ```{r} 58 | bam.test.obj <- SingleCellDataWhitelist(bam.test.obj, "~/Desktop/Morris Lab/CloneHunter/inst/extdata/v1_whitelist.csv") 59 | dim(bam.test.obj@whitelisted.count) 60 | ``` 61 | 62 | ### Metric Based Filtering 63 | ```{r} 64 | bam.test.obj <- MetricBasedFiltering(bam.test.obj, 20, comparison = "less") 65 | bam.test.obj <- MetricBasedFiltering(bam.test.obj, 2, comparison = "greater") 66 | dim(bam.test.obj@metric.filtered.count) 67 | ``` 68 | 69 | ### Metric Plots Again to Check for Additional Filtering 70 | ```{r, fig.width=10, fig.height=10} 71 | MetricPlots(bam.test.obj) 72 | ``` 73 | 74 | ### Jaccard Analysis 75 | ```{r} 76 | bam.test.obj <- JaccardAnalysis(bam.test.obj) 77 | ``` 78 | 79 | ### Clone Calling 80 | ```{r} 81 | bam.test.obj <- CloneCalling(celltag.obj = bam.test.obj, correlation.cutoff=0.7) 82 | bam.test.obj@clone.composition[["v1"]] 83 | bam.test.obj@clone.size.info[["v1"]] 84 | ``` 85 | 86 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(AddCellTagFreqSort) 4 | export(Barcode.Aggregate) 5 | export(CellTagDataForCollapsing) 6 | export(CellTagDataPostCollapsing) 7 | export(CellTagExtraction) 8 | export(CellTagMatrixCount) 9 | export(CellTagObject) 10 | export(CellTagPatternCalling) 11 | export(CellTagWhitelistFiltering) 12 | export(CloneCalling) 13 | export(JaccardAnalysis) 14 | export(MetricBasedFiltering) 15 | export(MetricPlots) 16 | export(SingleCellDataBinarization) 17 | export(SingleCellDataWhitelist) 18 | export(addData2Nodes) 19 | export(bam.process) 20 | export(convertCellTagMatrix2LinkList) 21 | export(drawSubnet) 22 | export(fastq.process) 23 | export(getNodesfromLinkList) 24 | exportClasses(CellTag) 25 | exportMethods(show) 26 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/R/.DS_Store -------------------------------------------------------------------------------- /R/AuxiliaryFunctions.R: -------------------------------------------------------------------------------- 1 | #' Fastq Process Function 2 | #' 3 | #' This function extracts CellTags from the raw fastq sequencing file, provides counts of each CellTag and sorts them in desending order. 4 | #' @param fastq.file The input fastq/bam data directory 5 | #' @param pattern The pattern to seek for 6 | #' @param short.nt.before.tag A short sequence before the 8nt tag to help more specific identification 7 | #' @param short.nt.after.tag A short sequence after the 8nt tag to help more specific identification 8 | #' @return A list contains count table of CellTags. If requested to save fullTag counts, i.e. save.fullTag.counts = TRUE, return a list of both 8nt tags and full sequences count. Otherwise, a list of 8nt tags counts. 9 | #' @keywords single-cell RNA-seq data, CellTagging 10 | #' @export 11 | #' @examples 12 | #' fastq.process("data.fastq", "CCGGT[ATCG]{8}GAATTC", "CCGGT", "GAATTC") 13 | #' 14 | fastq.process <- function(fastq.file, pattern, short.nt.before.tag, short.nt.after.tag) { 15 | con <- file(fastq.file, "r") 16 | 17 | # Get the sequences containing the tags (with both full tag region and only 8nt tag) 18 | seq.list <- c() 19 | filtered.sequences <- c() 20 | full.tag.seq <- c() 21 | only.tag.seq <- c() 22 | print("Reading File......") 23 | # Get the size of the bam file 24 | fq.size <- file.size(fastq.file) 25 | total <- fq.size/(1000000 * 101) 26 | # Initialize the progress bar 27 | pb <- txtProgressBar(min = 0, max = total, style = 3) 28 | # Initialize the count 29 | count <- 0 30 | while(TRUE) { 31 | curr.lines <- readLines(con, 1000000) 32 | if (length(curr.lines) == 0) break 33 | else { 34 | curr.seqs <- curr.lines[seq(2, 1000000, by = 4)] 35 | seq.list <- c(seq.list, curr.seqs) 36 | reg.rslt <- regexpr(pattern, curr.seqs, ignore.case = TRUE, perl = TRUE) 37 | contain.idx <- which(reg.rslt > 0) 38 | curr.f.seq <- curr.seqs[contain.idx] 39 | 40 | filtered.sequences <- c(filtered.sequences, curr.f.seq) 41 | start.loc <- reg.rslt[contain.idx] 42 | end.loc <- start.loc + nchar(short.nt.before.tag) + 8 + nchar(short.nt.after.tag) - 1 43 | curr.full.tag <- substr(curr.f.seq, start = start.loc, stop = end.loc) 44 | only.tag <- substr(curr.full.tag, start = (nchar(short.nt.before.tag) + 1), stop = (nchar(short.nt.before.tag) + 8)) 45 | full.tag.seq <- c(full.tag.seq, curr.full.tag) 46 | only.tag.seq <- c(only.tag.seq, only.tag) 47 | } 48 | count <- count + 1 49 | if (count > total) { 50 | count <- total 51 | } 52 | setTxtProgressBar(pb, count) 53 | } 54 | close(con) 55 | close(pb) 56 | rslt <- list(full.tag.seq, only.tag.seq) 57 | return(rslt) 58 | } 59 | 60 | #' Bam File Process Function 61 | #' 62 | #' This function extracts CellTags from the bam sequencing file, provides cell barcode, umi and their corresponding celltag information. 63 | #' @param bam.file The input bam data directory 64 | #' @param pattern The pattern to seek for 65 | #' @param short.nt.before.tag A short sequence before the 8nt tag to help more specific identification 66 | #' @param short.nt.after.tag A short sequence after the 8nt tag to help more specific identification 67 | #' @return A data table contains cell barcode, celltag and umi information 68 | #' @keywords single-cell RNA-seq data, CellTagging 69 | #' @export 70 | #' @examples 71 | #' bam.process("data.fastq", "CCGGT[ATCG]{8}GAATTC", "CCGGT", "GAATTC") 72 | #' 73 | bam.process <- function(bam.file, pattern, short.nt.before.tag, short.nt.after.tag, technique) { 74 | # Install Rsamtools 75 | if (!requireNamespace("BiocManager", quietly = TRUE)) 76 | install.packages("BiocManager") 77 | if (!requireNamespace("Rsamtools", quietly = TRUE)) { 78 | BiocManager::install("Rsamtools") 79 | } 80 | library(Rsamtools) 81 | # Get the bam file 82 | bamFile <- BamFile(bam.file) 83 | # Get the size of the bam file 84 | bam.size <- file.size(bam.file) 85 | total <- bam.size/(1000000 * 82.99) 86 | print(paste0("Reading ", bam.file, " ...")) 87 | # Initialize the progress bar 88 | pb <- txtProgressBar(min = 0, max = total, style = 3) 89 | # Initialize the number of lines to read at once 90 | yieldSize(bamFile) <- 1000000 91 | open(bamFile) 92 | if (tolower(technique) == "10x") { 93 | parameters <- ScanBamParam(what = scanBamWhat(), tag = c("CB", "GN", "UB", "CR")) 94 | } else { 95 | if (tolower(technique) == "dropseq") { 96 | parameters <- ScanBamParam(what = scanBamWhat(), tag = c("XC", "GN", "XM", "GE")) 97 | } else { 98 | if (tolower(technique) == "zumi") { 99 | parameters <- ScanBamParam(what = scanBamWhat(), tag = c("BC", "GN", "UB", "CR")) 100 | } else { 101 | stop("We don't support your current single-cell sequencing technology. Please contact us to add.") 102 | } 103 | } 104 | } 105 | bam.parsed.df <- data.table() 106 | count <- 0 107 | while(TRUE) { 108 | curr.read <- scanBam(bamFile, param = parameters)[[1]] 109 | # print(count) 110 | if (length(curr.read$qname) <= 0) { 111 | break 112 | } else { 113 | # Read in all information 114 | curr.seqs <- as.character(curr.read$seq) 115 | # Check if the sequences contain the celltag motif 116 | reg.rslt <- regexpr(pattern, curr.seqs, ignore.case = TRUE, perl = TRUE) 117 | contain.idx <- which(reg.rslt > 0) 118 | if (length(contain.idx) > 0) { 119 | if (tolower(technique) == "10x") { 120 | curr.cell.bc <- curr.read$tag$CB 121 | curr.umi <- curr.read$tag$UB 122 | } else { 123 | if (tolower(technique) == "dropseq") { 124 | curr.cell.bc <- curr.read$tag$XC 125 | curr.umi <- curr.read$tag$XM 126 | } else if (tolower(technique) == "zumi") { 127 | curr.cell.bc <- curr.read$tag$BC 128 | curr.umi <- curr.read$tag$UB 129 | } 130 | } 131 | curr.cell.tag <- rep(NA, length(curr.read$qname)) 132 | if (!(is.null(curr.cell.bc) | is.null(curr.umi))) { 133 | # Initialize the current data table 134 | curr.df <- data.table(Cell.BC = curr.cell.bc, UMI = curr.umi, Cell.Tag = curr.cell.tag) 135 | curr.f.seq <- curr.seqs[contain.idx] 136 | start.loc <- reg.rslt[contain.idx] 137 | end.loc <- start.loc + nchar(short.nt.before.tag) + 8 + nchar(short.nt.after.tag) - 1 138 | 139 | curr.full.tag <- substr(curr.f.seq, start = start.loc, stop = end.loc) 140 | only.tag <- substr(curr.full.tag, start = (nchar(short.nt.before.tag) + 1), stop = (nchar(short.nt.before.tag) + 8)) 141 | 142 | curr.df$Cell.Tag[contain.idx] <- only.tag 143 | # Add to the current data frame 144 | if (nrow(bam.parsed.df) <= 0) { 145 | bam.parsed.df <- curr.df[contain.idx,] 146 | } else { 147 | bam.parsed.df <- rbind(bam.parsed.df, curr.df[contain.idx, ]) 148 | } 149 | } 150 | } 151 | } 152 | count <- count + 1 153 | setTxtProgressBar(pb, count) 154 | } 155 | close(bamFile) 156 | close(pb) 157 | return(bam.parsed.df) 158 | } 159 | 160 | #' CellTag Pattern Calling Function 161 | #' 162 | #' This function provides motif patterns corresponding to the input celltag version 163 | #' @param celltag.version Which CellTag version are you investigating? 164 | #' @return A list containing the pattern, nucleotides to look for before/after the motif 165 | #' @keywords single-cell RNA-seq data, CellTagging 166 | #' @export 167 | #' @examples 168 | #' CellTagPatternCalling("v1") 169 | #' 170 | CellTagPatternCalling <- function(celltag.version) { 171 | celltag.df <- data.frame(version = c("v1", "v2", "v3"), 172 | nt.before.tag = c("GGT", "GTGATG", "TGTACG"), 173 | stringsAsFactors = F) 174 | rownames(celltag.df) <- celltag.df$version 175 | short.nt.before.tag <- celltag.df[celltag.version, "nt.before.tag"] 176 | short.nt.after.tag <- "GAATTC" 177 | 178 | pattern <- paste0(short.nt.before.tag, "[ATCG]{8}", short.nt.after.tag) 179 | return(c(pattern, short.nt.before.tag, short.nt.after.tag)) 180 | } 181 | 182 | #' CellTag Barcode Aggregation function 183 | #' 184 | #' This function allows barcode aggregation of multiple-file processing. 185 | #' @param file.list files in a list to aggregate in order same as the BAM files 186 | #' @param output.file where to save this aggregated output file. Should be a .tsv file. 187 | #' @return A list containing the pattern, nucleotides to look for before/after the motif 188 | #' @keywords single-cell RNA-seq data, CellTagging 189 | #' @export 190 | #' @examples 191 | #' Barcode.Aggregate(list("barcodes_1.tsv", "barcodes_2.tsv"), output.file = "barcode_aggr.tsv") 192 | #' 193 | Barcode.Aggregate <- function(file.list, output.file) { 194 | final.bc <- c() 195 | for (i in 1:length(file.list)) { 196 | curr.prefix <- paste0("Sample-", i, "_") 197 | curr.file <- file.list[[i]] 198 | curr.bc <- read.table(curr.file, header = F, stringsAsFactors = F) 199 | bc.to.save <- paste0(curr.prefix, curr.bc[,1]) 200 | final.bc <- c(final.bc, bc.to.save) 201 | } 202 | write.table(as.data.frame(final.bc), output.file, sep = "\t", row.names = F, col.names = F, quote = F) 203 | } 204 | 205 | 206 | GetCellTagCurrentVersionWorkingMatrix <- function(celltag.obj, slot.to.select) { 207 | curr.mtx <- slot(celltag.obj, slot.to.select) 208 | if (nrow(curr.mtx) <= 0) { 209 | return(curr.mtx) 210 | } else { 211 | curr.version <- celltag.obj@curr.version 212 | curr.mtx.sub <- curr.mtx[, which(startsWith(colnames(curr.mtx), curr.version))] 213 | colnames(curr.mtx.sub) <- gsub(pattern = paste0(curr.version, "."), replacement = "", colnames(curr.mtx.sub)) 214 | full.mtx.sub <- curr.mtx.sub[Matrix::rowSums(is.na(curr.mtx.sub)) != ncol(curr.mtx.sub),] 215 | 216 | return(full.mtx.sub) 217 | } 218 | } 219 | 220 | SetCellTagCurrentVersionWorkingMatrix <- function(celltag.obj, slot.to.set, final.to.set, replace = FALSE) { 221 | cop.final <- final.to.set 222 | colnames(cop.final) <- paste0(celltag.obj@curr.version, ".", colnames(cop.final)) 223 | curr.version.existing.mtx <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, slot.to.set) 224 | 225 | if (replace) { 226 | slot(celltag.obj, slot.to.set) <- cop.final 227 | return(celltag.obj) 228 | } 229 | 230 | if (sum(dim(slot(celltag.obj, slot.to.set))) <= 0) { 231 | slot(celltag.obj, slot.to.set) <- cop.final 232 | } else { 233 | curr.existing.mtx <- slot(celltag.obj, slot.to.set) 234 | if (ncol(curr.version.existing.mtx) > 0) { 235 | curr.ver.exist.colnames <- paste0(celltag.obj@curr.version, ".", colnames(curr.version.existing.mtx)) 236 | indx <- which(colnames(curr.existing.mtx) %in% curr.ver.exist.colnames) 237 | curr.existing.mtx <- curr.existing.mtx[, -indx] 238 | } 239 | new.rownames <- unique(c(rownames(curr.existing.mtx), rownames(cop.final))) 240 | 241 | diff.rnms <- setdiff(new.rownames, rownames(cop.final)) 242 | cop.comp.mtx <- matrix(NA, nrow = length(diff.rnms), ncol = ncol(cop.final)) 243 | rownames(cop.comp.mtx) <- diff.rnms 244 | colnames(cop.comp.mtx) <- colnames(cop.final) 245 | 246 | diff.rnms.2 <- setdiff(new.rownames, rownames(curr.existing.mtx)) 247 | cem.comp.mtx <- matrix(NA, nrow = length(diff.rnms.2), ncol = ncol(curr.existing.mtx)) 248 | rownames(cem.comp.mtx) <- diff.rnms.2 249 | colnames(cem.comp.mtx) <- colnames(curr.existing.mtx) 250 | 251 | to.merge.mtx.cop <- rbind(cop.final, cop.comp.mtx) 252 | to.merge.mtx.cem <- rbind(curr.existing.mtx, cem.comp.mtx) 253 | 254 | if (ncol(to.merge.mtx.cem) <= 0) { 255 | new.mtx <- to.merge.mtx.cop[,colnames(cop.final)] 256 | } else { 257 | new.mtx <- cbind(to.merge.mtx.cop[new.rownames,], to.merge.mtx.cem[new.rownames, ]) 258 | } 259 | 260 | slot(celltag.obj, slot.to.set) <- new.mtx 261 | } 262 | 263 | return(celltag.obj) 264 | } 265 | 266 | 267 | -------------------------------------------------------------------------------- /R/CellTagExtraction.R: -------------------------------------------------------------------------------- 1 | #' CellTag Extraction Function 2 | #' 3 | #' This function extracts CellTags from the raw fastq/bam sequencing file. If it is a fastq file, provides counts of each CellTag and sorts them in desending order. If it is a bam file, returns the barcode, umi, celltag information. 4 | #' @param celltag.obj A CellTag object initialized with path to the fastq/bam file 5 | #' @param celltag.version The CellTag version to extract 6 | #' @param technique The technique used for scRNA-seq, Default to 10x. Currently enabled for 10x and dropseq. 7 | #' @return A CellTag object with attribute (bam.parse.rslt) filled 8 | #' @keywords single-cell RNA-seq data, CellTagging 9 | #' @export 10 | #' @examples 11 | #' CellTagExtraction(bam.test.obj) 12 | #' 13 | CellTagExtraction <- function(celltag.obj, celltag.version, technique = "10x") { 14 | celltag.obj@curr.version <- celltag.version 15 | if (file_test("-f", celltag.obj@fastq.bam.dir)) { 16 | fastq.bam.input <- celltag.obj@fastq.bam.dir 17 | } else { 18 | fastq.bam.input <- list.files(celltag.obj@fastq.bam.dir, full.names = T) 19 | } 20 | file.extension.unique <- unique(file_ext(fastq.bam.input)) 21 | 22 | if (length(celltag.obj@celltag.version) > 0) { 23 | if (celltag.obj@curr.version %in% celltag.obj@celltag.version) { 24 | print("This CellTag has already been processed!") 25 | } else { 26 | celltag.obj@celltag.version <- c(celltag.obj@celltag.version, celltag.obj@curr.version) 27 | } 28 | } else { 29 | celltag.obj@celltag.version <- celltag.obj@curr.version 30 | } 31 | 32 | p.calling <- CellTagPatternCalling(celltag.version) 33 | 34 | if (endsWith(file.extension.unique, "fastq") || endsWith(file.extension.unique, "fq")) { 35 | if (length(fastq.bam.input) > 1) { 36 | stop("Please process the whitelist files one at a time!") 37 | } 38 | rslt <- fastq.process(fastq.file = fastq.bam.input, pattern = p.calling[1], p.calling[2], p.calling[3]) 39 | celltag.obj@fastq.full.celltag[[celltag.version]] <- rslt[[1]] 40 | celltag.obj@fastq.only.celltag[[celltag.version]] <- rslt[[2]] 41 | } 42 | if (endsWith(file.extension.unique, "bam")) { 43 | rslt <- NULL 44 | for (i in 1:length(fastq.bam.input)) { 45 | curr.rslt <- bam.process(bam.file = fastq.bam.input[i], pattern = p.calling[1], p.calling[2], p.calling[3], technique) 46 | if (length(fastq.bam.input) > 1) curr.rslt$Cell.BC <- paste0("Sample-", i, "_", curr.rslt$Cell.BC) 47 | if (is.null(rslt)) { 48 | rslt <- curr.rslt 49 | } else { 50 | rslt <- rbind(rslt, curr.rslt, fill = TRUE) 51 | } 52 | } 53 | celltag.obj@bam.parse.rslt[[celltag.version]] <- rslt 54 | } 55 | 56 | return(celltag.obj) 57 | } 58 | 59 | -------------------------------------------------------------------------------- /R/CellTagForCollapsing.R: -------------------------------------------------------------------------------- 1 | #' CellTag Starcode Prior Collapsing 2 | #' 3 | #' This function generate the .txt file that will be fed into starcode - https://github.com/gui11aume/starcode - to collapse similar CellTags. 4 | #' @param celltag.obj A CellTag object with the raw count matrix filled. 5 | #' @param output.file The filepath and name to save the table for collapsing (usually a .txt file) 6 | #' @return A CellTag object with collapsing mapping table stored in pre.starcode slot 7 | #' @keywords single-cell RNA-seq data, CellTagging 8 | #' @export 9 | #' @examples 10 | #' CellTagDataForCollapsing(bam.test.obj, "./collapsing.txt") 11 | #' 12 | CellTagDataForCollapsing <- function(celltag.obj, output.file) { 13 | # Get the data out from the CellTag object 14 | umi.matrix <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "raw.count") 15 | 16 | for.collapse <- as.data.frame(Matrix::summary(umi.matrix)) 17 | for.collapse$i <- rownames(umi.matrix)[for.collapse$i] 18 | for.collapse$j <- colnames(umi.matrix)[for.collapse$j] 19 | 20 | colnames(for.collapse) <- c("X2", "X1", "value") 21 | for.collapse$X1 <- as.character(for.collapse$X1) 22 | for.collapse$X2 <- as.character(for.collapse$X2) 23 | for.collapse <- for.collapse[which(for.collapse$value > 0), ] 24 | # Create the contatenation column 25 | if (length(list.files(celltag.obj@fastq.bam.dir)) > 1) { 26 | parts.to.paste <- unlist(lapply(strsplit(for.collapse$X2, "_"), function(x) x[2])) 27 | for.collapse$concat <- paste0(for.collapse$X1, unlist(lapply(strsplit(parts.to.paste, "-"), function(x) x[1]))) 28 | sample.list.prefix <- unique(unlist(lapply(strsplit(for.collapse$X2, "_"), function(x) x[1]))) 29 | r <- apply(as.data.frame(sample.list.prefix), 1, 30 | function(x) { 31 | for.collapse.sub <- for.collapse[which(startsWith(for.collapse$X2, paste0(x, "_"))), c("concat", "value")] 32 | filename.to.save <- paste0(strsplit(output.file, "[.]")[[1]][1], "_", x, ".txt") 33 | write.table(for.collapse.sub, filename.to.save, sep = "\t", row.names = F, quote = F, col.names = F) 34 | }) 35 | } else { 36 | for.collapse$concat <- paste0(for.collapse$X1, unlist(lapply(strsplit(for.collapse$X2, "-"), function(x) x[1]))) 37 | for.collapse.sub <- for.collapse[, c("concat", "value")] 38 | write.table(for.collapse.sub, output.file, sep = "\t", row.names = F, quote = F, col.names = F) 39 | } 40 | # Set CellTag object 41 | celltag.obj@pre.starcode[[celltag.obj@curr.version]] <- for.collapse 42 | # Print the path saved 43 | cat("The file for collapsing is stored at: ", output.file, "\n") 44 | return(celltag.obj) 45 | } 46 | 47 | #' CellTag Starcode Post Collapsing 48 | #' 49 | #' This function processes the result generated from starcode - https://github.com/gui11aume/starcode. 50 | #' @param celltag.obj A CellTag object with the pre-starcode mapping matrix filled. 51 | #' @param collapsed.rslt.file File path to the collapsed result file 52 | #' @return A CellTag object with collapsed count matrix stored in collapsed.count slot 53 | #' @keywords single-cell RNA-seq data, CellTagging 54 | #' @export 55 | #' @examples 56 | #' CellTagDataPostCollapsing(bam.test.obj, "./collapsing_result.txt") 57 | #' 58 | CellTagDataPostCollapsing <- function(celltag.obj, collapsed.rslt.file, replace.option = FALSE) { 59 | ultimate.collapsing.df <- data.frame() 60 | for (i in 1:length(collapsed.rslt.file)) { 61 | final.collapsing.df <- data.frame() 62 | # Process this one by one 63 | curr.file.dir <- collapsed.rslt.file[i] 64 | print(paste0("Processing ", curr.file.dir)) 65 | # Read in the collpased result 66 | collapsed <- read.table(curr.file.dir, sep = "\t", header = F, stringsAsFactors = F) 67 | # Read in the file for collapsing 68 | if (length(collapsed.rslt.file) > 1) { 69 | curr.sample.parts <- strsplit(basename(curr.file.dir), "_")[[1]] 70 | curr.sample <- strsplit(curr.sample.parts[length(curr.sample.parts)], "[.]")[[1]][1] 71 | collapsing <- celltag.obj@pre.starcode[[celltag.obj@curr.version]] 72 | collapsing <- collapsing[which(startsWith(collapsing$X2, paste0(curr.sample, "_"))), ] 73 | } else { 74 | collapsing <- celltag.obj@pre.starcode[[celltag.obj@curr.version]] 75 | } 76 | rownames(collapsing) <- collapsing$concat 77 | colnames(collapsing)[c(1:2)] <- c("Cell.Barcode", "CellTag") 78 | new.collapsing.df <- collapsing 79 | 80 | cell.bc <- substring(collapsed$V1, 9) 81 | cell.ct <- substring(collapsed$V1, 1, 8) 82 | cell.same <- apply(collapsed, 1, 83 | function(x) { 84 | cell.bc <- substring(x[1], 9) 85 | cell.subset <- strsplit(x[3], ",")[[1]] 86 | return(all(endsWith(cell.subset, cell.bc))) 87 | }) 88 | 89 | cell.same.index <- which(cell.same) 90 | cell.diff.indx <- which(!cell.same) 91 | 92 | pb <- txtProgressBar(min = 0, max = length(cell.bc), style = 3) 93 | pb.count <- 0 94 | for (csi in cell.same.index) { 95 | pb.count <- pb.count + 1 96 | setTxtProgressBar(pb, pb.count) 97 | 98 | curr.row <- collapsed[csi,] 99 | curr.centroid <- curr.row$V1 100 | curr.count <- curr.row$V2 101 | curr.ct <- cell.ct[csi] 102 | 103 | curr.new.row <- data.frame(row.names = curr.centroid, concat = curr.centroid, CellTag = curr.ct, 104 | value = curr.count, stringsAsFactors = F) 105 | 106 | if (nrow(final.collapsing.df) <= 0){ 107 | final.collapsing.df <- curr.new.row 108 | } else { 109 | final.collapsing.df <- rbind(final.collapsing.df, curr.new.row) 110 | } 111 | } 112 | 113 | for (cdi in cell.diff.indx) { 114 | pb.count <- pb.count + 1 115 | setTxtProgressBar(pb, pb.count) 116 | 117 | curr.row <- collapsed[cdi,] 118 | curr.centroid <- curr.row$V1 119 | curr.count <- curr.row$V2 120 | curr.collapse.set <- strsplit(curr.row$V3, ",")[[1]] 121 | curr.ct <- cell.ct[cdi] 122 | curr.bc <- cell.bc[cdi] 123 | 124 | same.concat <- curr.collapse.set[which(endsWith(curr.collapse.set, curr.bc))] 125 | curr.to.collapse <- setdiff(same.concat, curr.centroid) 126 | 127 | if (length(curr.to.collapse) > 0) { 128 | for (j in 1:length(curr.to.collapse)) { 129 | curr.for.c <- curr.to.collapse[j] 130 | curr.for.c.ct <- substring(curr.for.c, 1, 8) 131 | if (curr.for.c.ct != curr.ct) { 132 | ind <- which(collapsing$concat == curr.to.collapse[j]) 133 | ind.cent <- which(collapsing$concat == curr.centroid) 134 | new.collapsing.df[ind, "concat"] <- curr.centroid 135 | new.collapsing.df[ind, "CellTag"] <- collapsing[ind.cent[1], "CellTag"] 136 | new.collapsing.df[ind, "Cell.Barcode"] <- collapsing[ind.cent[1], "Cell.Barcode"] 137 | } 138 | } 139 | curr.centroid.sub <- new.collapsing.df[which(new.collapsing.df$concat == curr.centroid), ] 140 | curr.count.new <- sum(curr.centroid.sub$value) 141 | curr.new.row <- data.frame(row.names = curr.centroid, concat = curr.centroid, CellTag = curr.ct, 142 | value = curr.count.new, stringsAsFactors = F) 143 | }else { 144 | curr.new.row <- new.collapsing.df[same.concat, c("concat", "CellTag", "value")] 145 | } 146 | curr.diff.rows <- new.collapsing.df[setdiff(curr.collapse.set, same.concat), c("concat", "CellTag", "value")] 147 | 148 | final.collapsing.df <- rbind(final.collapsing.df, curr.new.row) 149 | final.collapsing.df <- rbind(final.collapsing.df, curr.diff.rows) 150 | } 151 | 152 | if (length(which(is.na(final.collapsing.df$concat))) > 0) final.collapsing.df <- final.collapsing.df[-which(is.na(final.collapsing.df$concat)), ] 153 | rownames(final.collapsing.df) <- final.collapsing.df$concat 154 | final.collapsing.df <- cbind(final.collapsing.df, collapsing[rownames(final.collapsing.df), c("Cell.Barcode", "concat")]) 155 | 156 | if (nrow(ultimate.collapsing.df) <= 0) { 157 | ultimate.collapsing.df <- final.collapsing.df 158 | } else { 159 | ultimate.collapsing.df <- rbind(ultimate.collapsing.df, final.collapsing.df) 160 | } 161 | rownames(ultimate.collapsing.df) <- NULL 162 | close(pb) 163 | } 164 | 165 | df <- transform(ultimate.collapsing.df, Cell.Barcode = factor(Cell.Barcode), CellTag = factor(CellTag)) 166 | 167 | celltag.count.sparse <- sparseMatrix(as.integer(df$Cell.Barcode), as.integer(df$CellTag), x = df$value) 168 | colnames(celltag.count.sparse) <- levels(df$CellTag) 169 | rownames(celltag.count.sparse) <- levels(df$Cell.Barcode) 170 | 171 | # Save the new matrix to the object 172 | new.obj <- SetCellTagCurrentVersionWorkingMatrix(celltag.obj, "collapsed.count", as(celltag.count.sparse, "dgCMatrix"), replace = replace.option) 173 | return(new.obj) 174 | } 175 | -------------------------------------------------------------------------------- /R/CellTagMatrixGeneration.R: -------------------------------------------------------------------------------- 1 | #' CellTag Matrix Generation Function 2 | #' 3 | #' This function uses the extract information from data processed before and generate a Cell Barcode x CellTag matrix 4 | #' @param celltag.obj A CellTag object with bam file result filled 5 | #' @param barcodes.file A .tsv output file from 10x CellRanger pipeline. It contains a list of all cell barcodes identified in the filtered dataset. 6 | #' @return A CellTag object with the attribute (raw.count) filled 7 | #' @keywords single-cell RNA-seq data, CellTagging 8 | #' @export 9 | #' @examples 10 | #' CellTagMatrixCount(bam.test.obj, "barcodes.tsv") 11 | #' 12 | CellTagMatrixCount <- function(celltag.obj, barcodes.file, replace.option = FALSE) { 13 | # Read in the cell barcodes identified during alignment 14 | barcodeList <- fread(barcodes.file, header = FALSE)[[1]] 15 | celltagData <- celltag.obj@bam.parse.rslt[[celltag.obj@curr.version]] 16 | # Filter based on filtered barcodes 17 | celltagData <- celltagData[which(celltagData$Cell.BC %in% barcodeList), ] 18 | 19 | #With the parsed CellTag reads loaded we can then easily filter the data and generate UMI Counts for each Cell Barcode/Cell Tag combination. 20 | #-Groups the data.table by Cell Barcode/Cell Tag combination and creates a new column "UMI.Count" which has the number of unique UMI associated with each Cell Barcode/Cell.Tag combination. uniqueN is equivalent to length(unique(UMI)) 21 | celltagCounts <- celltagData[, .(UMI.Count = uniqueN(UMI)), .(Cell.BC, Cell.Tag)] 22 | # The data is now in a long format and needs to be reshaped. We will cast the long data into a wide format resembling a matrix. 23 | # celltagCountsWide <- dcast(data = celltagCounts, formula = Cell.BC ~ Cell.Tag, value.var = "UMI.Count", fill = 0 ) 24 | 25 | #Now we have the data we want in the correct format. Next we can add Cells from the barcode list that were not in the celltagData. 26 | missingCells <- barcodeList[!(barcodeList %in% celltagCounts$Cell.BC)] 27 | #Lets make a data.table with one column Cell.BC which will contain a list of the missing cells. This can then be merged with the UMI Count data table. 28 | missingCells <- setDT(expand.grid(Cell.BC = missingCells, Cell.Tag = unique(celltagCounts$Cell.Tag))) 29 | missingCells$UMI.Count <- 0 30 | #Bind the missing cells to the data.table containing the Cell Tag UMI Counts. 31 | alltagCounts <- rbind(celltagCounts, missingCells, fill = TRUE) 32 | #Now we can filter out cells which are not in our barcode list. 33 | alltagCounts <- alltagCounts[Cell.BC %in% barcodeList, ] 34 | 35 | #Generate dgCMatrix 36 | ### Reference for code 37 | ## https://datawookie.netlify.app/blog/2016/01/casting-a-wide-and-sparse-matrix-in-r/ 38 | 39 | df <- as.data.frame(alltagCounts) 40 | df <- transform(df, Cell.BC = factor(Cell.BC), Cell.Tag = factor(Cell.Tag)) 41 | 42 | celltag.count.sparse <- sparseMatrix(as.integer(df$Cell.BC), as.integer(df$Cell.Tag), x = df$UMI.Count) 43 | colnames(celltag.count.sparse) <- levels(df$Cell.Tag) 44 | rownames(celltag.count.sparse) <- levels(df$Cell.BC) 45 | 46 | #Lets also filter Cell Tags in which no UMIs are counted. 47 | celltagExpr <- Matrix::colSums(celltag.count.sparse) 48 | tagsRemove <- names(celltagExpr)[celltagExpr == 0] 49 | alltagCounts[, (tagsRemove):= NULL] 50 | 51 | ## Let's make the dgc matrix again with the tags removed 52 | df <- as.data.frame(alltagCounts) 53 | df <- transform(df, Cell.BC = factor(Cell.BC), Cell.Tag = factor(Cell.Tag)) 54 | 55 | celltag.count.sparse <- sparseMatrix(as.integer(df$Cell.BC), as.integer(df$Cell.Tag), x = df$UMI.Count) 56 | colnames(celltag.count.sparse) <- levels(df$Cell.Tag) 57 | rownames(celltag.count.sparse) <- levels(df$Cell.BC) 58 | 59 | #We now have a final matrix. Next lets generate some stats about the Cell Tags. 60 | celltagExpr <- summary(Matrix::colSums(celltag.count.sparse)) 61 | cellsPerTag <- summary(Matrix::colSums(celltag.count.sparse > 0)) 62 | cellExpr <- summary(Matrix::rowSums(celltag.count.sparse)) 63 | 64 | tagsPerCell <- Matrix::rowSums(celltag.count.sparse > 0) 65 | tagsPerCellSum <- summary(tagsPerCell) 66 | 67 | stats.df <- rbind(celltagExpr, cellsPerTag, cellExpr, tagsPerCellSum) 68 | rownames(stats.df) <- c("CellTag.UMI.Counts", "Cells.per.CellTag", "Cell.UMI.Counts", "CellTags.per.Cell") 69 | stats.df <- as.data.frame(stats.df) 70 | 71 | 72 | dgc.mtx.filter <- celltag.count.sparse 73 | new.obj <- SetCellTagCurrentVersionWorkingMatrix(celltag.obj, "raw.count", as(dgc.mtx.filter, "dgCMatrix"), replace = replace.option) 74 | 75 | return(new.obj) 76 | } 77 | -------------------------------------------------------------------------------- /R/CellTagNetworkContruction.R: -------------------------------------------------------------------------------- 1 | #' Convert CellTag Matrix to Link List 2 | #' 3 | #' This function convert the CellTag Matrix to a link list, which is further used for network construction and visualizetion 4 | #' @param celltag.obj A CellTag object with all clone information filled 5 | #' @return A CellTag object with the attribute (network.link.list) filled 6 | #' @keywords single-cell RNA-seq data, CellTagging 7 | #' @export 8 | #' @examples 9 | #' convertCellTagMatrix2LinkList(bam.test.obj) 10 | #' 11 | convertCellTagMatrix2LinkList <- function(celltag.obj){ 12 | # celltag_data should be data frame (N x 3). 13 | # the columnname of this data frame should be c("CellTagV1", "CellTagV2", "CellTagV3") 14 | celltag.dt <- celltag.obj@clone.composition 15 | v1.df <- as.data.frame(celltag.dt$v1) 16 | v2.df <- as.data.frame(celltag.dt$v2) 17 | v3.df <- as.data.frame(celltag.dt$v3) 18 | rownames(v1.df) <- v1.df$cell.barcode 19 | rownames(v2.df) <- v2.df$cell.barcode 20 | rownames(v3.df) <- v3.df$cell.barcode 21 | 22 | all.cells <- unique(c(celltag.dt$v1$cell.barcode, celltag.dt$v2$cell.barcode, celltag.dt$v3$cell.barcode)) 23 | celltag_data <- data.frame(row.names = all.cells) 24 | celltag_data[rownames(v1.df), "CellTagV1"] <- v1.df[rownames(v1.df),"clone.id"] 25 | celltag_data[rownames(v2.df), "CellTagV2"] <- v2.df[rownames(v2.df),"clone.id"] 26 | celltag_data[rownames(v3.df), "CellTagV3"] <- v3.df[rownames(v3.df),"clone.id"] 27 | 28 | celltag.obj@celltag.aggr.final <- celltag_data 29 | 30 | ### 1.Preprocessing celltag data #### 31 | message("Preprocessing data..") 32 | # pick up cells that have one or more celltag, and remove cells that do not have any celltag. 33 | Cells_with_tag <- rownames(celltag_data)[!(is.na(celltag_data$CellTagV1) & 34 | is.na(celltag_data$CellTagV2) & 35 | is.na(celltag_data$CellTagV3))] 36 | 37 | 38 | message(paste0(" Cells that have CellTagV1: ", sum(!is.na(celltag_data$CellTagV1)))) 39 | message(paste0(" Cells that have CellTagV2: ", sum(!is.na(celltag_data$CellTagV2)))) 40 | message(paste0(" Cells that have CellTagV3: ", sum(!is.na(celltag_data$CellTagV3)))) 41 | 42 | 43 | # remove non-tagged cells 44 | celltag_data <- celltag_data[Cells_with_tag, ] 45 | 46 | # convert NA to "e" 47 | tags <- c("CellTagV1", "CellTagV2", "CellTagV3") 48 | for (i in tags) { 49 | celltag_data[is.na(celltag_data[ ,i]),i] <- "e"} 50 | 51 | ### 2. Constructing LinkList ### 52 | message("Constructing link list..") 53 | 54 | findRoot <- function(cell_id, tag) { # e.g, cell_id = "TGTTCCGGTGAGGCTA-8"; tag = "CellTagV1", "CellTagV2", "CellTagV3" 55 | tagid <- celltag_data[cell_id,tag] 56 | tmp <- as.data.frame(t(c(paste0(tag, "_", tagid), cell_id, tag)), stringsAsFactors = F) 57 | rownames(tmp) <- NULL 58 | colnames(tmp) <- c("source", "target", "tag") 59 | return(tmp) 60 | } 61 | 62 | 63 | ## first, clonal population that share the same celltag is combined to make subnetwork. 64 | ## then, subnewtorks will be combined further if they are originated from same mother. 65 | 66 | all_cell_id <- rownames(celltag_data) 67 | remaining_cell_id <- all_cell_id 68 | tags <- c("CellTagV3", "CellTagV2", "CellTagV1") 69 | linkList <- data.frame() 70 | 71 | # 2.1 find connection between "celltag" -> "cells" 72 | for (tag in tags) { 73 | remaining_cells <- celltag_data[remaining_cell_id,] 74 | subcells <- remaining_cells[remaining_cells[,tag] != "e",] 75 | 76 | tmp <- foreach(i = rownames(subcells), .combine = rbind, .packages="foreach") %do% { 77 | findRoot(i, tag) 78 | } 79 | linkList <- rbind(linkList, tmp) 80 | done_id <- rownames(subcells) 81 | # remaining_cell_id <- remaining_cell_id[!(remaining_cell_id %in% done_id)] !!!!algorithm was modified 20180830!!!! in new version, remaining_cell_id will now be updated. 82 | } 83 | 84 | 85 | # 2.2 hidden link ["CellTagV2" -> "CellTagV3"], or ["CellTagV1" -> "CellTagV3"] 86 | hiddenlink_D13 <- foreach(i = (unique(celltag_data$CellTagV3)[-1]), .combine = rbind, .packages="foreach") %do% { 87 | 88 | sub_cells <- celltag_data[celltag_data$CellTagV3 == i, ] 89 | 90 | prev_tag <- sub_cells$CellTagV2 91 | prev_tag <- prev_tag[prev_tag != "e"] 92 | prev_tag <- names(which.max(table(prev_tag))) 93 | 94 | if (class(prev_tag) != "NULL") { 95 | tmp <- as.data.frame(t(c(paste0("CellTagV2", "_", prev_tag), 96 | paste0("CellTagV3", "_", i), 97 | "CellTagV2")), stringsAsFactors = F) 98 | rownames(tmp) <- NULL 99 | colnames(tmp) <- c("source", "target", "tag") 100 | return(tmp) 101 | } 102 | 103 | prev_tag <- sub_cells$CellTagV1 104 | prev_tag <- prev_tag[prev_tag != "e"] 105 | prev_tag <- names(which.max(table(prev_tag))) 106 | 107 | if (class(prev_tag) != "NULL") { 108 | tmp <- as.data.frame(t(c(paste0("CellTagV1", "_", prev_tag), 109 | paste0("CellTagV3", "_", i), 110 | "CellTagV1")), stringsAsFactors = F) 111 | rownames(tmp) <- NULL 112 | colnames(tmp) <- c("source", "target", "tag") 113 | return(tmp) 114 | } 115 | 116 | } 117 | # 2.3 hidden link ["CellTagV1" -> "CellTagV2"] 118 | hiddenlink_D3 <- foreach(i = (unique(celltag_data$CellTagV2)[-1]), .combine = rbind, .packages="foreach") %do% { 119 | 120 | sub_cells <- celltag_data[celltag_data$CellTagV2 == i, ] 121 | 122 | prev_tag <- sub_cells$CellTagV1 123 | prev_tag <- prev_tag[prev_tag != "e"] 124 | prev_tag <- names(which.max(table(prev_tag))) 125 | 126 | if (class(prev_tag) != "NULL") { 127 | tmp <- as.data.frame(t(c(paste0("CellTagV1", "_", prev_tag), 128 | paste0("CellTagV2", "_", i), 129 | "CellTagV1")), stringsAsFactors = F) 130 | rownames(tmp) <- NULL 131 | colnames(tmp) <- c("source", "target", "tag") 132 | return(tmp) 133 | } 134 | 135 | } 136 | rm(remaining_cells, remaining_cell_id, sub_cells, subcells, all_cell_id, done_id, prev_tag, tag, tags) 137 | 138 | # 2.4 integrating all links 139 | 140 | modifyCellName <- function(linkList){ 141 | # this function change cell name, like.. "TTCTCCTGTATCACCA-7" -> "TTCTCCTGTATCACCA-7_D3" 142 | # in the date processing algorithm v-0.20, cells that have multiple cell tag will show up mutiple times. 143 | # thus we have make new name to avoid them being overrapped. 144 | 145 | linkList$target_unmodified <- linkList$target 146 | 147 | node_cell <- grep("-", linkList$target) 148 | 149 | linkList[node_cell, "target"] <- paste0(linkList[node_cell, "target"], 150 | "_", 151 | stringr::str_split_fixed(linkList[node_cell, "tag"], "g", 2)[,2]) 152 | return(linkList) 153 | } 154 | 155 | # integrate 156 | linkList <- rbind(linkList, hiddenlink_D3) 157 | linkList <- rbind(linkList, hiddenlink_D13) 158 | 159 | # change cell name 160 | linkList <- modifyCellName(linkList) 161 | 162 | message("finished") 163 | 164 | celltag.obj@network.link.list <- linkList 165 | return(celltag.obj) 166 | 167 | } 168 | 169 | #' Get Nodes from Link List 170 | #' 171 | #' This function extracts the node information from the generated link list. 172 | #' @param celltag.obj A CellTag object with link list filled 173 | #' @return A CellTag object with the attribute (nodes) filled 174 | #' @keywords single-cell RNA-seq data, CellTagging 175 | #' @export 176 | #' @examples 177 | #' getNodesfromLinkList(bam.test.obj) 178 | #' 179 | getNodesfromLinkList <- function(celltag.obj){ 180 | # This function construct Nodes list from linkList. 181 | # Use "convertCellTagMatrix2LinkList" function before running this function to get linkList. 182 | linkList <- celltag.obj@network.link.list 183 | 184 | nodes <- union(linkList$target, linkList$source) 185 | Nodes <- data.frame(nodes, row.names = nodes, stringsAsFactors = F) 186 | 187 | 188 | #tag 189 | refferTagid <- function(each_node) { 190 | cells_or_not <- (sum(c("CellTagV1", "CellTagV2", "CellTagV3") %in% strsplit(each_node, "_")[[1]]) == 0) 191 | 192 | if (cells_or_not) { 193 | ans <- linkList[linkList$target == each_node, "tag"] 194 | } else { 195 | ans <- strsplit(each_node, "_")[[1]][1] 196 | } 197 | return(ans) 198 | } 199 | 200 | refferUMname <- function(each_node){ 201 | cells_or_not <- (sum(c("CellTagV1", "CellTagV2", "CellTagV3") %in% strsplit(each_node, "_")[[1]]) == 0) 202 | 203 | 204 | if (cells_or_not) { 205 | ans <- linkList[linkList$target == each_node, "target_unmodified"] 206 | } else { 207 | ans <- each_node 208 | } 209 | return(ans) 210 | 211 | } 212 | 213 | 214 | 215 | Nodes$tag <- sapply(nodes, refferTagid) 216 | Nodes$node_name_unmodified <- sapply(nodes, refferUMname) 217 | 218 | celltag.obj@nodes <- Nodes 219 | return(celltag.obj) 220 | } 221 | 222 | #' Add Additional Information to the Nodes 223 | #' 224 | #' This function add auxillary information to the nodes. Such information can include cluster information, cell type information and so on. The information should be stored as a data frame when passing in to the funtion. 225 | #' @param celltag.obj A CellTag object with nodes filled 226 | #' @param additional_data A data frame with auxillary information about the nodes (rownames = the nodes names) 227 | #' @return A CellTag object with the attribute (nodes) modified. 228 | #' @keywords single-cell RNA-seq data, CellTagging 229 | #' @export 230 | #' @examples 231 | #' addData2Nodes(bam.test.obj, cluster.info) 232 | #' 233 | addData2Nodes <- function(celltag.obj, additional_data){ 234 | 235 | # Nodes: data frame 236 | # additional_data: data frame 237 | # 238 | # the rownames of additional_data should be same format as "node_name_unmodified" in Nodes 239 | 240 | Nodes <- celltag.obj@nodes 241 | new.nodes <- cbind(Nodes, additional_data[Nodes$node_name_unmodified,]) 242 | no.col <- ncol(additional_data) 243 | colnames(new.nodes)[(ncol(new.nodes)-no.col+1):ncol(new.nodes)] <- colnames(additional_data) 244 | 245 | celltag.obj@nodes <- new.nodes 246 | return(celltag.obj) 247 | } 248 | 249 | -------------------------------------------------------------------------------- /R/CellTagNetworkVisualiztion.R: -------------------------------------------------------------------------------- 1 | returnDirectlyConnectedNodes <- function(node, linkList){ 2 | tmp_link <- linkList[linkList$source %in% node,] 3 | tmp_link2 <- linkList[linkList$target %in% node,] 4 | 5 | tmp_nodes <- union(tmp_link$target, tmp_link2$source) 6 | tmp_nodes <- union(tmp_nodes, node) 7 | return(tmp_nodes) 8 | } 9 | 10 | 11 | 12 | returnAllConnectedNodes <- function(node, linkList){ 13 | for (i in 1:5) { 14 | node <- returnDirectlyConnectedNodes(node, linkList) 15 | } 16 | return(node) 17 | } 18 | 19 | 20 | drawNetworkGraph <- function(linkList, Nodes, overlay){ 21 | 22 | rownames(Nodes) <- 1:nrow(Nodes) 23 | 24 | ref <- 1:nrow(Nodes) 25 | names(ref) <- Nodes$nodes 26 | linkList$source1 <- ref[as.character(linkList$source)] - 1 27 | linkList$target1 <- ref[as.character(linkList$target)] - 1 28 | 29 | linkList$Value <- 1 30 | #linkList$Colour <- c("#CD6155", "#566573")[as.numeric(linkList[,3] > 0) + 1] 31 | 32 | a <- forceNetwork(Links = linkList, Nodes = Nodes, zoom = T,opacityNoHover = 0.5, 33 | Source = "source1", Target = "target1", arrows = T, 34 | NodeID = "nodes", Value ="Value" , #linkColour = linkList$Colour, 35 | Group = overlay, opacity = 0.9) 36 | 37 | return(a) 38 | 39 | } 40 | 41 | #' Draw the Network 42 | #' 43 | #' This function generate a force-directed network based on the link list and nodes information. 44 | #' @param celltag.obj A CellTag object with link list and nodes filled 45 | #' @param tag Which tags would you like to plot? 46 | #' @param overlay What information would you like to overlay with the network? This should be one of the column names of the node information. 47 | #' @return A CellTag object with the attribute (network) modified. 48 | #' @keywords single-cell RNA-seq data, CellTagging 49 | #' @export 50 | #' @examples 51 | #' drawSubnet(bam.test.obj, "CellTagV1_2", "Cluster") 52 | #' 53 | drawSubnet <- function(celltag.obj, tag, overlay){ 54 | # e.g. tag; "celltag2.1_698" 55 | # e.g. color: "cluster" or "tag" or "SuperClone" 56 | Nodes <- celltag.obj@nodes 57 | linkList <- celltag.obj@network.link.list 58 | 59 | no <- returnAllConnectedNodes(tag, linkList) 60 | sub_link <- linkList[(linkList$source %in% no) | (linkList$target %in% no),] 61 | sub_Nodes <- Nodes[Nodes$nodes %in% no ,] 62 | 63 | a <- drawNetworkGraph(sub_link, sub_Nodes, overlay) 64 | 65 | celltag.obj@network <- a 66 | return(celltag.obj) 67 | } 68 | 69 | -------------------------------------------------------------------------------- /R/CellTagObjSet.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | setClass("CellTag", 3 | slots = list(obj.name = "character", 4 | fastq.bam.dir = "character", 5 | curr.version = "character", 6 | celltag.version = "character", 7 | fastq.full.celltag = "ANY", 8 | fastq.only.celltag = "ANY", 9 | celltag.freq.stats = "ANY", 10 | whitelist = "ANY", 11 | bam.parse.rslt = "ANY", 12 | celltag.stats = "ANY", 13 | pre.starcode = "ANY", 14 | raw.count = "dgCMatrix", 15 | collapsed.count = "dgCMatrix", 16 | whitelisted.count = "dgCMatrix", 17 | metric.filtered.count = "dgCMatrix", 18 | binary.mtx = "dgCMatrix", 19 | jaccard.mtx = "dsTMatrix", 20 | clone.composition = "ANY", 21 | clone.size.info = "ANY", 22 | celltag.aggr.final = "data.frame", 23 | network.link.list = "ANY", 24 | nodes = "ANY", 25 | network = "ANY")) 26 | #' @export 27 | setMethod("show", 28 | "CellTag", 29 | function(object) { 30 | cat("Object name: ", object@obj.name, "\n") 31 | cat("Raw CellTag Counts = ", (ncol(object@raw.count)), "\n") 32 | cat("Raw Number of Cells with CellTag = ", nrow(object@raw.count), "\n") 33 | cat("Collapsed CellTag Counts = ", ncol(object@collapsed.count), "\n") 34 | cat("Whitelisted CellTag Counts = ", (ncol(object@whitelisted.count)), "\n") 35 | cat("Whitelisted Number of Cells with CellTag = ", nrow(object@whitelisted.count), "\n") 36 | }) 37 | 38 | -------------------------------------------------------------------------------- /R/CellTagWhitelistGeneration.R: -------------------------------------------------------------------------------- 1 | #' CellTag Whitelist Filtering Function 2 | #' 3 | #' This function conducts whitelist filtering such that only CellTags with count number over their certain percentile would be considered for clone calling 4 | #' @param celltag.obj A CellTag Object with CellTag frequency table counted and sorted 5 | #' @param percentile A fraction cutoff percentile for filtering the CellTags e.g. 0.9 for 90th percentile 6 | #' @param output.dir Which directory would you like to store these files? If NULL, save to the same directory as the fastq/bam file 7 | #' @return A CellTag Object with attribute (whitelist) filled. 8 | #' @keywords single-cell RNA-seq data, CellTagging 9 | #' @export 10 | #' @examples 11 | #' CellTagWhitelistFiltering(bam.test.obj, 0.9) 12 | #' 13 | CellTagWhitelistFiltering <- function(celltag.obj, percentile, output.dir = NULL) { 14 | # Load table and calculate cutoff 15 | count.sorted.table <- celltag.obj@celltag.freq.stats[[celltag.obj@curr.version]] 16 | count.cutoff <- quantile(count.sorted.table$Count, probs = percentile) 17 | count.true.cut <- floor(count.cutoff/10) 18 | 19 | # Plot 20 | plot(count.sorted.table$Count, main="CellTag Whitelist",xlab="CellTag",ylab="Reads") 21 | abline(v=sum(count.sorted.table$Count >= count.true.cut), col="red", lty=2) 22 | cat(paste0("Abline Threshold: ", sum(count.sorted.table$Count >= count.true.cut)), "\n") 23 | 24 | # Subset the ones pass filtering 25 | whitelist <- subset(count.sorted.table, Count>=count.true.cut) 26 | 27 | if (is.null(output.dir)) output.dir <- paste0(dirname(celltag.obj@fastq.bam.dir), "/", celltag.obj@curr.version, "_whitelist.csv") 28 | write.csv(whitelist, output.dir, quote = F, row.names = F) 29 | 30 | cat("File is saved: ", output.dir, "\n") 31 | 32 | celltag.obj@whitelist[[celltag.obj@curr.version]] <- whitelist 33 | return(celltag.obj) 34 | } 35 | 36 | #' CellTag Frequency Sort Table 37 | #' 38 | #' This function counts and sorts the identified CellTags from Fastq file 39 | #' @param celltag.obj A CellTag Object with CellTags extracted 40 | #' @return A CellTag Object with attribute (celltag.freq.stats) filled. 41 | #' @keywords single-cell RNA-seq data, CellTagging 42 | #' @export 43 | #' @examples 44 | #' CellTagWhitelistFiltering(bam.test.obj) 45 | #' 46 | AddCellTagFreqSort <- function(celltag.obj) { 47 | # Count the occurrence of each CellTag 48 | cell.tag.count <- as.data.table(table(celltag.obj@fastq.only.celltag[[celltag.obj@curr.version]]), stringsAsFactors = F) 49 | # Sort the CellTags in descending order of occurrence 50 | cell.tag.count.sort <- cell.tag.count[order(-cell.tag.count$N), ] 51 | colnames(cell.tag.count.sort) <- c("CellTag", "Count") 52 | # Add to the slot in celltag object 53 | celltag.obj@celltag.freq.stats[[celltag.obj@curr.version]] <- cell.tag.count.sort 54 | return(celltag.obj) 55 | } 56 | -------------------------------------------------------------------------------- /R/CloneCalling.R: -------------------------------------------------------------------------------- 1 | #' Jaccard Analysis Function 2 | #' 3 | #' This function conducts Jaccard analysis to calculate the Jaccard similarity between cells. 4 | #' @param celltag.obj A CellTag object with the counts filtered based on metrics 5 | #' @param plot.corr Would you like to plot the correlation matrix? 6 | #' @return A CellTag object with attribute (jaccard.mtx) filled 7 | #' @keywords single-cell RNA-seq data, CellTagging 8 | #' @export 9 | #' @examples 10 | #' JaccardAnalysis(bam.test.obj) 11 | #' 12 | JaccardAnalysis <- function(celltag.obj, plot.corr = TRUE, fast = FALSE) { 13 | filtered.whitelised.data <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "metric.filtered.count") 14 | # Calculating the Jaccard matrix 15 | if (fast) { 16 | Jac <- proxyC::simil(filtered.whitelised.data, method = "jaccard") 17 | } else { 18 | Jac <- proxy::simil(as.matrix(filtered.whitelised.data), method = "Jaccard") 19 | Jac <- as(Jac, "dsTMatrix") 20 | } 21 | 22 | if ((!fast) & plot.corr) { 23 | diag(Jac) <- 1 24 | corrplot(Jac, method="color", order="hclust", hclust.method ="ward.D2", cl.lim=c(0,1), tl.cex=0.1) 25 | } 26 | 27 | celltag.obj@jaccard.mtx <- Jac 28 | return(celltag.obj) 29 | } 30 | 31 | #' Clone Calling Function 32 | #' 33 | #' This function conducts clone calling based on the Jaccard results. 34 | #' @param celltag.obj A CellTag object with the jaccard matrix generated 35 | #' @param correlation.cutoff Correlation cutoff for clone membership 36 | #' @return A CellTag object with attributes (clone.composition & clone.size.info) filled. 37 | #' @keywords single-cell RNA-seq data, CellTagging 38 | #' @export 39 | #' @examples 40 | #' CloneCalling(bam.test.obj, 0.7) 41 | #' 42 | CloneCalling <- function(celltag.obj, correlation.cutoff) { 43 | Jaccard.Matrix <- celltag.obj@jaccard.mtx 44 | 45 | # Using the igraph package to facilitate the identification of membership to each clone 46 | jac.summ <- Matrix::summary(Jaccard.Matrix) 47 | jac.lower.i <- jac.summ$j 48 | jac.summ$j <- jac.summ$i 49 | jac.summ$i <- jac.lower.i 50 | lower.tri.summ <- subset(jac.summ, i>j) # Exclude diagnol 51 | 52 | test <- sparseMatrix(i = lower.tri.summ$i, 53 | j = lower.tri.summ$j, 54 | x = lower.tri.summ$x, 55 | dims = dim(Jaccard.Matrix), 56 | dimnames = dimnames(Jaccard.Matrix)) 57 | 58 | test.df <- as.data.frame(Matrix::summary(test)) 59 | test.df.sub <- test.df[which(test.df$x > correlation.cutoff), ] 60 | 61 | check.corelation <- test.df.sub[,c(1,2)] 62 | colnames(check.corelation) <- c("row", "col") 63 | check.corelation <- as.matrix(check.corelation) 64 | 65 | graph.cor <- graph.data.frame(check.corelation, directed = FALSE) 66 | groups.cor <- split(unique(as.vector(check.corelation)), clusters(graph.cor)$membership) 67 | conv.groups.cor <- lapply(groups.cor, 68 | function(list.cor){ 69 | rownames(test)[list.cor]}) 70 | 71 | # Put clones into tables 72 | l <- seq(1, length(groups.cor)) 73 | df.conv <- apply(as.matrix(l), 1, 74 | function(x) { 75 | data.table(clone.id = x, 76 | cell.barcode = conv.groups.cor[[x]]) 77 | } 78 | ) 79 | 80 | df.comb <- rbindlist(df.conv) 81 | 82 | # Calculate the size of each clone 83 | counts <- table(df.comb$clone.id) 84 | counts <- as.data.frame(counts) 85 | colnames(counts) <- c("Clone.ID", "Frequency") 86 | 87 | celltag.obj@clone.composition[[celltag.obj@curr.version]] <- df.comb 88 | celltag.obj@clone.size.info[[celltag.obj@curr.version]] <- counts 89 | return(celltag.obj) 90 | } 91 | 92 | -------------------------------------------------------------------------------- /R/CreateCellTagObject.R: -------------------------------------------------------------------------------- 1 | #' Create a New CellTag Object 2 | #' 3 | #' This function creates a CellTag object that contains the basic information required for the object 4 | #' @param object.name The name of the object 5 | #' @param fastq.bam.input The input fastq/bam data file path 6 | #' @param celltag.version Which version of CellTags are you working with? 7 | #' @return A CellTag Object with open attributes that can be filled as analysis moving along 8 | #' @keywords single-cell RNA-seq data, CellTagging 9 | #' @export 10 | #' @examples 11 | #' CellTagObejct("hf1.d15.test", "hf1.d15.bam", "v1") 12 | #' 13 | CellTagObject <- function(object.name, fastq.bam.directory) { 14 | ct <- new("CellTag", obj.name = object.name, fastq.bam.dir = fastq.bam.directory) 15 | return(ct) 16 | } 17 | -------------------------------------------------------------------------------- /R/MetricBasedPlottingAndFiltering.R: -------------------------------------------------------------------------------- 1 | #' Metric-Base Filtering Function 2 | #' 3 | #' This function applies further filtering on scRNA-seq data with CellTags based on cutoff values identified from the metric plots. 4 | #' @param celltag.obj A CellTag Object with count matrix generated 5 | #' @param cutoff The cutoff decided from the metric plots 6 | #' @param comparison Would you like to maintain the part less than/greater than the cutoff? Default to less. Choices can be greater or less. 7 | #' @return A CellTag Object with attribute (metric.filtered.count) filled 8 | #' @keywords single-cell RNA-seq data, CellTagging 9 | #' @export 10 | #' @examples 11 | #' MetricBasedFiltering(bam.test.object, 20, "less") 12 | #' 13 | MetricBasedFiltering <- function(celltag.obj, cutoff, comparison = "less", replace.option = FALSE) { 14 | whitelisted.ct.data <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "whitelisted.count") 15 | metric.filter.ct.data <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "metric.filtered.count") 16 | if (ncol(metric.filter.ct.data) <= 0) { 17 | whitelisted.celltag.data <- as.matrix(whitelisted.ct.data) 18 | } else { 19 | whitelisted.celltag.data <- as.matrix(metric.filter.ct.data) 20 | } 21 | # Set up the filtering data frame 22 | CellTags.per.cell.whitelisted.pf <- as.data.frame(Matrix::rowSums(whitelisted.celltag.data)) 23 | 24 | # Set up the filtered celltag dataset object 25 | if (comparison == "less") { 26 | cell.filter <- subset(CellTags.per.cell.whitelisted.pf, CellTags.per.cell.whitelisted.pf <= (cutoff)) 27 | } else { 28 | cell.filter <- subset(CellTags.per.cell.whitelisted.pf, CellTags.per.cell.whitelisted.pf >= (cutoff)) 29 | } 30 | cell.bc.filter <- row.names(cell.filter) 31 | # Filter celltag dataset 32 | celltags.whitelisted.new <- whitelisted.celltag.data[cell.bc.filter, ] 33 | 34 | new.obj <- SetCellTagCurrentVersionWorkingMatrix(celltag.obj, "metric.filtered.count", as(celltags.whitelisted.new, "dgCMatrix"), replace = replace.option) 35 | 36 | return(new.obj) 37 | } 38 | 39 | #' CellTag Metric Plotting Function 40 | #' 41 | #' This function provides some metric plots for further downstream celltag filtering in the scRNA-seq dataset. 42 | #' @param celltag.obj A CellTag Object 43 | #' @keywords single-cell RNA-seq data, CellTagging 44 | #' @export 45 | #' @examples 46 | #' MetricPlots(bam.test.obj) 47 | #' 48 | MetricPlots <- function(celltag.obj) { 49 | 50 | obj.metric.filtered.count <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "metric.filtered.count") 51 | obj.whitelisted.count <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "whitelisted.count") 52 | 53 | if (ncol(obj.metric.filtered.count) <= 0) { 54 | if (ncol(obj.whitelisted.count) <= 0) { 55 | celltag.data <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "binary.mtx") 56 | } else { 57 | celltag.data <- obj.whitelisted.count 58 | } 59 | } else { 60 | celltag.data <- obj.metric.filtered.count 61 | } 62 | 63 | CellTags.per.cell.whitelisted.pf <- Matrix::rowSums(celltag.data) 64 | CellTags.per.cell.avg <- mean(CellTags.per.cell.whitelisted.pf) 65 | CellTags.frequency.whitelisted.pf <- Matrix::colSums(celltag.data) 66 | CellTags.freq.avg <- mean(CellTags.frequency.whitelisted.pf) 67 | par(mfrow=c(2,2)) 68 | plot(CellTags.per.cell.whitelisted.pf, main = "CellTag Counts of Individual Cells", xlab = "Cell Index", ylab = "CellTag Counts") 69 | plot(CellTags.frequency.whitelisted.pf, main = "CellTag Occurrence Frequency Across All Cells", xlab = "Cell Index", ylab = "CellTag Frequency") 70 | hist(CellTags.per.cell.whitelisted.pf, main = "Histogram of CellTag Counts of Individual Cells", xlab = "CellTag Counts", ylab = "Count") 71 | hist(CellTags.frequency.whitelisted.pf, main = "Histogram of CellTag Occurrence Frequency Across All Cells", xlab = "CellTag Occurrence Frequency", ylab = "Count") 72 | cat("Average: ", CellTags.per.cell.avg, "\n") 73 | cat("Frequency: ", CellTags.freq.avg, "\n") 74 | } 75 | 76 | -------------------------------------------------------------------------------- /R/ScCellTagMatrixProcess.R: -------------------------------------------------------------------------------- 1 | #' Single-cell RNA-seq Binarization Function 2 | #' 3 | #' This function binarize the single-cell celltag data based on a given cutoff. It will generate a binary matrix, which will be stored as a slot in the CellTag Object. The binary matrix will be further used for future processing of the single-cell data. 4 | #' @param celltag.obj A CellTag object with the raw count matrix generated 5 | #' @param tag.cutoff How many tags would you like to be used as a cutoff to say that the cells are tagged? 6 | #' @return A CellTag object with the attribute (binary.mtx) filled. 7 | #' @keywords single-cell RNA-seq data, CellTagging 8 | #' @export 9 | #' @examples 10 | #' SingleCellDataBinarization(bam.test.obj, 2) 11 | #' 12 | SingleCellDataBinarization <- function(celltag.obj, tag.cutoff, replace.option = FALSE) { 13 | obj.collapsed.count <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "collapsed.count") 14 | if (sum(dim(obj.collapsed.count)) <= 0) { 15 | CellTags <- GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "raw.count") 16 | } else { 17 | CellTags <- obj.collapsed.count 18 | } 19 | CellTags[CellTags < tag.cutoff] <- 0 20 | CellTags[CellTags > 0] <- 1 21 | new.obj <- SetCellTagCurrentVersionWorkingMatrix(celltag.obj, "binary.mtx", as(CellTags, "dgCMatrix"), replace = replace.option) 22 | 23 | return(new.obj) 24 | } 25 | 26 | #' Single-cell RNA-seq Whitelisting Function 27 | #' 28 | #' The whitelist is a list of CellTag generated based on assessment of CellTag library. It helps reduce the effect from sequencing error in CellTags. This function conducts whitelist filtering through the single-cell dataset. It will filter out CellTags that are not included in the whitelist. 29 | #' @param celltag.obj A CellTag object with the binary matrix generated 30 | #' @param whitels.cell.tag.file file director to the whitelisted cell tags 31 | #' @return A CellTag object with the attribute (whitelisted.count) filled 32 | #' @keywords single-cell RNA-seq data, CellTagging 33 | #' @export 34 | #' @examples 35 | #' SingleCellDataWhitelist(bam.test.obj, "~/Desktop/My_Favourite_Whitelist.csv") 36 | #' 37 | SingleCellDataWhitelist <- function(celltag.obj, whitels.cell.tag.file, replace.option = FALSE) { 38 | # Store the cell names 39 | CellTags <- as.matrix(GetCellTagCurrentVersionWorkingMatrix(celltag.obj, "binary.mtx")) 40 | cell.names <- rownames(CellTags) 41 | 42 | # Process the celltag matrix to format below 43 | # row - celltag 44 | # col - cells 45 | CellTags <- t(CellTags) 46 | celltag.rownames <- row.names(CellTags) 47 | 48 | # Filter the matrix using whitelist 49 | if (endsWith(whitels.cell.tag.file, ".csv")) { 50 | separator <- "," 51 | } else { 52 | if (endsWith(whitels.cell.tag.file, ".txt") | endsWith(whitels.cell.tag.file, ".tsv")) { 53 | separator <- "\t" 54 | } else { 55 | separator <- " " 56 | } 57 | } 58 | whitelist <- read.delim(whitels.cell.tag.file, sep = separator, header = T, stringsAsFactors = F) 59 | whitelist.names <- whitelist[,1] 60 | whitelist <- Reduce(intersect, list(whitelist.names, celltag.rownames)) 61 | celltags.whitelisted <- CellTags[whitelist,] 62 | colnames(celltags.whitelisted) <- cell.names 63 | 64 | new.obj <- SetCellTagCurrentVersionWorkingMatrix(celltag.obj, "whitelisted.count", as(t(as.matrix(celltags.whitelisted)), "dgCMatrix"), replace = replace.option) 65 | return(new.obj) 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /R/scripts.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/R/scripts.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R Package - CellTagR 2 | 3 | ## Important Notices 4 | We recently fixed the Binarization function name from ```SingleCellDataBinatization``` to ```SingleCellDataBinarization```. Please update your code accordingly, if you are using the latest version of CellTagR. 5 | 6 | We recently found that inside the setter function, the column names of the filtered count matrix are possibly shuffled around during the second round of filtering, thus some CellTags were associated with the wrong cell barcodes. This could lead to inaccurate clone-calling. We suggest users reinstall the package and empty the slot with the following line of code and restart the pipeline from this step: https://github.com/morris-lab/CellTagR#6-additional-filtering. 7 | ```r 8 | celltag.obj@metric.filtered.count <- as(matrix(NA, 0, 0), "dgCMatrix") 9 | ``` 10 | 11 | ## Description 12 | This is a wrapped R package of the workflow (https://github.com/morris-lab/CellTagWorkflow) with additional assessment of the complexity of the Celltag Library sequences. Additionally, previous version of this package can be found https://github.com/morris-lab/PreviousCloneHunter. ***Note: This has been changed and improved. Analysis with previous version will not be compatible.*** This package have a dependency on R version (R >= 3.5.0). This can be used as an alternative approach for this pipeline. For details regarding development and usage of CellTag, please refer to the following papaer - *Biddy et. al. Nature, 2018*, https://www.nature.com/articles/s41586-018-0744-4, *Kong et al., Nature Protocol, 2020*, https://www.nature.com/articles/s41596-019-0247-2 13 | 14 | Install devtools 15 | ```r 16 | install.packages("devtools") 17 | ``` 18 | Install the package from GitHub. 19 | ```r 20 | library("devtools") 21 | devtools::install_github("morris-lab/CellTagR") 22 | ``` 23 | Load the package 24 | ```r 25 | library("CellTagR") 26 | ``` 27 | 28 | ## Assessment of CellTag Library Complexity via Sequencing 29 | In this first section, we evaluate the CellTag library complexity using sequencing. Following is an example using the sequencing data we generated in lab for pooled CellTag library V2. 30 | ### 1. Read in the fastq sequencing data and extract the CellTags 31 | The extracted CellTags will be stored as an attribute (fastq.full.celltag & fastq.only.celltag) in the resulting object. 32 | ```r 33 | # Read in the data file that come with the package 34 | fpath <- system.file("extdata", "V2-1_R1.zip", package = "CellTagR") 35 | extract.dir <- "." 36 | # Extract the dataset 37 | unzip(fpath, overwrite = FALSE, exdir = ".") 38 | full.fpath <- paste0(extract.dir, "/", "V2-1_S2_L001_R1_001.fastq") 39 | # Set up the CellTag Object 40 | test.obj <- CellTagObject(object.name = "v2.whitelist.test", fastq.bam.directory = full.fpath) 41 | # Extract the CellTags 42 | test.obj <- CellTagExtraction(celltag.obj = test.obj, celltag.version = "v2") 43 | ``` 44 | 45 | ### 2. Count the CellTags and sort based on the occurrence of each CellTag 46 | ```r 47 | # Count and Sort the CellTags in descending order of occurrence 48 | test.obj <- AddCellTagFreqSort(test.obj) 49 | # Check the stats 50 | test.obj@celltag.freq.stats 51 | ``` 52 | 53 | ### 3. Generation of a whitelist for the CellTag library 54 | Here, we generating the whitelist for this CellTag library - CellTag V2. This will remove the CellTags with an occurrence number below the threshold. The threshold (using 90th percentile as an example) is determined: floor[(90th quantile)/10]. The percentile can be changed while calling the function. A plot of CellTag reads will be plotted and it can be used to further choose the percentile. If the output directory is offered, whitelist files will be stored in the provided directory. Otherwise, whitelist files will be saved under the same directory as the fastq files with name as _whitelist.csv (Example: v2_whitelist.csv). 55 | 56 | ```r 57 | # Generate the whitelist 58 | test.obj <- CellTagWhitelistFiltering(celltag.obj = test.obj, percentile = 0.9, output.dir = NULL) 59 | ``` 60 | The generated whitelist for each library can be used to filter and clean the single-cell CellTag UMI matrices. 61 | 62 | ## Single-Cell CellTag Extraction and Quantification 63 | In this section, we are presenting an alternative approach that utilizes this package to carry out CellTag extraction, quantification, and generation of UMI count matrices. This can be also accomplished via the workflow supplied - https://github.com/morris-lab/CellTagWorkflow. 64 | #### Note: Using the package could be slow for the extraction part. For reference, it took approximately an hour to extract from a 40Gb BAM file using a maximum of 8Gb of memory. 65 | 66 | ### 1. Download the BAM file 67 | Here we follow the same step as in https://github.com/morris-lab/CellTagWorkflow to download the a BAM file from the Sequence Read Archive (SRA) server. Again, this file is quite large. Hence, it might take a while to download. The file can be downloaded using wget in terminal as well as in R. 68 | ```r 69 | # bash 70 | wget https://sra-pub-src-1.s3.amazonaws.com/SRR7347033/hf1.d15.possorted_genome_bam.bam.1 71 | ``` 72 | OR 73 | ```r 74 | download.file("https://sra-pub-src-1.s3.amazonaws.com/SRR7347033/hf1.d15.possorted_genome_bam.bam.1", "./hf1.d15.bam") 75 | ``` 76 | 77 | ### (RECOMMENDED) Optional Step: BAM File Filtering 78 | ***NOTE:*** If BAM file filtering is **NOT** required (although, we strongly recommend this), skip this step and move to *Step 2 - Create a CellTag Object*, in which the entire BAM file will be used. Otherwise, before generating a CellTag object and extracting the CellTags, we will carry out the following BAM filtering step, from which a subset of reads in the BAM file will be searched during CellTag extraction. 79 | 80 | In this step, we will filter the BAM file to reduce the possibility that false positive CellTags will be identified. Briefly, the 17-20 bp sequence that comprises the CellTag barcode may appear by chance in other regions of the transcriptome. These may be identified as CellTags and cells expressing these transcripts may be falsely called as clones. By filtering reads in the BAM file to only include those which are unmapped as well as those mapped to GFP or (optionally) the CellTag UTR, we reduce the chances of extracting false positive CellTags. 81 | 82 | We recommend adding the CellTag UTR and GFP CDS as transgenes to the reference genome used during alignment. These sequences and corresponding GTF entries are stored [here](https://github.com/morris-lab/CellTagR/blob/master/Examples/CellTag_UTR.fa). More information on adding a marker gene to a reference can be found here: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_mr. 83 | 84 | ### I. Filter unmapped reads 85 | First, we will use samtools to efficiently filter umapped reads. 86 | 87 | ```r 88 | # bash 89 | samtools view -b -f 4 ./hf1.d15.bam > ./hf1.d15.filtered.bam 90 | ``` 91 | 92 | ### II. Filter transgene reads 93 | Next, we will filter reads aligned to GFP or the CellTag UTR. 94 | 95 | ```r 96 | # bash 97 | samtools view -b ./hf1.d15.bam GFP >> ./hf1.d15.filtered.bam 98 | ``` 99 | 100 | If the CellTag UTR was not included in the reference, the following line may be omitted. 101 | 102 | ```r 103 | # bash 104 | samtools view -b ./hf1.d15.bam CellTag.UTR >> ./hf1.d15.filtered.bam 105 | ``` 106 | 107 | ### 2. Create a CellTag Object 108 | In this step, we will initialize a CellTag object with a object name and the path to where the bam file is stored **if only one bam file is processed.** 109 | 110 | ```r 111 | # Set up the CellTag Object 112 | bam.test.obj <- CellTagObject(object.name = "bam.cell.tag.obj", fastq.bam.directory = "./hf1.d15.filtered.bam") 113 | ``` 114 | 115 | **Update: CellTagR now enables read-in of multiple BAM files at a time.** When multiple BAM files need to be processed, ***please use a folder that contains ONLY BAM files and put the fastq.bam.directory as the path of the folder.*** For instance, two bam files need to be processed named as *bam1.bam* and *bam2.bam*. They will be put into a folder named as *beautiful_bams* in the *Desktop*. Then, the input will be *fastq.bam.directory="~/Desktop/beautiful_bams/"* as below. 116 | 117 | ```r 118 | ## NOT RUN 119 | # Set up the CellTag Object 120 | # bam.test.obj <- CellTagObject(object.name = "bam.cell.tag.obj", fastq.bam.directory = "~/Desktop/beautiful_bams/") 121 | ``` 122 | 123 | ***Note: The following tutorials are only intended for processing ONE CellTag version. To obtain information for all three versions of CellTags, running the following pipeline is required for each CellTag version independently, i.e. finishing process for V1 and then repeating the procedure for V2, and so on. After running the pipeline for each CellTag version, the clonal information of each will be stored in the same object, which can be used to carry out network construction and visualization.*** 124 | 125 | ### 3. Extract the CellTags from the BAM file 126 | In this step, we will extract the CellTag information from the BAM file, which contains information including cell barcodes, CellTag and Unique Molecular Identifiers (UMI). The result generated from this extraction will be a data table containing the following information. The result will then be saved into the slot "bam.parse.rslt" in the object in the following format. 127 | 128 | |Cell Barcode|Unique Molecular Identifier|CellTag Motif| 129 | |:----------:|:-:|:---------:| 130 | |Cell.BC|UMI|Cell.Tag| 131 | ```r 132 | # Extract the CellTag information 133 | bam.test.obj <- CellTagExtraction(bam.test.obj, celltag.version = "v1") 134 | # Check the bam file result 135 | head(bam.test.obj@bam.parse.rslt[["v1"]]) 136 | ``` 137 | **Update: CellTagR now enables read-in of multiple BAM files at a time.** Extraction with multiple samples will automatically add prefixes to different samples in the order of BAM file given, i.e. Sample-\_\. The order of BAM file processing will be printed as it processes along. Prefixes assignments from users will be coming soon! 138 | 139 | ### 4. Quantify the CellTag UMI Counts and Generate UMI Count Matrices 140 | In this step, we will quantify the CellTag UMI counts and generate the UMI count matrices. This function will take in two inputs, including the barcode tsv file generated by 10X and celltag object processed from Step 2. The barcode tsv file can be either filtered or raw. **However, note that using the raw barcodes file could require a large amount of memory for using this function**. If filtered barcode files are used, **only cell barcodes that appear in the filtered barcode file** will be preserved. The result will also be saved as a *dgCMatrix* in a slot - "raw.count" - under the object. At the same time, initial CellTag statistics will be saved as another slot under the object. The matrix will be in the format as following. ***If multiple BAM files, please follow the updated.*** 141 | 142 | ||CellTag Motif 1|CellTag Motif 2|\|CellTag Motif N| 143 | |:----------:|:-:|:---------:|:--:|:--:| 144 | |Cell.BC|Motif 1|Motif 2|\|Motif N| 145 | 146 | ```r 147 | # Generate the sparse count matrix 148 | bam.test.obj <- CellTagMatrixCount(celltag.obj = bam.test.obj, barcodes.file = "./barcodes.tsv") 149 | # Check the dimension of the raw count matrix 150 | dim(bam.test.obj@raw.count) 151 | ``` 152 | 153 | **Update: CellTagR now enables read in of multiple BAM files at a time.** An aggregated barcode file needs to be generated for multiple BAM file processed with proper prefixes. Please use the *Barcode.Aggregate* function to generate a aggregated barcode file. This function takes in an **ordered** list of barcodes files. The order should be the same as the BAM file order. 154 | 155 | ```r 156 | Barcode.Aggregate(list("barcode_1.tsv", "barcode_2.tsv"), "./barcodes_all.tsv") 157 | # Generate the sparse count matrix 158 | bam.test.obj <- CellTagMatrixCount(celltag.obj = bam.test.obj, barcodes.file = "./barcodes_all.tsv") 159 | # Check the dimension of the raw count matrix 160 | dim(bam.test.obj@raw.count) 161 | ``` 162 | 163 | The generated CellTag UMI count matrices can then be used in the following steps for clone identification. 164 | 165 | ## Single-cell CellTag UMI Count Matrix Processing 166 | In this section, we are presenting an alternative approach that utilizes this package we established to carry out clone calling with single-cell CellTag UMI count matrices. In this pipeline below, we are using a subset of dataset generated from the full data (Full data can be found here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99915). Briefly, in our lab, we reprogram mouse embryonic fibroblasts (MEFs) to induced endoderm progenitors (iEPs). This dataset is a single-cell dataset that contains cells collected from different time points during the process. This subset is a part of the first replicate of the data. It contains cells collected at Day 15 with three different CellTag libraries - V1, V2 & V3. 167 | 168 | ### 1. Read in the single-cell CellTag UMI count matrix 169 | We generated this object from the above steps, using BAM files. As above, BAM files take a long time to process. Hence, in this repository, we include a sample object saved as .Rds file from the previous steps, in which raw count matrix is included in the slot - "raw.count" 170 | ```r 171 | # Read the RDS file and get the object 172 | dt.mtx.path <- system.file("extdata", "Demo_V1.Rds", package = "CellTagR") 173 | bam.test.obj <- readRDS(dt.mtx.path) 174 | ``` 175 | 176 | ### (RECOMMENDED) Optional Step: CellTag Error Correction 177 | ***NOTE:*** If CellTag error correction is **NOT** required (although, we strongly recommend this), skip this step and move to *Step 2 - binarization*, in which the raw matrix will be used. Otherwise, before binarization and additional filtering, we will carry out the following error correction step via Starcode, from which a collapsed matrix will be used further for binarization. 178 | 179 | In this step, we will identify CellTags with similar sequences and collapse similar CellTags to the centroid CellTag. For more information and installation, please refer to starcode software - https://github.com/gui11aume/starcode. Briefly, starcode clusters DNA sequences based on the Levenshtein distances between each pair of sequences, from which we collapse similar CellTag sequences to correct for potential errors occurred during single-cell RNA-sequencing process. Default maximum distance from starcode was used to cluster the CellTags. 180 | 181 | ### I. Prepare for the data to be collapsed 182 | First, we will prepare the data to the format that is accepted by starcode. This function accepts two inputs including the CellTag object with raw count matrix generated and a path to where to save the output text file. The output will be a text file with each line containing one sequence to collapse with others. In this function, we concatenate the CellTag with cell barcode and use the combined sequences as input to execute Starcode. The file to be used for Starcode will be stored under the provided directory. 183 | ```r 184 | # Generating the collapsing file 185 | bam.test.obj <- CellTagDataForCollapsing(celltag.obj = bam.test.obj, output.file = "~/Desktop/collapsing.txt") 186 | ``` 187 | 188 | **Update: CellTagR now enables read-in of multiple BAM files at a time.** Multiple files with their prefixes used before will be incorporated into the output files. Hence multiple files will be generated for collapsing in the given directory. For instance, if there are 2 samples and to be saved on Desktop, they will be named as *collapsing_Sample-1.txt* and *collapsing_Sample-2.txt*. 189 | 190 | ### II. Run Starcode to cluster CellTags 191 | Following the instruction for Starcode, we will run the following command to generate the result from starcode. **Make sure to run each file generated for each sample if multiple are processed** 192 | 193 | ```r 194 | ./starcode -s --print-clusters ~/Desktop/collapsing.txt > ~/Desktop/collapsing_result.txt 195 | ``` 196 | 197 | ***Please use a folder containing ONLY the collapsing results! And please name the collapsing results corresponding to their sample names if multiple samples are processed.*** For example, use the name *collapsing_result_Sample-1.txt* for *collapsing_Sample-1.txt*. 198 | 199 | ### III. Extract information from Starcode result and collapse similar CellTags 200 | With the collapsed results, we will regenerate the CellTag x Cell Barcode matrix. The collpased matrix will be stored in a slot - "collapsed.count" - in the CellTag object. This function takes two inputs including the CellTag Object to modify and the path to th result file from collapsing. ***If multiple BAM files generated the collapsing result, check the update*** 201 | 202 | ```r 203 | # Recount and generate collapsed matrix 204 | bam.test.obj <- CellTagDataPostCollapsing(celltag.obj = bam.test.obj, collapsed.rslt.file = "~/Desktop/collapsing_rslt.txt") 205 | # Check the dimension of this collapsed count. 206 | head(bam.test.obj@collapsed.count) 207 | ``` 208 | 209 | ***Update: If with multiple BAM file generated collapsing result, run the following lines*** Example: the result files are saved on the desktop in the folder named *star_collapse*. 210 | 211 | ```r 212 | collapsed.rslt.dir <- "~/Desktop/star_collapse" 213 | # Recount and generate collapsed matrix 214 | bam.test.obj <- CellTagDataPostCollapsing(celltag.obj = bam.test.obj, collapsed.rslt.file = list.files(collapsed.rslt.dir, full.names = T)) 215 | # Check the dimension of this collapsed count. 216 | head(bam.test.obj@collapsed.count) 217 | ``` 218 | 219 | Below is an example Jaccard Analysis result with Error Correction using Starcode collapsing (top - without collapsing, bottom - with collapsing): 220 |

221 | 222 |

223 | 224 |

225 | 226 |

227 | 228 | ### 2. Binarize the single-cell CellTag UMI count matrix 229 | Here, we binarize the count matrix to contain 0 or 1, where 0 indicates no such CellTag found in a single cell and 1 reports CellTag expression. The suggested cutoff that marks presence or absence is at least 2 counts per CellTag per Cell. For details regarding cutoff choice, please refer to the paper - https://www.nature.com/articles/s41586-018-0744-4. The binary matrix will be stored in a slot - 'binary.mtx' - as a *dgCMatrix*. **Note: If collapsing was performed, binarization will be based on the collapsed count matrix. Otherwise, it will be based on the raw count matrix** 230 | ```r 231 | # Calling binarization 232 | bam.test.obj <- SingleCellDataBinarization(bam.test.obj, 2) 233 | ``` 234 | 235 | ### 3. Metric plots to facilitate for additional filtering 236 | We then generate scatter plots for the number of total celltag counts in each cell and the number each CellTag across all cells. These plots assist filtering and cleaning of the data. 237 | ```r 238 | MetricPlots(bam.test.obj) 239 | ``` 240 | Below is an example plot that you could obtain from this object 241 |

242 | 243 |

244 | 245 | ### 4. Apply the whitelisted CellTags generated from assessment 246 | Based on the whitelist generated earlier, we filter the UMI count matrix to contain only whitelisted CelTags for the current version under processing. The function takes in two inputs including the CellTag object with binarization performed and the path to the whitelist csv file. The whitelist result will be saved in a slot - "whitelisted.count". 247 | ```r 248 | # Read the RDS file and get the object 249 | dt.mtx.whitelist.path <- system.file("extdata", "v1_whitelist.csv", package = "CellTagR") 250 | bam.test.obj <- SingleCellDataWhitelist(bam.test.obj, dt.mtx.whitelist.path) 251 | ``` 252 | 253 | ### 5. Check metric plots after whitelist filtering 254 | Recheck the metric similar to Step 3 255 | ```r 256 | MetricPlots(bam.test.obj) 257 | ``` 258 | 259 | ### 6. Additional filtering 260 | #### Filter out cells with more than 20 CellTags 261 | ```r 262 | bam.test.obj <- MetricBasedFiltering(bam.test.obj, 20, comparison = "less") 263 | ``` 264 | #### Filter out cells with less than 2 CellTags 265 | ```r 266 | bam.test.obj <- MetricBasedFiltering(bam.test.obj, 2, comparison = "greater") 267 | ``` 268 | ### 7. Last check of metric plots 269 | ```r 270 | MetricPlots(bam.test.obj) 271 | ``` 272 | Example plot of last check! 273 |

274 | 275 |

276 | If it looks good, proceed to the following steps to call the clones. 277 | 278 | ### 8. Clone Calling 279 | #### I. Jaccard Analysis 280 | This calculates pairwise Jaccard similarities among cells using the filtered CellTag UMI count matrix. This function takes the CellTag object with metric filtering carried out. This will generate a Jaccard similarity matrix, which is saved as a part of the object in a slot - "jaccard.mtx". It also plots a correlation heatmap with cells ordered by hierarchical clustering. 281 | 282 | ```r 283 | bam.test.obj <- JaccardAnalysis(bam.test.obj) 284 | ``` 285 | ##### Note: For large sparse matrix, a fast version can be chosen using the parameter *fast*. 286 | ```r 287 | bam.test.obj <- JaccardAnalysis(bam.test.obj, fast = T) 288 | ``` 289 | #### II. Clone Calling 290 | Based on the Jaccard similarity matrix, we can call clones of cells. A clone will be selected if the correlations inside of the clones passes the cutoff given (here, 0.7 is used. It can be changed based on the heatmap/correlation matrix generated above). Using this part, a list containing the clonal identities of all cells and the count information for each clone will be stored in the object in slots - "clone.composition" and "clone.size.info". 291 | 292 | ##### Clonal Identity Table `clone.composition` 293 | 294 | |clone.id|cell.barcode| 295 | |:-------:|:------:| 296 | |Clonal ID|Cell BC | 297 | 298 | ##### Count Table `clone.size.info` 299 | |Clone.ID|Frequency| 300 | |:------:|:-------:| 301 | |Clonal ID|The cell number in the clone| 302 | 303 | ```r 304 | # Call clones 305 | bam.test.obj <- CloneCalling(celltag.obj = bam.test.obj, correlation.cutoff=0.7) 306 | # Check them out!! 307 | bam.test.obj@clone.composition[["v1"]] 308 | bam.test.obj@clone.size.info[["v1"]] 309 | ``` 310 | 311 | ## Network Construction And Visualization 312 | Having all three CellTag version analyzed and stored in one CellTag object, we will construct network of each individual clone connecting to its descendents. As well as connections between clones, cells in each clone will be visualized on the network as leaf nodes. In the network, each center node denotes a clone. Connections between those nodes suggest a "parent-child" relationship between the clones. Each leaf node denotes a cell. Connections between leaf nodes and center nodes suggest a "belonging" relationship. Additionally, we allow users to further construct a stacked bar chart to facilitate further analysis of the dynamics of different timepoints. 313 | 314 | ***Note:*** Here, we provide a demo object in .Rds format that is generated with all three versions processed. The R notebook used to process all three versions are included in the Examples folder. 315 | 316 | ### 1. Read in the object 317 | ```r 318 | # Read the RDS file and get the object 319 | dt.mtx.path <- system.file("extdata", "bam_v123_obj.Rds", package = "CellTagR") 320 | bam.test.obj <- readRDS(dt.mtx.path) 321 | ``` 322 | 323 | ### 2. Calculate the link list 324 | Here, we convert the CellTag Matrix into a form of link list, which will be further used to construct the linkages in the network 325 | ```r 326 | bam.test.obj <- convertCellTagMatrix2LinkList(bam.test.obj) 327 | ``` 328 | The linked list is saved in the slot - "network.link.list", in the following format. 329 | 330 | |source|target|tag|target_unmodified| 331 | |:-------:|:------:|:------:|:------:| 332 | |The Source Node|The Target Node|Associated CellTag|Original Target Name| 333 | 334 | In the source node, the data is formatted as \_\. These are the centroid nodes for the network. The clone number can be found in the previously filled slot - "clone.composition". In the target node, there are two possibilities. One of possible targets are cells that belong to the centroid clone. The others are clones that are related to the centroid clone, which will suggest "parent-child" relationship between clones. For example, in the table below, the first row describes the belonging relationship of cell with barcode "AAGCCGCAGCTAGCCC-1" to Clone3 from CellTag V1, while the second row indicates a "parent-child" relationship between Clone 1 from CellTag V1 and Clone 42 from CellTag V2. 335 | 336 | |source|target|tag|target_unmodified| 337 | |:-------:|:------:|:------:|:------:| 338 | |CellTagV1_3|AAGCCGCAGCTAGCCC-1_V1|CellTagV1|AAGCCGCAGCTAGCCC-1| 339 | |CellTagV1_1|CellTagV2_42|CellTagV1|CellTagV2_42| 340 | 341 | ### 3. Get nodes from the link list 342 | This will obtain all the nodes that are involved in this network. 343 | ```r 344 | bam.test.obj <- getNodesfromLinkList(bam.test.obj) 345 | ``` 346 | 347 | ### 4. Add additional information 348 | For each leaf node (each cell), other information, such as cluster/cell types, can be available via other analysis. In this step, we will add these information into each node such that these information can be visualized on the network as well. In this scenario, for demo purposes, we used a simulation data frame to serve as a mock cluster information for each node. 349 | ```r 350 | # Simulate some additional data 351 | additional_data <- data.frame(sample(1:10, size = length(rownames(bam.test.obj@celltag.aggr.final)), replace = TRUE), row.names = rownames(bam.test.obj@celltag.aggr.final)) 352 | colnames(additional_data) <- "Cluster" 353 | # Add the data to the object 354 | bam.test.obj <- addData2Nodes(bam.test.obj, additional_data) 355 | ``` 356 | 357 | ### 5. Network visualization and plot 358 | Here, we will visualize the network! 359 | ```r 360 | # Network Visualization 361 | bam.test.obj <- drawSubnet(tag = "CellTagV1_2", overlay = "Cluster", celltag.obj = bam.test.obj) 362 | bam.test.obj@network 363 | ``` 364 | 365 | Additionally, the network can be saved to a html file, allowing better visualization and overview. Please make sure to have pandoc to support markdown and output this network. 366 | ```r 367 | saveNetwork(bam.test.obj@network, "~/Desktop/presentation/Demo/hf1.d15.network.construction.html") 368 | ``` 369 | 370 | ### 6. Stack bar chart generation 371 | An important aspect of using CellTagging is to analyze the clonal dynamics of a population of cells. Here, we provide a stack bar chart option to provide some insights. 372 | ```r 373 | # Get the data for ploting 374 | bar.data <- bam.test.obj@celltag.aggr.final 375 | bar.data$Cell.BC <- rownames(bar.data) 376 | 377 | bar.data <- gather(bar.data, key = "CellTag", value = "Clone", 1:3, na.rm = FALSE) 378 | 379 | # Using ggplot to plot 380 | ggplot(data = bar.data) + 381 | geom_bar(mapping = aes(x = CellTag, fill = factor(Clone)), position = "fill", show.legend = FALSE) + 382 | scale_y_continuous(labels = scales::percent_format()) + 383 | theme_bw() 384 | ``` 385 | Below is a sample bar chart! 386 |

387 | 388 |

389 | 390 | ## Contact Us 391 | -------------------------------------------------------------------------------- /inst/extdata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/inst/extdata/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/Demo_V1.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/inst/extdata/Demo_V1.Rds -------------------------------------------------------------------------------- /inst/extdata/V2-1_R1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/inst/extdata/V2-1_R1.zip -------------------------------------------------------------------------------- /inst/extdata/bam_v123_obj.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/inst/extdata/bam_v123_obj.Rds -------------------------------------------------------------------------------- /inst/extdata/hf1.d28.prefiltered.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/CellTagR/de6f6fb041e7c7b85915554c6eb675c087e4b2eb/inst/extdata/hf1.d28.prefiltered.Rds -------------------------------------------------------------------------------- /man/AddCellTagFreqSort.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagWhitelistGeneration.R 3 | \name{AddCellTagFreqSort} 4 | \alias{AddCellTagFreqSort} 5 | \title{CellTag Frequency Sort Table} 6 | \usage{ 7 | AddCellTagFreqSort(celltag.obj) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag Object with CellTags extracted} 11 | } 12 | \value{ 13 | A CellTag Object with attribute (celltag.freq.stats) filled. 14 | } 15 | \description{ 16 | This function counts and sorts the identified CellTags from Fastq file 17 | } 18 | \examples{ 19 | CellTagWhitelistFiltering(bam.test.obj) 20 | 21 | } 22 | \keyword{CellTagging} 23 | \keyword{RNA-seq} 24 | \keyword{data,} 25 | \keyword{single-cell} 26 | -------------------------------------------------------------------------------- /man/Barcode.Aggregate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AuxiliaryFunctions.R 3 | \name{Barcode.Aggregate} 4 | \alias{Barcode.Aggregate} 5 | \title{CellTag Barcode Aggregation function} 6 | \usage{ 7 | Barcode.Aggregate(file.list, output.file) 8 | } 9 | \arguments{ 10 | \item{file.list}{files in a list to aggregate in order same as the BAM files} 11 | 12 | \item{output.file}{where to save this aggregated output file. Should be a .tsv file.} 13 | } 14 | \value{ 15 | A list containing the pattern, nucleotides to look for before/after the motif 16 | } 17 | \description{ 18 | This function allows barcode aggregation of multiple-file processing. 19 | } 20 | \examples{ 21 | Barcode.Aggregate(list("barcodes_1.tsv", "barcodes_2.tsv"), output.file = "barcode_aggr.tsv") 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/CellTagDataForCollapsing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagForCollapsing.R 3 | \name{CellTagDataForCollapsing} 4 | \alias{CellTagDataForCollapsing} 5 | \title{CellTag Starcode Prior Collapsing} 6 | \usage{ 7 | CellTagDataForCollapsing(celltag.obj, output.file) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the raw count matrix filled.} 11 | 12 | \item{output.file}{The filepath and name to save the table for collapsing (usually a .txt file)} 13 | } 14 | \value{ 15 | A CellTag object with collapsing mapping table stored in pre.starcode slot 16 | } 17 | \description{ 18 | This function generate the .txt file that will be fed into starcode - https://github.com/gui11aume/starcode - to collapse similar CellTags. 19 | } 20 | \examples{ 21 | CellTagDataForCollapsing(bam.test.obj, "./collapsing.txt") 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/CellTagDataPostCollapsing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagForCollapsing.R 3 | \name{CellTagDataPostCollapsing} 4 | \alias{CellTagDataPostCollapsing} 5 | \title{CellTag Starcode Post Collapsing} 6 | \usage{ 7 | CellTagDataPostCollapsing(celltag.obj, collapsed.rslt.file) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the pre-starcode mapping matrix filled.} 11 | 12 | \item{collapsed.rslt.file}{File path to the collapsed result file} 13 | } 14 | \value{ 15 | A CellTag object with collapsed count matrix stored in collapsed.count slot 16 | } 17 | \description{ 18 | This function processes the result generated from starcode - https://github.com/gui11aume/starcode. 19 | } 20 | \examples{ 21 | CellTagDataPostCollapsing(bam.test.obj, "./collapsing_result.txt") 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/CellTagExtraction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagExtraction.R 3 | \name{CellTagExtraction} 4 | \alias{CellTagExtraction} 5 | \title{CellTag Extraction Function} 6 | \usage{ 7 | CellTagExtraction(celltag.obj, celltag.version, technique = "10x") 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object initialized with path to the fastq/bam file} 11 | 12 | \item{celltag.version}{The CellTag version to extract} 13 | 14 | \item{technique}{The technique used for scRNA-seq, Default to 10x. Currently enabled for 10x and dropseq.} 15 | } 16 | \value{ 17 | A CellTag object with attribute (bam.parse.rslt) filled 18 | } 19 | \description{ 20 | This function extracts CellTags from the raw fastq/bam sequencing file. If it is a fastq file, provides counts of each CellTag and sorts them in desending order. If it is a bam file, returns the barcode, umi, celltag information. 21 | } 22 | \examples{ 23 | CellTagExtraction(bam.test.obj) 24 | 25 | } 26 | \keyword{CellTagging} 27 | \keyword{RNA-seq} 28 | \keyword{data,} 29 | \keyword{single-cell} 30 | -------------------------------------------------------------------------------- /man/CellTagMatrixCount.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagMatrixGeneration.R 3 | \name{CellTagMatrixCount} 4 | \alias{CellTagMatrixCount} 5 | \title{CellTag Matrix Generation Function} 6 | \usage{ 7 | CellTagMatrixCount(celltag.obj, barcodes.file) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with bam file result filled} 11 | 12 | \item{barcodes.file}{A .tsv output file from 10x CellRanger pipeline. It contains a list of all cell barcodes identified in the filtered dataset.} 13 | } 14 | \value{ 15 | A CellTag object with the attribute (raw.count) filled 16 | } 17 | \description{ 18 | This function uses the extract information from data processed before and generate a Cell Barcode x CellTag matrix 19 | } 20 | \examples{ 21 | CellTagMatrixCount(bam.test.obj, "barcodes.tsv") 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/CellTagObject.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CreateCellTagObject.R 3 | \name{CellTagObject} 4 | \alias{CellTagObject} 5 | \title{Create a New CellTag Object} 6 | \usage{ 7 | CellTagObject(object.name, fastq.bam.directory) 8 | } 9 | \arguments{ 10 | \item{object.name}{The name of the object} 11 | 12 | \item{fastq.bam.input}{The input fastq/bam data file path} 13 | 14 | \item{celltag.version}{Which version of CellTags are you working with?} 15 | } 16 | \value{ 17 | A CellTag Object with open attributes that can be filled as analysis moving along 18 | } 19 | \description{ 20 | This function creates a CellTag object that contains the basic information required for the object 21 | } 22 | \examples{ 23 | CellTagObejct("hf1.d15.test", "hf1.d15.bam", "v1") 24 | 25 | } 26 | \keyword{CellTagging} 27 | \keyword{RNA-seq} 28 | \keyword{data,} 29 | \keyword{single-cell} 30 | -------------------------------------------------------------------------------- /man/CellTagPatternCalling.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AuxiliaryFunctions.R 3 | \name{CellTagPatternCalling} 4 | \alias{CellTagPatternCalling} 5 | \title{CellTag Pattern Calling Function} 6 | \usage{ 7 | CellTagPatternCalling(celltag.version) 8 | } 9 | \arguments{ 10 | \item{celltag.version}{Which CellTag version are you investigating?} 11 | } 12 | \value{ 13 | A list containing the pattern, nucleotides to look for before/after the motif 14 | } 15 | \description{ 16 | This function provides motif patterns corresponding to the input celltag version 17 | } 18 | \examples{ 19 | CellTagPatternCalling("v1") 20 | 21 | } 22 | \keyword{CellTagging} 23 | \keyword{RNA-seq} 24 | \keyword{data,} 25 | \keyword{single-cell} 26 | -------------------------------------------------------------------------------- /man/CellTagWhitelistFiltering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagWhitelistGeneration.R 3 | \name{CellTagWhitelistFiltering} 4 | \alias{CellTagWhitelistFiltering} 5 | \title{CellTag Whitelist Filtering Function} 6 | \usage{ 7 | CellTagWhitelistFiltering(celltag.obj, percentile, output.dir = NULL) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag Object with CellTag frequency table counted and sorted} 11 | 12 | \item{percentile}{A fraction cutoff percentile for filtering the CellTags e.g. 0.9 for 90th percentile} 13 | 14 | \item{output.dir}{Which directory would you like to store these files? If NULL, save to the same directory as the fastq/bam file} 15 | } 16 | \value{ 17 | A CellTag Object with attribute (whitelist) filled. 18 | } 19 | \description{ 20 | This function conducts whitelist filtering such that only CellTags with count number over their certain percentile would be considered for clone calling 21 | } 22 | \examples{ 23 | CellTagWhitelistFiltering(bam.test.obj, 0.9) 24 | 25 | } 26 | \keyword{CellTagging} 27 | \keyword{RNA-seq} 28 | \keyword{data,} 29 | \keyword{single-cell} 30 | -------------------------------------------------------------------------------- /man/CloneCalling.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CloneCalling.R 3 | \name{CloneCalling} 4 | \alias{CloneCalling} 5 | \title{Clone Calling Function} 6 | \usage{ 7 | CloneCalling(celltag.obj, correlation.cutoff) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the jaccard matrix generated} 11 | 12 | \item{correlation.cutoff}{Correlation cutoff for clone membership} 13 | } 14 | \value{ 15 | A CellTag object with attributes (clone.composition & clone.size.info) filled. 16 | } 17 | \description{ 18 | This function conducts clone calling based on the Jaccard results. 19 | } 20 | \examples{ 21 | CloneCalling(bam.test.obj, 0.7) 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/JaccardAnalysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CloneCalling.R 3 | \name{JaccardAnalysis} 4 | \alias{JaccardAnalysis} 5 | \title{Jaccard Analysis Function} 6 | \usage{ 7 | JaccardAnalysis(celltag.obj, plot.corr = TRUE, fast = FALSE) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the counts filtered based on metrics} 11 | 12 | \item{plot.corr}{Would you like to plot the correlation matrix?} 13 | } 14 | \value{ 15 | A CellTag object with attribute (jaccard.mtx) filled 16 | } 17 | \description{ 18 | This function conducts Jaccard analysis to calculate the Jaccard similarity between cells. 19 | } 20 | \examples{ 21 | JaccardAnalysis(bam.test.obj) 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/MetricBasedFiltering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/MetricBasedPlottingAndFiltering.R 3 | \name{MetricBasedFiltering} 4 | \alias{MetricBasedFiltering} 5 | \title{Metric-Base Filtering Function} 6 | \usage{ 7 | MetricBasedFiltering(celltag.obj, cutoff, comparison = "less") 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag Object with count matrix generated} 11 | 12 | \item{cutoff}{The cutoff decided from the metric plots} 13 | 14 | \item{comparison}{Would you like to maintain the part less than/greater than the cutoff? Default to less. Choices can be greater or less.} 15 | } 16 | \value{ 17 | A CellTag Object with attribute (metric.filtered.count) filled 18 | } 19 | \description{ 20 | This function applies further filtering on scRNA-seq data with CellTags based on cutoff values identified from the metric plots. 21 | } 22 | \examples{ 23 | MetricBasedFiltering(bam.test.object, 20, "less") 24 | 25 | } 26 | \keyword{CellTagging} 27 | \keyword{RNA-seq} 28 | \keyword{data,} 29 | \keyword{single-cell} 30 | -------------------------------------------------------------------------------- /man/MetricPlots.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/MetricBasedPlottingAndFiltering.R 3 | \name{MetricPlots} 4 | \alias{MetricPlots} 5 | \title{CellTag Metric Plotting Function} 6 | \usage{ 7 | MetricPlots(celltag.obj) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag Object} 11 | } 12 | \description{ 13 | This function provides some metric plots for further downstream celltag filtering in the scRNA-seq dataset. 14 | } 15 | \examples{ 16 | MetricPlots(bam.test.obj) 17 | 18 | } 19 | \keyword{CellTagging} 20 | \keyword{RNA-seq} 21 | \keyword{data,} 22 | \keyword{single-cell} 23 | -------------------------------------------------------------------------------- /man/SingleCellDataBinatization.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ScCellTagMatrixProcess.R 3 | \name{SingleCellDataBinatization} 4 | \alias{SingleCellDataBinatization} 5 | \title{Single-cell RNA-seq Binarization Function} 6 | \usage{ 7 | SingleCellDataBinatization(celltag.obj, tag.cutoff) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the raw count matrix generated} 11 | 12 | \item{tag.cutoff}{How many tags would you like to be used as a cutoff to say that the cells are tagged?} 13 | } 14 | \value{ 15 | A CellTag object with the attribute (binary.mtx) filled. 16 | } 17 | \description{ 18 | This function binarize the single-cell celltag data based on a given cutoff. It will generate a binary matrix, which will be stored as a slot in the CellTag Object. The binary matrix will be further used for future processing of the single-cell data. 19 | } 20 | \examples{ 21 | SingleCellDataBinatization(bam.test.obj, 2) 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/SingleCellDataWhitelist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ScCellTagMatrixProcess.R 3 | \name{SingleCellDataWhitelist} 4 | \alias{SingleCellDataWhitelist} 5 | \title{Single-cell RNA-seq Whitelisting Function} 6 | \usage{ 7 | SingleCellDataWhitelist(celltag.obj, whitels.cell.tag.file) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with the binary matrix generated} 11 | 12 | \item{whitels.cell.tag.file}{file director to the whitelisted cell tags} 13 | } 14 | \value{ 15 | A CellTag object with the attribute (whitelisted.count) filled 16 | } 17 | \description{ 18 | The whitelist is a list of CellTag generated based on assessment of CellTag library. It helps reduce the effect from sequencing error in CellTags. This function conducts whitelist filtering through the single-cell dataset. It will filter out CellTags that are not included in the whitelist. 19 | } 20 | \examples{ 21 | SingleCellDataWhitelist(bam.test.obj, "~/Desktop/My_Favourite_Whitelist.csv") 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/addData2Nodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagNetworkContruction.R 3 | \name{addData2Nodes} 4 | \alias{addData2Nodes} 5 | \title{Add Additional Information to the Nodes} 6 | \usage{ 7 | addData2Nodes(celltag.obj, additional_data) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with nodes filled} 11 | 12 | \item{additional_data}{A data frame with auxillary information about the nodes (rownames = the nodes names)} 13 | } 14 | \value{ 15 | A CellTag object with the attribute (nodes) modified. 16 | } 17 | \description{ 18 | This function add auxillary information to the nodes. Such information can include cluster information, cell type information and so on. The information should be stored as a data frame when passing in to the funtion. 19 | } 20 | \examples{ 21 | addData2Nodes(bam.test.obj, cluster.info) 22 | 23 | } 24 | \keyword{CellTagging} 25 | \keyword{RNA-seq} 26 | \keyword{data,} 27 | \keyword{single-cell} 28 | -------------------------------------------------------------------------------- /man/bam.process.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AuxiliaryFunctions.R 3 | \name{bam.process} 4 | \alias{bam.process} 5 | \title{Bam File Process Function} 6 | \usage{ 7 | bam.process( 8 | bam.file, 9 | pattern, 10 | short.nt.before.tag, 11 | short.nt.after.tag, 12 | technique 13 | ) 14 | } 15 | \arguments{ 16 | \item{bam.file}{The input bam data directory} 17 | 18 | \item{pattern}{The pattern to seek for} 19 | 20 | \item{short.nt.before.tag}{A short sequence before the 8nt tag to help more specific identification} 21 | 22 | \item{short.nt.after.tag}{A short sequence after the 8nt tag to help more specific identification} 23 | } 24 | \value{ 25 | A data table contains cell barcode, celltag and umi information 26 | } 27 | \description{ 28 | This function extracts CellTags from the bam sequencing file, provides cell barcode, umi and their corresponding celltag information. 29 | } 30 | \examples{ 31 | bam.process("data.fastq", "CCGGT[ATCG]{8}GAATTC", "CCGGT", "GAATTC") 32 | 33 | } 34 | \keyword{CellTagging} 35 | \keyword{RNA-seq} 36 | \keyword{data,} 37 | \keyword{single-cell} 38 | -------------------------------------------------------------------------------- /man/convertCellTagMatrix2LinkList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagNetworkContruction.R 3 | \name{convertCellTagMatrix2LinkList} 4 | \alias{convertCellTagMatrix2LinkList} 5 | \title{Convert CellTag Matrix to Link List} 6 | \usage{ 7 | convertCellTagMatrix2LinkList(celltag.obj) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with all clone information filled} 11 | } 12 | \value{ 13 | A CellTag object with the attribute (network.link.list) filled 14 | } 15 | \description{ 16 | This function convert the CellTag Matrix to a link list, which is further used for network construction and visualizetion 17 | } 18 | \examples{ 19 | convertCellTagMatrix2LinkList(bam.test.obj) 20 | 21 | } 22 | \keyword{CellTagging} 23 | \keyword{RNA-seq} 24 | \keyword{data,} 25 | \keyword{single-cell} 26 | -------------------------------------------------------------------------------- /man/drawSubnet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagNetworkVisualiztion.R 3 | \name{drawSubnet} 4 | \alias{drawSubnet} 5 | \title{Draw the Network} 6 | \usage{ 7 | drawSubnet(celltag.obj, tag, overlay) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with link list and nodes filled} 11 | 12 | \item{tag}{Which tags would you like to plot?} 13 | 14 | \item{overlay}{What information would you like to overlay with the network? This should be one of the column names of the node information.} 15 | } 16 | \value{ 17 | A CellTag object with the attribute (network) modified. 18 | } 19 | \description{ 20 | This function generate a force-directed network based on the link list and nodes information. 21 | } 22 | \examples{ 23 | drawSubnet(bam.test.obj, "CellTagV1_2", "Cluster") 24 | 25 | } 26 | \keyword{CellTagging} 27 | \keyword{RNA-seq} 28 | \keyword{data,} 29 | \keyword{single-cell} 30 | -------------------------------------------------------------------------------- /man/fastq.process.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AuxiliaryFunctions.R 3 | \name{fastq.process} 4 | \alias{fastq.process} 5 | \title{Fastq Process Function} 6 | \usage{ 7 | fastq.process(fastq.file, pattern, short.nt.before.tag, short.nt.after.tag) 8 | } 9 | \arguments{ 10 | \item{fastq.file}{The input fastq/bam data directory} 11 | 12 | \item{pattern}{The pattern to seek for} 13 | 14 | \item{short.nt.before.tag}{A short sequence before the 8nt tag to help more specific identification} 15 | 16 | \item{short.nt.after.tag}{A short sequence after the 8nt tag to help more specific identification} 17 | } 18 | \value{ 19 | A list contains count table of CellTags. If requested to save fullTag counts, i.e. save.fullTag.counts = TRUE, return a list of both 8nt tags and full sequences count. Otherwise, a list of 8nt tags counts. 20 | } 21 | \description{ 22 | This function extracts CellTags from the raw fastq sequencing file, provides counts of each CellTag and sorts them in desending order. 23 | } 24 | \examples{ 25 | fastq.process("data.fastq", "CCGGT[ATCG]{8}GAATTC", "CCGGT", "GAATTC") 26 | 27 | } 28 | \keyword{CellTagging} 29 | \keyword{RNA-seq} 30 | \keyword{data,} 31 | \keyword{single-cell} 32 | -------------------------------------------------------------------------------- /man/getNodesfromLinkList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CellTagNetworkContruction.R 3 | \name{getNodesfromLinkList} 4 | \alias{getNodesfromLinkList} 5 | \title{Get Nodes from Link List} 6 | \usage{ 7 | getNodesfromLinkList(celltag.obj) 8 | } 9 | \arguments{ 10 | \item{celltag.obj}{A CellTag object with link list filled} 11 | } 12 | \value{ 13 | A CellTag object with the attribute (nodes) filled 14 | } 15 | \description{ 16 | This function extracts the node information from the generated link list. 17 | } 18 | \examples{ 19 | getNodesfromLinkList(bam.test.obj) 20 | 21 | } 22 | \keyword{CellTagging} 23 | \keyword{RNA-seq} 24 | \keyword{data,} 25 | \keyword{single-cell} 26 | --------------------------------------------------------------------------------