├── .DS_Store ├── .Rbuildignore ├── .Rhistory ├── .gitignore ├── Capybara.Rproj ├── DESCRIPTION ├── NAMESPACE ├── R ├── .DS_Store ├── Binarization.R ├── Classification.R ├── EmpiricalPValueCalculation.R ├── Helper.R ├── HighResolutionReferenceConstruction.R ├── MultiIDFilterQP.R ├── SingleCellQuadraticProgramming.R ├── SingleCellQuadraticProgrammingAux.R └── TransitionScoreCalculation.R ├── README.md ├── examples ├── .DS_Store ├── Monocle_hat_colin.png ├── Step 1 - ARCHS4 Mining.R ├── Step 2 - bulk cleaning.R ├── bulk class mca pancreatic.png ├── bulk_composition_pancreatic.pdf ├── cardiac_TS_plots.png ├── cardiac_bulk_v2.png ├── features.tsv ├── gene_info.tsv ├── pancreatic dot plot.pdf └── pancreatic dot plot.png ├── inst ├── .DS_Store └── extdata │ ├── .DS_Store │ ├── Bulk Reference RPKM.Rds │ ├── Bulk Reference Raw.Rds │ ├── MCA Adult Background.Rds │ ├── MCA Embryonic Background.Rds │ ├── MCA_CellAssignments.csv │ ├── baron_dataset.zip │ └── features.tsv └── man ├── binarization.mann.whitney.Rd ├── binary.to.classification.Rd ├── calc.scale.ratio.Rd ├── construct.high.res.reference.Rd ├── gene.intersect.sub.Rd ├── get.least.connected.Rd ├── get.mid.connected.Rd ├── get.most.connected.Rd ├── multi.id.curate.qp.Rd ├── normalize.dt.Rd ├── perc.calc.aux.Rd ├── percentage.calc.Rd ├── ref.construction.Rd ├── sample.func.Rd ├── sc.quad.prog.run.Rd ├── single.round.QP.analysis.Rd ├── top.genes.Rd └── transition.score.Rd /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/.DS_Store -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^Capybara\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | baron.expr <- data.frame() 2 | baron.meta <- data.frame() 3 | count <- 1 4 | for (f in baron.fns) { 5 | curr.df <- read.csv(f, row.names = 1, stringsAsFactors = F, check.names = F) 6 | curr.bc <- paste0(rownames(curr.df), "_", curr.df$barcode, "_Sample_", count) 7 | rownames(curr.df) <- curr.bc 8 | if (nrow(baron.meta) <= 0) { 9 | baron.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"], 10 | stringsAsFactors = F, check.names = F) 11 | } else { 12 | curr.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"], 13 | stringsAsFactors = F, check.names = F) 14 | baron.meta <- rbind(baron.meta, curr.meta) 15 | } 16 | curr.expr <- curr.df[, c(3:ncol(curr.df))] 17 | if (nrow(baron.expr) <= 0) { 18 | baron.expr <- curr.expr 19 | } else { 20 | baron.expr <- rbind(baron.expr, curr.expr) 21 | } 22 | count <- count +1 23 | } 24 | count.mtx <- t(baron.expr) 25 | write.csv(count.mtx, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", quote = F) 26 | write.csv(baron.meta, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic_meta.csv", quote = F) 27 | single.round.QP.analysis(ref.df, count.mtx, n.cores = 4, save.to.path = "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/", save.to.filename = "02_MCA_Based_scClassifier_reference_mix90_test_normalize_select", unix.par = TRUE) 28 | background.mtx <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/01_MCA_Based_scClassifier_reference_mix90_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 29 | mtx.test <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/02_MCA_Based_scClassifier_reference_mix90_test_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 30 | # reference Permutation 31 | col.sub <- ncol(background.mtx) - 2 32 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)]) 33 | # Test Permutation 34 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)])) 35 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list) 36 | classification <- binary.to.classification(bin.count[,c(1:col.sub)]) 37 | rownames(classification) <- classification$barcode 38 | classification$actual <- baron.meta[rownames(classification), "cell.type"] 39 | table.freq <- table(classification$actual, classification$call) 40 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3)) 41 | rownames(table.freq.perc)[14] <- "beta" 42 | table.freq.sub <- as.data.frame(table.freq.perc[c(1,2,4,5,6,7,8,11,12,14), c(1,2,5,8,3,4,6,7,9,10,13)]) 43 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub) 44 | table.freq.melt <- melt(table.freq.sub) 45 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call, 46 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 47 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), 48 | ordered = T) 49 | table.freq.melt$variable <- factor(table.freq.melt$variable, 50 | levels = c("B_cell", "beta", "ductal", "endothelial", 51 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"), 52 | ordered = T) 53 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 54 | geom_point(aes(colour = variable)) + 55 | scale_size_area(name = "value", max_size=12) + 56 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) + 57 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 58 | theme(legend.position = "right", 59 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 60 | axis.text.y = element_text(size = 12, face = "bold.italic"), 61 | axis.ticks = element_blank(), 62 | axis.title = element_blank(), 63 | title = element_text(face = "bold.italic", size = 14), 64 | panel.grid.major = element_blank(), 65 | panel.grid.minor = element_blank(), 66 | panel.background = element_blank(), 67 | axis.line = element_blank()) 68 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 69 | geom_point(aes(colour = variable)) + 70 | scale_size_area(name = "value", max_size=12) + 71 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 72 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 73 | theme(legend.position = "right", 74 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 75 | axis.text.y = element_text(size = 12, face = "bold.italic"), 76 | axis.ticks = element_blank(), 77 | axis.title = element_blank(), 78 | title = element_text(face = "bold.italic", size = 14), 79 | panel.grid.major = element_blank(), 80 | panel.grid.minor = element_blank(), 81 | panel.background = element_blank(), 82 | axis.line = element_blank()) 83 | View(table.freq.perc) 84 | rownames(table.freq.perc)[14] <- "beta" 85 | rownames(table.freq.perc)[16] <- "beta" 86 | View(table.freq.perc) 87 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell", 88 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell", c(1,2,5,8,3,4,6,7,9,10,13)]) 89 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell", 90 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)]) 91 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub) 92 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 93 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)]) 94 | View(table.freq.perc) 95 | table.freq <- table(classification$actual, classification$call) 96 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3)) 97 | rownames(table.freq.perc)[16] <- "beta" 98 | View(table.freq.perc) 99 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 100 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)]) 101 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub) 102 | table.freq.melt <- melt(table.freq.sub) 103 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call, 104 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 105 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), 106 | ordered = T) 107 | table.freq.melt$variable <- factor(table.freq.melt$variable, 108 | levels = c("B_cell", "beta", "ductal", "endothelial", 109 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"), 110 | ordered = T) 111 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 112 | geom_point(aes(colour = value)) + 113 | scale_size_area(name = "value", max_size=12) + 114 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 115 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 116 | theme(legend.position = "right", 117 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 118 | axis.text.y = element_text(size = 12, face = "bold.italic"), 119 | axis.ticks = element_blank(), 120 | axis.title = element_blank(), 121 | title = element_text(face = "bold.italic", size = 14), 122 | panel.grid.major = element_blank(), 123 | panel.grid.minor = element_blank(), 124 | panel.background = element_blank(), 125 | axis.line = element_blank()) 126 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 127 | geom_point(aes(colour = value)) + 128 | scale_size_area(name = "value", max_size=12) + 129 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) + 130 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 131 | theme(legend.position = "right", 132 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 133 | axis.text.y = element_text(size = 12, face = "bold.italic"), 134 | axis.ticks = element_blank(), 135 | axis.title = element_blank(), 136 | title = element_text(face = "bold.italic", size = 14), 137 | panel.grid.major = element_blank(), 138 | panel.grid.minor = element_blank(), 139 | panel.background = element_blank(), 140 | axis.line = element_blank()) 141 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 142 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)]) 143 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub) 144 | table.freq.melt <- melt(table.freq.sub) 145 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call, 146 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 147 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), 148 | ordered = T) 149 | table.freq.melt$variable <- factor(table.freq.melt$variable, 150 | levels = c("B_cell", "beta", "ductal", "endothelial", 151 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"), 152 | ordered = T) 153 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 154 | geom_point(aes(colour = variable)) + 155 | scale_size_area(name = "value", max_size=12) + 156 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 157 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 158 | theme(legend.position = "right", 159 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 160 | axis.text.y = element_text(size = 12, face = "bold.italic"), 161 | axis.ticks = element_blank(), 162 | axis.title = element_blank(), 163 | title = element_text(face = "bold.italic", size = 14), 164 | panel.grid.major = element_blank(), 165 | panel.grid.minor = element_blank(), 166 | panel.background = element_blank(), 167 | axis.line = element_blank()) 168 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 169 | geom_point(aes(colour = variable)) + 170 | scale_size_area(name = "value", max_size=12) + 171 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 172 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 173 | theme(legend.position = "right", 174 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 175 | axis.text.y = element_text(size = 12, face = "bold.italic"), 176 | axis.ticks = element_blank(), 177 | axis.title = element_blank(), 178 | title = element_text(face = "bold.italic", size = 14), 179 | panel.grid.major = element_blank(), 180 | panel.grid.minor = element_blank(), 181 | panel.background = element_blank(), 182 | axis.line = element_line(colour = "black", size = 1)) 183 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 184 | geom_point(aes(colour = variable)) + 185 | scale_size_area(name = "value", max_size=12) + 186 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 187 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 188 | guides(fill = FALSE, color = FALSE) + 189 | theme(legend.position = "right", 190 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 191 | axis.text.y = element_text(size = 12, face = "bold.italic"), 192 | axis.ticks = element_blank(), 193 | axis.title = element_blank(), 194 | title = element_text(face = "bold.italic", size = 14), 195 | panel.grid.major = element_blank(), 196 | panel.grid.minor = element_blank(), 197 | panel.background = element_blank(), 198 | axis.line = element_line(colour = "black", size = 1)) 199 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 200 | geom_point(aes(colour = variable)) + 201 | scale_size_area(name = "value", max_size=12) + 202 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 203 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 204 | guides(fill = FALSE, color = FALSE) + 205 | theme(legend.position = "bottom", 206 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 207 | axis.text.y = element_text(size = 12, face = "bold.italic"), 208 | axis.ticks = element_blank(), 209 | axis.title = element_blank(), 210 | title = element_text(face = "bold.italic", size = 14), 211 | panel.grid.major = element_blank(), 212 | panel.grid.minor = element_blank(), 213 | panel.background = element_blank(), 214 | axis.line = element_line(colour = "black", size = 1)) 215 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 216 | geom_point(aes(colour = variable)) + 217 | scale_size_area(name = "Percentage", max_size=12) + 218 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 219 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 220 | guides(fill = FALSE, color = FALSE) + 221 | theme(legend.position = "bottom", 222 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 223 | axis.text.y = element_text(size = 12, face = "bold.italic"), 224 | axis.ticks = element_blank(), 225 | axis.title = element_blank(), 226 | title = element_text(face = "bold.italic", size = 14), 227 | panel.grid.major = element_blank(), 228 | panel.grid.minor = element_blank(), 229 | panel.background = element_blank(), 230 | axis.line = element_line(colour = "black", size = 1)) 231 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 6, height = 8, paper = "special") 232 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 233 | geom_point(aes(colour = variable)) + 234 | scale_size_area(name = "Percentage", max_size=12) + 235 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 236 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 237 | guides(fill = FALSE, color = FALSE) + 238 | theme(legend.position = "bottom", 239 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 240 | axis.text.y = element_text(size = 12, face = "bold.italic"), 241 | axis.ticks = element_blank(), 242 | axis.title = element_blank(), 243 | title = element_text(face = "bold.italic", size = 14), 244 | panel.grid.major = element_blank(), 245 | panel.grid.minor = element_blank(), 246 | panel.background = element_blank(), 247 | axis.line = element_line(colour = "black", size = 1)) 248 | dev.off() 249 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 7, height = 9, paper = "special") 250 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 251 | geom_point(aes(colour = variable)) + 252 | scale_size_area(name = "Percentage", max_size=12) + 253 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 254 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 255 | guides(fill = FALSE, color = FALSE) + 256 | theme(legend.position = "bottom", 257 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 258 | axis.text.y = element_text(size = 12, face = "bold.italic"), 259 | axis.ticks = element_blank(), 260 | axis.title = element_blank(), 261 | title = element_text(face = "bold.italic", size = 14), 262 | panel.grid.major = element_blank(), 263 | panel.grid.minor = element_blank(), 264 | panel.background = element_blank(), 265 | axis.line = element_line(colour = "black", size = 1)) 266 | dev.off() 267 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 8, height = 9, paper = "special") 268 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 269 | geom_point(aes(colour = variable)) + 270 | scale_size_area(name = "Percentage", max_size=12) + 271 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 272 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 273 | guides(fill = FALSE, color = FALSE) + 274 | theme(legend.position = "bottom", 275 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 276 | axis.text.y = element_text(size = 12, face = "bold.italic"), 277 | axis.ticks = element_blank(), 278 | axis.title = element_blank(), 279 | title = element_text(face = "bold.italic", size = 14), 280 | panel.grid.major = element_blank(), 281 | panel.grid.minor = element_blank(), 282 | panel.background = element_blank(), 283 | axis.line = element_line(colour = "black", size = 1)) 284 | dev.off() 285 | baron.expr <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", header = T, row.names = 1, stringsAsFactors = F) 286 | View(baron.meta) 287 | View(mca) 288 | View(mca.meta) 289 | # Background cells 290 | mca <- read.csv("~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_CellAssignments.csv", 291 | row.names = 1, header = T, stringsAsFactors = F) 292 | View(mca) 293 | # Fetal bulk testing 294 | MCA.fetal.tissue <- c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Fetal_Brain", 295 | "Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache", 296 | "Neonatal_Brain", "Neonatal-Calvaria", "Neonatal-Heart", "Neonatal-Muscle", 297 | "Neonatal-Rib", "Neonatal-Skin", "Trophoblast-Stem-Cell", "NeonatalPancreas") 298 | library(devtools) 299 | library(roxygen2) 300 | setwd("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Capybara/") 301 | document() 302 | install(".") 303 | library(Capybara) 304 | syste 305 | system.file(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")) 306 | system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara") 307 | read.csv(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")) 308 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 309 | geom_bar(stat = "identity") + 310 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 311 | scale_fill_manual( 312 | name = "Mapped MCA Tissue", 313 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 314 | RColorBrewer::brewer.pal(8, "Set2"), 315 | RColorBrewer::brewer.pal(8, "Set3")) 316 | ) + 317 | labs(y = "Percentage of Cells") + 318 | ggtitle("Bulk Classification to MCA Tissues") + 319 | theme(legend.position = "none", 320 | axis.ticks = element_blank(), 321 | axis.title.x = element_blank(), 322 | axis.text = element_text(face = "bold.italic", size = 12), 323 | title = element_text(face = "bold.italic", size = 14), 324 | panel.grid.major = element_blank(), 325 | panel.grid.minor = element_blank(), 326 | panel.background = element_blank(), 327 | axis.line.x = element_blank(), 328 | axis.line.y = element_line(color = "black", size =1)) 329 | comp.raw <- count.in.cat 330 | comp.raw <- comp.raw[order(-comp.raw$perc), ] 331 | comp.raw$Var1 <- rownames(comp.raw) 332 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T) 333 | comp.raw$label <- "Mouse Pancreatic Cells" 334 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 335 | geom_bar(stat = "identity") + 336 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 337 | scale_fill_manual( 338 | name = "Mapped MCA Tissue", 339 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 340 | RColorBrewer::brewer.pal(8, "Set2"), 341 | RColorBrewer::brewer.pal(8, "Set3")) 342 | ) + 343 | labs(y = "Percentage of Cells") + 344 | ggtitle("Bulk Classification to MCA Tissues") + 345 | theme(legend.position = "none", 346 | axis.ticks = element_blank(), 347 | axis.title.x = element_blank(), 348 | axis.text = element_text(face = "bold.italic", size = 12), 349 | title = element_text(face = "bold.italic", size = 14), 350 | panel.grid.major = element_blank(), 351 | panel.grid.minor = element_blank(), 352 | panel.background = element_blank(), 353 | axis.line.x = element_blank(), 354 | axis.line.y = element_line(color = "black", size =1)) 355 | background.mca <- readRDS("~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/MCA ARCHS4/06_qp_top_90_each_category_backgroung.Rds") 356 | background.mtx <- background.mca 357 | mtx.test <- qp.paga.rslt.sub[,colnames(background.mtx)] 358 | ### cor test 359 | ref.test <- t(background.mtx) 360 | mtx.test.cor <- t(qp.paga.rslt.sub) 361 | corr.mtx <- WGCNA::cor(ref.test, mtx.test.cor) 362 | correlation.cutoff <- quantile(corr.mtx, 0.90) 363 | new.corr.bin <- corr.mtx 364 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1 365 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0 366 | new.corr.bin <- as.data.frame(new.corr.bin) 367 | new.corr.bin$cell.bc.ref <- rownames(new.corr.bin) 368 | new.corr.bin.melt <- reshape2::melt(new.corr.bin) 369 | new.corr.bin.melt.sub <- new.corr.bin.melt[which(new.corr.bin.melt$value > 0),] 370 | new.corr.bin.melt.sub$cell.type <- unlist(lapply(strsplit(new.corr.bin.melt.sub$cell.bc.ref, "_"), function(x) x[1])) 371 | count.in.cat <- c() 372 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1]))) 373 | for (uc in unique.cat) { 374 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)] 375 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7) 376 | } 377 | count.in.cat <- as.data.frame(count.in.cat) 378 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3) 379 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)] 380 | comp.raw <- count.in.cat 381 | comp.raw <- comp.raw[order(-comp.raw$perc), ] 382 | comp.raw$Var1 <- rownames(comp.raw) 383 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T) 384 | comp.raw$label <- "Mouse Pancreatic Cells" 385 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 386 | geom_bar(stat = "identity") + 387 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 388 | scale_fill_manual( 389 | name = "Mapped MCA Tissue", 390 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 391 | RColorBrewer::brewer.pal(8, "Set2"), 392 | RColorBrewer::brewer.pal(8, "Set3")) 393 | ) + 394 | labs(y = "Percentage of Cells") + 395 | ggtitle("Bulk Classification to MCA Tissues") + 396 | theme(legend.position = "none", 397 | axis.ticks = element_blank(), 398 | axis.title.x = element_blank(), 399 | axis.text = element_text(face = "bold.italic", size = 12), 400 | title = element_text(face = "bold.italic", size = 14), 401 | panel.grid.major = element_blank(), 402 | panel.grid.minor = element_blank(), 403 | panel.background = element_blank(), 404 | axis.line.x = element_blank(), 405 | axis.line.y = element_line(color = "black", size =1)) 406 | View(comp.raw) 407 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 3, height = 8, paper = "special") 408 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 409 | geom_bar(stat = "identity") + 410 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 411 | scale_fill_manual( 412 | name = "Mapped MCA Tissue", 413 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 414 | RColorBrewer::brewer.pal(8, "Set2"), 415 | RColorBrewer::brewer.pal(8, "Set3")) 416 | ) + 417 | labs(y = "Percentage of Cells") + 418 | ggtitle("Bulk Classification to \n MCA Tissues") + 419 | theme(legend.position = "none", 420 | axis.ticks = element_blank(), 421 | axis.title.x = element_blank(), 422 | axis.text = element_text(face = "bold.italic", size = 12), 423 | title = element_text(face = "bold.italic", size = 14), 424 | panel.grid.major = element_blank(), 425 | panel.grid.minor = element_blank(), 426 | panel.background = element_blank(), 427 | axis.line.x = element_blank(), 428 | axis.line.y = element_line(color = "black", size =1)) 429 | dev.off() 430 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special") 431 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 432 | geom_bar(stat = "identity") + 433 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 434 | scale_fill_manual( 435 | name = "Mapped MCA Tissue", 436 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 437 | RColorBrewer::brewer.pal(8, "Set2"), 438 | RColorBrewer::brewer.pal(8, "Set3")) 439 | ) + 440 | labs(y = "Percentage of Cells") + 441 | ggtitle("Bulk Classification to \n MCA Tissues") + 442 | theme(legend.position = "none", 443 | axis.ticks = element_blank(), 444 | axis.title.x = element_blank(), 445 | axis.text = element_text(face = "bold.italic", size = 12), 446 | title = element_text(face = "bold.italic", size = 14), 447 | panel.grid.major = element_blank(), 448 | panel.grid.minor = element_blank(), 449 | panel.background = element_blank(), 450 | axis.line.x = element_blank(), 451 | axis.line.y = element_line(color = "black", size =1)) 452 | dev.off() 453 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 454 | geom_bar(stat = "identity") + 455 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 456 | scale_fill_manual( 457 | name = "Mapped MCA Tissue", 458 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 459 | RColorBrewer::brewer.pal(8, "Set2"), 460 | RColorBrewer::brewer.pal(8, "Set3")) 461 | ) + 462 | labs(y = "Percentage of Cells") + 463 | ggtitle("Bulk Classification to \nMCA Tissues") + 464 | theme(legend.position = "none", 465 | axis.ticks = element_blank(), 466 | axis.title.x = element_blank(), 467 | axis.text = element_text(face = "bold.italic", size = 12), 468 | title = element_text(face = "bold.italic", size = 14), 469 | panel.grid.major = element_blank(), 470 | panel.grid.minor = element_blank(), 471 | panel.background = element_blank(), 472 | axis.line.x = element_blank(), 473 | axis.line.y = element_line(color = "black", size =1)) 474 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special") 475 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) + 476 | geom_bar(stat = "identity") + 477 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) + 478 | scale_fill_manual( 479 | name = "Mapped MCA Tissue", 480 | values = c(RColorBrewer::brewer.pal(12, "Paired"), 481 | RColorBrewer::brewer.pal(8, "Set2"), 482 | RColorBrewer::brewer.pal(8, "Set3")) 483 | ) + 484 | labs(y = "Percentage of Cells") + 485 | ggtitle("Bulk Classification to \nMCA Tissues") + 486 | theme(legend.position = "none", 487 | axis.ticks = element_blank(), 488 | axis.title.x = element_blank(), 489 | axis.text = element_text(face = "bold.italic", size = 12), 490 | title = element_text(face = "bold.italic", size = 14), 491 | panel.grid.major = element_blank(), 492 | panel.grid.minor = element_blank(), 493 | panel.background = element_blank(), 494 | axis.line.x = element_blank(), 495 | axis.line.y = element_line(color = "black", size =1)) 496 | dev.off() 497 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 498 | geom_point(aes(colour = variable)) + 499 | scale_size_area(name = "Percentage", max_size=12) + 500 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 501 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 502 | guides(fill = FALSE, color = FALSE) + 503 | theme(legend.position = "bottom", 504 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 505 | axis.text.y = element_text(size = 12, face = "bold.italic"), 506 | axis.ticks = element_blank(), 507 | axis.title = element_blank(), 508 | title = element_text(face = "bold.italic", size = 14), 509 | panel.grid.major = element_blank(), 510 | panel.grid.minor = element_blank(), 511 | panel.background = element_blank(), 512 | axis.line = element_line(colour = "black", size = 1)) 513 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | -------------------------------------------------------------------------------- /Capybara.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: Capybara 2 | Title: A tool to measure cell identity and fate transitions 3 | Version: 0.0.0.9000 4 | Authors@R: 5 | person(given = "Sam", 6 | family = "Morris", 7 | role = c("aut", "cre"), 8 | email = "s.morris@wustl.edu", 9 | comment = c(ORCID = "YOUR-ORCID-ID")) 10 | Description: What the package does (one paragraph). 11 | Depends: R (>= 3.5.0), 12 | pheatmap, 13 | viridis, 14 | tools, 15 | psych, 16 | MVA, 17 | data.table, 18 | pbmcapply, 19 | reshape, 20 | Matrix, 21 | tidyverse, 22 | snow, 23 | parallel, 24 | plyr, 25 | dplyr, 26 | ggplot2, 27 | quadprog, 28 | limma, 29 | WGCNA, 30 | glmnet 31 | License: What license it uses 32 | Encoding: UTF-8 33 | LazyData: true 34 | RoxygenNote: 7.1.2 35 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(binarization.mann.whitney) 4 | export(binary.to.classification) 5 | export(calc.scale.ratio) 6 | export(construct.high.res.reference) 7 | export(gene.intersect.sub) 8 | export(get.least.connected) 9 | export(get.mid.connected) 10 | export(get.most.connected) 11 | export(multi.id.curate.qp) 12 | export(normalize.dt) 13 | export(perc.calc.aux) 14 | export(percentage.calc) 15 | export(ref.construction) 16 | export(sample.func) 17 | export(sc.quad.prog.run) 18 | export(single.round.QP.analysis) 19 | export(top.genes) 20 | export(transition.score) 21 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/R/.DS_Store -------------------------------------------------------------------------------- /R/Binarization.R: -------------------------------------------------------------------------------- 1 | #' Binarization and Identity Calling from Identity Score 2 | #' 3 | #' This function calls single identity or multiple identities of query cells from empirical p-values. Inferred cell types are marked by 1 in a binarized matrix. 4 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types 5 | #' @param ref.perc.ls Emprical p-values for reference cells 6 | #' @param ref.meta The celltype meta information for reference cells 7 | #' @param perc.ls Emprical p-values for query cells 8 | #' @param bulk If the reference data type is bulk RNA-seq. The default is bulk = FALSE 9 | #' @param map.df bulk mapping. The default is bulk = FALSE 10 | #' @param init.class initial classification. The default is init.class = NULL 11 | #' @keywords binarization, identity calling 12 | #' @note 13 | #' @export 14 | #' @examples 15 | #' 16 | binarization.mann.whitney <- function(mtx, ref.perc.ls, ref.meta, perc.ls, bulk = FALSE, map.df = NULL, init.class = NULL) { 17 | p_vals <- matrix(nrow = nrow(mtx), ncol = (ncol(mtx) - 1)) 18 | type_rank_in_order_mtx <- c() 19 | which_type <- c() 20 | 21 | all.indx <- seq(1,length(perc.ls)) 22 | 23 | benchmark <- list() 24 | benchmark.final <- c() 25 | 26 | for (i in 1:length(ref.perc.ls)) { 27 | curr.cell <- names(ref.perc.ls[[i]]) 28 | perc_mtx <- as.data.frame(ref.perc.ls[[i]][[1]]) 29 | if (curr.cell %in% rownames(ref.meta)) { 30 | curr.ct <- ref.meta[curr.cell, "cell.type"] 31 | if (endsWith(curr.ct, " ")) curr.ct <- substr(curr.ct, start = 1, stop = (nchar(curr.ct) - 1)) 32 | curr.cell.cell.type <- gsub(" ", ".", curr.ct) 33 | curr.cell.cell.type <- gsub("−", ".", curr.cell.cell.type) 34 | curr.cell.cell.type <- gsub("-", ".", curr.cell.cell.type) 35 | curr.cell.cell.type <- gsub("/", ".", curr.cell.cell.type) 36 | curr.cell.cell.type <- gsub("&", ".", curr.cell.cell.type) 37 | curr.cell.cell.type <- gsub("\\(", ".", curr.cell.cell.type) 38 | curr.cell.cell.type <- gsub("\\)", ".", curr.cell.cell.type) 39 | if (!bulk) { 40 | curr.bm <- paste0("frxn_cell.type_", curr.cell.cell.type) 41 | } else { 42 | if (is.null(map.df)) { 43 | stop("No bulk mapping found!") 44 | } 45 | curr.bm <- map.df[which(map.df$tissue == curr.ct), "corresponding"] 46 | } 47 | curr.celltype.perm <- perc_mtx[, curr.bm] 48 | if (length(benchmark) <= 0) { 49 | benchmark[[curr.bm]] <- curr.celltype.perm 50 | } else { 51 | benchmark[[curr.bm]] <- c(benchmark[[curr.bm]], curr.celltype.perm) 52 | } 53 | } 54 | } 55 | 56 | for (i in 1:length(benchmark)) { 57 | curr.name <- names(benchmark)[i] 58 | curr.benchmark <- benchmark[[i]] 59 | curr.benchmark <- curr.benchmark[which(curr.benchmark < 1)] 60 | curr.outliers <- boxplot.stats(curr.benchmark)$out 61 | higher.end.outliers <- curr.outliers[which(curr.outliers > getmode(curr.benchmark))] 62 | if (length(higher.end.outliers) > 0) benchmark[[i]] <- curr.benchmark[which(curr.benchmark <= min(higher.end.outliers))] 63 | benchmark.final[curr.name] <- max(benchmark[[i]][which(benchmark[[i]] < 1)]) 64 | } 65 | 66 | if (!("Unknown" %in% names(table(init.class$init.class)))) {init.class = NULL} 67 | 68 | for (i in all.indx) { 69 | perc_mtx<-as.matrix((perc.ls[[i]][[1]])) 70 | vec<-as.vector(perc_mtx) 71 | rank<-rank(vec,ties.method = "average") 72 | type_rank<-as.matrix(t(colSums(matrix(rank, nrow=50)))) 73 | colnames(type_rank)<-colnames(perc_mtx) 74 | type_rank_in_order<-colnames(type_rank)[order(type_rank,decreasing = F)] 75 | type_rank_in_order_mtx<-rbind(type_rank_in_order_mtx,type_rank_in_order) 76 | 77 | if (!is.null(init.class)) { 78 | if(init.class[names(perc.ls[[i]]), "init.class"] == "Unknown") { 79 | which_type <- c(which_type, 0) 80 | } else { 81 | if (init.class[names(perc.ls[[i]]), "init.class"] == "Unknown.Progenitor") { 82 | which_type <- c(which_type, -1) 83 | } else { 84 | for (j in 1:(length(colnames(type_rank))-1)) { 85 | 86 | a<-type_rank_in_order[1] 87 | b<-type_rank_in_order[j+1] 88 | 89 | test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less") 90 | p_vals[i,j]<-test$p.value 91 | if(test$p.value<.05 & (init.class[names(perc.ls[[i]]), "init.class"] == "Single-ID")){ 92 | which_type<-c(which_type,j) 93 | break 94 | } else { 95 | if (j == (ncol(type_rank_in_order_mtx) - 1) | (init.class[names(perc.ls[[i]]), "init.class"] == "Multi-ID")) { 96 | which_type <- c(which_type, j+1) 97 | } 98 | } 99 | } 100 | } 101 | } 102 | } else { 103 | if(mean(perc_mtx[,type_rank_in_order[1]]) >= (benchmark.final[type_rank_in_order[1]] + 0.01)) { 104 | which_type <- c(which_type, 0) 105 | } else { 106 | for (j in 1:(length(colnames(type_rank))-1)) { 107 | 108 | a<-type_rank_in_order[1] 109 | b<-type_rank_in_order[j+1] 110 | 111 | test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less") 112 | p_vals[i,j]<-test$p.value 113 | if(test$p.value<.05){ 114 | which_type<-c(which_type,j) 115 | break 116 | } else { 117 | if (j == (ncol(type_rank_in_order_mtx) - 1)) { 118 | which_type <- c(which_type, j+1) 119 | } 120 | } 121 | } 122 | } 123 | } 124 | } 125 | 126 | type_rank_in_order_mtx <- as.matrix(type_rank_in_order_mtx) 127 | binary.mtx<-matrix(0,nrow = nrow(mtx), ncol = ncol(mtx)) 128 | colnames(binary.mtx)<-colnames(mtx) 129 | cellnames<-c() 130 | for (i in 1:length(which_type)) { 131 | cellnames <- c(cellnames, names(perc.ls[[i]])) 132 | if (which_type[i] > 0) { 133 | for (j in 1:which_type[i]) { 134 | binary.mtx[i,type_rank_in_order_mtx[i,j]] <- 1 135 | } 136 | } 137 | if (which_type[i] < 0) { 138 | for (j in 1:which_type[i]) { 139 | binary.mtx[i,type_rank_in_order_mtx[i,j]] <- -1 140 | } 141 | } 142 | } 143 | rownames(binary.mtx)<-cellnames 144 | 145 | return(binary.mtx) 146 | } 147 | -------------------------------------------------------------------------------- /R/Classification.R: -------------------------------------------------------------------------------- 1 | #' Generate classification results from the binarized result matrix 2 | #' 3 | #' This function genrate classification table from the binarized result matrix 4 | #' @param bin.count.rslt The binarized classification result matrix 5 | #' @keywords calssification, result 6 | #' @note 7 | #' @export 8 | #' @examples 9 | #' 10 | binary.to.classification <- function(bin.count.rslt) { 11 | class.rslt <- data.frame() 12 | bin.count.rsums <- rowSums(bin.count.rslt) 13 | for (i in 1:nrow(bin.count.rslt)) { 14 | curr.cell.bc <- rownames(bin.count.rslt)[i] 15 | if (bin.count.rsums[i] > 1) { 16 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Multi_ID", stringsAsFactors = F) 17 | } else { 18 | if (bin.count.rsums[i] == 0) { 19 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown", stringsAsFactors = F) 20 | } 21 | else { 22 | if (bin.count.rsums[i] < 0) { 23 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown.Progenitor", stringsAsFactors = F) 24 | } else { 25 | idx <- which(bin.count.rslt[i,] == 1) 26 | identity <- unlist(lapply(strsplit(colnames(bin.count.rslt), "frxn_cell.type_"), function(x) x[length(x)]))[idx] 27 | curr.df <- data.frame(barcode = curr.cell.bc, call = identity[length(identity)], stringsAsFactors = F) 28 | } 29 | } 30 | } 31 | if (nrow(class.rslt) <= 0) { 32 | class.rslt <- curr.df 33 | } else { 34 | class.rslt <- rbind(class.rslt, curr.df) 35 | } 36 | } 37 | return(class.rslt) 38 | } 39 | -------------------------------------------------------------------------------- /R/EmpiricalPValueCalculation.R: -------------------------------------------------------------------------------- 1 | #' Empirical p-value of Identity Score Calculation 2 | #' 3 | #' This function resamples from a sample dataset and returns an empirical p-value of identity score 4 | #' @param dens.x The background dataset that we compare the identity score of our sample with 5 | #' @param curr.val The identity score of the sample dataset to be evaluated its significance 6 | #' @param n The number of times of resampling. The default is n=1000 7 | #' @keywords resampling, empirical p-value 8 | #' @note 9 | #' @export 10 | #' @examples 11 | #' 12 | sample.func <- function(dens.x, curr.val, prob, n = 1000) { 13 | samp <- sample(1:length(dens.x), n, replace = T, prob = prob) 14 | samp.val <- dens.x[samp] 15 | perc <- sum(samp.val > curr.val)/n 16 | return(perc) 17 | } 18 | 19 | #' Emprical p-values for query cells 20 | #' 21 | #' This function returns a list of emprical p-value matrices. Every matrix in the list contains emprical p-values of all possilbe celltypes for a single query cells under test.times resampled backgrounds. 22 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types 23 | #' @param bkgd.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types 24 | #' @keywords resampling, empirical p-value, all query cells 25 | #' @note 26 | #' @export 27 | #' @examples 28 | #' 29 | percentage.calc <- function(mtx, bkgd.mtx) { 30 | cnms <- colnames(mtx) 31 | cell.names <- rownames(mtx) 32 | mtx.cell.num <- nrow(mtx) 33 | mtx.ct.num <- ncol(mtx) 34 | test.times <- 50 35 | 36 | perc.ls <- pbmclapply(as.list(seq_len(mtx.cell.num)), perc.calc.aux, cell.names = cell.names, mtx.ct.num = mtx.ct.num, 37 | background.mtx = bkgd.mtx, mtx = mtx, cnms = cnms, mc.cores = 4) 38 | 39 | return(perc.ls) 40 | } 41 | 42 | #' Emprical p-values for a query cell 43 | #' 44 | #' This function returns emprical p-values of all possilbe celltypes for a query cells under test.times resampled backgrounds. 45 | #' @param l The index of the sample cell of query 46 | #' @param cell.names A vector of sample cells' names 47 | #' @param mtx.ct.num The number of all possible cell types 48 | #' @param background.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types 49 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types 50 | #' @param cnms The names of all possible cell types 51 | #' @param test.times The number of times resampling the background. The default is test.times=50 52 | #' @keywords resampling, empirical p-value, single query 53 | #' @note 54 | #' @export 55 | #' @examples 56 | #' 57 | perc.calc.aux <- function(l, cell.names, mtx.ct.num, background.mtx, mtx, cnms, test.times = 50) { 58 | curr.cell <- cell.names[l] 59 | perc.vec <- lapply(seq(1, mtx.ct.num), 60 | function(x) { 61 | dens <- density(background.mtx[,x], n = nrow(mtx)) 62 | curr.num <- mtx[l, x] 63 | if (curr.num <= .Machine$double.eps * 2) { 64 | pc.v <- rep(1, test.times) 65 | } else { 66 | pc.v <- apply(as.data.frame(seq_len(test.times)), 1, function(x) sample.func(dens.x = dens$x, curr.val = curr.num, prob = dens$y, n = 1000)) 67 | } 68 | return(list(cnms[x], pc.v)) 69 | }) 70 | comb.perc <- rbindlist(perc.vec) 71 | comb.perc$label <- rep(seq(1,test.times), length(unique(comb.perc$V1))) 72 | 73 | comb.df <- dcast(comb.perc, label~V1, value.var = "V2") 74 | comb.df <- comb.df[, -c(1)] 75 | curr.list <- list() 76 | curr.list[[curr.cell]] <- comb.df 77 | return(curr.list) 78 | } 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/Helper.R: -------------------------------------------------------------------------------- 1 | #' Get the Most Connected Cells 2 | #' 3 | #' This function returns index of cells that are the most connected within a cell type 4 | #' @param mtx The normalized reference count matrix 5 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk 6 | #' @export 7 | #' 8 | get.most.connected <- function(mtx, n.sample) { 9 | corr.mtx <- WGCNA::cor(mtx) 10 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx) 11 | corr.mtx.melt <- melt(corr.mtx.upper) 12 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ] 13 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ] 14 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1) 15 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2) 16 | 17 | sample.list <- c() 18 | count.line <- 1 19 | while(length(sample.list) < n.sample) { 20 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line])))) 21 | count.line <- count.line + 1 22 | } 23 | 24 | return(sample.list[1:n.sample]) 25 | } 26 | 27 | 28 | #' Get the Least Connected Cells 29 | #' 30 | #' This function returns index of cells that are the least connected within a cell type 31 | #' @param mtx The normalized reference count matrix 32 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk 33 | #' @export 34 | #' 35 | get.least.connected <- function(mtx, n.sample) { 36 | corr.mtx <- WGCNA::cor(mtx) 37 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx) 38 | corr.mtx.melt <- melt(corr.mtx.upper) 39 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ] 40 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ] 41 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1) 42 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2) 43 | 44 | sample.list <- c() 45 | count.line <- 1 46 | while(length(sample.list) < n.sample) { 47 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line])))) 48 | count.line <- count.line + 1 49 | } 50 | 51 | return(sample.list[1:n.sample]) 52 | } 53 | 54 | #' Get the Medium Connected Cells 55 | #' 56 | #' This function returns index of cells that are the most connected within a cell type 57 | #' @param mtx The normalized reference count matrix 58 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk 59 | #' @export 60 | #' 61 | get.mid.connected <- function(mtx, n.sample) { 62 | corr.mtx <- WGCNA::cor(mtx) 63 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx) 64 | corr.mtx.melt <- melt(corr.mtx.upper) 65 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ] 66 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ] 67 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1) 68 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2) 69 | 70 | corr.mtx.melt.pos.sort.most <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ] 71 | corr.mtx.melt.pos.sort.most$X1 <- as.character(corr.mtx.melt.pos.sort.most$X1) 72 | corr.mtx.melt.pos.sort.most$X2 <- as.character(corr.mtx.melt.pos.sort.most$X2) 73 | 74 | sample.list <- c() 75 | count.line <- 1 76 | while(length(sample.list) < n.sample/2) { 77 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line])))) 78 | count.line <- count.line + 1 79 | } 80 | count.line <- 1 81 | while(length(sample.list) < n.sample) { 82 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort.most$X1[count.line], corr.mtx.melt.pos.sort.most$X2[count.line])))) 83 | count.line <- count.line + 1 84 | } 85 | 86 | return(sample.list[1:n.sample]) 87 | } 88 | 89 | 90 | # Create the function. 91 | getmode <- function(v) { 92 | uniqv <- unique(v) 93 | uniqv[which.max(tabulate(match(v, uniqv)))] 94 | } 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /R/HighResolutionReferenceConstruction.R: -------------------------------------------------------------------------------- 1 | #' Systematic construction of a high-resolution reference 2 | #' 3 | #' Create a pseudo-bulk reference by sampling 90-cells from each cell type to maintain cellular resolution while increasing transcriptional resolution 4 | #' @param ref.mtx The single-cell reference dataset 5 | #' @param coldata.df The metadata (cell type information) for cells in the high-resolution reference 6 | #' @param cell.num.for.ref The number of cell numbers used to build the reference for each cell type. The default is cell.num.for.ref = 90 7 | #' @keywords pseudo-bulk reference 8 | #' @note 9 | #' @export 10 | #' @examples 11 | #' 12 | construct.high.res.reference <- function(ref.mtx, coldata.df, criteria,cell.num.for.ref = 90) { 13 | 14 | ct.freq <- as.data.frame(table(coldata.df[,criteria]), stringsAsFactors = F) 15 | ref.meta <- data.frame() 16 | ref.sc <- data.frame() 17 | mca.counts.all.involved.sub<-ref.mtx 18 | for (j in 1:nrow(ct.freq)) { 19 | curr.ct.at.test <- as.character(ct.freq[j,1]) 20 | curr.cell.involve <- rownames(coldata.df)[which(coldata.df[,criteria] == curr.ct.at.test)] 21 | curr.cell.index <- which(colnames(mca.counts.all.involved.sub) %in% curr.cell.involve) 22 | curr.mtx.sub <- as.matrix(mca.counts.all.involved.sub[, curr.cell.index]) 23 | if (ct.freq$Freq[j] >= cell.num.for.ref) { 24 | sample.ref.cell <- c(get.most.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2), 25 | get.least.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2)) 26 | } else { 27 | sample.ref.cell <- sample(curr.cell.involve, size = cell.num.for.ref, replace = T) 28 | } 29 | new.bc <- paste0("Cell_", seq(((j-1) * cell.num.for.ref + 1), j * cell.num.for.ref)) 30 | curr.meta <- data.frame(row.names = new.bc, cell.type = curr.ct.at.test, cell.bc = sample.ref.cell, stringsAsFactors = F) 31 | curr.sc <- curr.mtx.sub[, sample.ref.cell] 32 | colnames(curr.sc) <- new.bc 33 | if (ncol(ref.sc) <= 0) { 34 | ref.meta <- curr.meta 35 | ref.sc <- curr.sc 36 | } else { 37 | ref.meta <- rbind(ref.meta, curr.meta) 38 | ref.sc <- cbind(ref.sc, curr.sc) 39 | } 40 | } 41 | ref.df <- ref.construction(ref.sc, ref.meta, "cell.type") 42 | return(list(ref.sc, ref.meta, ref.df)) 43 | } 44 | 45 | #' Reference Construction 46 | #' 47 | #' This function constructs reference from single-cell resolution reference data to be used for quadratic programming calculation 48 | #' @param sc The single-cell resolution dataset 49 | #' @param sc.aux The auxiliary data frame that annotate the single-cell resolutiond dataset 50 | #' @param criteria The column name to use for construction of the reference 51 | #' @export 52 | #' @examples 53 | #' ref.construction(single.ref.mtx, single.aux.df, "cell.type") 54 | ref.construction <- function(sc, sc.aux, criteria) { 55 | criteria.col <- sc.aux[, criteria] 56 | ref.df <- data.frame() 57 | uniq.crit <- unique(criteria.col) 58 | 59 | for (i in 1:length(uniq.crit)) { 60 | curr.crit <- uniq.crit[i] 61 | sc.bc <- colnames(sc)[which(sc.aux[, criteria] == curr.crit)] 62 | curr.sc.sub <- sc[, sc.bc] 63 | curr.df <- as.data.frame(rowSums(curr.sc.sub)) 64 | if (ncol(ref.df) <= 0) { 65 | ref.df <- curr.df 66 | } else { 67 | ref.df <- cbind(ref.df, curr.df) 68 | } 69 | colnames(ref.df)[i] <- paste0(criteria, "_", curr.crit) 70 | } 71 | return(ref.df) 72 | } 73 | 74 | -------------------------------------------------------------------------------- /R/MultiIDFilterQP.R: -------------------------------------------------------------------------------- 1 | #' Multi-ID Score-Based Filter 2 | #' 3 | #' This function filters the multiple identity cells based on their QP scores, where we assume that a low QP score (less than 10E-3) are not a true identity to consider 4 | #' @param binary.counts The binary count matrix, which is the output from binarization with Mann Whitney. 5 | #' @param classification The classification result, which is the output from binary to classification. 6 | #' @param qp.matrix The matrix that contains QP scores calculated for the sample cells 7 | #' @param qp.threshold The threshold to cut off for the QP scores in the multiple identity listed cells 8 | #' @return A list contain 2 elements, the first is the curated and filtered multiple identity data frame and the second is the new classification data frame. 9 | #' @keywords Multiple identities 10 | #' @export 11 | #' @examples 12 | #' multi.id.curate.qp(multi.id.meta) 13 | multi.id.curate.qp <- function(binary.counts, classification, qp.matrix, qp.threshold = 10^-3) { 14 | 15 | bin.count.std.multi <- as.data.frame(binary.counts[classification$barcode[which(classification$call == "Multi_ID")], ]) 16 | bin.count.std.multi$cell.bc <- rownames(bin.count.std.multi) 17 | 18 | bin.count.std.multi.melt <- reshape2::melt(bin.count.std.multi) 19 | bin.count.std.multi.melt <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$value > 0), ] 20 | 21 | classification.new <- classification 22 | 23 | qp.column <- c() 24 | for (i in 1:nrow(bin.count.std.multi.melt)) { 25 | curr.cell <- as.character(bin.count.std.multi.melt$cell.bc[i]) 26 | curr.cell.type <- as.character(bin.count.std.multi.melt$variable[i]) 27 | qp.column <- c(qp.column, qp.matrix[curr.cell, curr.cell.type]) 28 | } 29 | bin.count.std.multi.melt$qp.score <- qp.column 30 | bin.count.std.multi.melt.remain <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$qp.score >= qp.threshold), ] 31 | 32 | cell.occurrence <- as.data.frame(table(bin.count.std.multi.melt.remain$cell.bc)) 33 | single.id.cell <- as.character(cell.occurrence$Var1[which(cell.occurrence$Freq == 1)]) 34 | 35 | actual.multi <- bin.count.std.multi.melt.remain 36 | if (length(single.id.cell) > 0) { 37 | single.id.cell.bin <- bin.count.std.multi.melt.remain[which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ] 38 | rownames(single.id.cell.bin) <- single.id.cell.bin$cell.bc 39 | classification.new[rownames(single.id.cell.bin), "call"] <- unlist(lapply(strsplit(as.character(single.id.cell.bin$variable), "frxn_cell.type_"), function(x) x[2])) 40 | actual.multi <- bin.count.std.multi.melt.remain[-which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ] 41 | } 42 | 43 | return(list(actual.multi, classification.new)) 44 | } 45 | 46 | -------------------------------------------------------------------------------- /R/SingleCellQuadraticProgramming.R: -------------------------------------------------------------------------------- 1 | #' Single Round of Quadratic Programming 2 | #' 3 | #' This function runs preprocessing, including log normalization, gene intersection and scaling. Further, run one round of quadratic programming. See function sc.quad.prog.run for detailed description 4 | #' @param ref The reference transcriptome taht contains the transcriptome of each potential cell type 5 | #' @param sc.data the transcriptome profile of cells from single-cell RNA-sequencing 6 | #' @param scale.bulk.sc either scale or non-scale. Scaling is recommended to make the reference comparable to the single-cell. Default to scale. 7 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0 8 | #' @param unix.par boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE 9 | #' @param windows.par boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE 10 | #' @param n.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used 11 | #' @param save.to.path which directory would you like to save your file? 12 | #' @param save.to.filename prefix to the filename to save to. The final filename will be constructed inside of the function by tagging the following string _scale.csv for scaling and _non_scale.csv for not scaling. 13 | #' @param bulk.norm boolean value, either TRUE or FALSE. Would you like to normalize the reference? 14 | #' @param norm.sc boolean value, either TRUE or FALSE. Would you like to normalize the single-cell dataset? 15 | #' @param log.bulk boolean value, either TRUE or FALSE. Would you like to log the reference? 16 | #' @param log.sc boolean value, either TRUE or FALSE. Would you like to log the single-cell dataset? 17 | #' @keywords quadratic programming, scRNA-seq, 18 | #' @note This function calls the quadratic programming referenced to Treutlein et. al. 19 | #' @export 20 | #' @examples 21 | #' single.round.QP.analysis(ref = ref.transcriptome, sc.data = sc.transcriptome, force.eq = 1, save.to.path = "~/Desktop/", save.to.filename = "my_favorite") 22 | single.round.QP.analysis <- function(ref, sc.data, scale.bulk.sc = "scale", unix.par = FALSE, windows.par = FALSE, force.eq = 0, 23 | n.cores = 1, save.to.path, save.to.filename, bulk.norm = T, norm.sc = T, log.bulk = T, log.sc = T) { 24 | norm.bulk <- as.matrix(ref) 25 | norm.sc.mtx <- as.matrix(sc.data) 26 | 27 | # Normalized the bulk and single-cell data 28 | if (bulk.norm) norm.bulk <- normalize.dt(ref) 29 | if (norm.sc) norm.sc.mtx <- normalize.dt(sc.data) 30 | 31 | # Calculate the scaling ratio 32 | scale.ratio.sc <- calc.scale.ratio(ref, sc.data) 33 | scale.norm.sc <- norm.bulk/scale.ratio.sc 34 | 35 | # Calculate gene intersections 36 | norm.ls.sc <- gene.intersect.sub(norm.bulk, norm.sc.mtx) 37 | scale.ls.sc <- gene.intersect.sub(scale.norm.sc, norm.sc.mtx) 38 | 39 | log.norm.ls.sc <- norm.ls.sc 40 | log.scale.ls.sc <- scale.ls.sc 41 | # Finish the log-normalization 42 | if (log.bulk & log.sc) { 43 | log.norm.ls.sc <- lapply(norm.ls.sc, log1p) 44 | log.scale.ls.sc <- lapply(scale.ls.sc, log1p) 45 | } else { 46 | if (log.bulk) { 47 | log.norm.ls.sc[[1]] <- log1p(log.norm.ls.sc[[1]]) 48 | log.scale.ls.sc[[1]] <- log1p(log.scale.ls.sc[[1]]) 49 | } 50 | if (log.sc) { 51 | log.norm.ls.sc[[2]] <- log1p(log.norm.ls.sc[[2]]) 52 | log.scale.ls.sc[[2]] <- log1p(log.scale.ls.sc[[2]]) 53 | } 54 | } 55 | 56 | # Run QP and save to file 57 | if (scale.bulk.sc == "scale") { 58 | qp.rslt <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]), 59 | single.cell.transcriptome = log.scale.ls.sc[[2]], 60 | unix.parallel = unix.par, windows.parallel = windows.par, 61 | parallel.cores = n.cores, force.eq = force.eq) 62 | write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F) 63 | } else { 64 | if (scale.bulk.sc == "non-scale") { 65 | qp.rslt <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]), 66 | single.cell.transcriptome = log.norm.ls.sc[[2]], 67 | unix.parallel = unix.par, windows.parallel = windows.par, 68 | parallel.cores = n.cores, force.eq = force.eq) 69 | write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F) 70 | } else { 71 | qp.rslt.scl <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]), 72 | single.cell.transcriptome = log.scale.ls.sc[[2]], 73 | unix.parallel = unix.par, windows.parallel = windows.par, 74 | parallel.cores = n.cores, force.eq = force.eq) 75 | qp.rslt.non.scl <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]), 76 | single.cell.transcriptome = log.norm.ls.sc[[2]], 77 | unix.parallel = unix.par, windows.parallel = windows.par, 78 | parallel.cores = n.cores, force.eq = force.eq) 79 | write.csv(qp.rslt.scl, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F) 80 | write.csv(qp.rslt.non.scl, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F) 81 | } 82 | } 83 | } 84 | 85 | #' Single-Cell Quadratic Programming Calculation 86 | #' 87 | #' This function runs quadratic programming to identify the probability of the cells in single-cell RNA-seq belonging to cell types in the reference transcriptome 88 | #' @param bulk.transcriptome The reference transcriptome taht contains the transcriptome of each potential cell type 89 | #' @param single.cell.transcriptome the transcriptome profile of cells from single-cell RNA-sequencing 90 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0 91 | #' @param unix.parallel boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE 92 | #' @param windows.parallel boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE 93 | #' @param parallel.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used 94 | #' @keywords quadratic programming, scRNA-seq 95 | #' @note Code reference from Treutlein et. al., Dissecting direct reprogramming from fibroblast to neuron using single-cell RNA-seq 96 | #' @export 97 | #' @examples 98 | #' sc.quad.prog.run(ref.transcriptome, sc.transcriptome, force.eq = 1) 99 | sc.quad.prog.run <- function(bulk.transcriptome, single.cell.transcriptome, force.eq = 0, 100 | unix.parallel = FALSE, windows.parallel = FALSE, parallel.cores = 4) { 101 | # Check if the sizes of the matrices are comfortable with each other 102 | if (nrow(bulk.transcriptome) != nrow(single.cell.transcriptome)) { 103 | print("The number of genes included in the bulk expression is not the same as the single cell transcriptome") 104 | return() 105 | } 106 | 107 | # Initialize a data frame to hold the fraction identity matrix 108 | identity.matx <- data.frame() 109 | # Initialize the cell types given 110 | given.cell.typs <- colnames(bulk.transcriptome) 111 | 112 | # If using unix/linux based system, can use this process to parallel the the program 113 | # Set unix.parallel parameter to be TRUE, but set windows.parallel to be FALSE 114 | if (unix.parallel) { 115 | identy.mx.ls <- mclapply(as.list(seq_len(ncol(single.cell.transcriptome))), quad.prog.calc, 116 | bulk.transcriptome = bulk.transcriptome, 117 | single.cell.transcriptome = single.cell.transcriptome, 118 | force.eq = force.eq, mc.cores = parallel.cores) 119 | identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution, 120 | identity[[1]]$Lagrangian[1], identity[[3]])) 121 | 122 | identity.matx <- matrix(unlist(identity.ls), byrow = TRUE, 123 | nrow = length(identity.ls), ncol = length(identity.ls[[1]])) 124 | } else { 125 | # If using Windows based system, can use this process to parallel the the program 126 | # Set windows.parallel parameter to be TRUE, but set unix.parallel to be FALSE 127 | # NOTE: This might be slower than applying parallelization under Unix/Linux based system 128 | if (windows.parallel) { 129 | cl<-makeCluster(parallel.cores) 130 | identy.mx.ls <- clusterApply(cl, seq_len(ncol(single.cell.transcriptome)), quad.prog.calc, 131 | bulk.transcriptome = bulk.transcriptome, 132 | single.cell.transcriptome = single.cell.transcriptome, 133 | force.eq = force.eq) 134 | stopCluster(cl) 135 | identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution, 136 | identity[[1]]$Lagrangian[1], identity[[3]])) 137 | 138 | identity.matx <- matrix(unlist(identity.ls), byrow = TRUE, 139 | nrow = length(identity.ls), ncol = length(identity.ls[[1]])) 140 | 141 | # If not using parallel for this case, just run the following process in a loop for serial 142 | # All parallel parameters option should be set to FALSE 143 | } else { 144 | for (i in 1:ncol(single.cell.transcriptome)){ 145 | identity<-c() 146 | quad.rslt <- quad.prog.calc(i, bulk.transcriptome, single.cell.transcriptome, force.eq) 147 | QP <- quad.rslt[[1]] 148 | Error <- quad.rslt[[3]] 149 | 150 | identity <- c(colnames(single.cell.transcriptome)[i], QP$solution, QP$Lagrangian[1],Error) 151 | 152 | if (nrow(identity.matx) == 0) { 153 | identity.matx <- as.data.frame(t(as.matrix(identity))) 154 | } else { 155 | identity.matx <- rbind(identity.matx, as.data.frame(t(as.matrix(identity)))) 156 | } 157 | } 158 | } 159 | } 160 | 161 | # Set the column names of the returning data frame 162 | col.frx.names <- paste("frxn_", given.cell.typs, sep = "") 163 | colnames(identity.matx)<-c("cell_name", col.frx.names,"Lagrangian","Error") 164 | return(identity.matx) 165 | } 166 | 167 | quad.prog.calc <- function(col.num, bulk.transcriptome, single.cell.transcriptome, force.eq) { 168 | library(quadprog) 169 | Y <- as.matrix(single.cell.transcriptome[, col.num]) 170 | Rinv <- solve(chol(t(bulk.transcriptome) %*% bulk.transcriptome)); 171 | C <- cbind(rep(-1, ncol(bulk.transcriptome)), diag(ncol(bulk.transcriptome))) 172 | n <- nrow(Rinv) 173 | q <- ncol(bulk.transcriptome) 174 | b <- c(-1, rep(0, q)) 175 | d <- t(Y) %*% bulk.transcriptome 176 | 177 | QP<-solve.QP(Dmat = Rinv, factorized = TRUE, dvec = d, bvec = b, Amat = C, meq = force.eq) 178 | Error<-sum(abs(Y- bulk.transcriptome %*% QP$solution)) 179 | 180 | return(list(QP, colnames(single.cell.transcriptome)[col.num], Error)) 181 | } 182 | 183 | -------------------------------------------------------------------------------- /R/SingleCellQuadraticProgrammingAux.R: -------------------------------------------------------------------------------- 1 | #' Top Variable Gene Identificaiton 2 | #' 3 | #' This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note. 4 | #' @param input.dir The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell. 5 | #' @param output.dir The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names. 6 | #' @param top.number.count The number of top variable genes to extract. Default to 500 genes 7 | #' @keywords top variable gene extraction 8 | #' @note Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015 9 | #' @export 10 | #' @examples 11 | #' top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000) 12 | top.genes <- function(input.dir, output.dir, top.number.count = 500) { 13 | dataset <- input.dir 14 | top.dir <- output.dir 15 | 16 | DS <- read.table(dataset, sep="\t", header=TRUE, row.names=1, check.names=FALSE) 17 | X <- voom(as.matrix(DS), normalize.method="none")$E 18 | vars <- apply(X, 1, var) 19 | top <- sort.list(-vars)[1:top.number.count] 20 | X <- X[top,] 21 | 22 | topl <- as.data.frame(rownames(X)) 23 | colnames(topl) <- c(paste("Top", top.number.count, sep = "", collapse = "")) 24 | write.table(topl, top.dir, sep="\t", quote=F, row.names = FALSE, col.names = TRUE) 25 | return() 26 | } 27 | 28 | #' Normalization of Single-Cell RNA-Seq Data 29 | #' 30 | #' This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage. 31 | #' @param dt.st The dataset to normalize 32 | #' @keywords normalization 33 | #' @export 34 | #' @examples 35 | #' normalize.dt(single.cell.mtx) 36 | normalize.dt <- function(dt.st) { 37 | # Calculate column sums 38 | csums <- Matrix::colSums(dt.st) 39 | # Calculate averages of the sums 40 | cavg <- mean(csums) 41 | # Calculate normalized bulk 42 | norm.dt.st <- dt.st 43 | for (i in 1:length(csums)) {norm.dt.st[,i] <- (norm.dt.st[,i]/csums[i]) * cavg} 44 | return(norm.dt.st) 45 | } 46 | 47 | #' Scale Ratio Calculation 48 | #' 49 | #' This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction. 50 | #' @param bulk The reference dataset 51 | #' @param sc The single-cell dataset 52 | #' @keywords Scale ratio calculation 53 | #' @export 54 | #' @examples 55 | #' calc.scale.ratio(reference.dt, single.cell.mtx) 56 | calc.scale.ratio <- function(bulk, sc) { 57 | # Calculate column sums 58 | csums.bulk <- colSums(bulk) 59 | csums.sc <- Matrix::colSums(sc) 60 | # Calculate averages of the sums 61 | cavg.bulk <- mean(csums.bulk) 62 | cavg.sc <- mean(csums.sc) 63 | # Calculate scaling ratio for each tissue 64 | scale.ratio.sc <- cavg.bulk/cavg.sc 65 | return(scale.ratio.sc) 66 | } 67 | 68 | #' Find intersection genes 69 | #' 70 | #' This function identifies the intersection genes between reference and single-cell data 71 | #' @param bulk The reference dataset 72 | #' @param sc The single-cell dataset 73 | #' @export 74 | #' @examples 75 | #' gene.intersect.sub(reference.dt, single.cell.mtx) 76 | gene.intersect.sub <- function(bulk, sc) { 77 | # Find genes with all zero-expression within sc/bulk 78 | rsums.bulk <- Matrix::rowSums(bulk) 79 | rsums.sc <- Matrix::rowSums(sc) 80 | # Identify blank genes 81 | bulk.blank.genes <- which(rsums.bulk == 0) 82 | sc.blank.genes <- which(rsums.sc == 0) 83 | # Remove those genes 84 | norm.bulk.sub <- bulk 85 | norm.sc.sub <- sc 86 | if (length(bulk.blank.genes) > 0) {norm.bulk.sub <- bulk[-(bulk.blank.genes),]} 87 | if (length(sc.blank.genes) > 0) {norm.sc.sub <- sc[-(sc.blank.genes),]} 88 | # Find intersection genes 89 | genes.inter <- intersect(rownames(norm.bulk.sub), rownames(norm.sc.sub)) 90 | # Find the intersected sub dataset of bulk (normalized bulk) 91 | norm.bulk.sub.sub <- norm.bulk.sub[genes.inter, ] 92 | norm.sc.sub.sub <- norm.sc.sub[genes.inter, ] 93 | return(list(norm.bulk.sub.sub, norm.sc.sub.sub)) 94 | } 95 | 96 | 97 | -------------------------------------------------------------------------------- /R/TransitionScoreCalculation.R: -------------------------------------------------------------------------------- 1 | #' Transition Score Calculation 2 | #' 3 | #' This function calculates the transition scores for each cell state that is connected by cells with multiple identities. 4 | #' @param multi.id.meta A data frame that contains cells with multiple identities. Column 1 - cell barcode, Column 2 - cell type call, Column 3 - counts, Column 4 - corresponding QP scores. 5 | #' @return A data frame that contains the calculated transition scores for each identity 6 | #' @keywords Transition metric 7 | #' @export 8 | #' @examples 9 | #' transition.score(multi.id.meta) 10 | transition.score <- function(multi.id.meta) { 11 | 12 | unique.ct <- unique(as.character(multi.id.meta[,2])) 13 | ct.entropy <- data.frame() 14 | for (i in 1:length(unique.ct)) { 15 | curr.cell <- unique.ct[i] 16 | actual.sub <- multi.id.meta[which(as.character(multi.id.meta[,2]) == curr.cell), ] 17 | curr.entro <- sum(-actual.sub$qp.score*log(actual.sub$qp.score, base = max(2, nrow(actual.sub)))) 18 | curr.df <- data.frame(cell.ct = curr.cell, entropy = curr.entro, stringsAsFactors = F) 19 | if (nrow(ct.entropy) <= 0) { 20 | ct.entropy <- curr.df 21 | } else { 22 | ct.entropy <- rbind(ct.entropy, curr.df) 23 | } 24 | } 25 | 26 | rownames(ct.entropy) <- unlist(lapply(strsplit(ct.entropy$cell.ct, "frxn_cell.type_"), function(x) x[2])) 27 | 28 | return(ct.entropy) 29 | } 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R Package - Capybara 2 | 3 | 4 | Capybara is a tool to measure cell identity and fate transitions. This approach is designed to measure cell identity as a continuum, at a single-cell resolution. Capybara enables classification of discrete identities as well as cells with multiple identities. This package has a dependency on R version (R >= 3.5.0). In addition to the following tutorial, we provide the notebooks for production of figures in the morris-lab/Capybara_reproducibility (https://github.com/morris-lab/Capybara_reproducibility) repository. There could exist some variations due to difference in threshold selections. For details regarding the methods, usage and application, please refer to the following paper: Kong et al., Cell Stem Cell, 2022 (https://www.sciencedirect.com/science/article/pii/S1934590922000996?dgcid=coauthor) 5 | 6 | ## Installation 7 | ### Dependencies 8 | 9 | Most dependencies can be installed along with Capybara through CRAN. The following dependencies may need to be installed manually through BioConductor (Instructions can also be found here: https://bioconductor.org/). 10 | 11 | Install BiocManager 12 | ```r 13 | install.packages("BiocManager") 14 | ``` 15 | Install Bioconductor dependency packages 16 | ```r 17 | BiocManager::install(c("limma","impute", "AnnotationDbi","GO.db", "preprocessCore")) 18 | ``` 19 | 20 | ### Install the package 21 | Install devtools 22 | ```r 23 | install.packages("devtools") 24 | ``` 25 | Install the package from GitHub. 26 | ```r 27 | library("devtools") 28 | devtools::install_github("morris-lab/Capybara") 29 | ``` 30 | Load the package 31 | ```r 32 | library("Capybara") 33 | ``` 34 | 35 | ## Step 1: Tissue-Level Classification 36 | ### Application of quadratic programming on reference and sample single-cell dataset using a bulk reference 37 | 38 | Bulk transcriptome profiles of all tissues are mined from ARCHS4, a platform that contains most published RNA-seq and ChiP-seq datasets (Lachmann et al., 2018). ARCHS4 obtains raw datasets from the Gene Expression Omnibus (GEO), realigned and processed through a uniform pipeline. We filtered to contain only poly-A and total RNA-seq data from C57BL/6 mice. With further filtering and preprocessing (details can be found in the method section of the paper), we landed with a reference of a total of 30 tissues. We provide our mined bulk references, including a matrix in raw counts and a matrix in reads per kilobase per million (RPKM), as a part of the Capybara package. Selection of your preferred normalization method can be applied to raw counts. Here, we will demonstrate the usage of the bulk raw counts in the pipeline. 39 | 40 | **1. Load the bulk reference** 41 | ```r 42 | # File path 43 | bulk.raw.path <- system.file("extdata", "Bulk Reference Raw.Rds", package = "Capybara") 44 | bulk.rpkm.path <- system.file("extdata", "Bulk Reference RPKM.Rds", package = "Capybara") 45 | # Read the matrices 46 | bulk.raw <- readRDS(bulk.raw.path) 47 | bulk.rpkm <- readRDS(bulk.rpkm.path) 48 | ``` 49 | 50 | With the bulk reference, we next load the single-cell reference, such as a cell atlas, and the single-cell sample to be used. The datasets to be used should be in a matrix form with each row representing a gene and each column representing a cell. Here, we use the Mouse Cell Atlas (MCA) as background and single-cell RNA-seq data of mouse pancreatic cells (Baron et al., 2016) as examples for demonstration. MCA can be obtained from https://figshare.com/articles/MCA_DGE_Data/5435866. We included the mouse pancreatic dataset in the package. 51 | 52 | **2.Load the single-cell sample dataset and the corresponding meta data** 53 | 54 | *Note: The meta data of this file contains 2 columns, where the first column represents cell.type and the second column represents barcode.* 55 | 56 | ```r 57 | # Read in the pancreatic data file that come with the package 58 | fpath <- system.file("extdata", "baron_dataset.zip", package = "Capybara") 59 | extract.dir <- "." 60 | # Extract the dataset 61 | unzip(fpath, overwrite = FALSE, exdir = ".") 62 | # Identify the full path 63 | full.fpath.meta <- paste0(extract.dir, "/", "baron_et_al_pancreatic_meta.csv") 64 | full.fpath.raw <- paste0(extract.dir, "/", "baron_et_al_pancreatic.csv") 65 | # Load the count matrix and the meta data 66 | baron.expr <- read.csv(full.fpath.raw, header = T, row.names = 1, stringsAsFactors = F) 67 | baron.meta <- read.csv(full.fpath.meta, header = T, row.names = 1, stringsAsFactors = F) 68 | ``` 69 | 70 | **3. Application of QP on the sample single-cell data** 71 | 72 | Notice: For Windows users, please set unix.par=F and n.cores=1 73 | 74 | ```r 75 | single.round.QP.analysis(bulk.raw, baron.expr, scale.bulk.sc = "scale", unix.par = TRUE, 76 | force.eq = 1, n.cores = 4, save.to.path = "./", 77 | save.to.filename = "baron_bulk_classification_qp") 78 | ``` 79 | 80 | **4. Load the single-cell reference meta data** 81 | 82 | *Note: The meta data of Mouse Cell Atlas contains 6 columns, including Cell.name, ClusterID, Tissue, Batch, Cell.Barcode, and Annotation. The annotation is what we used for high-resolution reference construction. We've included the version of meta data we used along with the package.* 83 | 84 | ```r 85 | # Read the meta data 86 | mca.meta.fpath <- system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara") 87 | mca <- read.csv(mca.meta.fpath, row.names = 1, header = T, stringsAsFactors = F) 88 | # Clean up the meta data 89 | mca.meta <- data.frame(row.names = mca$Cell.name, 90 | tissue = mca$Tissue, 91 | cell.type = mca$Annotation, 92 | stringsAsFactors = F) 93 | ``` 94 | 95 | **5. Load the single-cell reference atlas and apply QP tissue-by-tissue** 96 | 97 | *Due to the large size of MCA count data, we did* ***NOT*** *include the counts along with the package. We further separated MCA into fetal/neonatal/embryonic and adult categories. The counts data were organized in the following manner.* 98 | 99 | > Folder: MCA Counts 100 | 101 | >> Tissue_1 Folder 102 | 103 | >>> count.csv 104 | 105 | >> Tissue_2 Folder 106 | 107 | >>> count.csv 108 | 109 | >> ... 110 | 111 | ```r 112 | # List all possible files and tissues in the Mouse Cell Atlas 113 | file.ls <- list.files("./MCA_Counts/", full.names = T) 114 | base.nms <- basename(file.ls) 115 | 116 | # Identify the tissues 117 | unq.tissue <- unique(base.nms) 118 | 119 | # Set a path to save all QP files for all tissues 120 | general.path.to.save <- "./MCA_All_Tissue_QP/" 121 | for (k in 1:length(unq.tissue)) { 122 | curr.tissue <- unq.tissue[k] 123 | curr.filename <- paste0("0", k, "_", curr.tissue, "_Bulk_ARCHS4") 124 | 125 | file.base.name <- base.nms[which(startsWith(base.nms, curr.tissue))][1] 126 | file.full <- file.ls[which(startsWith(base.nms, curr.tissue))][1] 127 | 128 | print(curr.tissue) 129 | 130 | sc.data <- read.csv(paste0(file.full, "/count.csv"), header = T, row.names = 1, stringsAsFactors = F) 131 | 132 | if (all(is.na(sc.data))) { 133 | print("There is no data in this counting matrix!") 134 | } else { 135 | single.round.QP.analysis(bulk.raw, sc.data, scale.bulk.sc = "scale", unix.par = TRUE, 136 | force.eq = 1, n.cores = 4, save.to.path = general.path.to.save, 137 | save.to.filename = curr.filename) 138 | } 139 | } 140 | ``` 141 | 142 | **6. Selection of 90 cells from each tissue to construct a QP background** 143 | 144 | With all QP scores calculated on the bulk transcriptome profiles of all tissues, we select 90 most relevant cells of each tissue in the MCA (90 highest scored cells in the MCA to each bulk tissue) as a QP background. We use this QP background further to map our sample single-cell data. Assuming that each cell in each cell type of the MCA takes a unique combination of QP scores to each tissue in ARCHS4, cells in the sample that share similar combination to those in MCA are marked to relate to the corresponding tissue in the MCA. Here, we demonstrate how we constructed the backgrounds as following. We included the background matrices along with the packages such that it can be directly used for convenience. 145 | 146 | **(a) Get QP scores for all annotated cells** 147 | ```r 148 | # Read the QP files from the directory 149 | qp.files.to.read.clean <- list.files("./MCA_All_Tissue_QP/", full.names = T) 150 | 151 | full.qp.mtx.known.annotation <- data.frame() 152 | full.qp.mtx.unknown.annotation <- data.frame() 153 | for (i in 1:length(qp.files.to.read.clean)) { 154 | curr.file <- qp.files.to.read.clean[i] 155 | curr.qp.rslt <- read.csv(curr.file, header = T, row.names = 1, stringsAsFactors = F) 156 | 157 | cells.to.keep <- intersect(rownames(mca.meta), rownames(curr.qp.rslt)) 158 | cells.unlabel <- setdiff(rownames(curr.qp.rslt), cells.to.keep) 159 | 160 | curr.sub.mtx.to.keep <- curr.qp.rslt[cells.to.keep, ] 161 | curr.sub.mtx.unlabel <- curr.qp.rslt[cells.unlabel, ] 162 | 163 | if (nrow(full.qp.mtx.known.annotation) <= 0) { 164 | full.qp.mtx.known.annotation <- curr.sub.mtx.to.keep 165 | full.qp.mtx.unknown.annotation <- curr.sub.mtx.unlabel 166 | } else { 167 | full.qp.mtx.known.annotation <- rbind(full.qp.mtx.known.annotation, curr.sub.mtx.to.keep) 168 | full.qp.mtx.unknown.annotation <- rbind(full.qp.mtx.unknown.annotation, curr.sub.mtx.unlabel) 169 | } 170 | } 171 | 172 | full.qp.mtx.known.annotation.qp.score.only <- full.qp.mtx.known.annotation[,c(1:(ncol(full.qp.mtx.known.annotation) - 2))] 173 | ``` 174 | 175 | **(b) Selection of 90 cells** 176 | ```r 177 | # Create a map between MCA and ARCHS4 178 | map.df <- data.frame(mca.tissue = c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Trophoblast-Stem-Cell", "Fetal_Brain", 179 | "Neonatal-Calvaria","Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache", 180 | "Neonatal-Heart", "Neonatal-Muscle", 181 | "Neonatal-Rib", "Neonatal-Skin", "NeonatalPancreas"), 182 | corresponding = c("frxn_embryo", "frxn_embryo", "frxn_embryo", "frxn_brain","frxn_brain", 183 | "frxn_small.intestine", "frxn_liver", 184 | "frxn_lung", "frxn_stomach", "frxn_heart", "frxn_muscle", "frxn_muscle", 185 | "frxn_skin", "frxn_pancreas"), 186 | stringsAsFactors = F) 187 | 188 | # Identify top 90 cells for each tissue 189 | tm.tissue <- unique(map.df$tm.tissue) 190 | cell.selector <- c() 191 | n.sample <- 90 192 | for (i in 1:length(tm.tissue)) { 193 | curr.tissue <- tm.tissue[i] 194 | cell.names <- rownames(mca.meta)[which(mca.meta$tissue == curr.tissue)] 195 | curr.qp.subset <- full.qp.mtx.known.annotation.qp.score.only[cell.names, ] 196 | curr.map <- map.df$corresponding[which(map.df$tm.tissue == curr.tissue)] 197 | if (length(curr.map) <= 1){ 198 | curr.qp.subset.sub <- data.frame(score = curr.qp.subset[,curr.map], cell.name = cell.names, stringsAsFactors = F) 199 | } else { 200 | curr.qp.subset.sub <- data.frame(score = rowSums(curr.qp.subset[,curr.map]), cell.name = cell.names, stringsAsFactors = F) 201 | } 202 | curr.qp.subset.sub.sort <- curr.qp.subset.sub[order(-curr.qp.subset.sub$score), ] 203 | cells.to.incl <- curr.qp.subset.sub.sort$cell.name[1:n.sample] 204 | 205 | cell.selector <- c(cell.selector, cells.to.incl) 206 | } 207 | saveRDS(full.qp.mtx.known.annotation.qp.score.only[cell.selector, ], "./MCA_embryonic_background.RDS") 208 | ``` 209 | 210 | *Note: This constructed QP background can be saved and reused and does not need to be reconstructed every time.* 211 | 212 | ### Identification of tissue correlate in the reference to the sample single-cell dataset 213 | 214 | To find the correlated tissue in the reference to the sample single-cell dataset, we use a correlation based method. In brief, we calculate Pearson's correlation of the QP scores in a pairwise manner between each cell in the sample and each cell in the reference. Recall the assumption that cells in the sample that share similar combination of QP scores to those in MCA are marked to relate to the corresponding tissue in the MCA. If there is a significant percentage of reference cells of a tissue (over 70%) mapped to a cell, we record the tissue label. Then the frequency of each tissue label is calculated. Tissues with a frequency at least 0.5% (for cell number > 10,000) or at least 100 cells will be selected for further analysis. 215 | 216 | **1. Load the QP background matrix** 217 | ```r 218 | background.qp.fpath <- system.file("extdata", "MCA Adult Background.Rds", package = "Capybara") 219 | background.mtx <- readRDS(background.qp.fpath) 220 | ``` 221 | 222 | **2. Load the QP scores of the sample** 223 | 224 | ```r 225 | ## Load QP results 226 | qp.rslt <- read.csv("./baron_bulk_classification_qp_scale.csv", row.names = 1, header = T, stringsAsFactors = F) 227 | 228 | ## Reshape the data 229 | qp.rslt.sub <- qp.rslt[,c(1:(ncol(qp.rslt) - 2))] 230 | ``` 231 | 232 | **3. Correlation calculation** 233 | 234 | *Note: we use WGCNA to calculate the correlation* 235 | 236 | ```r 237 | mtx.test <- t(qp.rslt.sub[, colnames(background.mtx)]) 238 | ref.test <- t(background.mtx) 239 | 240 | # Pearson's Correlation Calculation 241 | corr.mtx <- WGCNA::cor(ref.test, mtx.test) 242 | ``` 243 | 244 | **4. Binarization based on correlation** 245 | 246 | We perform binarization based on the correlation estimates. 247 | 248 | ```r 249 | # Setup a correlation cutoff to the 90th quantile of the correlation matrix 250 | correlation.cutoff <- quantile(corr.mtx, 0.90) 251 | 252 | # Binarization based on the correlation 253 | new.corr.bin <- corr.mtx 254 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1 255 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0 256 | new.corr.bin <- as.data.frame(new.corr.bin) 257 | ``` 258 | 259 | **5. Counting the tissues and select the final tissue types** 260 | 261 | Count the frequency of occurrence of each tissue in the tissue list. 262 | 263 | ```r 264 | # Count 265 | count.in.cat <- c() 266 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1]))) 267 | for (uc in unique.cat) { 268 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)] 269 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7) 270 | } 271 | 272 | count.in.cat <- as.data.frame(count.in.cat) 273 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3) 274 | 275 | # Check frequency 276 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)] 277 | ``` 278 | 279 | Below is a composition example for this pancreatic dataset, where we identify 3 major tissues, including stomach, pancreas, and small instestine. 280 |

281 | 282 |

283 | 284 | 285 | ## Step 2: Generation of a High-Resolution Custom Reference, and Continuous Identity Measurement 286 | 287 | After tissue-level classification, relevant cell types are selected from cell atlas and built as a single cell reference dataset. As an alternative, users could also use their own single-cell reference dataset to benchmark their samples. 288 | 289 | ### Systematic construction of a high-resolution reference 290 | 291 | To alleviate the effect of technical variations, we construct pseudo-bulk references for each reference cell type. By default, 90 cells of each cell type would be used to build the reference. The construct.high.res.reference function returns a list containing expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference. 292 | 293 | **Get the counts of the cell types involved in the tissues selected** 294 | 295 | To obtain the exact same structure of the MCA data, please download the data here (https://wustl.box.com/s/z46vm5yq7r1lw3353o8ttz1xo8pu3dvw). 296 | 297 | ```r 298 | mca.meta$cell.bc.tissue <- unlist(lapply(strsplit(rownames(mca.meta), "_"), function(x) x[1])) 299 | pancreatic.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.adult), ] 300 | 301 | mca.counts.all.involved <- NULL 302 | tissues.to.read <- unique(pancreatic.all.meta$tissue) 303 | general.path <- "~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_Counts/" 304 | for (i in 1:length(tissues.to.read)) { 305 | curr.t <- tissues.to.read[i] 306 | curr.path.to.read <- paste0(general.path, curr.t, "/count.csv") 307 | curr.count <- read.csv(curr.path.to.read, header = T, row.names = 1, stringsAsFactors = F) 308 | if (is.null(mca.counts.all.involved)) { 309 | mca.counts.all.involved <- curr.count 310 | } else { 311 | mca.counts.all.involved <- cbind(mca.counts.all.involved, curr.count) 312 | } 313 | } 314 | 315 | ## Meta data filtering 316 | pancreatic.all.meta$cell.type <- gsub("Dendrtic cell", "Dendritic cell", pancreatic.all.meta$cell.type) 317 | pancreatic.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", pancreatic.all.meta$cell.type) 318 | pancreatic.all.meta$cell.type.alone <- unlist(lapply(strsplit(pancreatic.all.meta$cell.type.1, "_"), function(x) x[1])) 319 | 320 | ## Filter out cell types with less than 30 cells 321 | cell.type.alone.freq <- as.data.frame(table(pancreatic.all.meta$cell.type.alone)) 322 | cell.type.over.30 <- cell.type.alone.freq$Var1[which(cell.type.alone.freq$Freq >= 30)] 323 | pancreatic.sub.meta <- pancreatic.all.meta[which(pancreatic.all.meta$cell.type.alone %in% as.character(cell.type.over.30)),] 324 | coldata.df <- pancreatic.sub.meta 325 | ``` 326 | 327 | If the data is obtained from MCA website (https://figshare.com/articles/MCA_DGE_Data/5435866), please download the compressed file with rmbatch_dge. The extraction of the compressed file will provide a folder contains space-delimited text files, containing the single-cell matrices for each tissue in the MCA from different animals. Please follow the next few lines for the processing of such files. Special thanks to Danyi_ZHENG for sharing the tutorial with these MCA files (Detailed tutorial can be found here - https://github.com/Danyi-ZHENG/Capybara_MCA_tutorial/blob/main/Capybara_tutorial_MCA_220415.R) 328 | 329 | ```r 330 | mca.meta$cell.bc.tissue <- unlist(lapply(strsplit(rownames(mca.meta), "_"), function(x) x[1])) 331 | pancreatic.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.adult), ] 332 | 333 | mca.counts.all.involved <- NULL 334 | tissues.to.read <- unique(pancreatic.all.meta$tissue) 335 | 336 | curr.dir <- "../rmbatch_dge/" 337 | curr.dir.files <- list.files(curr.dir) 338 | curr.dir.files.sub <- unlist(lapply(strsplit(curr.dir.files, "_"), function(x) x[1])) 339 | 340 | for (i in 1:length(tissues.to.read)) { 341 | 342 | print(i) 343 | 344 | curr.t <- tissues.to.read[i] 345 | acceptable.files.starts <- c(curr.t, paste0(curr.t, seq(1,10))) 346 | file.index <- which(curr.dir.files.sub %in% acceptable.files.starts) 347 | 348 | curr.files.to.read <- curr.dir.files[file.index] 349 | curr.path.to.read <- paste0(curr.dir, "/", curr.files.to.read) 350 | 351 | curr.tissue.count.mtx <- NULL 352 | 353 | for (curr.f in curr.path.to.read) { 354 | curr.count <- read.table(curr.f, header = T, row.names = 1, stringsAsFactors = F) 355 | if (is.null(curr.tissue.count.mtx)) { 356 | curr.tissue.count.mtx <- curr.count 357 | } else { 358 | gene.intersect <- intersect(rownames(curr.tissue.count.mtx), rownames(curr.count)) 359 | curr.tissue.count.mtx <- cbind(curr.tissue.count.mtx[gene.intersect,], curr.count[gene.intersect,]) 360 | } 361 | } 362 | 363 | 364 | if (is.null(mca.counts.all.involved)) { 365 | mca.counts.all.involved <- curr.tissue.count.mtx 366 | } else { 367 | gene.intersect <- intersect(rownames(mca.counts.all.involved), rownames(curr.tissue.count.mtx)) 368 | mca.counts.all.involved <- cbind(mca.counts.all.involved[gene.intersect,], curr.tissue.count.mtx[gene.intersect,]) 369 | } 370 | } 371 | 372 | ## Meta data filtering 373 | pancreatic.all.meta$cell.type <- gsub("Dendrtic cell", "Dendritic cell", pancreatic.all.meta$cell.type) 374 | pancreatic.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", pancreatic.all.meta$cell.type) 375 | pancreatic.all.meta$cell.type.alone <- unlist(lapply(strsplit(pancreatic.all.meta$cell.type.1, "_"), function(x) x[1])) 376 | 377 | ## Filter out cell types with less than 30 cells 378 | cell.type.alone.freq <- as.data.frame(table(pancreatic.all.meta$cell.type.alone)) 379 | cell.type.over.30 <- cell.type.alone.freq$Var1[which(cell.type.alone.freq$Freq >= 30)] 380 | pancreatic.sub.meta <- pancreatic.all.meta[which(pancreatic.all.meta$cell.type.alone %in% as.character(cell.type.over.30)),] 381 | coldata.df <- pancreatic.sub.meta 382 | ``` 383 | 384 | **Construction** 385 | 386 | ``` r 387 | # Construction of a high-resolution reference 388 | ref.list <- construct.high.res.reference(mca.counts.all.involved, coldata.df = coldata.df, criteria = "cell.type.alone") 389 | # Get expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference 390 | ref.df <- ref.list[[3]] 391 | ref.meta <- ref.list[[2]] 392 | ref.sc <- ref.list[[1]] 393 | ``` 394 | 395 | ### Application of quadratic programming on the self-established reference with the sample 396 | 397 | ``` r 398 | # Measure cell identity in the reference dataset as a background 399 | single.round.QP.analysis(ref.df, ref.list[[1]], n.cores = 4, save.to.path = "./", save.to.filename = "01_MCA_Based_scClassifier_reference_mix90_normalize_select", unix.par = TRUE) 400 | 401 | # Measure cell identity in the query dataset 402 | single.round.QP.analysis(ref.df, baron.expr, n.cores = 4, save.to.path = "./", save.to.filename = "02_MCA_Based_scClassifier_reference_mix90_test_normalize_select", unix.par = TRUE) 403 | ``` 404 | 405 | ## Step 3: Discrete Cell Type Classification and Multiple Identity Scoring 406 | 407 | ### Empirical p-value calculation 408 | With the constructed single-cell reference, we apply QP to both the sample and reference single-cell datasets to generate continuous measurements of cell identity. The result of this step includes two lists of p-value matrices: one for the reference and the other for the sample. For each cell, each column of the p-value matrix denotes a cell type, while each row describes each round of 50 (default). 409 | ``` r 410 | # Read in background and testing identity scores 411 | background.mtx <- read.csv("./01_MCA_Based_scClassifier_reference_mix90_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 412 | mtx.test <- read.csv("./02_MCA_Based_scClassifier_reference_mix90_test_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 413 | 414 | col.sub <- ncol(background.mtx) - 2 415 | 416 | # Conduct reference randomization to get empirical p-value matrix 417 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)]) 418 | 419 | # Conduct test randomization to get empirical p-value matrix 420 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)])) 421 | ``` 422 | 423 | ### Binarization with Mann-Whitney 424 | A randomized test is performed using the background distributions as null to compute the occurrence probability or empirical p-values of each identity score. This test shapes the likelihood identity score occurrence as a continuous distribution, in which the cell type with the lowest likelihood rank is the classified identity. Capybara is also able to identify cells that harbor multiple identities, potentially representing cells transitioning between defined cell identities. To capture multiple cell identities, we use a Mann-Whitney (MW) test to compare the occurrence probabilities of the cell type with the lowest likelihood rank to that of other cell types, following the order from the second-lowest to the highest rank-sum. From this test, we calculate a p-value to determine whether two identities are equally likely to represent the identity of a specific cell. We stop our comparison when we identify the first cell type that is significantly (p-value < 0.05) less likely to represent one of the cell identities. A binarized matrix will be returned with each row representing a query cell and each column representing a possible cell type. 1 means inferred cell type in the matrix. 425 | 426 | ``` r 427 | # Binarization of inference results 428 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list) 429 | ``` 430 | 431 | ### Classification 432 | 433 | Finally, we return a classification table of each query cell and its inferred cell type. Cells with multiple inferred identities are marked as "Multi_ID". Cells with no significant inferred identity are marked as "unassigned". 434 | 435 | ``` r 436 | classification <- binary.to.classification(bin.count[,c(1:col.sub)]) 437 | rownames(classification) <- classification$barcode 438 | ``` 439 | 440 | ### Check the Classification Result 441 | 442 | We check the classification results by comparing the labels that are shared between the reference and manual annotation of Baron et al., 2016. Further, we visualize the agreement using ggplot2. 443 | 444 | ```r 445 | rownames(baron.meta) <- gsub("-", ".",rownames(baron.meta)) 446 | classification$actual <- baron.meta[rownames(classification), "cell.type"] 447 | 448 | table.freq <- table(classification$actual, classification$call) 449 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3)) 450 | 451 | rownames(table.freq.perc)[nrow(table.freq.perc] <- "beta" 452 | 453 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 454 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", 455 | "Multi_ID", "Endocrine.cell"), 456 | c("B_cell", "beta", "ductal", "endothelial", 457 | "macrophage", "T_cell", "immune_other", "activated_stellate", 458 | "alpha", "delta", "gamma")]) 459 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub) 460 | table.freq.melt <- melt(table.freq.sub) 461 | 462 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call, 463 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell", 464 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", 465 | "Multi_ID", "Endocrine.cell"), 466 | ordered = T) 467 | table.freq.melt$variable <- factor(table.freq.melt$variable, 468 | levels = c("B_cell", "beta", "ductal", "endothelial", 469 | "macrophage", "T_cell", "immune_other", "activated_stellate", 470 | "alpha", "delta", "gamma"), 471 | ordered = T) 472 | 473 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) + 474 | geom_point(aes(colour = variable)) + 475 | scale_size_area(name = "Percentage", max_size=12) + 476 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) + 477 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") + 478 | guides(fill = FALSE, color = FALSE) + 479 | theme(legend.position = "bottom", 480 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90), 481 | axis.text.y = element_text(size = 12, face = "bold.italic"), 482 | axis.ticks = element_blank(), 483 | axis.title = element_blank(), 484 | title = element_text(face = "bold.italic", size = 14), 485 | panel.grid.major = element_blank(), 486 | panel.grid.minor = element_blank(), 487 | panel.background = element_blank(), 488 | axis.line = element_line(colour = "black", size = 1)) 489 | ``` 490 | 491 | Below is a dot plot example for this pancreatic dataset to show agreement. 492 |

493 | 494 |

495 | 496 | ## Analysis of Cells with Multiple Identities 497 | 498 | A unique aspect of Capybara is the classificaiton of cells with multiple identities, which are key to characterize cell fate transitions in a continuous process. Cells with multiple identities label transition harbors, while the discrete cell identities that connect these cells mark potential pivotal states/hallmarks during the continuous processes. In Capybara, we further develop a 'transition metric', a transition score, to measure the flux through the mixed cell identities. It is worth noting that the intention of this score is not to measure potential of each identity but to measure the dynamics going through each discrete state. For details, please refer to the paper. Here, we use an example of cardiomyocyte reprogramming (Stone et al., 2019) to demonstrate the preprocessing of data, classification, analysis of cells with multiple identities and calculation of transition scores. 499 | 500 | ### 1. Download the data 501 | 502 | The dataset for the cardiomyocyte reprogramming can be found here under GEO: GSE131328. This dataset contains 6 timepoints of this reprogramming process, Day -1, 1, 2, 3, 7, and 14, where Day -1 marks the day of transduction of three transcription factors and Day 14 cells were sorted using a-MHC reporter (Stone et al., 2019). The data can be downloaded in terminal as well as in R. 503 | 504 | ``` 505 | wget https://www.ncbi.nlm.nih.gov/geo/download/acc=GSE133452&format=file&file=GSE133452%5Fm1%5F1%5F2%5F3%5F7%5F14P%5Fpaper%2Ecsv%2Egz 506 | ``` 507 | 508 | or 509 | 510 | ```r 511 | download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133452&format=file&file=GSE133452%5Fm1%5F1%5F2%5F3%5F7%5F14P%5Fpaper%2Ecsv%2Egz", "./cardiomyocyte_reprogramming_m1_14p.csv.gz") 512 | unzip("./cardiomyocyte_reprogramming_m1_14p.csv.gz", overwrite = FALSE, exdir = ".") 513 | ``` 514 | 515 | ### 2. Preprocessing of the data with Seurat 516 | 517 | In this step, we preprocess the data with Seurat to filter the data and obtain a UMAP embedding of the data. For details of Seurat processing, please refer to the instructions or vignettes here - https://satijalab.org/seurat/vignettes.html. 518 | 519 | ```r 520 | # Read in the file path for all features and genes 521 | feature.file.path <- system.file("extdata", "features.tsv", package = "Capybara") 522 | 523 | # Load the data 524 | stone.et.al <- read.csv("./cardiomyocyte_reprogramming_m1_14p.csv", row.names = 1, header = T, stringsAsFactors = F) 525 | feature.df <- read.table(feature.file.path, header = F, row.names = 1, stringsAsFactors = F) 526 | 527 | # Map the gene names fr 528 | gene.name.subset <- feature.df[intersect(stone.et.al$X, rownames(feature.df)), ] 529 | stone.et.al.subset <- stone.et.al[which(stone.et.al$X %in% rownames(feature.df)), ] 530 | stone.et.al.subset$gene.name <- gene.name.subset[stone.et.al.subset$X, "V2"] 531 | stone.et.al.subset <- stone.et.al.subset[-which(duplicated(stone.et.al.subset$gene.name)), ] 532 | rnm <- stone.et.al.subset$gene.name 533 | stone.et.al.final <- stone.et.al.subset[, -c(1,ncol(stone.et.al.subset))] 534 | rownames(stone.et.al.final) <- rnm 535 | 536 | # Create Seurat object 537 | sc.data.stone <- CreateSeuratObject(counts = stone.et.al.final, project = "cardiac.reprog", min.cells = 3, min.features = 200) 538 | 539 | # Calculate mitochondria content 540 | sc.data.stone[["percent.mt"]] <- PercentageFeatureSet(sc.data.stone, pattern = "mt-") 541 | 542 | # Visualize QC metrics as a violin plot and scatter plot 543 | VlnPlot(sc.data.stone, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 544 | 545 | plot1 <- FeatureScatter(sc.data.stone, feature1 = "nCount_RNA", feature2 = "percent.mt") 546 | plot2 <- FeatureScatter(sc.data.stone, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") 547 | CombinePlots(plots = list(plot1, plot2)) 548 | 549 | # Filter the dataset based on number of features 550 | sc.data.stone <- subset(sc.data.stone, subset = nFeature_RNA > 200 & nFeature_RNA < 5500) 551 | 552 | # Log normalize the data 553 | sc.data.stone <- NormalizeData(sc.data.stone, normalization.method = "LogNormalize", scale.factor = 10000) 554 | 555 | # Variable gene identification 556 | sc.data.stone <- FindVariableFeatures(sc.data.stone, selection.method = "vst", nfeatures = 2000) 557 | 558 | # Scale the data 559 | all.genes <- rownames(sc.data.stone) 560 | sc.data.stone <- ScaleData(sc.data.stone, features = all.genes) 561 | 562 | # PCA 563 | sc.data.stone <- RunPCA(sc.data.stone, features = VariableFeatures(object = sc.data.stone)) 564 | 565 | # JackStraw procedure and Elbow plot to select number of PCs 566 | sc.data.stone <- JackStraw(sc.data.stone, num.replicate = 100) 567 | sc.data.stone <- ScoreJackStraw(sc.data.stone, dims = 1:20) 568 | 569 | JackStrawPlot(sc.data.stone, dims = 1:20) 570 | ElbowPlot(sc.data.stone) 571 | 572 | # Identify neighbors and clusters 573 | sc.data.stone <- FindNeighbors(sc.data.stone, dims = 1:18) 574 | sc.data.stone <- FindClusters(sc.data.stone, resolution = 0.8) 575 | 576 | # UMAP embedding 577 | sc.data.stone <- RunUMAP(sc.data.stone, dims = 1:18) 578 | ``` 579 | 580 | ### 3. Classification 581 | 582 | Here, we perform the same classification pipeline as described above in the first section. 583 | 584 | #### Step 1. Tissue Classification 585 | 586 | ***Load the bulk data*** 587 | 588 | ```r 589 | # File path 590 | bulk.raw.path <- system.file("extdata", "Bulk Reference Raw.Rds", package = "Capybara") 591 | bulk.rpkm.path <- system.file("extdata", "Bulk Reference RPKM.Rds", package = "Capybara") 592 | # Read the matrices 593 | bulk.raw <- readRDS(bulk.raw.path) 594 | bulk.rpkm <- readRDS(bulk.rpkm.path) 595 | ``` 596 | 597 | ***Application of Quadratic Programming using Bulk*** 598 | 599 | ```r 600 | single.round.QP.analysis(bulk.raw, stone.et.al, scale.bulk.sc = "scale", unix.par = TRUE, 601 | force.eq = 1, n.cores = 4, save.to.path = "./", 602 | save.to.filename = "stone_bulk_classification_qp") 603 | ``` 604 | 605 | ***Correlation Analysist*** 606 | 607 | ```r 608 | ## Load QP results 609 | qp.rslt <- read.csv("./stone_bulk_classification_qp_scale.csv", row.names = 1, header = T, stringsAsFactors = F) 610 | 611 | ## Reshape the data 612 | qp.rslt.sub <- qp.rslt[,c(1:(ncol(qp.rslt) - 2))] 613 | 614 | ## Background matrix 615 | background.qp.fpath <- system.file("extdata", "MCA Embryonic Background.Rds", package = "Capybara") 616 | background.mca <- readRDS(background.qp.fpath) 617 | background.mtx <- background.mca[[2]] 618 | 619 | ## Correlation Analysis 620 | mtx.test <- t(qp.rslt.sub[, colnames(background.mtx)]) 621 | ref.test <- t(background.mtx) 622 | 623 | ## Pearson's Correlation Calculation 624 | corr.mtx <- WGCNA::cor(ref.test, mtx.test) 625 | 626 | ## Setup a correlation cutoff to the 90th quantile of the correlation matrix 627 | correlation.cutoff <- quantile(corr.mtx, 0.90) 628 | 629 | ## Binarization based on the correlation 630 | new.corr.bin <- corr.mtx 631 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1 632 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0 633 | new.corr.bin <- as.data.frame(new.corr.bin) 634 | ``` 635 | 636 | ***Mapping to Tissues in Mouse Cell Atlas (MCA)*** 637 | 638 | ```r 639 | # Count 640 | count.in.cat <- c() 641 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1]))) 642 | for (uc in unique.cat) { 643 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:30729)] 644 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.80) 645 | } 646 | 647 | count.in.cat <- as.data.frame(count.in.cat) 648 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3) 649 | 650 | final.cell.types.fetal <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)] 651 | ``` 652 | 653 | Below is the composition for this cardiac reprogramming dataset, where we identify 4 major tissues. 654 |

655 | 656 |

657 | 658 | #### Step 2. Construction of Reference at High-Resolution and Continuous Identity Measurements 659 | 660 | ***Get the counts of cell types in the selected tissues from MCA*** 661 | 662 | ```r 663 | # Background cells 664 | mca <- read.csv("~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_CellAssignments.csv", 665 | row.names = 1, header = T, stringsAsFactors = F) 666 | mca.meta <- data.frame(row.names = mca$Cell.name, 667 | tissue = mca$Tissue, 668 | cell.bc.tissue = unlist(lapply(strsplit(mca$Cell.name, "_"), function(x) x[1])), 669 | cell.type = mca$Annotation, 670 | stringsAsFactors = F) 671 | 672 | cardiac.rp.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.fetal), ] 673 | 674 | mca.counts.all.involved <- NULL 675 | tissues.to.read <- unique(cardiac.rp.all.meta$tissue) 676 | general.path <- "~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_Counts/" 677 | for (i in 1:length(tissues.to.read)) { 678 | curr.t <- tissues.to.read[i] 679 | curr.path.to.read <- paste0(general.path, curr.t, "/count.csv") 680 | curr.count <- read.csv(curr.path.to.read, header = T, row.names = 1, stringsAsFactors = F) 681 | if (is.null(mca.counts.all.involved)) { 682 | mca.counts.all.involved <- curr.count 683 | } else { 684 | mca.counts.all.involved <- cbind(mca.counts.all.involved, curr.count) 685 | } 686 | } 687 | 688 | ## meta data cleaning 689 | cardiac.rp.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", cardiac.rp.all.meta$cell.type) 690 | cardiac.rp.all.meta$cell.type.alone <- unlist(lapply(strsplit(cardiac.rp.all.meta$cell.type.1, "_"), function(x) x[1])) 691 | 692 | cardiac.rp.all.meta$cell.type.1 <- tolower(cardiac.rp.all.meta$cell.type.1) 693 | coldata.df <- cardiac.rp.all.meta 694 | ``` 695 | 696 | ***Construction*** 697 | ```r 698 | # Construction of a high-resolution reference 699 | ref.list <- construct.high.res.reference(mca.counts.all.involved, coldata.df = coldata.df, criteria = "cell.type.1") 700 | # Get expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference 701 | ref.df <- ref.construction(ref.list[[1]], ref.list[[2]], "cell.type") 702 | ``` 703 | 704 | ***Application of Quadratic Programming*** 705 | ```r 706 | single.round.QP.analysis(ref.df, ref.list[[1]], n.cores = 4, save.to.path = "./", save.to.filename = "stone_et_al_reference_MCA", unix.par = TRUE) 707 | single.round.QP.analysis(ref.df, stone.et.al, n.cores = 4, save.to.path = "./", save.to.filename = "stone_et_al_test_MCA", unix.par = TRUE) 708 | ``` 709 | 710 | #### Step 3. Discrete Cell Type Classification and Multiple Identity scoring 711 | 712 | ***Empirical p-value Calculation*** 713 | ```r 714 | # Read in background and testing identity scores 715 | background.mtx <- read.csv("./stone_et_al_reference_MCA_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 716 | mtx.test <- read.csv("./stone_et_al_test_MCA_scale.csv", header = T, row.names = 1, stringsAsFactors = F) 717 | 718 | col.sub <- ncol(background.mtx) - 2 719 | 720 | # Conduct reference randomization to get empirical p-value matrix 721 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)]) 722 | 723 | # Conduct test randomization to get empirical p-value matrix 724 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)])) 725 | ``` 726 | 727 | ***Binarization and Classificationn*** 728 | ```r 729 | # Binarization of inference results 730 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list) 731 | # Classificationn 732 | classification <- binary.to.classification(bin.count[,c(1:col.sub)]) 733 | rownames(classification) <- classification$barcode 734 | ``` 735 | 736 | ### 4. Filter Cells with Multiple Identities based on the QP scores 737 | 738 | Different interpretations can be taken on the cells with multiple identities. Here, we interpret these cells as in transition. However, some cells with multiple identities may be incorrectly labelled, which is represented by close to zero (we defined as <10E-3, this threshold can be modified as an input parameter of the function) QP score for one of the labelled identities. Hence, we first filter the cells with multiple identities such that each cell receives relative significant QP scores for each shared identity. This function will return a list where \[\[1\]\] represents a data frame of actual multiple identity cells and \[\[2\]\] represents an updated classification data frame. In the data frame of actual multiple identity cells, the QP measurements for each identity are also included for calculation of transition scores. 739 | 740 | ```r 741 | multi.classification.list <- multi.id.curate.qp(binary.counts = bin.count, classification = classification, qp.matrix = mtx.test) 742 | # Reassign variables 743 | actual.multi <- multi.classification.list[[1]] 744 | new.classification <- multi.classification.list[[2]] 745 | ``` 746 | 747 | ### 5. Calculate Transition Scores 748 | 749 | ***Calculation*** 750 | 751 | Cells with multiple identities label critical transition states in different trajectories. Building on this concept, we also measure the strength and frequency of connections to the "hub" identities, i.e. discrete identities where the multiple identity cells are connected to. This provides a metric that we define as a "transition score". The calculation of transition scores only involves cells with multiple identities. In brief, we interpret QP scores as probabilities of the cell transitioning to each discrete cell identity, which we further use as a measure of transition probability. Using this measurement of transition probability, we calculate the amount of information that the terminal cell state has received based on information theory. For detailed methods, please refer to the paper. Here, we demonstrate the calculation of transition scores for discrete cell identity states that are involved in cells with multiple identities in this cardiomyocyte reprogramming process. This function takes an input of the multiple identity data frame calculated from above and outputs a data frame with each cell state to their scores. 752 | 753 | 754 | ```r 755 | score.df <- transition.score(actual.multi) 756 | ``` 757 | 758 | ***Visualization*** 759 | 760 | Here, we visualize the transition scores on the UMAP, comparing to their time points of collection. Additionally, we compared the scores between time points using box plots. Other visualization methods, such as violin plots, can also be adapted. 761 | 762 | Below is visualization results of cardiac reprogramming process. 763 |

764 | 765 |

766 | 767 | 768 | *Note: this will be continuously updating* 769 | -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/.DS_Store -------------------------------------------------------------------------------- /examples/Monocle_hat_colin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/Monocle_hat_colin.png -------------------------------------------------------------------------------- /examples/Step 1 - ARCHS4 Mining.R: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # Step 1: ARCHS4 Mining 3 | ############################################### 4 | 5 | library(rhdf5) 6 | 7 | destination_file = "mouse_matrix_download.h5" 8 | url = "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix_download.h5" 9 | 10 | # Check if gene expression file was already downloaded, if not in current directory download file form repository 11 | if(!file.exists(destination_file)){ 12 | print("Downloading compressed gene expression matrix.") 13 | download.file(url, destination_file, quiet = FALSE, mode = 'wb') 14 | } 15 | 16 | # Get the meta data from the h5 file 17 | samples = h5read("mouse_matrix_download.h5", "meta") 18 | 19 | sample_meta <- data.frame(channel_count = samples$Sample_channel_count, 20 | accession = samples$Sample_geo_accession, 21 | characteristics = samples$Sample_characteristics_ch1, 22 | organism = samples$Sample_organism_ch1, 23 | source = samples$Sample_source_name_ch1, 24 | molecule_info = samples$Sample_molecule_ch1, 25 | series_id = samples$Sample_series_id, stringsAsFactors=F) 26 | 27 | # Use only total RNA and polyA RNA-seq data 28 | total.RNA.indx <- which(sample_meta$molecule_info == "total RNA") 29 | poly.A.indx <- which(sample_meta$molecule_info == "polyA RNA") 30 | sample_meta_pa_total <- sample_meta[c(total.RNA.indx, poly.A.indx),] 31 | 32 | ### Clean some of the labels 33 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("stomachs", "stomach wt", "stomach tissue"))] <- "stomach" 34 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("mus musculus strain c57bl/6j urinary bladder tissue adult (8 weeks)", "bladder"))] <- "urinary bladder" 35 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("bone lining cells", "wild type_calvarial frontal bone", "wild type_calvarial parietal bone", "wt mandibular bone"))] <- "bone" 36 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("primary esophagus"))] <- "esophagus" 37 | 38 | # Limit to these tissue labels 39 | tissue.to.choose <- c("aorta", "bone", "bone marrow", "marrow", "brain", "colon", 40 | "large intestine", "diaphragm", "embryo", "esophagus", "brown fat", "white fat", 41 | "adipose", "gall bladder", "heart", "kidney", "lung", "ovary", 42 | "pancreas", "prostate", "skin", "small intestine", "spleen", "muscle", 43 | "stomach", "testis", "thymus", "tongue", "trachea", 44 | "urinary bladder", "uterus", "liver", "mammary gland") 45 | indx <- which(tolower(sample_meta_pa_total$source) %in% tissue.to.choose) 46 | 47 | sample_tissue_sub <- sample_meta_pa_total[indx,] 48 | sample_tissue_sub$source <- tolower(sample_tissue_sub$source) 49 | 50 | # Get the strain infomation in place 51 | strains <- c() 52 | treatment <- c() 53 | genotype <- c() 54 | for (i in 1:nrow(sample_tissue_sub)) { 55 | curr_characteristics <- sample_tissue_sub$characteristics[i] 56 | curr_parts <- strsplit(curr_characteristics, "Xx-xX")[[1]] 57 | strain.parts <- which(startsWith(tolower(curr_parts), "strain")) 58 | treatment.parts <- which(startsWith(tolower(curr_parts), "treatment")) 59 | genotype.parts <- which(startsWith(tolower(curr_parts), "genotype")) 60 | 61 | curr_strain <- " " 62 | curr_treatment <- " " 63 | curr_genotype <- " " 64 | if (length(strain.parts) > 0) { 65 | curr_strain <- strsplit(tolower(curr_parts[strain.parts]), "strain: ")[[1]][2] 66 | } 67 | if (length(treatment.parts) > 0) { 68 | curr_treatment <- strsplit(tolower(curr_parts[treatment.parts]), "treatment: ")[[1]][2] 69 | } 70 | if (length(genotype.parts) > 0) { 71 | curr_genotype <- strsplit(tolower(curr_parts[genotype.parts]), "genotype: ")[[1]][2] 72 | } 73 | strains <- c(strains, curr_strain) 74 | treatment <- c(treatment, curr_treatment) 75 | genotype <- c(genotype, curr_genotype) 76 | } 77 | 78 | # subset to only include c57bl6 79 | sample_tissue_sub$strain <- strains 80 | sample_tissue_sub$treatment <- treatment 81 | sample_tissue_sub$genotype <- genotype 82 | 83 | sample_tissue_strain_sub <- sample_tissue_sub[which(tolower(sample_tissue_sub$strain) %in% c("c57bl6", "c57bl6j", "c57bl6n")),] 84 | 85 | saveRDS(sample_tissue_sub, "01_full_accession_information_data_frame.Rds") 86 | 87 | # Retrieve information from compressed data 88 | samples_geo = samples$Sample_geo_accession 89 | genes <- samples$genes 90 | 91 | sample_locations = which(samples_geo %in% sample_tissue_strain_sub$accession) 92 | 93 | # Read the expression data 94 | expression = h5read(destination_file, "data/expression", index=list(1:length(genes), sample_locations)) 95 | H5close() 96 | 97 | rownames(expression) = genes 98 | colnames(expression) = samples_geo[sample_locations] 99 | 100 | # Use Ensembl to extract gene length information based on Ensembl ID 101 | library(biomaRt) 102 | gene.info <- read.table("gene_info.tsv", header=T, stringsAsFactors=F) 103 | 104 | mart <- useMart("ensembl",dataset="mmusculus_gene_ensembl") 105 | ensembl.gene <- gene.info$gene.ensembl[which(!is.na(gene.info$gene.ensembl))] 106 | ensembl.map.rslt <- getBM(attributes = c("mgi_symbol", "start_position", "end_position", "ensembl_gene_id_version", "external_gene_name", "ensembl_gene_id"), 107 | filters = "ensembl_gene_id", values = ensembl.gene, mart = mart) 108 | ensembl.map.rslt <- unique(ensembl.map.rslt[,c(2:6)]) 109 | rownames(ensembl.map.rslt) <- ensembl.map.rslt$ensembl_gene_id 110 | # Calculate length of the gene 111 | ensembl.map.rslt$gene.length <- abs(ensembl.map.rslt$end_position - ensembl.map.rslt$start_position)/1000 112 | 113 | gene.info$gene.length <- ensembl.map.rslt[gene.info$gene.ensembl, "gene.length"] 114 | 115 | gene.info.sub <- gene.info[which(!is.na(gene.info$gene.length)), ] 116 | rownames(gene.info.sub) <- gene.info.sub$gene.sym 117 | 118 | # RPKM Calculation 119 | raw.count.sub <- expression[which(rownames(expression) %in% gene.info.sub$gene.sym),] 120 | raw.count.sub <- as.data.frame(raw.count.sub) 121 | raw.count.sub$gene.length <- gene.info.sub[rownames(raw.count.sub), "gene.length"] 122 | rpkm.count <- raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))] 123 | # Calculate per million size factor and divide each sample by its size factor 124 | size.factors <- colSums(raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))])/1000000 125 | for (i in 1:(ncol(rpkm.count))) { 126 | rpkm.count[,i] <- rpkm.count[,i]/size.factors[colnames(rpkm.count)[i]] 127 | rpkm.count[,i] <- rpkm.count[,i]/raw.count.sub$gene.length 128 | } 129 | 130 | saveRDS(rpkm.count, "03_rpkm_count_Tissue_bulk.Rds") 131 | saveRDS(raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))], "03_raw_count_Tissue_bulk.Rds") 132 | 133 | 134 | -------------------------------------------------------------------------------- /examples/Step 2 - bulk cleaning.R: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # Step 2: ARCHS4 Bulk Cleaning 3 | ############################################### 4 | library(viridis) 5 | library(pheatmap) 6 | library(ggplot2) 7 | library(reshape2) 8 | 9 | ### function to get the most similar n samples 10 | get.most.connected <- function(mtx, n.sample) { 11 | corr.mtx <- WGCNA::cor(mtx) 12 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx) 13 | corr.mtx.melt <- melt(corr.mtx.upper) 14 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ] 15 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ] 16 | corr.mtx.melt.pos.sort$Var1 <- as.character(corr.mtx.melt.pos.sort$Var1) 17 | corr.mtx.melt.pos.sort$Var2 <- as.character(corr.mtx.melt.pos.sort$Var2) 18 | 19 | sample.list <- c() 20 | count.line <- 1 21 | while(length(sample.list) < n.sample) { 22 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$Var1[count.line], corr.mtx.melt.pos.sort$Var2[count.line])))) 23 | count.line <- count.line + 1 24 | } 25 | 26 | return(sample.list[1:90]) 27 | } 28 | 29 | ### Load the raw data/rpkm data from the step 1 30 | raw.counts <- readRDS("03_raw_count_Tissue_bulk.Rds") 31 | rpkm.count <- readRDS("03_rpkm_count_Tissue_bulk.Rds") 32 | accession.full <- readRDS("01_full_accession_information_data_frame.Rds") 33 | rownames(accession.full) <- accession.full$accession 34 | 35 | ### Subset the accession 36 | accession.sub <- accession.full[colnames(raw.counts), ] 37 | accession.sub$source <- tolower(accession.sub$source) 38 | accession.freq <- as.data.frame(table(accession.sub$source, accession.sub$series_id)) 39 | 40 | accession.sub$merge <- paste0(accession.sub$source, "_", accession.sub$series_id) 41 | accession.sub.freq <- as.data.frame(table(accession.sub$merge)) 42 | accession.sub.freq$Var1 <- as.character(accession.sub.freq$Var1) 43 | accession.sub.freq$tissue <- unlist(lapply(strsplit(accession.sub.freq$Var1, "_"), function(x) x[1])) 44 | 45 | ### Select for the most associated 90 samples across different GEO RNA-seq data 46 | tissue.uniq <- unique(accession.sub$source) 47 | tissue.uniq <- tissue.uniq[which(!is.na(tissue.uniq))] 48 | series.to.keep <- c() 49 | sample.accession.to.keep <- c() 50 | new.accession <- data.frame() 51 | new.sample.accession <- data.frame() 52 | sample.count <- data.frame() 53 | sample.rpkm <- data.frame() 54 | n.sample <- 90 55 | for (tis in tissue.uniq) { 56 | curr.freq.sub <- accession.sub.freq[which(accession.sub.freq$tissue == tis), ] 57 | curr.freq.sub.sort <- curr.freq.sub[order(-curr.freq.sub$Freq), ] 58 | curr.series.to.keep <- strsplit(curr.freq.sub.sort$Var1[1], "_")[[1]][2] 59 | curr.geo.to.keep <- accession.sub[which(accession.sub$series_id == curr.series.to.keep & accession.sub$source == tis), ] 60 | curr.geo.to.keep$tissue <- tis 61 | print(tis) 62 | print(nrow(curr.geo.to.keep)) 63 | series.to.keep <- c(series.to.keep, curr.series.to.keep) 64 | # sample.accession.to.keep <- c(sample.accession.to.keep, curr.geo.to.keep) 65 | 66 | if (nrow(curr.geo.to.keep) >= n.sample) { 67 | geo.sample <- get.most.connected(log2(rpkm.count[,curr.geo.to.keep$accession] + 1), n.sample = n.sample) 68 | } else { 69 | geo.sample <- sample(curr.geo.to.keep$accession, n.sample, replace = T) 70 | } 71 | 72 | curr.count.sample <- raw.counts[, geo.sample] 73 | curr.rpkm.sample <- rpkm.count[, geo.sample] 74 | 75 | colnames(curr.count.sample) <- paste0(tis, "_", seq(1,n.sample)) 76 | colnames(curr.rpkm.sample) <- paste0(tis, "_", seq(1,n.sample)) 77 | 78 | if (nrow(new.accession) <= 0) { 79 | new.accession <- curr.geo.to.keep 80 | new.sample.accession <- curr.geo.to.keep[geo.sample,] 81 | sample.count <- curr.count.sample 82 | sample.rpkm <- curr.rpkm.sample 83 | } else { 84 | new.accession <- rbind(new.accession, curr.geo.to.keep) 85 | new.sample.accession <- rbind(new.sample.accession, curr.geo.to.keep[geo.sample, ]) 86 | sample.count <- cbind(sample.count, curr.count.sample) 87 | sample.rpkm <- cbind(sample.rpkm, curr.rpkm.sample) 88 | } 89 | } 90 | 91 | accession.new.freq <- as.data.frame(table(new.accession$source)) 92 | 93 | ### Look at the correlation between samples 94 | corr.sample.rpkm <- WGCNA::cor(sample.rpkm) 95 | corr.sample.rpkm.log <- WGCNA::cor(log2(sample.rpkm + 1)) 96 | 97 | my.col <- data.frame(row.names = rownames(corr.sample.rpkm), 98 | tissue = unlist(lapply(strsplit(rownames(corr.sample.rpkm), "_"), function(x) x[1])), 99 | stringsAsFactors = F) 100 | 101 | ### Heatmap plot to see if the samples selected are well correlated as well as well distinguished from other tissues 102 | pheatmap::pheatmap(corr.sample.rpkm, color = viridis(20, option = "A"), show_colnames = F, show_rownames = F, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 103 | annotation_col = my.col, cellheight = 0.2, cellwidth = 0.2, file = "~/Desktop/test.pdf") 104 | pheatmap::pheatmap(corr.sample.rpkm.log, color = viridis(20, option = "A"), show_colnames = F, show_rownames = F, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 105 | annotation_col = my.col, cellheight = 0.2, cellwidth = 0.2, file = "~/Desktop/test_log.pdf") 106 | 107 | saveRDS(sample.count, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sampled_raw_counts.Rds") 108 | saveRDS(sample.rpkm, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sampled_raw_rpkm.Rds") 109 | 110 | ### Compute the final averaged bulk dataset 111 | final.rpkm.tissue <- data.frame() 112 | final.raw.count.tissue <- data.frame() 113 | uniq.tissue <- unique(my.col$tissue) 114 | 115 | for (tis in uniq.tissue) { 116 | curr.rpkm.tis <- sample.rpkm[, rownames(my.col)[which(my.col$tissue == tis)]] 117 | curr.rpkm.mean <- as.data.frame(rowMeans(curr.rpkm.tis)) 118 | 119 | curr.count.tis <- sample.count[, rownames(my.col)[which(my.col$tissue == tis)]] 120 | curr.count.mean <- as.data.frame(rowMeans(curr.count.tis)) 121 | 122 | colnames(curr.rpkm.mean) <- tis 123 | colnames(curr.count.mean) <- tis 124 | 125 | if (ncol(final.rpkm.tissue) <= 0) { 126 | final.rpkm.tissue <- curr.rpkm.mean 127 | final.raw.count.tissue <- curr.count.mean 128 | } else { 129 | final.rpkm.tissue <- cbind(final.rpkm.tissue, curr.rpkm.mean) 130 | final.raw.count.tissue <- cbind(final.raw.count.tissue, curr.count.mean) 131 | } 132 | } 133 | 134 | ### Check if the averaged bulk dataset has tissues that are distinguishable from each other 135 | rpkm.tissue.cor <- WGCNA::cor(log2(final.rpkm.tissue + 1)) 136 | pheatmap::pheatmap(rpkm.tissue.cor, color = viridis(20, option = "A"), show_colnames = T, show_rownames = T, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 137 | cellheight = 15, cellwidth = 15, file = "~/Desktop/test_log.pdf") 138 | 139 | saveRDS(final.rpkm.tissue, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sample_final_rpkm_tissue_collapsed.Rds") 140 | saveRDS(final.raw.count.tissue, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sample_final_raw_count_tissue_collapsed.Rds") 141 | 142 | -------------------------------------------------------------------------------- /examples/bulk class mca pancreatic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/bulk class mca pancreatic.png -------------------------------------------------------------------------------- /examples/bulk_composition_pancreatic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/bulk_composition_pancreatic.pdf -------------------------------------------------------------------------------- /examples/cardiac_TS_plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/cardiac_TS_plots.png -------------------------------------------------------------------------------- /examples/cardiac_bulk_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/cardiac_bulk_v2.png -------------------------------------------------------------------------------- /examples/pancreatic dot plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/pancreatic dot plot.pdf -------------------------------------------------------------------------------- /examples/pancreatic dot plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/pancreatic dot plot.png -------------------------------------------------------------------------------- /inst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/Bulk Reference RPKM.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/Bulk Reference RPKM.Rds -------------------------------------------------------------------------------- /inst/extdata/Bulk Reference Raw.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/Bulk Reference Raw.Rds -------------------------------------------------------------------------------- /inst/extdata/MCA Adult Background.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/MCA Adult Background.Rds -------------------------------------------------------------------------------- /inst/extdata/MCA Embryonic Background.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/MCA Embryonic Background.Rds -------------------------------------------------------------------------------- /inst/extdata/baron_dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/baron_dataset.zip -------------------------------------------------------------------------------- /man/binarization.mann.whitney.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Binarization.R 3 | \name{binarization.mann.whitney} 4 | \alias{binarization.mann.whitney} 5 | \title{Binarization and Identity Calling from Identity Score} 6 | \usage{ 7 | binarization.mann.whitney( 8 | mtx, 9 | ref.perc.ls, 10 | ref.meta, 11 | perc.ls, 12 | bulk = FALSE, 13 | map.df = NULL, 14 | init.class = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types} 19 | 20 | \item{ref.perc.ls}{Emprical p-values for reference cells} 21 | 22 | \item{ref.meta}{The celltype meta information for reference cells} 23 | 24 | \item{perc.ls}{Emprical p-values for query cells} 25 | 26 | \item{bulk}{If the reference data type is bulk RNA-seq. The default is bulk = FALSE} 27 | 28 | \item{map.df}{bulk mapping. The default is bulk = FALSE} 29 | 30 | \item{init.class}{initial classification. The default is init.class = NULL} 31 | } 32 | \description{ 33 | This function calls single identity or multiple identities of query cells from empirical p-values. Inferred cell types are marked by 1 in a binarized matrix. 34 | } 35 | \note{ 36 | 37 | } 38 | \examples{ 39 | 40 | } 41 | \keyword{binarization,} 42 | \keyword{calling} 43 | \keyword{identity} 44 | -------------------------------------------------------------------------------- /man/binary.to.classification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Classification.R 3 | \name{binary.to.classification} 4 | \alias{binary.to.classification} 5 | \title{Generate classification results from the binarized result matrix} 6 | \usage{ 7 | binary.to.classification(bin.count.rslt) 8 | } 9 | \arguments{ 10 | \item{bin.count.rslt}{The binarized classification result matrix} 11 | } 12 | \description{ 13 | This function genrate classification table from the binarized result matrix 14 | } 15 | \note{ 16 | 17 | } 18 | \examples{ 19 | 20 | } 21 | \keyword{calssification,} 22 | \keyword{result} 23 | -------------------------------------------------------------------------------- /man/calc.scale.ratio.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R 3 | \name{calc.scale.ratio} 4 | \alias{calc.scale.ratio} 5 | \title{Scale Ratio Calculation} 6 | \usage{ 7 | calc.scale.ratio(bulk, sc) 8 | } 9 | \arguments{ 10 | \item{bulk}{The reference dataset} 11 | 12 | \item{sc}{The single-cell dataset} 13 | } 14 | \description{ 15 | This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction. 16 | } 17 | \examples{ 18 | calc.scale.ratio(reference.dt, single.cell.mtx) 19 | } 20 | \keyword{Scale} 21 | \keyword{calculation} 22 | \keyword{ratio} 23 | -------------------------------------------------------------------------------- /man/construct.high.res.reference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/HighResolutionReferenceConstruction.R 3 | \name{construct.high.res.reference} 4 | \alias{construct.high.res.reference} 5 | \title{Systematic construction of a high-resolution reference} 6 | \usage{ 7 | construct.high.res.reference( 8 | ref.mtx, 9 | coldata.df, 10 | criteria, 11 | cell.num.for.ref = 90 12 | ) 13 | } 14 | \arguments{ 15 | \item{ref.mtx}{The single-cell reference dataset} 16 | 17 | \item{coldata.df}{The metadata (cell type information) for cells in the high-resolution reference} 18 | 19 | \item{cell.num.for.ref}{The number of cell numbers used to build the reference for each cell type. The default is cell.num.for.ref = 90} 20 | } 21 | \description{ 22 | Create a pseudo-bulk reference by sampling 90-cells from each cell type to maintain cellular resolution while increasing transcriptional resolution 23 | } 24 | \note{ 25 | 26 | } 27 | \examples{ 28 | 29 | } 30 | \keyword{pseudo-bulk} 31 | \keyword{reference} 32 | -------------------------------------------------------------------------------- /man/gene.intersect.sub.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R 3 | \name{gene.intersect.sub} 4 | \alias{gene.intersect.sub} 5 | \title{Find intersection genes} 6 | \usage{ 7 | gene.intersect.sub(bulk, sc) 8 | } 9 | \arguments{ 10 | \item{bulk}{The reference dataset} 11 | 12 | \item{sc}{The single-cell dataset} 13 | } 14 | \description{ 15 | This function identifies the intersection genes between reference and single-cell data 16 | } 17 | \examples{ 18 | gene.intersect.sub(reference.dt, single.cell.mtx) 19 | } 20 | -------------------------------------------------------------------------------- /man/get.least.connected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Helper.R 3 | \name{get.least.connected} 4 | \alias{get.least.connected} 5 | \title{Get the Least Connected Cells} 6 | \usage{ 7 | get.least.connected(mtx, n.sample) 8 | } 9 | \arguments{ 10 | \item{mtx}{The normalized reference count matrix} 11 | 12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk} 13 | } 14 | \description{ 15 | This function returns index of cells that are the least connected within a cell type 16 | } 17 | -------------------------------------------------------------------------------- /man/get.mid.connected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Helper.R 3 | \name{get.mid.connected} 4 | \alias{get.mid.connected} 5 | \title{Get the Medium Connected Cells} 6 | \usage{ 7 | get.mid.connected(mtx, n.sample) 8 | } 9 | \arguments{ 10 | \item{mtx}{The normalized reference count matrix} 11 | 12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk} 13 | } 14 | \description{ 15 | This function returns index of cells that are the most connected within a cell type 16 | } 17 | -------------------------------------------------------------------------------- /man/get.most.connected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Helper.R 3 | \name{get.most.connected} 4 | \alias{get.most.connected} 5 | \title{Get the Most Connected Cells} 6 | \usage{ 7 | get.most.connected(mtx, n.sample) 8 | } 9 | \arguments{ 10 | \item{mtx}{The normalized reference count matrix} 11 | 12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk} 13 | } 14 | \description{ 15 | This function returns index of cells that are the most connected within a cell type 16 | } 17 | -------------------------------------------------------------------------------- /man/multi.id.curate.qp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/MultiIDFilterQP.R 3 | \name{multi.id.curate.qp} 4 | \alias{multi.id.curate.qp} 5 | \title{Multi-ID Score-Based Filter} 6 | \usage{ 7 | multi.id.curate.qp( 8 | binary.counts, 9 | classification, 10 | qp.matrix, 11 | qp.threshold = 10^-3 12 | ) 13 | } 14 | \arguments{ 15 | \item{binary.counts}{The binary count matrix, which is the output from binarization with Mann Whitney.} 16 | 17 | \item{classification}{The classification result, which is the output from binary to classification.} 18 | 19 | \item{qp.matrix}{The matrix that contains QP scores calculated for the sample cells} 20 | 21 | \item{qp.threshold}{The threshold to cut off for the QP scores in the multiple identity listed cells} 22 | } 23 | \value{ 24 | A list contain 2 elements, the first is the curated and filtered multiple identity data frame and the second is the new classification data frame. 25 | } 26 | \description{ 27 | This function filters the multiple identity cells based on their QP scores, where we assume that a low QP score (less than 10E-3) are not a true identity to consider 28 | } 29 | \examples{ 30 | multi.id.curate.qp(multi.id.meta) 31 | } 32 | \keyword{Multiple} 33 | \keyword{identities} 34 | -------------------------------------------------------------------------------- /man/normalize.dt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R 3 | \name{normalize.dt} 4 | \alias{normalize.dt} 5 | \title{Normalization of Single-Cell RNA-Seq Data} 6 | \usage{ 7 | normalize.dt(dt.st) 8 | } 9 | \arguments{ 10 | \item{dt.st}{The dataset to normalize} 11 | } 12 | \description{ 13 | This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage. 14 | } 15 | \examples{ 16 | normalize.dt(single.cell.mtx) 17 | } 18 | \keyword{normalization} 19 | -------------------------------------------------------------------------------- /man/perc.calc.aux.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R 3 | \name{perc.calc.aux} 4 | \alias{perc.calc.aux} 5 | \title{Emprical p-values for a query cell} 6 | \usage{ 7 | perc.calc.aux( 8 | l, 9 | cell.names, 10 | mtx.ct.num, 11 | background.mtx, 12 | mtx, 13 | cnms, 14 | test.times = 50 15 | ) 16 | } 17 | \arguments{ 18 | \item{l}{The index of the sample cell of query} 19 | 20 | \item{cell.names}{A vector of sample cells' names} 21 | 22 | \item{mtx.ct.num}{The number of all possible cell types} 23 | 24 | \item{background.mtx}{The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types} 25 | 26 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types} 27 | 28 | \item{cnms}{The names of all possible cell types} 29 | 30 | \item{test.times}{The number of times resampling the background. The default is test.times=50} 31 | } 32 | \description{ 33 | This function returns emprical p-values of all possilbe celltypes for a query cells under test.times resampled backgrounds. 34 | } 35 | \note{ 36 | 37 | } 38 | \examples{ 39 | 40 | } 41 | \keyword{empirical} 42 | \keyword{p-value,} 43 | \keyword{query} 44 | \keyword{resampling,} 45 | \keyword{single} 46 | -------------------------------------------------------------------------------- /man/percentage.calc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R 3 | \name{percentage.calc} 4 | \alias{percentage.calc} 5 | \title{Emprical p-values for query cells} 6 | \usage{ 7 | percentage.calc(mtx, bkgd.mtx) 8 | } 9 | \arguments{ 10 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types} 11 | 12 | \item{bkgd.mtx}{The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types} 13 | } 14 | \description{ 15 | This function returns a list of emprical p-value matrices. Every matrix in the list contains emprical p-values of all possilbe celltypes for a single query cells under test.times resampled backgrounds. 16 | } 17 | \note{ 18 | 19 | } 20 | \examples{ 21 | 22 | } 23 | \keyword{all} 24 | \keyword{cells} 25 | \keyword{empirical} 26 | \keyword{p-value,} 27 | \keyword{query} 28 | \keyword{resampling,} 29 | -------------------------------------------------------------------------------- /man/ref.construction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/HighResolutionReferenceConstruction.R 3 | \name{ref.construction} 4 | \alias{ref.construction} 5 | \title{Reference Construction} 6 | \usage{ 7 | ref.construction(sc, sc.aux, criteria) 8 | } 9 | \arguments{ 10 | \item{sc}{The single-cell resolution dataset} 11 | 12 | \item{sc.aux}{The auxiliary data frame that annotate the single-cell resolutiond dataset} 13 | 14 | \item{criteria}{The column name to use for construction of the reference} 15 | } 16 | \description{ 17 | This function constructs reference from single-cell resolution reference data to be used for quadratic programming calculation 18 | } 19 | \examples{ 20 | ref.construction(single.ref.mtx, single.aux.df, "cell.type") 21 | } 22 | -------------------------------------------------------------------------------- /man/sample.func.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R 3 | \name{sample.func} 4 | \alias{sample.func} 5 | \title{Empirical p-value of Identity Score Calculation} 6 | \usage{ 7 | sample.func(dens.x, curr.val, prob, n = 1000) 8 | } 9 | \arguments{ 10 | \item{dens.x}{The background dataset that we compare the identity score of our sample with} 11 | 12 | \item{curr.val}{The identity score of the sample dataset to be evaluated its significance} 13 | 14 | \item{n}{The number of times of resampling. The default is n=1000} 15 | } 16 | \description{ 17 | This function resamples from a sample dataset and returns an empirical p-value of identity score 18 | } 19 | \note{ 20 | 21 | } 22 | \examples{ 23 | 24 | } 25 | \keyword{empirical} 26 | \keyword{p-value} 27 | \keyword{resampling,} 28 | -------------------------------------------------------------------------------- /man/sc.quad.prog.run.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgramming.R 3 | \name{sc.quad.prog.run} 4 | \alias{sc.quad.prog.run} 5 | \title{Single-Cell Quadratic Programming Calculation} 6 | \usage{ 7 | sc.quad.prog.run( 8 | bulk.transcriptome, 9 | single.cell.transcriptome, 10 | force.eq = 0, 11 | unix.parallel = FALSE, 12 | windows.parallel = FALSE, 13 | parallel.cores = 4 14 | ) 15 | } 16 | \arguments{ 17 | \item{bulk.transcriptome}{The reference transcriptome taht contains the transcriptome of each potential cell type} 18 | 19 | \item{single.cell.transcriptome}{the transcriptome profile of cells from single-cell RNA-sequencing} 20 | 21 | \item{force.eq}{either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0} 22 | 23 | \item{unix.parallel}{boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE} 24 | 25 | \item{windows.parallel}{boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE} 26 | 27 | \item{parallel.cores}{the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used} 28 | } 29 | \description{ 30 | This function runs quadratic programming to identify the probability of the cells in single-cell RNA-seq belonging to cell types in the reference transcriptome 31 | } 32 | \note{ 33 | Code reference from Treutlein et. al., Dissecting direct reprogramming from fibroblast to neuron using single-cell RNA-seq 34 | } 35 | \examples{ 36 | sc.quad.prog.run(ref.transcriptome, sc.transcriptome, force.eq = 1) 37 | } 38 | \keyword{programming,} 39 | \keyword{quadratic} 40 | \keyword{scRNA-seq} 41 | -------------------------------------------------------------------------------- /man/single.round.QP.analysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgramming.R 3 | \name{single.round.QP.analysis} 4 | \alias{single.round.QP.analysis} 5 | \title{Single Round of Quadratic Programming} 6 | \usage{ 7 | single.round.QP.analysis( 8 | ref, 9 | sc.data, 10 | scale.bulk.sc = "scale", 11 | unix.par = FALSE, 12 | windows.par = FALSE, 13 | force.eq = 0, 14 | n.cores = 1, 15 | save.to.path, 16 | save.to.filename, 17 | bulk.norm = T, 18 | norm.sc = T, 19 | log.bulk = T, 20 | log.sc = T 21 | ) 22 | } 23 | \arguments{ 24 | \item{ref}{The reference transcriptome taht contains the transcriptome of each potential cell type} 25 | 26 | \item{sc.data}{the transcriptome profile of cells from single-cell RNA-sequencing} 27 | 28 | \item{scale.bulk.sc}{either scale or non-scale. Scaling is recommended to make the reference comparable to the single-cell. Default to scale.} 29 | 30 | \item{unix.par}{boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE} 31 | 32 | \item{windows.par}{boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE} 33 | 34 | \item{force.eq}{either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0} 35 | 36 | \item{n.cores}{the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used} 37 | 38 | \item{save.to.path}{which directory would you like to save your file?} 39 | 40 | \item{save.to.filename}{prefix to the filename to save to. The final filename will be constructed inside of the function by tagging the following string _scale.csv for scaling and _non_scale.csv for not scaling.} 41 | 42 | \item{bulk.norm}{boolean value, either TRUE or FALSE. Would you like to normalize the reference?} 43 | 44 | \item{norm.sc}{boolean value, either TRUE or FALSE. Would you like to normalize the single-cell dataset?} 45 | 46 | \item{log.bulk}{boolean value, either TRUE or FALSE. Would you like to log the reference?} 47 | 48 | \item{log.sc}{boolean value, either TRUE or FALSE. Would you like to log the single-cell dataset?} 49 | } 50 | \description{ 51 | This function runs preprocessing, including log normalization, gene intersection and scaling. Further, run one round of quadratic programming. See function sc.quad.prog.run for detailed description 52 | } 53 | \note{ 54 | This function calls the quadratic programming referenced to Treutlein et. al. 55 | } 56 | \examples{ 57 | single.round.QP.analysis(ref = ref.transcriptome, sc.data = sc.transcriptome, force.eq = 1, save.to.path = "~/Desktop/", save.to.filename = "my_favorite") 58 | } 59 | \keyword{programming,} 60 | \keyword{quadratic} 61 | \keyword{scRNA-seq,} 62 | -------------------------------------------------------------------------------- /man/top.genes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R 3 | \name{top.genes} 4 | \alias{top.genes} 5 | \title{Top Variable Gene Identificaiton} 6 | \usage{ 7 | top.genes(input.dir, output.dir, top.number.count = 500) 8 | } 9 | \arguments{ 10 | \item{input.dir}{The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell.} 11 | 12 | \item{output.dir}{The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names.} 13 | 14 | \item{top.number.count}{The number of top variable genes to extract. Default to 500 genes} 15 | } 16 | \description{ 17 | This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note. 18 | } 19 | \note{ 20 | Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015 21 | } 22 | \examples{ 23 | top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000) 24 | } 25 | \keyword{extraction} 26 | \keyword{gene} 27 | \keyword{top} 28 | \keyword{variable} 29 | -------------------------------------------------------------------------------- /man/transition.score.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/TransitionScoreCalculation.R 3 | \name{transition.score} 4 | \alias{transition.score} 5 | \title{Transition Score Calculation} 6 | \usage{ 7 | transition.score(multi.id.meta) 8 | } 9 | \arguments{ 10 | \item{multi.id.meta}{A data frame that contains cells with multiple identities. Column 1 - cell barcode, Column 2 - cell type call, Column 3 - counts, Column 4 - corresponding QP scores.} 11 | } 12 | \value{ 13 | A data frame that contains the calculated transition scores for each identity 14 | } 15 | \description{ 16 | This function calculates the transition scores for each cell state that is connected by cells with multiple identities. 17 | } 18 | \examples{ 19 | transition.score(multi.id.meta) 20 | } 21 | \keyword{Transition} 22 | \keyword{metric} 23 | --------------------------------------------------------------------------------