├── .DS_Store
├── .Rbuildignore
├── .Rhistory
├── .gitignore
├── Capybara.Rproj
├── DESCRIPTION
├── NAMESPACE
├── R
├── .DS_Store
├── Binarization.R
├── Classification.R
├── EmpiricalPValueCalculation.R
├── Helper.R
├── HighResolutionReferenceConstruction.R
├── MultiIDFilterQP.R
├── SingleCellQuadraticProgramming.R
├── SingleCellQuadraticProgrammingAux.R
└── TransitionScoreCalculation.R
├── README.md
├── examples
├── .DS_Store
├── Monocle_hat_colin.png
├── Step 1 - ARCHS4 Mining.R
├── Step 2 - bulk cleaning.R
├── bulk class mca pancreatic.png
├── bulk_composition_pancreatic.pdf
├── cardiac_TS_plots.png
├── cardiac_bulk_v2.png
├── features.tsv
├── gene_info.tsv
├── pancreatic dot plot.pdf
└── pancreatic dot plot.png
├── inst
├── .DS_Store
└── extdata
│ ├── .DS_Store
│ ├── Bulk Reference RPKM.Rds
│ ├── Bulk Reference Raw.Rds
│ ├── MCA Adult Background.Rds
│ ├── MCA Embryonic Background.Rds
│ ├── MCA_CellAssignments.csv
│ ├── baron_dataset.zip
│ └── features.tsv
└── man
├── binarization.mann.whitney.Rd
├── binary.to.classification.Rd
├── calc.scale.ratio.Rd
├── construct.high.res.reference.Rd
├── gene.intersect.sub.Rd
├── get.least.connected.Rd
├── get.mid.connected.Rd
├── get.most.connected.Rd
├── multi.id.curate.qp.Rd
├── normalize.dt.Rd
├── perc.calc.aux.Rd
├── percentage.calc.Rd
├── ref.construction.Rd
├── sample.func.Rd
├── sc.quad.prog.run.Rd
├── single.round.QP.analysis.Rd
├── top.genes.Rd
└── transition.score.Rd
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/.DS_Store
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^Capybara\.Rproj$
2 | ^\.Rproj\.user$
3 |
--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
1 | baron.expr <- data.frame()
2 | baron.meta <- data.frame()
3 | count <- 1
4 | for (f in baron.fns) {
5 | curr.df <- read.csv(f, row.names = 1, stringsAsFactors = F, check.names = F)
6 | curr.bc <- paste0(rownames(curr.df), "_", curr.df$barcode, "_Sample_", count)
7 | rownames(curr.df) <- curr.bc
8 | if (nrow(baron.meta) <= 0) {
9 | baron.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"],
10 | stringsAsFactors = F, check.names = F)
11 | } else {
12 | curr.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"],
13 | stringsAsFactors = F, check.names = F)
14 | baron.meta <- rbind(baron.meta, curr.meta)
15 | }
16 | curr.expr <- curr.df[, c(3:ncol(curr.df))]
17 | if (nrow(baron.expr) <= 0) {
18 | baron.expr <- curr.expr
19 | } else {
20 | baron.expr <- rbind(baron.expr, curr.expr)
21 | }
22 | count <- count +1
23 | }
24 | count.mtx <- t(baron.expr)
25 | write.csv(count.mtx, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", quote = F)
26 | write.csv(baron.meta, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic_meta.csv", quote = F)
27 | single.round.QP.analysis(ref.df, count.mtx, n.cores = 4, save.to.path = "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/", save.to.filename = "02_MCA_Based_scClassifier_reference_mix90_test_normalize_select", unix.par = TRUE)
28 | background.mtx <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/01_MCA_Based_scClassifier_reference_mix90_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
29 | mtx.test <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/02_MCA_Based_scClassifier_reference_mix90_test_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
30 | # reference Permutation
31 | col.sub <- ncol(background.mtx) - 2
32 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)])
33 | # Test Permutation
34 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)]))
35 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list)
36 | classification <- binary.to.classification(bin.count[,c(1:col.sub)])
37 | rownames(classification) <- classification$barcode
38 | classification$actual <- baron.meta[rownames(classification), "cell.type"]
39 | table.freq <- table(classification$actual, classification$call)
40 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3))
41 | rownames(table.freq.perc)[14] <- "beta"
42 | table.freq.sub <- as.data.frame(table.freq.perc[c(1,2,4,5,6,7,8,11,12,14), c(1,2,5,8,3,4,6,7,9,10,13)])
43 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
44 | table.freq.melt <- melt(table.freq.sub)
45 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
46 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
47 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
48 | ordered = T)
49 | table.freq.melt$variable <- factor(table.freq.melt$variable,
50 | levels = c("B_cell", "beta", "ductal", "endothelial",
51 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
52 | ordered = T)
53 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
54 | geom_point(aes(colour = variable)) +
55 | scale_size_area(name = "value", max_size=12) +
56 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) +
57 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
58 | theme(legend.position = "right",
59 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
60 | axis.text.y = element_text(size = 12, face = "bold.italic"),
61 | axis.ticks = element_blank(),
62 | axis.title = element_blank(),
63 | title = element_text(face = "bold.italic", size = 14),
64 | panel.grid.major = element_blank(),
65 | panel.grid.minor = element_blank(),
66 | panel.background = element_blank(),
67 | axis.line = element_blank())
68 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
69 | geom_point(aes(colour = variable)) +
70 | scale_size_area(name = "value", max_size=12) +
71 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
72 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
73 | theme(legend.position = "right",
74 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
75 | axis.text.y = element_text(size = 12, face = "bold.italic"),
76 | axis.ticks = element_blank(),
77 | axis.title = element_blank(),
78 | title = element_text(face = "bold.italic", size = 14),
79 | panel.grid.major = element_blank(),
80 | panel.grid.minor = element_blank(),
81 | panel.background = element_blank(),
82 | axis.line = element_blank())
83 | View(table.freq.perc)
84 | rownames(table.freq.perc)[14] <- "beta"
85 | rownames(table.freq.perc)[16] <- "beta"
86 | View(table.freq.perc)
87 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell",
88 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell", c(1,2,5,8,3,4,6,7,9,10,13)])
89 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell",
90 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
91 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
92 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
93 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
94 | View(table.freq.perc)
95 | table.freq <- table(classification$actual, classification$call)
96 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3))
97 | rownames(table.freq.perc)[16] <- "beta"
98 | View(table.freq.perc)
99 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
100 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
101 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
102 | table.freq.melt <- melt(table.freq.sub)
103 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
104 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
105 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
106 | ordered = T)
107 | table.freq.melt$variable <- factor(table.freq.melt$variable,
108 | levels = c("B_cell", "beta", "ductal", "endothelial",
109 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
110 | ordered = T)
111 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
112 | geom_point(aes(colour = value)) +
113 | scale_size_area(name = "value", max_size=12) +
114 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
115 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
116 | theme(legend.position = "right",
117 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
118 | axis.text.y = element_text(size = 12, face = "bold.italic"),
119 | axis.ticks = element_blank(),
120 | axis.title = element_blank(),
121 | title = element_text(face = "bold.italic", size = 14),
122 | panel.grid.major = element_blank(),
123 | panel.grid.minor = element_blank(),
124 | panel.background = element_blank(),
125 | axis.line = element_blank())
126 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
127 | geom_point(aes(colour = value)) +
128 | scale_size_area(name = "value", max_size=12) +
129 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) +
130 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
131 | theme(legend.position = "right",
132 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
133 | axis.text.y = element_text(size = 12, face = "bold.italic"),
134 | axis.ticks = element_blank(),
135 | axis.title = element_blank(),
136 | title = element_text(face = "bold.italic", size = 14),
137 | panel.grid.major = element_blank(),
138 | panel.grid.minor = element_blank(),
139 | panel.background = element_blank(),
140 | axis.line = element_blank())
141 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
142 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
143 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
144 | table.freq.melt <- melt(table.freq.sub)
145 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
146 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
147 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
148 | ordered = T)
149 | table.freq.melt$variable <- factor(table.freq.melt$variable,
150 | levels = c("B_cell", "beta", "ductal", "endothelial",
151 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
152 | ordered = T)
153 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
154 | geom_point(aes(colour = variable)) +
155 | scale_size_area(name = "value", max_size=12) +
156 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
157 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
158 | theme(legend.position = "right",
159 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
160 | axis.text.y = element_text(size = 12, face = "bold.italic"),
161 | axis.ticks = element_blank(),
162 | axis.title = element_blank(),
163 | title = element_text(face = "bold.italic", size = 14),
164 | panel.grid.major = element_blank(),
165 | panel.grid.minor = element_blank(),
166 | panel.background = element_blank(),
167 | axis.line = element_blank())
168 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
169 | geom_point(aes(colour = variable)) +
170 | scale_size_area(name = "value", max_size=12) +
171 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
172 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
173 | theme(legend.position = "right",
174 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
175 | axis.text.y = element_text(size = 12, face = "bold.italic"),
176 | axis.ticks = element_blank(),
177 | axis.title = element_blank(),
178 | title = element_text(face = "bold.italic", size = 14),
179 | panel.grid.major = element_blank(),
180 | panel.grid.minor = element_blank(),
181 | panel.background = element_blank(),
182 | axis.line = element_line(colour = "black", size = 1))
183 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
184 | geom_point(aes(colour = variable)) +
185 | scale_size_area(name = "value", max_size=12) +
186 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
187 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
188 | guides(fill = FALSE, color = FALSE) +
189 | theme(legend.position = "right",
190 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
191 | axis.text.y = element_text(size = 12, face = "bold.italic"),
192 | axis.ticks = element_blank(),
193 | axis.title = element_blank(),
194 | title = element_text(face = "bold.italic", size = 14),
195 | panel.grid.major = element_blank(),
196 | panel.grid.minor = element_blank(),
197 | panel.background = element_blank(),
198 | axis.line = element_line(colour = "black", size = 1))
199 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
200 | geom_point(aes(colour = variable)) +
201 | scale_size_area(name = "value", max_size=12) +
202 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
203 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
204 | guides(fill = FALSE, color = FALSE) +
205 | theme(legend.position = "bottom",
206 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
207 | axis.text.y = element_text(size = 12, face = "bold.italic"),
208 | axis.ticks = element_blank(),
209 | axis.title = element_blank(),
210 | title = element_text(face = "bold.italic", size = 14),
211 | panel.grid.major = element_blank(),
212 | panel.grid.minor = element_blank(),
213 | panel.background = element_blank(),
214 | axis.line = element_line(colour = "black", size = 1))
215 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
216 | geom_point(aes(colour = variable)) +
217 | scale_size_area(name = "Percentage", max_size=12) +
218 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
219 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
220 | guides(fill = FALSE, color = FALSE) +
221 | theme(legend.position = "bottom",
222 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
223 | axis.text.y = element_text(size = 12, face = "bold.italic"),
224 | axis.ticks = element_blank(),
225 | axis.title = element_blank(),
226 | title = element_text(face = "bold.italic", size = 14),
227 | panel.grid.major = element_blank(),
228 | panel.grid.minor = element_blank(),
229 | panel.background = element_blank(),
230 | axis.line = element_line(colour = "black", size = 1))
231 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 6, height = 8, paper = "special")
232 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
233 | geom_point(aes(colour = variable)) +
234 | scale_size_area(name = "Percentage", max_size=12) +
235 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
236 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
237 | guides(fill = FALSE, color = FALSE) +
238 | theme(legend.position = "bottom",
239 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
240 | axis.text.y = element_text(size = 12, face = "bold.italic"),
241 | axis.ticks = element_blank(),
242 | axis.title = element_blank(),
243 | title = element_text(face = "bold.italic", size = 14),
244 | panel.grid.major = element_blank(),
245 | panel.grid.minor = element_blank(),
246 | panel.background = element_blank(),
247 | axis.line = element_line(colour = "black", size = 1))
248 | dev.off()
249 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 7, height = 9, paper = "special")
250 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
251 | geom_point(aes(colour = variable)) +
252 | scale_size_area(name = "Percentage", max_size=12) +
253 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
254 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
255 | guides(fill = FALSE, color = FALSE) +
256 | theme(legend.position = "bottom",
257 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
258 | axis.text.y = element_text(size = 12, face = "bold.italic"),
259 | axis.ticks = element_blank(),
260 | axis.title = element_blank(),
261 | title = element_text(face = "bold.italic", size = 14),
262 | panel.grid.major = element_blank(),
263 | panel.grid.minor = element_blank(),
264 | panel.background = element_blank(),
265 | axis.line = element_line(colour = "black", size = 1))
266 | dev.off()
267 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 8, height = 9, paper = "special")
268 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
269 | geom_point(aes(colour = variable)) +
270 | scale_size_area(name = "Percentage", max_size=12) +
271 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
272 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
273 | guides(fill = FALSE, color = FALSE) +
274 | theme(legend.position = "bottom",
275 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
276 | axis.text.y = element_text(size = 12, face = "bold.italic"),
277 | axis.ticks = element_blank(),
278 | axis.title = element_blank(),
279 | title = element_text(face = "bold.italic", size = 14),
280 | panel.grid.major = element_blank(),
281 | panel.grid.minor = element_blank(),
282 | panel.background = element_blank(),
283 | axis.line = element_line(colour = "black", size = 1))
284 | dev.off()
285 | baron.expr <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", header = T, row.names = 1, stringsAsFactors = F)
286 | View(baron.meta)
287 | View(mca)
288 | View(mca.meta)
289 | # Background cells
290 | mca <- read.csv("~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_CellAssignments.csv",
291 | row.names = 1, header = T, stringsAsFactors = F)
292 | View(mca)
293 | # Fetal bulk testing
294 | MCA.fetal.tissue <- c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Fetal_Brain",
295 | "Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache",
296 | "Neonatal_Brain", "Neonatal-Calvaria", "Neonatal-Heart", "Neonatal-Muscle",
297 | "Neonatal-Rib", "Neonatal-Skin", "Trophoblast-Stem-Cell", "NeonatalPancreas")
298 | library(devtools)
299 | library(roxygen2)
300 | setwd("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Capybara/")
301 | document()
302 | install(".")
303 | library(Capybara)
304 | syste
305 | system.file(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara"))
306 | system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")
307 | read.csv(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara"))
308 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
309 | geom_bar(stat = "identity") +
310 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
311 | scale_fill_manual(
312 | name = "Mapped MCA Tissue",
313 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
314 | RColorBrewer::brewer.pal(8, "Set2"),
315 | RColorBrewer::brewer.pal(8, "Set3"))
316 | ) +
317 | labs(y = "Percentage of Cells") +
318 | ggtitle("Bulk Classification to MCA Tissues") +
319 | theme(legend.position = "none",
320 | axis.ticks = element_blank(),
321 | axis.title.x = element_blank(),
322 | axis.text = element_text(face = "bold.italic", size = 12),
323 | title = element_text(face = "bold.italic", size = 14),
324 | panel.grid.major = element_blank(),
325 | panel.grid.minor = element_blank(),
326 | panel.background = element_blank(),
327 | axis.line.x = element_blank(),
328 | axis.line.y = element_line(color = "black", size =1))
329 | comp.raw <- count.in.cat
330 | comp.raw <- comp.raw[order(-comp.raw$perc), ]
331 | comp.raw$Var1 <- rownames(comp.raw)
332 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T)
333 | comp.raw$label <- "Mouse Pancreatic Cells"
334 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
335 | geom_bar(stat = "identity") +
336 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
337 | scale_fill_manual(
338 | name = "Mapped MCA Tissue",
339 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
340 | RColorBrewer::brewer.pal(8, "Set2"),
341 | RColorBrewer::brewer.pal(8, "Set3"))
342 | ) +
343 | labs(y = "Percentage of Cells") +
344 | ggtitle("Bulk Classification to MCA Tissues") +
345 | theme(legend.position = "none",
346 | axis.ticks = element_blank(),
347 | axis.title.x = element_blank(),
348 | axis.text = element_text(face = "bold.italic", size = 12),
349 | title = element_text(face = "bold.italic", size = 14),
350 | panel.grid.major = element_blank(),
351 | panel.grid.minor = element_blank(),
352 | panel.background = element_blank(),
353 | axis.line.x = element_blank(),
354 | axis.line.y = element_line(color = "black", size =1))
355 | background.mca <- readRDS("~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/MCA ARCHS4/06_qp_top_90_each_category_backgroung.Rds")
356 | background.mtx <- background.mca
357 | mtx.test <- qp.paga.rslt.sub[,colnames(background.mtx)]
358 | ### cor test
359 | ref.test <- t(background.mtx)
360 | mtx.test.cor <- t(qp.paga.rslt.sub)
361 | corr.mtx <- WGCNA::cor(ref.test, mtx.test.cor)
362 | correlation.cutoff <- quantile(corr.mtx, 0.90)
363 | new.corr.bin <- corr.mtx
364 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1
365 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0
366 | new.corr.bin <- as.data.frame(new.corr.bin)
367 | new.corr.bin$cell.bc.ref <- rownames(new.corr.bin)
368 | new.corr.bin.melt <- reshape2::melt(new.corr.bin)
369 | new.corr.bin.melt.sub <- new.corr.bin.melt[which(new.corr.bin.melt$value > 0),]
370 | new.corr.bin.melt.sub$cell.type <- unlist(lapply(strsplit(new.corr.bin.melt.sub$cell.bc.ref, "_"), function(x) x[1]))
371 | count.in.cat <- c()
372 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1])))
373 | for (uc in unique.cat) {
374 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)]
375 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7)
376 | }
377 | count.in.cat <- as.data.frame(count.in.cat)
378 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3)
379 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)]
380 | comp.raw <- count.in.cat
381 | comp.raw <- comp.raw[order(-comp.raw$perc), ]
382 | comp.raw$Var1 <- rownames(comp.raw)
383 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T)
384 | comp.raw$label <- "Mouse Pancreatic Cells"
385 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
386 | geom_bar(stat = "identity") +
387 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
388 | scale_fill_manual(
389 | name = "Mapped MCA Tissue",
390 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
391 | RColorBrewer::brewer.pal(8, "Set2"),
392 | RColorBrewer::brewer.pal(8, "Set3"))
393 | ) +
394 | labs(y = "Percentage of Cells") +
395 | ggtitle("Bulk Classification to MCA Tissues") +
396 | theme(legend.position = "none",
397 | axis.ticks = element_blank(),
398 | axis.title.x = element_blank(),
399 | axis.text = element_text(face = "bold.italic", size = 12),
400 | title = element_text(face = "bold.italic", size = 14),
401 | panel.grid.major = element_blank(),
402 | panel.grid.minor = element_blank(),
403 | panel.background = element_blank(),
404 | axis.line.x = element_blank(),
405 | axis.line.y = element_line(color = "black", size =1))
406 | View(comp.raw)
407 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 3, height = 8, paper = "special")
408 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
409 | geom_bar(stat = "identity") +
410 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
411 | scale_fill_manual(
412 | name = "Mapped MCA Tissue",
413 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
414 | RColorBrewer::brewer.pal(8, "Set2"),
415 | RColorBrewer::brewer.pal(8, "Set3"))
416 | ) +
417 | labs(y = "Percentage of Cells") +
418 | ggtitle("Bulk Classification to \n MCA Tissues") +
419 | theme(legend.position = "none",
420 | axis.ticks = element_blank(),
421 | axis.title.x = element_blank(),
422 | axis.text = element_text(face = "bold.italic", size = 12),
423 | title = element_text(face = "bold.italic", size = 14),
424 | panel.grid.major = element_blank(),
425 | panel.grid.minor = element_blank(),
426 | panel.background = element_blank(),
427 | axis.line.x = element_blank(),
428 | axis.line.y = element_line(color = "black", size =1))
429 | dev.off()
430 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special")
431 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
432 | geom_bar(stat = "identity") +
433 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
434 | scale_fill_manual(
435 | name = "Mapped MCA Tissue",
436 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
437 | RColorBrewer::brewer.pal(8, "Set2"),
438 | RColorBrewer::brewer.pal(8, "Set3"))
439 | ) +
440 | labs(y = "Percentage of Cells") +
441 | ggtitle("Bulk Classification to \n MCA Tissues") +
442 | theme(legend.position = "none",
443 | axis.ticks = element_blank(),
444 | axis.title.x = element_blank(),
445 | axis.text = element_text(face = "bold.italic", size = 12),
446 | title = element_text(face = "bold.italic", size = 14),
447 | panel.grid.major = element_blank(),
448 | panel.grid.minor = element_blank(),
449 | panel.background = element_blank(),
450 | axis.line.x = element_blank(),
451 | axis.line.y = element_line(color = "black", size =1))
452 | dev.off()
453 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
454 | geom_bar(stat = "identity") +
455 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
456 | scale_fill_manual(
457 | name = "Mapped MCA Tissue",
458 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
459 | RColorBrewer::brewer.pal(8, "Set2"),
460 | RColorBrewer::brewer.pal(8, "Set3"))
461 | ) +
462 | labs(y = "Percentage of Cells") +
463 | ggtitle("Bulk Classification to \nMCA Tissues") +
464 | theme(legend.position = "none",
465 | axis.ticks = element_blank(),
466 | axis.title.x = element_blank(),
467 | axis.text = element_text(face = "bold.italic", size = 12),
468 | title = element_text(face = "bold.italic", size = 14),
469 | panel.grid.major = element_blank(),
470 | panel.grid.minor = element_blank(),
471 | panel.background = element_blank(),
472 | axis.line.x = element_blank(),
473 | axis.line.y = element_line(color = "black", size =1))
474 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special")
475 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
476 | geom_bar(stat = "identity") +
477 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
478 | scale_fill_manual(
479 | name = "Mapped MCA Tissue",
480 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
481 | RColorBrewer::brewer.pal(8, "Set2"),
482 | RColorBrewer::brewer.pal(8, "Set3"))
483 | ) +
484 | labs(y = "Percentage of Cells") +
485 | ggtitle("Bulk Classification to \nMCA Tissues") +
486 | theme(legend.position = "none",
487 | axis.ticks = element_blank(),
488 | axis.title.x = element_blank(),
489 | axis.text = element_text(face = "bold.italic", size = 12),
490 | title = element_text(face = "bold.italic", size = 14),
491 | panel.grid.major = element_blank(),
492 | panel.grid.minor = element_blank(),
493 | panel.background = element_blank(),
494 | axis.line.x = element_blank(),
495 | axis.line.y = element_line(color = "black", size =1))
496 | dev.off()
497 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA, value))) +
498 | geom_point(aes(colour = variable)) +
499 | scale_size_area(name = "Percentage", max_size=12) +
500 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
501 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
502 | guides(fill = FALSE, color = FALSE) +
503 | theme(legend.position = "bottom",
504 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
505 | axis.text.y = element_text(size = 12, face = "bold.italic"),
506 | axis.ticks = element_blank(),
507 | axis.title = element_blank(),
508 | title = element_text(face = "bold.italic", size = 14),
509 | panel.grid.major = element_blank(),
510 | panel.grid.minor = element_blank(),
511 | panel.background = element_blank(),
512 | axis.line = element_line(colour = "black", size = 1))
513 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 |
--------------------------------------------------------------------------------
/Capybara.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: Capybara
2 | Title: A tool to measure cell identity and fate transitions
3 | Version: 0.0.0.9000
4 | Authors@R:
5 | person(given = "Sam",
6 | family = "Morris",
7 | role = c("aut", "cre"),
8 | email = "s.morris@wustl.edu",
9 | comment = c(ORCID = "YOUR-ORCID-ID"))
10 | Description: What the package does (one paragraph).
11 | Depends: R (>= 3.5.0),
12 | pheatmap,
13 | viridis,
14 | tools,
15 | psych,
16 | MVA,
17 | data.table,
18 | pbmcapply,
19 | reshape,
20 | Matrix,
21 | tidyverse,
22 | snow,
23 | parallel,
24 | plyr,
25 | dplyr,
26 | ggplot2,
27 | quadprog,
28 | limma,
29 | WGCNA,
30 | glmnet
31 | License: What license it uses
32 | Encoding: UTF-8
33 | LazyData: true
34 | RoxygenNote: 7.1.2
35 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(binarization.mann.whitney)
4 | export(binary.to.classification)
5 | export(calc.scale.ratio)
6 | export(construct.high.res.reference)
7 | export(gene.intersect.sub)
8 | export(get.least.connected)
9 | export(get.mid.connected)
10 | export(get.most.connected)
11 | export(multi.id.curate.qp)
12 | export(normalize.dt)
13 | export(perc.calc.aux)
14 | export(percentage.calc)
15 | export(ref.construction)
16 | export(sample.func)
17 | export(sc.quad.prog.run)
18 | export(single.round.QP.analysis)
19 | export(top.genes)
20 | export(transition.score)
21 |
--------------------------------------------------------------------------------
/R/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/R/.DS_Store
--------------------------------------------------------------------------------
/R/Binarization.R:
--------------------------------------------------------------------------------
1 | #' Binarization and Identity Calling from Identity Score
2 | #'
3 | #' This function calls single identity or multiple identities of query cells from empirical p-values. Inferred cell types are marked by 1 in a binarized matrix.
4 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
5 | #' @param ref.perc.ls Emprical p-values for reference cells
6 | #' @param ref.meta The celltype meta information for reference cells
7 | #' @param perc.ls Emprical p-values for query cells
8 | #' @param bulk If the reference data type is bulk RNA-seq. The default is bulk = FALSE
9 | #' @param map.df bulk mapping. The default is bulk = FALSE
10 | #' @param init.class initial classification. The default is init.class = NULL
11 | #' @keywords binarization, identity calling
12 | #' @note
13 | #' @export
14 | #' @examples
15 | #'
16 | binarization.mann.whitney <- function(mtx, ref.perc.ls, ref.meta, perc.ls, bulk = FALSE, map.df = NULL, init.class = NULL) {
17 | p_vals <- matrix(nrow = nrow(mtx), ncol = (ncol(mtx) - 1))
18 | type_rank_in_order_mtx <- c()
19 | which_type <- c()
20 |
21 | all.indx <- seq(1,length(perc.ls))
22 |
23 | benchmark <- list()
24 | benchmark.final <- c()
25 |
26 | for (i in 1:length(ref.perc.ls)) {
27 | curr.cell <- names(ref.perc.ls[[i]])
28 | perc_mtx <- as.data.frame(ref.perc.ls[[i]][[1]])
29 | if (curr.cell %in% rownames(ref.meta)) {
30 | curr.ct <- ref.meta[curr.cell, "cell.type"]
31 | if (endsWith(curr.ct, " ")) curr.ct <- substr(curr.ct, start = 1, stop = (nchar(curr.ct) - 1))
32 | curr.cell.cell.type <- gsub(" ", ".", curr.ct)
33 | curr.cell.cell.type <- gsub("−", ".", curr.cell.cell.type)
34 | curr.cell.cell.type <- gsub("-", ".", curr.cell.cell.type)
35 | curr.cell.cell.type <- gsub("/", ".", curr.cell.cell.type)
36 | curr.cell.cell.type <- gsub("&", ".", curr.cell.cell.type)
37 | curr.cell.cell.type <- gsub("\\(", ".", curr.cell.cell.type)
38 | curr.cell.cell.type <- gsub("\\)", ".", curr.cell.cell.type)
39 | if (!bulk) {
40 | curr.bm <- paste0("frxn_cell.type_", curr.cell.cell.type)
41 | } else {
42 | if (is.null(map.df)) {
43 | stop("No bulk mapping found!")
44 | }
45 | curr.bm <- map.df[which(map.df$tissue == curr.ct), "corresponding"]
46 | }
47 | curr.celltype.perm <- perc_mtx[, curr.bm]
48 | if (length(benchmark) <= 0) {
49 | benchmark[[curr.bm]] <- curr.celltype.perm
50 | } else {
51 | benchmark[[curr.bm]] <- c(benchmark[[curr.bm]], curr.celltype.perm)
52 | }
53 | }
54 | }
55 |
56 | for (i in 1:length(benchmark)) {
57 | curr.name <- names(benchmark)[i]
58 | curr.benchmark <- benchmark[[i]]
59 | curr.benchmark <- curr.benchmark[which(curr.benchmark < 1)]
60 | curr.outliers <- boxplot.stats(curr.benchmark)$out
61 | higher.end.outliers <- curr.outliers[which(curr.outliers > getmode(curr.benchmark))]
62 | if (length(higher.end.outliers) > 0) benchmark[[i]] <- curr.benchmark[which(curr.benchmark <= min(higher.end.outliers))]
63 | benchmark.final[curr.name] <- max(benchmark[[i]][which(benchmark[[i]] < 1)])
64 | }
65 |
66 | if (!("Unknown" %in% names(table(init.class$init.class)))) {init.class = NULL}
67 |
68 | for (i in all.indx) {
69 | perc_mtx<-as.matrix((perc.ls[[i]][[1]]))
70 | vec<-as.vector(perc_mtx)
71 | rank<-rank(vec,ties.method = "average")
72 | type_rank<-as.matrix(t(colSums(matrix(rank, nrow=50))))
73 | colnames(type_rank)<-colnames(perc_mtx)
74 | type_rank_in_order<-colnames(type_rank)[order(type_rank,decreasing = F)]
75 | type_rank_in_order_mtx<-rbind(type_rank_in_order_mtx,type_rank_in_order)
76 |
77 | if (!is.null(init.class)) {
78 | if(init.class[names(perc.ls[[i]]), "init.class"] == "Unknown") {
79 | which_type <- c(which_type, 0)
80 | } else {
81 | if (init.class[names(perc.ls[[i]]), "init.class"] == "Unknown.Progenitor") {
82 | which_type <- c(which_type, -1)
83 | } else {
84 | for (j in 1:(length(colnames(type_rank))-1)) {
85 |
86 | a<-type_rank_in_order[1]
87 | b<-type_rank_in_order[j+1]
88 |
89 | test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less")
90 | p_vals[i,j]<-test$p.value
91 | if(test$p.value<.05 & (init.class[names(perc.ls[[i]]), "init.class"] == "Single-ID")){
92 | which_type<-c(which_type,j)
93 | break
94 | } else {
95 | if (j == (ncol(type_rank_in_order_mtx) - 1) | (init.class[names(perc.ls[[i]]), "init.class"] == "Multi-ID")) {
96 | which_type <- c(which_type, j+1)
97 | }
98 | }
99 | }
100 | }
101 | }
102 | } else {
103 | if(mean(perc_mtx[,type_rank_in_order[1]]) >= (benchmark.final[type_rank_in_order[1]] + 0.01)) {
104 | which_type <- c(which_type, 0)
105 | } else {
106 | for (j in 1:(length(colnames(type_rank))-1)) {
107 |
108 | a<-type_rank_in_order[1]
109 | b<-type_rank_in_order[j+1]
110 |
111 | test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less")
112 | p_vals[i,j]<-test$p.value
113 | if(test$p.value<.05){
114 | which_type<-c(which_type,j)
115 | break
116 | } else {
117 | if (j == (ncol(type_rank_in_order_mtx) - 1)) {
118 | which_type <- c(which_type, j+1)
119 | }
120 | }
121 | }
122 | }
123 | }
124 | }
125 |
126 | type_rank_in_order_mtx <- as.matrix(type_rank_in_order_mtx)
127 | binary.mtx<-matrix(0,nrow = nrow(mtx), ncol = ncol(mtx))
128 | colnames(binary.mtx)<-colnames(mtx)
129 | cellnames<-c()
130 | for (i in 1:length(which_type)) {
131 | cellnames <- c(cellnames, names(perc.ls[[i]]))
132 | if (which_type[i] > 0) {
133 | for (j in 1:which_type[i]) {
134 | binary.mtx[i,type_rank_in_order_mtx[i,j]] <- 1
135 | }
136 | }
137 | if (which_type[i] < 0) {
138 | for (j in 1:which_type[i]) {
139 | binary.mtx[i,type_rank_in_order_mtx[i,j]] <- -1
140 | }
141 | }
142 | }
143 | rownames(binary.mtx)<-cellnames
144 |
145 | return(binary.mtx)
146 | }
147 |
--------------------------------------------------------------------------------
/R/Classification.R:
--------------------------------------------------------------------------------
1 | #' Generate classification results from the binarized result matrix
2 | #'
3 | #' This function genrate classification table from the binarized result matrix
4 | #' @param bin.count.rslt The binarized classification result matrix
5 | #' @keywords calssification, result
6 | #' @note
7 | #' @export
8 | #' @examples
9 | #'
10 | binary.to.classification <- function(bin.count.rslt) {
11 | class.rslt <- data.frame()
12 | bin.count.rsums <- rowSums(bin.count.rslt)
13 | for (i in 1:nrow(bin.count.rslt)) {
14 | curr.cell.bc <- rownames(bin.count.rslt)[i]
15 | if (bin.count.rsums[i] > 1) {
16 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Multi_ID", stringsAsFactors = F)
17 | } else {
18 | if (bin.count.rsums[i] == 0) {
19 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown", stringsAsFactors = F)
20 | }
21 | else {
22 | if (bin.count.rsums[i] < 0) {
23 | curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown.Progenitor", stringsAsFactors = F)
24 | } else {
25 | idx <- which(bin.count.rslt[i,] == 1)
26 | identity <- unlist(lapply(strsplit(colnames(bin.count.rslt), "frxn_cell.type_"), function(x) x[length(x)]))[idx]
27 | curr.df <- data.frame(barcode = curr.cell.bc, call = identity[length(identity)], stringsAsFactors = F)
28 | }
29 | }
30 | }
31 | if (nrow(class.rslt) <= 0) {
32 | class.rslt <- curr.df
33 | } else {
34 | class.rslt <- rbind(class.rslt, curr.df)
35 | }
36 | }
37 | return(class.rslt)
38 | }
39 |
--------------------------------------------------------------------------------
/R/EmpiricalPValueCalculation.R:
--------------------------------------------------------------------------------
1 | #' Empirical p-value of Identity Score Calculation
2 | #'
3 | #' This function resamples from a sample dataset and returns an empirical p-value of identity score
4 | #' @param dens.x The background dataset that we compare the identity score of our sample with
5 | #' @param curr.val The identity score of the sample dataset to be evaluated its significance
6 | #' @param n The number of times of resampling. The default is n=1000
7 | #' @keywords resampling, empirical p-value
8 | #' @note
9 | #' @export
10 | #' @examples
11 | #'
12 | sample.func <- function(dens.x, curr.val, prob, n = 1000) {
13 | samp <- sample(1:length(dens.x), n, replace = T, prob = prob)
14 | samp.val <- dens.x[samp]
15 | perc <- sum(samp.val > curr.val)/n
16 | return(perc)
17 | }
18 |
19 | #' Emprical p-values for query cells
20 | #'
21 | #' This function returns a list of emprical p-value matrices. Every matrix in the list contains emprical p-values of all possilbe celltypes for a single query cells under test.times resampled backgrounds.
22 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
23 | #' @param bkgd.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types
24 | #' @keywords resampling, empirical p-value, all query cells
25 | #' @note
26 | #' @export
27 | #' @examples
28 | #'
29 | percentage.calc <- function(mtx, bkgd.mtx) {
30 | cnms <- colnames(mtx)
31 | cell.names <- rownames(mtx)
32 | mtx.cell.num <- nrow(mtx)
33 | mtx.ct.num <- ncol(mtx)
34 | test.times <- 50
35 |
36 | perc.ls <- pbmclapply(as.list(seq_len(mtx.cell.num)), perc.calc.aux, cell.names = cell.names, mtx.ct.num = mtx.ct.num,
37 | background.mtx = bkgd.mtx, mtx = mtx, cnms = cnms, mc.cores = 4)
38 |
39 | return(perc.ls)
40 | }
41 |
42 | #' Emprical p-values for a query cell
43 | #'
44 | #' This function returns emprical p-values of all possilbe celltypes for a query cells under test.times resampled backgrounds.
45 | #' @param l The index of the sample cell of query
46 | #' @param cell.names A vector of sample cells' names
47 | #' @param mtx.ct.num The number of all possible cell types
48 | #' @param background.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types
49 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
50 | #' @param cnms The names of all possible cell types
51 | #' @param test.times The number of times resampling the background. The default is test.times=50
52 | #' @keywords resampling, empirical p-value, single query
53 | #' @note
54 | #' @export
55 | #' @examples
56 | #'
57 | perc.calc.aux <- function(l, cell.names, mtx.ct.num, background.mtx, mtx, cnms, test.times = 50) {
58 | curr.cell <- cell.names[l]
59 | perc.vec <- lapply(seq(1, mtx.ct.num),
60 | function(x) {
61 | dens <- density(background.mtx[,x], n = nrow(mtx))
62 | curr.num <- mtx[l, x]
63 | if (curr.num <= .Machine$double.eps * 2) {
64 | pc.v <- rep(1, test.times)
65 | } else {
66 | pc.v <- apply(as.data.frame(seq_len(test.times)), 1, function(x) sample.func(dens.x = dens$x, curr.val = curr.num, prob = dens$y, n = 1000))
67 | }
68 | return(list(cnms[x], pc.v))
69 | })
70 | comb.perc <- rbindlist(perc.vec)
71 | comb.perc$label <- rep(seq(1,test.times), length(unique(comb.perc$V1)))
72 |
73 | comb.df <- dcast(comb.perc, label~V1, value.var = "V2")
74 | comb.df <- comb.df[, -c(1)]
75 | curr.list <- list()
76 | curr.list[[curr.cell]] <- comb.df
77 | return(curr.list)
78 | }
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/R/Helper.R:
--------------------------------------------------------------------------------
1 | #' Get the Most Connected Cells
2 | #'
3 | #' This function returns index of cells that are the most connected within a cell type
4 | #' @param mtx The normalized reference count matrix
5 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
6 | #' @export
7 | #'
8 | get.most.connected <- function(mtx, n.sample) {
9 | corr.mtx <- WGCNA::cor(mtx)
10 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
11 | corr.mtx.melt <- melt(corr.mtx.upper)
12 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
13 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ]
14 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
15 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
16 |
17 | sample.list <- c()
18 | count.line <- 1
19 | while(length(sample.list) < n.sample) {
20 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
21 | count.line <- count.line + 1
22 | }
23 |
24 | return(sample.list[1:n.sample])
25 | }
26 |
27 |
28 | #' Get the Least Connected Cells
29 | #'
30 | #' This function returns index of cells that are the least connected within a cell type
31 | #' @param mtx The normalized reference count matrix
32 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
33 | #' @export
34 | #'
35 | get.least.connected <- function(mtx, n.sample) {
36 | corr.mtx <- WGCNA::cor(mtx)
37 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
38 | corr.mtx.melt <- melt(corr.mtx.upper)
39 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
40 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ]
41 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
42 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
43 |
44 | sample.list <- c()
45 | count.line <- 1
46 | while(length(sample.list) < n.sample) {
47 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
48 | count.line <- count.line + 1
49 | }
50 |
51 | return(sample.list[1:n.sample])
52 | }
53 |
54 | #' Get the Medium Connected Cells
55 | #'
56 | #' This function returns index of cells that are the most connected within a cell type
57 | #' @param mtx The normalized reference count matrix
58 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
59 | #' @export
60 | #'
61 | get.mid.connected <- function(mtx, n.sample) {
62 | corr.mtx <- WGCNA::cor(mtx)
63 | corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
64 | corr.mtx.melt <- melt(corr.mtx.upper)
65 | corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
66 | corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ]
67 | corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
68 | corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
69 |
70 | corr.mtx.melt.pos.sort.most <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ]
71 | corr.mtx.melt.pos.sort.most$X1 <- as.character(corr.mtx.melt.pos.sort.most$X1)
72 | corr.mtx.melt.pos.sort.most$X2 <- as.character(corr.mtx.melt.pos.sort.most$X2)
73 |
74 | sample.list <- c()
75 | count.line <- 1
76 | while(length(sample.list) < n.sample/2) {
77 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
78 | count.line <- count.line + 1
79 | }
80 | count.line <- 1
81 | while(length(sample.list) < n.sample) {
82 | sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort.most$X1[count.line], corr.mtx.melt.pos.sort.most$X2[count.line]))))
83 | count.line <- count.line + 1
84 | }
85 |
86 | return(sample.list[1:n.sample])
87 | }
88 |
89 |
90 | # Create the function.
91 | getmode <- function(v) {
92 | uniqv <- unique(v)
93 | uniqv[which.max(tabulate(match(v, uniqv)))]
94 | }
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/R/HighResolutionReferenceConstruction.R:
--------------------------------------------------------------------------------
1 | #' Systematic construction of a high-resolution reference
2 | #'
3 | #' Create a pseudo-bulk reference by sampling 90-cells from each cell type to maintain cellular resolution while increasing transcriptional resolution
4 | #' @param ref.mtx The single-cell reference dataset
5 | #' @param coldata.df The metadata (cell type information) for cells in the high-resolution reference
6 | #' @param cell.num.for.ref The number of cell numbers used to build the reference for each cell type. The default is cell.num.for.ref = 90
7 | #' @keywords pseudo-bulk reference
8 | #' @note
9 | #' @export
10 | #' @examples
11 | #'
12 | construct.high.res.reference <- function(ref.mtx, coldata.df, criteria,cell.num.for.ref = 90) {
13 |
14 | ct.freq <- as.data.frame(table(coldata.df[,criteria]), stringsAsFactors = F)
15 | ref.meta <- data.frame()
16 | ref.sc <- data.frame()
17 | mca.counts.all.involved.sub<-ref.mtx
18 | for (j in 1:nrow(ct.freq)) {
19 | curr.ct.at.test <- as.character(ct.freq[j,1])
20 | curr.cell.involve <- rownames(coldata.df)[which(coldata.df[,criteria] == curr.ct.at.test)]
21 | curr.cell.index <- which(colnames(mca.counts.all.involved.sub) %in% curr.cell.involve)
22 | curr.mtx.sub <- as.matrix(mca.counts.all.involved.sub[, curr.cell.index])
23 | if (ct.freq$Freq[j] >= cell.num.for.ref) {
24 | sample.ref.cell <- c(get.most.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2),
25 | get.least.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2))
26 | } else {
27 | sample.ref.cell <- sample(curr.cell.involve, size = cell.num.for.ref, replace = T)
28 | }
29 | new.bc <- paste0("Cell_", seq(((j-1) * cell.num.for.ref + 1), j * cell.num.for.ref))
30 | curr.meta <- data.frame(row.names = new.bc, cell.type = curr.ct.at.test, cell.bc = sample.ref.cell, stringsAsFactors = F)
31 | curr.sc <- curr.mtx.sub[, sample.ref.cell]
32 | colnames(curr.sc) <- new.bc
33 | if (ncol(ref.sc) <= 0) {
34 | ref.meta <- curr.meta
35 | ref.sc <- curr.sc
36 | } else {
37 | ref.meta <- rbind(ref.meta, curr.meta)
38 | ref.sc <- cbind(ref.sc, curr.sc)
39 | }
40 | }
41 | ref.df <- ref.construction(ref.sc, ref.meta, "cell.type")
42 | return(list(ref.sc, ref.meta, ref.df))
43 | }
44 |
45 | #' Reference Construction
46 | #'
47 | #' This function constructs reference from single-cell resolution reference data to be used for quadratic programming calculation
48 | #' @param sc The single-cell resolution dataset
49 | #' @param sc.aux The auxiliary data frame that annotate the single-cell resolutiond dataset
50 | #' @param criteria The column name to use for construction of the reference
51 | #' @export
52 | #' @examples
53 | #' ref.construction(single.ref.mtx, single.aux.df, "cell.type")
54 | ref.construction <- function(sc, sc.aux, criteria) {
55 | criteria.col <- sc.aux[, criteria]
56 | ref.df <- data.frame()
57 | uniq.crit <- unique(criteria.col)
58 |
59 | for (i in 1:length(uniq.crit)) {
60 | curr.crit <- uniq.crit[i]
61 | sc.bc <- colnames(sc)[which(sc.aux[, criteria] == curr.crit)]
62 | curr.sc.sub <- sc[, sc.bc]
63 | curr.df <- as.data.frame(rowSums(curr.sc.sub))
64 | if (ncol(ref.df) <= 0) {
65 | ref.df <- curr.df
66 | } else {
67 | ref.df <- cbind(ref.df, curr.df)
68 | }
69 | colnames(ref.df)[i] <- paste0(criteria, "_", curr.crit)
70 | }
71 | return(ref.df)
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/R/MultiIDFilterQP.R:
--------------------------------------------------------------------------------
1 | #' Multi-ID Score-Based Filter
2 | #'
3 | #' This function filters the multiple identity cells based on their QP scores, where we assume that a low QP score (less than 10E-3) are not a true identity to consider
4 | #' @param binary.counts The binary count matrix, which is the output from binarization with Mann Whitney.
5 | #' @param classification The classification result, which is the output from binary to classification.
6 | #' @param qp.matrix The matrix that contains QP scores calculated for the sample cells
7 | #' @param qp.threshold The threshold to cut off for the QP scores in the multiple identity listed cells
8 | #' @return A list contain 2 elements, the first is the curated and filtered multiple identity data frame and the second is the new classification data frame.
9 | #' @keywords Multiple identities
10 | #' @export
11 | #' @examples
12 | #' multi.id.curate.qp(multi.id.meta)
13 | multi.id.curate.qp <- function(binary.counts, classification, qp.matrix, qp.threshold = 10^-3) {
14 |
15 | bin.count.std.multi <- as.data.frame(binary.counts[classification$barcode[which(classification$call == "Multi_ID")], ])
16 | bin.count.std.multi$cell.bc <- rownames(bin.count.std.multi)
17 |
18 | bin.count.std.multi.melt <- reshape2::melt(bin.count.std.multi)
19 | bin.count.std.multi.melt <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$value > 0), ]
20 |
21 | classification.new <- classification
22 |
23 | qp.column <- c()
24 | for (i in 1:nrow(bin.count.std.multi.melt)) {
25 | curr.cell <- as.character(bin.count.std.multi.melt$cell.bc[i])
26 | curr.cell.type <- as.character(bin.count.std.multi.melt$variable[i])
27 | qp.column <- c(qp.column, qp.matrix[curr.cell, curr.cell.type])
28 | }
29 | bin.count.std.multi.melt$qp.score <- qp.column
30 | bin.count.std.multi.melt.remain <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$qp.score >= qp.threshold), ]
31 |
32 | cell.occurrence <- as.data.frame(table(bin.count.std.multi.melt.remain$cell.bc))
33 | single.id.cell <- as.character(cell.occurrence$Var1[which(cell.occurrence$Freq == 1)])
34 |
35 | actual.multi <- bin.count.std.multi.melt.remain
36 | if (length(single.id.cell) > 0) {
37 | single.id.cell.bin <- bin.count.std.multi.melt.remain[which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ]
38 | rownames(single.id.cell.bin) <- single.id.cell.bin$cell.bc
39 | classification.new[rownames(single.id.cell.bin), "call"] <- unlist(lapply(strsplit(as.character(single.id.cell.bin$variable), "frxn_cell.type_"), function(x) x[2]))
40 | actual.multi <- bin.count.std.multi.melt.remain[-which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ]
41 | }
42 |
43 | return(list(actual.multi, classification.new))
44 | }
45 |
46 |
--------------------------------------------------------------------------------
/R/SingleCellQuadraticProgramming.R:
--------------------------------------------------------------------------------
1 | #' Single Round of Quadratic Programming
2 | #'
3 | #' This function runs preprocessing, including log normalization, gene intersection and scaling. Further, run one round of quadratic programming. See function sc.quad.prog.run for detailed description
4 | #' @param ref The reference transcriptome taht contains the transcriptome of each potential cell type
5 | #' @param sc.data the transcriptome profile of cells from single-cell RNA-sequencing
6 | #' @param scale.bulk.sc either scale or non-scale. Scaling is recommended to make the reference comparable to the single-cell. Default to scale.
7 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0
8 | #' @param unix.par boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE
9 | #' @param windows.par boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE
10 | #' @param n.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used
11 | #' @param save.to.path which directory would you like to save your file?
12 | #' @param save.to.filename prefix to the filename to save to. The final filename will be constructed inside of the function by tagging the following string _scale.csv for scaling and _non_scale.csv for not scaling.
13 | #' @param bulk.norm boolean value, either TRUE or FALSE. Would you like to normalize the reference?
14 | #' @param norm.sc boolean value, either TRUE or FALSE. Would you like to normalize the single-cell dataset?
15 | #' @param log.bulk boolean value, either TRUE or FALSE. Would you like to log the reference?
16 | #' @param log.sc boolean value, either TRUE or FALSE. Would you like to log the single-cell dataset?
17 | #' @keywords quadratic programming, scRNA-seq,
18 | #' @note This function calls the quadratic programming referenced to Treutlein et. al.
19 | #' @export
20 | #' @examples
21 | #' single.round.QP.analysis(ref = ref.transcriptome, sc.data = sc.transcriptome, force.eq = 1, save.to.path = "~/Desktop/", save.to.filename = "my_favorite")
22 | single.round.QP.analysis <- function(ref, sc.data, scale.bulk.sc = "scale", unix.par = FALSE, windows.par = FALSE, force.eq = 0,
23 | n.cores = 1, save.to.path, save.to.filename, bulk.norm = T, norm.sc = T, log.bulk = T, log.sc = T) {
24 | norm.bulk <- as.matrix(ref)
25 | norm.sc.mtx <- as.matrix(sc.data)
26 |
27 | # Normalized the bulk and single-cell data
28 | if (bulk.norm) norm.bulk <- normalize.dt(ref)
29 | if (norm.sc) norm.sc.mtx <- normalize.dt(sc.data)
30 |
31 | # Calculate the scaling ratio
32 | scale.ratio.sc <- calc.scale.ratio(ref, sc.data)
33 | scale.norm.sc <- norm.bulk/scale.ratio.sc
34 |
35 | # Calculate gene intersections
36 | norm.ls.sc <- gene.intersect.sub(norm.bulk, norm.sc.mtx)
37 | scale.ls.sc <- gene.intersect.sub(scale.norm.sc, norm.sc.mtx)
38 |
39 | log.norm.ls.sc <- norm.ls.sc
40 | log.scale.ls.sc <- scale.ls.sc
41 | # Finish the log-normalization
42 | if (log.bulk & log.sc) {
43 | log.norm.ls.sc <- lapply(norm.ls.sc, log1p)
44 | log.scale.ls.sc <- lapply(scale.ls.sc, log1p)
45 | } else {
46 | if (log.bulk) {
47 | log.norm.ls.sc[[1]] <- log1p(log.norm.ls.sc[[1]])
48 | log.scale.ls.sc[[1]] <- log1p(log.scale.ls.sc[[1]])
49 | }
50 | if (log.sc) {
51 | log.norm.ls.sc[[2]] <- log1p(log.norm.ls.sc[[2]])
52 | log.scale.ls.sc[[2]] <- log1p(log.scale.ls.sc[[2]])
53 | }
54 | }
55 |
56 | # Run QP and save to file
57 | if (scale.bulk.sc == "scale") {
58 | qp.rslt <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]),
59 | single.cell.transcriptome = log.scale.ls.sc[[2]],
60 | unix.parallel = unix.par, windows.parallel = windows.par,
61 | parallel.cores = n.cores, force.eq = force.eq)
62 | write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F)
63 | } else {
64 | if (scale.bulk.sc == "non-scale") {
65 | qp.rslt <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]),
66 | single.cell.transcriptome = log.norm.ls.sc[[2]],
67 | unix.parallel = unix.par, windows.parallel = windows.par,
68 | parallel.cores = n.cores, force.eq = force.eq)
69 | write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F)
70 | } else {
71 | qp.rslt.scl <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]),
72 | single.cell.transcriptome = log.scale.ls.sc[[2]],
73 | unix.parallel = unix.par, windows.parallel = windows.par,
74 | parallel.cores = n.cores, force.eq = force.eq)
75 | qp.rslt.non.scl <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]),
76 | single.cell.transcriptome = log.norm.ls.sc[[2]],
77 | unix.parallel = unix.par, windows.parallel = windows.par,
78 | parallel.cores = n.cores, force.eq = force.eq)
79 | write.csv(qp.rslt.scl, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F)
80 | write.csv(qp.rslt.non.scl, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F)
81 | }
82 | }
83 | }
84 |
85 | #' Single-Cell Quadratic Programming Calculation
86 | #'
87 | #' This function runs quadratic programming to identify the probability of the cells in single-cell RNA-seq belonging to cell types in the reference transcriptome
88 | #' @param bulk.transcriptome The reference transcriptome taht contains the transcriptome of each potential cell type
89 | #' @param single.cell.transcriptome the transcriptome profile of cells from single-cell RNA-sequencing
90 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0
91 | #' @param unix.parallel boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE
92 | #' @param windows.parallel boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE
93 | #' @param parallel.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used
94 | #' @keywords quadratic programming, scRNA-seq
95 | #' @note Code reference from Treutlein et. al., Dissecting direct reprogramming from fibroblast to neuron using single-cell RNA-seq
96 | #' @export
97 | #' @examples
98 | #' sc.quad.prog.run(ref.transcriptome, sc.transcriptome, force.eq = 1)
99 | sc.quad.prog.run <- function(bulk.transcriptome, single.cell.transcriptome, force.eq = 0,
100 | unix.parallel = FALSE, windows.parallel = FALSE, parallel.cores = 4) {
101 | # Check if the sizes of the matrices are comfortable with each other
102 | if (nrow(bulk.transcriptome) != nrow(single.cell.transcriptome)) {
103 | print("The number of genes included in the bulk expression is not the same as the single cell transcriptome")
104 | return()
105 | }
106 |
107 | # Initialize a data frame to hold the fraction identity matrix
108 | identity.matx <- data.frame()
109 | # Initialize the cell types given
110 | given.cell.typs <- colnames(bulk.transcriptome)
111 |
112 | # If using unix/linux based system, can use this process to parallel the the program
113 | # Set unix.parallel parameter to be TRUE, but set windows.parallel to be FALSE
114 | if (unix.parallel) {
115 | identy.mx.ls <- mclapply(as.list(seq_len(ncol(single.cell.transcriptome))), quad.prog.calc,
116 | bulk.transcriptome = bulk.transcriptome,
117 | single.cell.transcriptome = single.cell.transcriptome,
118 | force.eq = force.eq, mc.cores = parallel.cores)
119 | identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution,
120 | identity[[1]]$Lagrangian[1], identity[[3]]))
121 |
122 | identity.matx <- matrix(unlist(identity.ls), byrow = TRUE,
123 | nrow = length(identity.ls), ncol = length(identity.ls[[1]]))
124 | } else {
125 | # If using Windows based system, can use this process to parallel the the program
126 | # Set windows.parallel parameter to be TRUE, but set unix.parallel to be FALSE
127 | # NOTE: This might be slower than applying parallelization under Unix/Linux based system
128 | if (windows.parallel) {
129 | cl<-makeCluster(parallel.cores)
130 | identy.mx.ls <- clusterApply(cl, seq_len(ncol(single.cell.transcriptome)), quad.prog.calc,
131 | bulk.transcriptome = bulk.transcriptome,
132 | single.cell.transcriptome = single.cell.transcriptome,
133 | force.eq = force.eq)
134 | stopCluster(cl)
135 | identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution,
136 | identity[[1]]$Lagrangian[1], identity[[3]]))
137 |
138 | identity.matx <- matrix(unlist(identity.ls), byrow = TRUE,
139 | nrow = length(identity.ls), ncol = length(identity.ls[[1]]))
140 |
141 | # If not using parallel for this case, just run the following process in a loop for serial
142 | # All parallel parameters option should be set to FALSE
143 | } else {
144 | for (i in 1:ncol(single.cell.transcriptome)){
145 | identity<-c()
146 | quad.rslt <- quad.prog.calc(i, bulk.transcriptome, single.cell.transcriptome, force.eq)
147 | QP <- quad.rslt[[1]]
148 | Error <- quad.rslt[[3]]
149 |
150 | identity <- c(colnames(single.cell.transcriptome)[i], QP$solution, QP$Lagrangian[1],Error)
151 |
152 | if (nrow(identity.matx) == 0) {
153 | identity.matx <- as.data.frame(t(as.matrix(identity)))
154 | } else {
155 | identity.matx <- rbind(identity.matx, as.data.frame(t(as.matrix(identity))))
156 | }
157 | }
158 | }
159 | }
160 |
161 | # Set the column names of the returning data frame
162 | col.frx.names <- paste("frxn_", given.cell.typs, sep = "")
163 | colnames(identity.matx)<-c("cell_name", col.frx.names,"Lagrangian","Error")
164 | return(identity.matx)
165 | }
166 |
167 | quad.prog.calc <- function(col.num, bulk.transcriptome, single.cell.transcriptome, force.eq) {
168 | library(quadprog)
169 | Y <- as.matrix(single.cell.transcriptome[, col.num])
170 | Rinv <- solve(chol(t(bulk.transcriptome) %*% bulk.transcriptome));
171 | C <- cbind(rep(-1, ncol(bulk.transcriptome)), diag(ncol(bulk.transcriptome)))
172 | n <- nrow(Rinv)
173 | q <- ncol(bulk.transcriptome)
174 | b <- c(-1, rep(0, q))
175 | d <- t(Y) %*% bulk.transcriptome
176 |
177 | QP<-solve.QP(Dmat = Rinv, factorized = TRUE, dvec = d, bvec = b, Amat = C, meq = force.eq)
178 | Error<-sum(abs(Y- bulk.transcriptome %*% QP$solution))
179 |
180 | return(list(QP, colnames(single.cell.transcriptome)[col.num], Error))
181 | }
182 |
183 |
--------------------------------------------------------------------------------
/R/SingleCellQuadraticProgrammingAux.R:
--------------------------------------------------------------------------------
1 | #' Top Variable Gene Identificaiton
2 | #'
3 | #' This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note.
4 | #' @param input.dir The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell.
5 | #' @param output.dir The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names.
6 | #' @param top.number.count The number of top variable genes to extract. Default to 500 genes
7 | #' @keywords top variable gene extraction
8 | #' @note Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015
9 | #' @export
10 | #' @examples
11 | #' top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000)
12 | top.genes <- function(input.dir, output.dir, top.number.count = 500) {
13 | dataset <- input.dir
14 | top.dir <- output.dir
15 |
16 | DS <- read.table(dataset, sep="\t", header=TRUE, row.names=1, check.names=FALSE)
17 | X <- voom(as.matrix(DS), normalize.method="none")$E
18 | vars <- apply(X, 1, var)
19 | top <- sort.list(-vars)[1:top.number.count]
20 | X <- X[top,]
21 |
22 | topl <- as.data.frame(rownames(X))
23 | colnames(topl) <- c(paste("Top", top.number.count, sep = "", collapse = ""))
24 | write.table(topl, top.dir, sep="\t", quote=F, row.names = FALSE, col.names = TRUE)
25 | return()
26 | }
27 |
28 | #' Normalization of Single-Cell RNA-Seq Data
29 | #'
30 | #' This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage.
31 | #' @param dt.st The dataset to normalize
32 | #' @keywords normalization
33 | #' @export
34 | #' @examples
35 | #' normalize.dt(single.cell.mtx)
36 | normalize.dt <- function(dt.st) {
37 | # Calculate column sums
38 | csums <- Matrix::colSums(dt.st)
39 | # Calculate averages of the sums
40 | cavg <- mean(csums)
41 | # Calculate normalized bulk
42 | norm.dt.st <- dt.st
43 | for (i in 1:length(csums)) {norm.dt.st[,i] <- (norm.dt.st[,i]/csums[i]) * cavg}
44 | return(norm.dt.st)
45 | }
46 |
47 | #' Scale Ratio Calculation
48 | #'
49 | #' This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction.
50 | #' @param bulk The reference dataset
51 | #' @param sc The single-cell dataset
52 | #' @keywords Scale ratio calculation
53 | #' @export
54 | #' @examples
55 | #' calc.scale.ratio(reference.dt, single.cell.mtx)
56 | calc.scale.ratio <- function(bulk, sc) {
57 | # Calculate column sums
58 | csums.bulk <- colSums(bulk)
59 | csums.sc <- Matrix::colSums(sc)
60 | # Calculate averages of the sums
61 | cavg.bulk <- mean(csums.bulk)
62 | cavg.sc <- mean(csums.sc)
63 | # Calculate scaling ratio for each tissue
64 | scale.ratio.sc <- cavg.bulk/cavg.sc
65 | return(scale.ratio.sc)
66 | }
67 |
68 | #' Find intersection genes
69 | #'
70 | #' This function identifies the intersection genes between reference and single-cell data
71 | #' @param bulk The reference dataset
72 | #' @param sc The single-cell dataset
73 | #' @export
74 | #' @examples
75 | #' gene.intersect.sub(reference.dt, single.cell.mtx)
76 | gene.intersect.sub <- function(bulk, sc) {
77 | # Find genes with all zero-expression within sc/bulk
78 | rsums.bulk <- Matrix::rowSums(bulk)
79 | rsums.sc <- Matrix::rowSums(sc)
80 | # Identify blank genes
81 | bulk.blank.genes <- which(rsums.bulk == 0)
82 | sc.blank.genes <- which(rsums.sc == 0)
83 | # Remove those genes
84 | norm.bulk.sub <- bulk
85 | norm.sc.sub <- sc
86 | if (length(bulk.blank.genes) > 0) {norm.bulk.sub <- bulk[-(bulk.blank.genes),]}
87 | if (length(sc.blank.genes) > 0) {norm.sc.sub <- sc[-(sc.blank.genes),]}
88 | # Find intersection genes
89 | genes.inter <- intersect(rownames(norm.bulk.sub), rownames(norm.sc.sub))
90 | # Find the intersected sub dataset of bulk (normalized bulk)
91 | norm.bulk.sub.sub <- norm.bulk.sub[genes.inter, ]
92 | norm.sc.sub.sub <- norm.sc.sub[genes.inter, ]
93 | return(list(norm.bulk.sub.sub, norm.sc.sub.sub))
94 | }
95 |
96 |
97 |
--------------------------------------------------------------------------------
/R/TransitionScoreCalculation.R:
--------------------------------------------------------------------------------
1 | #' Transition Score Calculation
2 | #'
3 | #' This function calculates the transition scores for each cell state that is connected by cells with multiple identities.
4 | #' @param multi.id.meta A data frame that contains cells with multiple identities. Column 1 - cell barcode, Column 2 - cell type call, Column 3 - counts, Column 4 - corresponding QP scores.
5 | #' @return A data frame that contains the calculated transition scores for each identity
6 | #' @keywords Transition metric
7 | #' @export
8 | #' @examples
9 | #' transition.score(multi.id.meta)
10 | transition.score <- function(multi.id.meta) {
11 |
12 | unique.ct <- unique(as.character(multi.id.meta[,2]))
13 | ct.entropy <- data.frame()
14 | for (i in 1:length(unique.ct)) {
15 | curr.cell <- unique.ct[i]
16 | actual.sub <- multi.id.meta[which(as.character(multi.id.meta[,2]) == curr.cell), ]
17 | curr.entro <- sum(-actual.sub$qp.score*log(actual.sub$qp.score, base = max(2, nrow(actual.sub))))
18 | curr.df <- data.frame(cell.ct = curr.cell, entropy = curr.entro, stringsAsFactors = F)
19 | if (nrow(ct.entropy) <= 0) {
20 | ct.entropy <- curr.df
21 | } else {
22 | ct.entropy <- rbind(ct.entropy, curr.df)
23 | }
24 | }
25 |
26 | rownames(ct.entropy) <- unlist(lapply(strsplit(ct.entropy$cell.ct, "frxn_cell.type_"), function(x) x[2]))
27 |
28 | return(ct.entropy)
29 | }
30 |
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # R Package - Capybara
2 |
3 |
4 | Capybara is a tool to measure cell identity and fate transitions. This approach is designed to measure cell identity as a continuum, at a single-cell resolution. Capybara enables classification of discrete identities as well as cells with multiple identities. This package has a dependency on R version (R >= 3.5.0). In addition to the following tutorial, we provide the notebooks for production of figures in the morris-lab/Capybara_reproducibility (https://github.com/morris-lab/Capybara_reproducibility) repository. There could exist some variations due to difference in threshold selections. For details regarding the methods, usage and application, please refer to the following paper: Kong et al., Cell Stem Cell, 2022 (https://www.sciencedirect.com/science/article/pii/S1934590922000996?dgcid=coauthor)
5 |
6 | ## Installation
7 | ### Dependencies
8 |
9 | Most dependencies can be installed along with Capybara through CRAN. The following dependencies may need to be installed manually through BioConductor (Instructions can also be found here: https://bioconductor.org/).
10 |
11 | Install BiocManager
12 | ```r
13 | install.packages("BiocManager")
14 | ```
15 | Install Bioconductor dependency packages
16 | ```r
17 | BiocManager::install(c("limma","impute", "AnnotationDbi","GO.db", "preprocessCore"))
18 | ```
19 |
20 | ### Install the package
21 | Install devtools
22 | ```r
23 | install.packages("devtools")
24 | ```
25 | Install the package from GitHub.
26 | ```r
27 | library("devtools")
28 | devtools::install_github("morris-lab/Capybara")
29 | ```
30 | Load the package
31 | ```r
32 | library("Capybara")
33 | ```
34 |
35 | ## Step 1: Tissue-Level Classification
36 | ### Application of quadratic programming on reference and sample single-cell dataset using a bulk reference
37 |
38 | Bulk transcriptome profiles of all tissues are mined from ARCHS4, a platform that contains most published RNA-seq and ChiP-seq datasets (Lachmann et al., 2018). ARCHS4 obtains raw datasets from the Gene Expression Omnibus (GEO), realigned and processed through a uniform pipeline. We filtered to contain only poly-A and total RNA-seq data from C57BL/6 mice. With further filtering and preprocessing (details can be found in the method section of the paper), we landed with a reference of a total of 30 tissues. We provide our mined bulk references, including a matrix in raw counts and a matrix in reads per kilobase per million (RPKM), as a part of the Capybara package. Selection of your preferred normalization method can be applied to raw counts. Here, we will demonstrate the usage of the bulk raw counts in the pipeline.
39 |
40 | **1. Load the bulk reference**
41 | ```r
42 | # File path
43 | bulk.raw.path <- system.file("extdata", "Bulk Reference Raw.Rds", package = "Capybara")
44 | bulk.rpkm.path <- system.file("extdata", "Bulk Reference RPKM.Rds", package = "Capybara")
45 | # Read the matrices
46 | bulk.raw <- readRDS(bulk.raw.path)
47 | bulk.rpkm <- readRDS(bulk.rpkm.path)
48 | ```
49 |
50 | With the bulk reference, we next load the single-cell reference, such as a cell atlas, and the single-cell sample to be used. The datasets to be used should be in a matrix form with each row representing a gene and each column representing a cell. Here, we use the Mouse Cell Atlas (MCA) as background and single-cell RNA-seq data of mouse pancreatic cells (Baron et al., 2016) as examples for demonstration. MCA can be obtained from https://figshare.com/articles/MCA_DGE_Data/5435866. We included the mouse pancreatic dataset in the package.
51 |
52 | **2.Load the single-cell sample dataset and the corresponding meta data**
53 |
54 | *Note: The meta data of this file contains 2 columns, where the first column represents cell.type and the second column represents barcode.*
55 |
56 | ```r
57 | # Read in the pancreatic data file that come with the package
58 | fpath <- system.file("extdata", "baron_dataset.zip", package = "Capybara")
59 | extract.dir <- "."
60 | # Extract the dataset
61 | unzip(fpath, overwrite = FALSE, exdir = ".")
62 | # Identify the full path
63 | full.fpath.meta <- paste0(extract.dir, "/", "baron_et_al_pancreatic_meta.csv")
64 | full.fpath.raw <- paste0(extract.dir, "/", "baron_et_al_pancreatic.csv")
65 | # Load the count matrix and the meta data
66 | baron.expr <- read.csv(full.fpath.raw, header = T, row.names = 1, stringsAsFactors = F)
67 | baron.meta <- read.csv(full.fpath.meta, header = T, row.names = 1, stringsAsFactors = F)
68 | ```
69 |
70 | **3. Application of QP on the sample single-cell data**
71 |
72 | Notice: For Windows users, please set unix.par=F and n.cores=1
73 |
74 | ```r
75 | single.round.QP.analysis(bulk.raw, baron.expr, scale.bulk.sc = "scale", unix.par = TRUE,
76 | force.eq = 1, n.cores = 4, save.to.path = "./",
77 | save.to.filename = "baron_bulk_classification_qp")
78 | ```
79 |
80 | **4. Load the single-cell reference meta data**
81 |
82 | *Note: The meta data of Mouse Cell Atlas contains 6 columns, including Cell.name, ClusterID, Tissue, Batch, Cell.Barcode, and Annotation. The annotation is what we used for high-resolution reference construction. We've included the version of meta data we used along with the package.*
83 |
84 | ```r
85 | # Read the meta data
86 | mca.meta.fpath <- system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")
87 | mca <- read.csv(mca.meta.fpath, row.names = 1, header = T, stringsAsFactors = F)
88 | # Clean up the meta data
89 | mca.meta <- data.frame(row.names = mca$Cell.name,
90 | tissue = mca$Tissue,
91 | cell.type = mca$Annotation,
92 | stringsAsFactors = F)
93 | ```
94 |
95 | **5. Load the single-cell reference atlas and apply QP tissue-by-tissue**
96 |
97 | *Due to the large size of MCA count data, we did* ***NOT*** *include the counts along with the package. We further separated MCA into fetal/neonatal/embryonic and adult categories. The counts data were organized in the following manner.*
98 |
99 | > Folder: MCA Counts
100 |
101 | >> Tissue_1 Folder
102 |
103 | >>> count.csv
104 |
105 | >> Tissue_2 Folder
106 |
107 | >>> count.csv
108 |
109 | >> ...
110 |
111 | ```r
112 | # List all possible files and tissues in the Mouse Cell Atlas
113 | file.ls <- list.files("./MCA_Counts/", full.names = T)
114 | base.nms <- basename(file.ls)
115 |
116 | # Identify the tissues
117 | unq.tissue <- unique(base.nms)
118 |
119 | # Set a path to save all QP files for all tissues
120 | general.path.to.save <- "./MCA_All_Tissue_QP/"
121 | for (k in 1:length(unq.tissue)) {
122 | curr.tissue <- unq.tissue[k]
123 | curr.filename <- paste0("0", k, "_", curr.tissue, "_Bulk_ARCHS4")
124 |
125 | file.base.name <- base.nms[which(startsWith(base.nms, curr.tissue))][1]
126 | file.full <- file.ls[which(startsWith(base.nms, curr.tissue))][1]
127 |
128 | print(curr.tissue)
129 |
130 | sc.data <- read.csv(paste0(file.full, "/count.csv"), header = T, row.names = 1, stringsAsFactors = F)
131 |
132 | if (all(is.na(sc.data))) {
133 | print("There is no data in this counting matrix!")
134 | } else {
135 | single.round.QP.analysis(bulk.raw, sc.data, scale.bulk.sc = "scale", unix.par = TRUE,
136 | force.eq = 1, n.cores = 4, save.to.path = general.path.to.save,
137 | save.to.filename = curr.filename)
138 | }
139 | }
140 | ```
141 |
142 | **6. Selection of 90 cells from each tissue to construct a QP background**
143 |
144 | With all QP scores calculated on the bulk transcriptome profiles of all tissues, we select 90 most relevant cells of each tissue in the MCA (90 highest scored cells in the MCA to each bulk tissue) as a QP background. We use this QP background further to map our sample single-cell data. Assuming that each cell in each cell type of the MCA takes a unique combination of QP scores to each tissue in ARCHS4, cells in the sample that share similar combination to those in MCA are marked to relate to the corresponding tissue in the MCA. Here, we demonstrate how we constructed the backgrounds as following. We included the background matrices along with the packages such that it can be directly used for convenience.
145 |
146 | **(a) Get QP scores for all annotated cells**
147 | ```r
148 | # Read the QP files from the directory
149 | qp.files.to.read.clean <- list.files("./MCA_All_Tissue_QP/", full.names = T)
150 |
151 | full.qp.mtx.known.annotation <- data.frame()
152 | full.qp.mtx.unknown.annotation <- data.frame()
153 | for (i in 1:length(qp.files.to.read.clean)) {
154 | curr.file <- qp.files.to.read.clean[i]
155 | curr.qp.rslt <- read.csv(curr.file, header = T, row.names = 1, stringsAsFactors = F)
156 |
157 | cells.to.keep <- intersect(rownames(mca.meta), rownames(curr.qp.rslt))
158 | cells.unlabel <- setdiff(rownames(curr.qp.rslt), cells.to.keep)
159 |
160 | curr.sub.mtx.to.keep <- curr.qp.rslt[cells.to.keep, ]
161 | curr.sub.mtx.unlabel <- curr.qp.rslt[cells.unlabel, ]
162 |
163 | if (nrow(full.qp.mtx.known.annotation) <= 0) {
164 | full.qp.mtx.known.annotation <- curr.sub.mtx.to.keep
165 | full.qp.mtx.unknown.annotation <- curr.sub.mtx.unlabel
166 | } else {
167 | full.qp.mtx.known.annotation <- rbind(full.qp.mtx.known.annotation, curr.sub.mtx.to.keep)
168 | full.qp.mtx.unknown.annotation <- rbind(full.qp.mtx.unknown.annotation, curr.sub.mtx.unlabel)
169 | }
170 | }
171 |
172 | full.qp.mtx.known.annotation.qp.score.only <- full.qp.mtx.known.annotation[,c(1:(ncol(full.qp.mtx.known.annotation) - 2))]
173 | ```
174 |
175 | **(b) Selection of 90 cells**
176 | ```r
177 | # Create a map between MCA and ARCHS4
178 | map.df <- data.frame(mca.tissue = c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Trophoblast-Stem-Cell", "Fetal_Brain",
179 | "Neonatal-Calvaria","Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache",
180 | "Neonatal-Heart", "Neonatal-Muscle",
181 | "Neonatal-Rib", "Neonatal-Skin", "NeonatalPancreas"),
182 | corresponding = c("frxn_embryo", "frxn_embryo", "frxn_embryo", "frxn_brain","frxn_brain",
183 | "frxn_small.intestine", "frxn_liver",
184 | "frxn_lung", "frxn_stomach", "frxn_heart", "frxn_muscle", "frxn_muscle",
185 | "frxn_skin", "frxn_pancreas"),
186 | stringsAsFactors = F)
187 |
188 | # Identify top 90 cells for each tissue
189 | tm.tissue <- unique(map.df$tm.tissue)
190 | cell.selector <- c()
191 | n.sample <- 90
192 | for (i in 1:length(tm.tissue)) {
193 | curr.tissue <- tm.tissue[i]
194 | cell.names <- rownames(mca.meta)[which(mca.meta$tissue == curr.tissue)]
195 | curr.qp.subset <- full.qp.mtx.known.annotation.qp.score.only[cell.names, ]
196 | curr.map <- map.df$corresponding[which(map.df$tm.tissue == curr.tissue)]
197 | if (length(curr.map) <= 1){
198 | curr.qp.subset.sub <- data.frame(score = curr.qp.subset[,curr.map], cell.name = cell.names, stringsAsFactors = F)
199 | } else {
200 | curr.qp.subset.sub <- data.frame(score = rowSums(curr.qp.subset[,curr.map]), cell.name = cell.names, stringsAsFactors = F)
201 | }
202 | curr.qp.subset.sub.sort <- curr.qp.subset.sub[order(-curr.qp.subset.sub$score), ]
203 | cells.to.incl <- curr.qp.subset.sub.sort$cell.name[1:n.sample]
204 |
205 | cell.selector <- c(cell.selector, cells.to.incl)
206 | }
207 | saveRDS(full.qp.mtx.known.annotation.qp.score.only[cell.selector, ], "./MCA_embryonic_background.RDS")
208 | ```
209 |
210 | *Note: This constructed QP background can be saved and reused and does not need to be reconstructed every time.*
211 |
212 | ### Identification of tissue correlate in the reference to the sample single-cell dataset
213 |
214 | To find the correlated tissue in the reference to the sample single-cell dataset, we use a correlation based method. In brief, we calculate Pearson's correlation of the QP scores in a pairwise manner between each cell in the sample and each cell in the reference. Recall the assumption that cells in the sample that share similar combination of QP scores to those in MCA are marked to relate to the corresponding tissue in the MCA. If there is a significant percentage of reference cells of a tissue (over 70%) mapped to a cell, we record the tissue label. Then the frequency of each tissue label is calculated. Tissues with a frequency at least 0.5% (for cell number > 10,000) or at least 100 cells will be selected for further analysis.
215 |
216 | **1. Load the QP background matrix**
217 | ```r
218 | background.qp.fpath <- system.file("extdata", "MCA Adult Background.Rds", package = "Capybara")
219 | background.mtx <- readRDS(background.qp.fpath)
220 | ```
221 |
222 | **2. Load the QP scores of the sample**
223 |
224 | ```r
225 | ## Load QP results
226 | qp.rslt <- read.csv("./baron_bulk_classification_qp_scale.csv", row.names = 1, header = T, stringsAsFactors = F)
227 |
228 | ## Reshape the data
229 | qp.rslt.sub <- qp.rslt[,c(1:(ncol(qp.rslt) - 2))]
230 | ```
231 |
232 | **3. Correlation calculation**
233 |
234 | *Note: we use WGCNA to calculate the correlation*
235 |
236 | ```r
237 | mtx.test <- t(qp.rslt.sub[, colnames(background.mtx)])
238 | ref.test <- t(background.mtx)
239 |
240 | # Pearson's Correlation Calculation
241 | corr.mtx <- WGCNA::cor(ref.test, mtx.test)
242 | ```
243 |
244 | **4. Binarization based on correlation**
245 |
246 | We perform binarization based on the correlation estimates.
247 |
248 | ```r
249 | # Setup a correlation cutoff to the 90th quantile of the correlation matrix
250 | correlation.cutoff <- quantile(corr.mtx, 0.90)
251 |
252 | # Binarization based on the correlation
253 | new.corr.bin <- corr.mtx
254 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1
255 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0
256 | new.corr.bin <- as.data.frame(new.corr.bin)
257 | ```
258 |
259 | **5. Counting the tissues and select the final tissue types**
260 |
261 | Count the frequency of occurrence of each tissue in the tissue list.
262 |
263 | ```r
264 | # Count
265 | count.in.cat <- c()
266 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1])))
267 | for (uc in unique.cat) {
268 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)]
269 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7)
270 | }
271 |
272 | count.in.cat <- as.data.frame(count.in.cat)
273 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3)
274 |
275 | # Check frequency
276 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)]
277 | ```
278 |
279 | Below is a composition example for this pancreatic dataset, where we identify 3 major tissues, including stomach, pancreas, and small instestine.
280 |
281 |
282 |
493 |
494 |
655 |
656 |
764 |
765 |