├── .DS_Store
├── .Rbuildignore
├── .Rhistory
├── .gitignore
├── Capybara.Rproj
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── .DS_Store
    ├── Binarization.R
    ├── Classification.R
    ├── EmpiricalPValueCalculation.R
    ├── Helper.R
    ├── HighResolutionReferenceConstruction.R
    ├── MultiIDFilterQP.R
    ├── SingleCellQuadraticProgramming.R
    ├── SingleCellQuadraticProgrammingAux.R
    └── TransitionScoreCalculation.R
├── README.md
├── examples
    ├── .DS_Store
    ├── Monocle_hat_colin.png
    ├── Step 1 - ARCHS4 Mining.R
    ├── Step 2 - bulk cleaning.R
    ├── bulk class mca pancreatic.png
    ├── bulk_composition_pancreatic.pdf
    ├── cardiac_TS_plots.png
    ├── cardiac_bulk_v2.png
    ├── features.tsv
    ├── gene_info.tsv
    ├── pancreatic dot plot.pdf
    └── pancreatic dot plot.png
├── inst
    ├── .DS_Store
    └── extdata
    │   ├── .DS_Store
    │   ├── Bulk Reference RPKM.Rds
    │   ├── Bulk Reference Raw.Rds
    │   ├── MCA Adult Background.Rds
    │   ├── MCA Embryonic Background.Rds
    │   ├── MCA_CellAssignments.csv
    │   ├── baron_dataset.zip
    │   └── features.tsv
└── man
    ├── binarization.mann.whitney.Rd
    ├── binary.to.classification.Rd
    ├── calc.scale.ratio.Rd
    ├── construct.high.res.reference.Rd
    ├── gene.intersect.sub.Rd
    ├── get.least.connected.Rd
    ├── get.mid.connected.Rd
    ├── get.most.connected.Rd
    ├── multi.id.curate.qp.Rd
    ├── normalize.dt.Rd
    ├── perc.calc.aux.Rd
    ├── percentage.calc.Rd
    ├── ref.construction.Rd
    ├── sample.func.Rd
    ├── sc.quad.prog.run.Rd
    ├── single.round.QP.analysis.Rd
    ├── top.genes.Rd
    └── transition.score.Rd


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/.DS_Store


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^Capybara\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | baron.expr <- data.frame()
  2 | baron.meta <- data.frame()
  3 | count <- 1
  4 | for (f in baron.fns) {
  5 | curr.df <- read.csv(f, row.names = 1, stringsAsFactors = F, check.names = F)
  6 | curr.bc <- paste0(rownames(curr.df), "_", curr.df$barcode, "_Sample_", count)
  7 | rownames(curr.df) <- curr.bc
  8 | if (nrow(baron.meta) <= 0) {
  9 | baron.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"],
 10 | stringsAsFactors = F, check.names = F)
 11 | } else {
 12 | curr.meta <- data.frame(row.names = curr.bc, cell.type = curr.df[curr.bc, "assigned_cluster"], barcode = curr.df[curr.bc, "barcode"],
 13 | stringsAsFactors = F, check.names = F)
 14 | baron.meta <- rbind(baron.meta, curr.meta)
 15 | }
 16 | curr.expr <- curr.df[, c(3:ncol(curr.df))]
 17 | if (nrow(baron.expr) <= 0) {
 18 | baron.expr <- curr.expr
 19 | } else  {
 20 | baron.expr <- rbind(baron.expr, curr.expr)
 21 | }
 22 | count <- count +1
 23 | }
 24 | count.mtx <- t(baron.expr)
 25 | write.csv(count.mtx, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", quote = F)
 26 | write.csv(baron.meta, "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic_meta.csv", quote = F)
 27 | single.round.QP.analysis(ref.df, count.mtx, n.cores = 4, save.to.path = "~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/", save.to.filename = "02_MCA_Based_scClassifier_reference_mix90_test_normalize_select", unix.par = TRUE)
 28 | background.mtx <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/01_MCA_Based_scClassifier_reference_mix90_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
 29 | mtx.test <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/02_MCA_Based_scClassifier_reference_mix90_test_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
 30 | # reference Permutation
 31 | col.sub <- ncol(background.mtx) - 2
 32 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)])
 33 | # Test Permutation
 34 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)]))
 35 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list)
 36 | classification <- binary.to.classification(bin.count[,c(1:col.sub)])
 37 | rownames(classification) <- classification$barcode
 38 | classification$actual <- baron.meta[rownames(classification), "cell.type"]
 39 | table.freq <- table(classification$actual, classification$call)
 40 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3))
 41 | rownames(table.freq.perc)[14] <- "beta"
 42 | table.freq.sub <- as.data.frame(table.freq.perc[c(1,2,4,5,6,7,8,11,12,14), c(1,2,5,8,3,4,6,7,9,10,13)])
 43 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
 44 | table.freq.melt <- melt(table.freq.sub)
 45 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
 46 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
 47 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
 48 | ordered = T)
 49 | table.freq.melt$variable <- factor(table.freq.melt$variable,
 50 | levels = c("B_cell", "beta", "ductal", "endothelial",
 51 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
 52 | ordered = T)
 53 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
 54 | geom_point(aes(colour = variable)) +
 55 | scale_size_area(name = "value", max_size=12) +
 56 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) +
 57 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
 58 | theme(legend.position = "right",
 59 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
 60 | axis.text.y = element_text(size = 12, face = "bold.italic"),
 61 | axis.ticks = element_blank(),
 62 | axis.title = element_blank(),
 63 | title = element_text(face = "bold.italic", size = 14),
 64 | panel.grid.major = element_blank(),
 65 | panel.grid.minor = element_blank(),
 66 | panel.background = element_blank(),
 67 | axis.line = element_blank())
 68 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
 69 | geom_point(aes(colour = variable)) +
 70 | scale_size_area(name = "value", max_size=12) +
 71 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
 72 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
 73 | theme(legend.position = "right",
 74 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
 75 | axis.text.y = element_text(size = 12, face = "bold.italic"),
 76 | axis.ticks = element_blank(),
 77 | axis.title = element_blank(),
 78 | title = element_text(face = "bold.italic", size = 14),
 79 | panel.grid.major = element_blank(),
 80 | panel.grid.minor = element_blank(),
 81 | panel.background = element_blank(),
 82 | axis.line = element_blank())
 83 | View(table.freq.perc)
 84 | rownames(table.freq.perc)[14] <- "beta"
 85 | rownames(table.freq.perc)[16] <- "beta"
 86 | View(table.freq.perc)
 87 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell",
 88 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell", c(1,2,5,8,3,4,6,7,9,10,13)])
 89 | table.freq.sub <- as.data.frame(table.freq.perc["B.cell", "beta", "Ductal.cell", "Endothelial.cell",
 90 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
 91 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
 92 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
 93 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
 94 | View(table.freq.perc)
 95 | table.freq <- table(classification$actual, classification$call)
 96 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3))
 97 | rownames(table.freq.perc)[16] <- "beta"
 98 | View(table.freq.perc)
 99 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
100 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
101 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
102 | table.freq.melt <- melt(table.freq.sub)
103 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
104 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
105 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
106 | ordered = T)
107 | table.freq.melt$variable <- factor(table.freq.melt$variable,
108 | levels = c("B_cell", "beta", "ductal", "endothelial",
109 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
110 | ordered = T)
111 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
112 | geom_point(aes(colour = value)) +
113 | scale_size_area(name = "value", max_size=12) +
114 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
115 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
116 | theme(legend.position = "right",
117 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
118 | axis.text.y = element_text(size = 12, face = "bold.italic"),
119 | axis.ticks = element_blank(),
120 | axis.title = element_blank(),
121 | title = element_text(face = "bold.italic", size = 14),
122 | panel.grid.major = element_blank(),
123 | panel.grid.minor = element_blank(),
124 | panel.background = element_blank(),
125 | axis.line = element_blank())
126 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
127 | geom_point(aes(colour = value)) +
128 | scale_size_area(name = "value", max_size=12) +
129 | scale_color_viridis_c(option = "A", begin = 0.15, end = 0.85) +
130 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
131 | theme(legend.position = "right",
132 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
133 | axis.text.y = element_text(size = 12, face = "bold.italic"),
134 | axis.ticks = element_blank(),
135 | axis.title = element_blank(),
136 | title = element_text(face = "bold.italic", size = 14),
137 | panel.grid.major = element_blank(),
138 | panel.grid.minor = element_blank(),
139 | panel.background = element_blank(),
140 | axis.line = element_blank())
141 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
142 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Endocrine.cell"), c(1,2,5,8,3,4,6,7,9,10,13)])
143 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
144 | table.freq.melt <- melt(table.freq.sub)
145 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
146 | levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
147 | "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", "Multi_ID", "Endocrine.cell"),
148 | ordered = T)
149 | table.freq.melt$variable <- factor(table.freq.melt$variable,
150 | levels = c("B_cell", "beta", "ductal", "endothelial",
151 | "macrophage", "T_cell", "immune_other", "activated_stellate", "alpha", "delta", "gamma"),
152 | ordered = T)
153 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
154 | geom_point(aes(colour = variable)) +
155 | scale_size_area(name = "value", max_size=12) +
156 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
157 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
158 | theme(legend.position = "right",
159 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
160 | axis.text.y = element_text(size = 12, face = "bold.italic"),
161 | axis.ticks = element_blank(),
162 | axis.title = element_blank(),
163 | title = element_text(face = "bold.italic", size = 14),
164 | panel.grid.major = element_blank(),
165 | panel.grid.minor = element_blank(),
166 | panel.background = element_blank(),
167 | axis.line = element_blank())
168 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
169 | geom_point(aes(colour = variable)) +
170 | scale_size_area(name = "value", max_size=12) +
171 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
172 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
173 | theme(legend.position = "right",
174 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
175 | axis.text.y = element_text(size = 12, face = "bold.italic"),
176 | axis.ticks = element_blank(),
177 | axis.title = element_blank(),
178 | title = element_text(face = "bold.italic", size = 14),
179 | panel.grid.major = element_blank(),
180 | panel.grid.minor = element_blank(),
181 | panel.background = element_blank(),
182 | axis.line = element_line(colour = "black", size = 1))
183 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
184 | geom_point(aes(colour = variable)) +
185 | scale_size_area(name = "value", max_size=12) +
186 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
187 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
188 | guides(fill = FALSE, color = FALSE) +
189 | theme(legend.position = "right",
190 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
191 | axis.text.y = element_text(size = 12, face = "bold.italic"),
192 | axis.ticks = element_blank(),
193 | axis.title = element_blank(),
194 | title = element_text(face = "bold.italic", size = 14),
195 | panel.grid.major = element_blank(),
196 | panel.grid.minor = element_blank(),
197 | panel.background = element_blank(),
198 | axis.line = element_line(colour = "black", size = 1))
199 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
200 | geom_point(aes(colour = variable)) +
201 | scale_size_area(name = "value", max_size=12) +
202 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
203 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
204 | guides(fill = FALSE, color = FALSE) +
205 | theme(legend.position = "bottom",
206 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
207 | axis.text.y = element_text(size = 12, face = "bold.italic"),
208 | axis.ticks = element_blank(),
209 | axis.title = element_blank(),
210 | title = element_text(face = "bold.italic", size = 14),
211 | panel.grid.major = element_blank(),
212 | panel.grid.minor = element_blank(),
213 | panel.background = element_blank(),
214 | axis.line = element_line(colour = "black", size = 1))
215 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
216 | geom_point(aes(colour = variable)) +
217 | scale_size_area(name = "Percentage", max_size=12) +
218 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
219 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
220 | guides(fill = FALSE, color = FALSE) +
221 | theme(legend.position = "bottom",
222 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
223 | axis.text.y = element_text(size = 12, face = "bold.italic"),
224 | axis.ticks = element_blank(),
225 | axis.title = element_blank(),
226 | title = element_text(face = "bold.italic", size = 14),
227 | panel.grid.major = element_blank(),
228 | panel.grid.minor = element_blank(),
229 | panel.background = element_blank(),
230 | axis.line = element_line(colour = "black", size = 1))
231 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 6, height = 8, paper = "special")
232 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
233 | geom_point(aes(colour = variable)) +
234 | scale_size_area(name = "Percentage", max_size=12) +
235 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
236 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
237 | guides(fill = FALSE, color = FALSE) +
238 | theme(legend.position = "bottom",
239 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
240 | axis.text.y = element_text(size = 12, face = "bold.italic"),
241 | axis.ticks = element_blank(),
242 | axis.title = element_blank(),
243 | title = element_text(face = "bold.italic", size = 14),
244 | panel.grid.major = element_blank(),
245 | panel.grid.minor = element_blank(),
246 | panel.background = element_blank(),
247 | axis.line = element_line(colour = "black", size = 1))
248 | dev.off()
249 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 7, height = 9, paper = "special")
250 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
251 | geom_point(aes(colour = variable)) +
252 | scale_size_area(name = "Percentage", max_size=12) +
253 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
254 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
255 | guides(fill = FALSE, color = FALSE) +
256 | theme(legend.position = "bottom",
257 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
258 | axis.text.y = element_text(size = 12, face = "bold.italic"),
259 | axis.ticks = element_blank(),
260 | axis.title = element_blank(),
261 | title = element_text(face = "bold.italic", size = 14),
262 | panel.grid.major = element_blank(),
263 | panel.grid.minor = element_blank(),
264 | panel.background = element_blank(),
265 | axis.line = element_line(colour = "black", size = 1))
266 | dev.off()
267 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/dot plot.pdf", width = 8, height = 9, paper = "special")
268 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
269 | geom_point(aes(colour = variable)) +
270 | scale_size_area(name = "Percentage", max_size=12) +
271 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
272 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
273 | guides(fill = FALSE, color = FALSE) +
274 | theme(legend.position = "bottom",
275 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
276 | axis.text.y = element_text(size = 12, face = "bold.italic"),
277 | axis.ticks = element_blank(),
278 | axis.title = element_blank(),
279 | title = element_text(face = "bold.italic", size = 14),
280 | panel.grid.major = element_blank(),
281 | panel.grid.minor = element_blank(),
282 | panel.background = element_blank(),
283 | axis.line = element_line(colour = "black", size = 1))
284 | dev.off()
285 | baron.expr <- read.csv("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/baron_et_al_pancreatic.csv", header = T, row.names = 1, stringsAsFactors = F)
286 | View(baron.meta)
287 | View(mca)
288 | View(mca.meta)
289 | # Background cells
290 | mca <- read.csv("~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_CellAssignments.csv",
291 | row.names = 1, header = T, stringsAsFactors = F)
292 | View(mca)
293 | # Fetal bulk testing
294 | MCA.fetal.tissue <- c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Fetal_Brain",
295 | "Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache",
296 | "Neonatal_Brain", "Neonatal-Calvaria", "Neonatal-Heart", "Neonatal-Muscle",
297 | "Neonatal-Rib", "Neonatal-Skin", "Trophoblast-Stem-Cell", "NeonatalPancreas")
298 | library(devtools)
299 | library(roxygen2)
300 | setwd("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Capybara/")
301 | document()
302 | install(".")
303 | library(Capybara)
304 | syste
305 | system.file(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara"))
306 | system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")
307 | read.csv(system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara"))
308 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
309 | geom_bar(stat = "identity") +
310 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
311 | scale_fill_manual(
312 | name = "Mapped MCA Tissue",
313 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
314 | RColorBrewer::brewer.pal(8, "Set2"),
315 | RColorBrewer::brewer.pal(8, "Set3"))
316 | ) +
317 | labs(y = "Percentage of Cells") +
318 | ggtitle("Bulk Classification to MCA Tissues") +
319 | theme(legend.position = "none",
320 | axis.ticks = element_blank(),
321 | axis.title.x = element_blank(),
322 | axis.text = element_text(face = "bold.italic", size = 12),
323 | title = element_text(face = "bold.italic", size = 14),
324 | panel.grid.major = element_blank(),
325 | panel.grid.minor = element_blank(),
326 | panel.background = element_blank(),
327 | axis.line.x = element_blank(),
328 | axis.line.y = element_line(color = "black", size =1))
329 | comp.raw <- count.in.cat
330 | comp.raw <- comp.raw[order(-comp.raw$perc), ]
331 | comp.raw$Var1 <- rownames(comp.raw)
332 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T)
333 | comp.raw$label <- "Mouse Pancreatic Cells"
334 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
335 | geom_bar(stat = "identity") +
336 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
337 | scale_fill_manual(
338 | name = "Mapped MCA Tissue",
339 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
340 | RColorBrewer::brewer.pal(8, "Set2"),
341 | RColorBrewer::brewer.pal(8, "Set3"))
342 | ) +
343 | labs(y = "Percentage of Cells") +
344 | ggtitle("Bulk Classification to MCA Tissues") +
345 | theme(legend.position = "none",
346 | axis.ticks = element_blank(),
347 | axis.title.x = element_blank(),
348 | axis.text = element_text(face = "bold.italic", size = 12),
349 | title = element_text(face = "bold.italic", size = 14),
350 | panel.grid.major = element_blank(),
351 | panel.grid.minor = element_blank(),
352 | panel.background = element_blank(),
353 | axis.line.x = element_blank(),
354 | axis.line.y = element_line(color = "black", size =1))
355 | background.mca <- readRDS("~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/MCA ARCHS4/06_qp_top_90_each_category_backgroung.Rds")
356 | background.mtx <- background.mca
357 | mtx.test <- qp.paga.rslt.sub[,colnames(background.mtx)]
358 | ### cor test
359 | ref.test <- t(background.mtx)
360 | mtx.test.cor <- t(qp.paga.rslt.sub)
361 | corr.mtx <- WGCNA::cor(ref.test, mtx.test.cor)
362 | correlation.cutoff <- quantile(corr.mtx, 0.90)
363 | new.corr.bin <- corr.mtx
364 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1
365 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0
366 | new.corr.bin <- as.data.frame(new.corr.bin)
367 | new.corr.bin$cell.bc.ref <- rownames(new.corr.bin)
368 | new.corr.bin.melt <- reshape2::melt(new.corr.bin)
369 | new.corr.bin.melt.sub <- new.corr.bin.melt[which(new.corr.bin.melt$value > 0),]
370 | new.corr.bin.melt.sub$cell.type <- unlist(lapply(strsplit(new.corr.bin.melt.sub$cell.bc.ref, "_"), function(x) x[1]))
371 | count.in.cat <- c()
372 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1])))
373 | for (uc in unique.cat) {
374 | curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)]
375 | count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7)
376 | }
377 | count.in.cat <- as.data.frame(count.in.cat)
378 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3)
379 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)]
380 | comp.raw <- count.in.cat
381 | comp.raw <- comp.raw[order(-comp.raw$perc), ]
382 | comp.raw$Var1 <- rownames(comp.raw)
383 | comp.raw$Var1 <- factor(comp.raw$Var1, comp.raw$Var1, ordered = T)
384 | comp.raw$label <- "Mouse Pancreatic Cells"
385 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
386 | geom_bar(stat = "identity") +
387 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
388 | scale_fill_manual(
389 | name = "Mapped MCA Tissue",
390 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
391 | RColorBrewer::brewer.pal(8, "Set2"),
392 | RColorBrewer::brewer.pal(8, "Set3"))
393 | ) +
394 | labs(y = "Percentage of Cells") +
395 | ggtitle("Bulk Classification to MCA Tissues") +
396 | theme(legend.position = "none",
397 | axis.ticks = element_blank(),
398 | axis.title.x = element_blank(),
399 | axis.text = element_text(face = "bold.italic", size = 12),
400 | title = element_text(face = "bold.italic", size = 14),
401 | panel.grid.major = element_blank(),
402 | panel.grid.minor = element_blank(),
403 | panel.background = element_blank(),
404 | axis.line.x = element_blank(),
405 | axis.line.y = element_line(color = "black", size =1))
406 | View(comp.raw)
407 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 3, height = 8, paper = "special")
408 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
409 | geom_bar(stat = "identity") +
410 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
411 | scale_fill_manual(
412 | name = "Mapped MCA Tissue",
413 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
414 | RColorBrewer::brewer.pal(8, "Set2"),
415 | RColorBrewer::brewer.pal(8, "Set3"))
416 | ) +
417 | labs(y = "Percentage of Cells") +
418 | ggtitle("Bulk Classification to \n MCA Tissues") +
419 | theme(legend.position = "none",
420 | axis.ticks = element_blank(),
421 | axis.title.x = element_blank(),
422 | axis.text = element_text(face = "bold.italic", size = 12),
423 | title = element_text(face = "bold.italic", size = 14),
424 | panel.grid.major = element_blank(),
425 | panel.grid.minor = element_blank(),
426 | panel.background = element_blank(),
427 | axis.line.x = element_blank(),
428 | axis.line.y = element_line(color = "black", size =1))
429 | dev.off()
430 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special")
431 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
432 | geom_bar(stat = "identity") +
433 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
434 | scale_fill_manual(
435 | name = "Mapped MCA Tissue",
436 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
437 | RColorBrewer::brewer.pal(8, "Set2"),
438 | RColorBrewer::brewer.pal(8, "Set3"))
439 | ) +
440 | labs(y = "Percentage of Cells") +
441 | ggtitle("Bulk Classification to \n MCA Tissues") +
442 | theme(legend.position = "none",
443 | axis.ticks = element_blank(),
444 | axis.title.x = element_blank(),
445 | axis.text = element_text(face = "bold.italic", size = 12),
446 | title = element_text(face = "bold.italic", size = 14),
447 | panel.grid.major = element_blank(),
448 | panel.grid.minor = element_blank(),
449 | panel.background = element_blank(),
450 | axis.line.x = element_blank(),
451 | axis.line.y = element_line(color = "black", size =1))
452 | dev.off()
453 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
454 | geom_bar(stat = "identity") +
455 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
456 | scale_fill_manual(
457 | name = "Mapped MCA Tissue",
458 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
459 | RColorBrewer::brewer.pal(8, "Set2"),
460 | RColorBrewer::brewer.pal(8, "Set3"))
461 | ) +
462 | labs(y = "Percentage of Cells") +
463 | ggtitle("Bulk Classification to \nMCA Tissues") +
464 | theme(legend.position = "none",
465 | axis.ticks = element_blank(),
466 | axis.title.x = element_blank(),
467 | axis.text = element_text(face = "bold.italic", size = 12),
468 | title = element_text(face = "bold.italic", size = 14),
469 | panel.grid.major = element_blank(),
470 | panel.grid.minor = element_blank(),
471 | panel.background = element_blank(),
472 | axis.line.x = element_blank(),
473 | axis.line.y = element_line(color = "black", size =1))
474 | pdf("~/Desktop/Morris Lab/Paper/Manuscript/scClassifier/Pancreatic Baron et al/bulk_composition_pancreatic.pdf", width = 4, height = 8, paper = "special")
475 | ggplot(comp.raw, aes(x = comp.raw$label, y = comp.raw$perc, fill = comp.raw$Var1, label = comp.raw$Var1)) +
476 | geom_bar(stat = "identity") +
477 | geom_text(position = position_stack(vjust = 0.5), fontface = "bold", aes(size = comp.raw$perc)) +
478 | scale_fill_manual(
479 | name = "Mapped MCA Tissue",
480 | values = c(RColorBrewer::brewer.pal(12, "Paired"),
481 | RColorBrewer::brewer.pal(8, "Set2"),
482 | RColorBrewer::brewer.pal(8, "Set3"))
483 | ) +
484 | labs(y = "Percentage of Cells") +
485 | ggtitle("Bulk Classification to \nMCA Tissues") +
486 | theme(legend.position = "none",
487 | axis.ticks = element_blank(),
488 | axis.title.x = element_blank(),
489 | axis.text = element_text(face = "bold.italic", size = 12),
490 | title = element_text(face = "bold.italic", size = 14),
491 | panel.grid.major = element_blank(),
492 | panel.grid.minor = element_blank(),
493 | panel.background = element_blank(),
494 | axis.line.x = element_blank(),
495 | axis.line.y = element_line(color = "black", size =1))
496 | dev.off()
497 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
498 | geom_point(aes(colour = variable)) +
499 | scale_size_area(name = "Percentage", max_size=12) +
500 | scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
501 | ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
502 | guides(fill = FALSE, color = FALSE) +
503 | theme(legend.position = "bottom",
504 | axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
505 | axis.text.y = element_text(size = 12, face = "bold.italic"),
506 | axis.ticks = element_blank(),
507 | axis.title = element_blank(),
508 | title = element_text(face = "bold.italic", size = 14),
509 | panel.grid.major = element_blank(),
510 | panel.grid.minor = element_blank(),
511 | panel.background = element_blank(),
512 | axis.line = element_line(colour = "black", size = 1))
513 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | 


--------------------------------------------------------------------------------
/Capybara.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: Capybara
 2 | Title: A tool to measure cell identity and fate transitions
 3 | Version: 0.0.0.9000
 4 | Authors@R: 
 5 |     person(given = "Sam",
 6 |            family = "Morris",
 7 |            role = c("aut", "cre"),
 8 |            email = "s.morris@wustl.edu",
 9 |            comment = c(ORCID = "YOUR-ORCID-ID"))
10 | Description: What the package does (one paragraph).
11 | Depends: R (>= 3.5.0),
12 |     pheatmap,
13 |     viridis,
14 |     tools,
15 |     psych,
16 |     MVA,
17 |     data.table,
18 |     pbmcapply,
19 |     reshape,
20 |     Matrix,
21 |     tidyverse,
22 |     snow,
23 |     parallel,
24 |     plyr,
25 |     dplyr,
26 |     ggplot2,
27 |     quadprog,
28 |     limma,
29 |     WGCNA,
30 |     glmnet
31 | License: What license it uses
32 | Encoding: UTF-8
33 | LazyData: true
34 | RoxygenNote: 7.1.2
35 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(binarization.mann.whitney)
 4 | export(binary.to.classification)
 5 | export(calc.scale.ratio)
 6 | export(construct.high.res.reference)
 7 | export(gene.intersect.sub)
 8 | export(get.least.connected)
 9 | export(get.mid.connected)
10 | export(get.most.connected)
11 | export(multi.id.curate.qp)
12 | export(normalize.dt)
13 | export(perc.calc.aux)
14 | export(percentage.calc)
15 | export(ref.construction)
16 | export(sample.func)
17 | export(sc.quad.prog.run)
18 | export(single.round.QP.analysis)
19 | export(top.genes)
20 | export(transition.score)
21 | 


--------------------------------------------------------------------------------
/R/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/R/.DS_Store


--------------------------------------------------------------------------------
/R/Binarization.R:
--------------------------------------------------------------------------------
  1 | #' Binarization and Identity Calling from Identity Score
  2 | #'
  3 | #' This function calls single identity or multiple identities of query cells from empirical p-values. Inferred cell types are marked by 1 in a binarized matrix.
  4 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
  5 | #' @param ref.perc.ls Emprical p-values for reference cells
  6 | #' @param ref.meta The celltype meta information for reference cells
  7 | #' @param perc.ls Emprical p-values for query cells
  8 | #' @param bulk If the reference data type is bulk RNA-seq. The default is bulk = FALSE
  9 | #' @param map.df bulk mapping. The default is bulk = FALSE
 10 | #' @param init.class initial classification. The default is init.class = NULL
 11 | #' @keywords binarization, identity calling
 12 | #' @note
 13 | #' @export
 14 | #' @examples
 15 | #'
 16 | binarization.mann.whitney <- function(mtx, ref.perc.ls, ref.meta, perc.ls, bulk = FALSE, map.df = NULL, init.class = NULL) {
 17 |   p_vals <- matrix(nrow = nrow(mtx), ncol = (ncol(mtx) - 1))
 18 |   type_rank_in_order_mtx <- c()
 19 |   which_type <- c()
 20 |   
 21 |   all.indx <- seq(1,length(perc.ls))
 22 |   
 23 |   benchmark <- list()
 24 |   benchmark.final <- c()
 25 |   
 26 |   for (i in 1:length(ref.perc.ls)) {
 27 |     curr.cell <- names(ref.perc.ls[[i]])
 28 |     perc_mtx <- as.data.frame(ref.perc.ls[[i]][[1]])
 29 |     if (curr.cell %in% rownames(ref.meta)) {
 30 |       curr.ct <- ref.meta[curr.cell, "cell.type"]
 31 |       if (endsWith(curr.ct, " ")) curr.ct <- substr(curr.ct, start = 1, stop = (nchar(curr.ct) - 1))
 32 |       curr.cell.cell.type <- gsub(" ", ".", curr.ct)
 33 |       curr.cell.cell.type <- gsub("−", ".", curr.cell.cell.type)
 34 |       curr.cell.cell.type <- gsub("-", ".", curr.cell.cell.type)
 35 |       curr.cell.cell.type <- gsub("/", ".", curr.cell.cell.type)
 36 |       curr.cell.cell.type <- gsub("&", ".", curr.cell.cell.type)
 37 |       curr.cell.cell.type <- gsub("\\(", ".", curr.cell.cell.type)
 38 |       curr.cell.cell.type <- gsub("\\)", ".", curr.cell.cell.type)
 39 |       if (!bulk) {
 40 |         curr.bm <- paste0("frxn_cell.type_", curr.cell.cell.type)
 41 |       } else {
 42 |         if (is.null(map.df)) {
 43 |           stop("No bulk mapping found!")
 44 |         }
 45 |         curr.bm <- map.df[which(map.df$tissue == curr.ct), "corresponding"]
 46 |       }
 47 |       curr.celltype.perm <- perc_mtx[, curr.bm]
 48 |       if (length(benchmark) <= 0) {
 49 |         benchmark[[curr.bm]] <- curr.celltype.perm
 50 |       } else {
 51 |         benchmark[[curr.bm]] <- c(benchmark[[curr.bm]], curr.celltype.perm)
 52 |       }
 53 |     }
 54 |   }
 55 |   
 56 |   for (i in 1:length(benchmark)) {
 57 |     curr.name <- names(benchmark)[i]
 58 |     curr.benchmark <- benchmark[[i]]
 59 |     curr.benchmark <- curr.benchmark[which(curr.benchmark < 1)]
 60 |     curr.outliers <- boxplot.stats(curr.benchmark)$out
 61 |     higher.end.outliers <- curr.outliers[which(curr.outliers > getmode(curr.benchmark))]
 62 |     if (length(higher.end.outliers) > 0) benchmark[[i]] <- curr.benchmark[which(curr.benchmark <= min(higher.end.outliers))]
 63 |     benchmark.final[curr.name] <- max(benchmark[[i]][which(benchmark[[i]] < 1)])
 64 |   }
 65 |   
 66 |   if (!("Unknown" %in% names(table(init.class$init.class)))) {init.class = NULL}
 67 |   
 68 |   for (i in all.indx) {
 69 |     perc_mtx<-as.matrix((perc.ls[[i]][[1]]))
 70 |     vec<-as.vector(perc_mtx)
 71 |     rank<-rank(vec,ties.method = "average")
 72 |     type_rank<-as.matrix(t(colSums(matrix(rank, nrow=50))))
 73 |     colnames(type_rank)<-colnames(perc_mtx)
 74 |     type_rank_in_order<-colnames(type_rank)[order(type_rank,decreasing = F)]
 75 |     type_rank_in_order_mtx<-rbind(type_rank_in_order_mtx,type_rank_in_order)
 76 |     
 77 |     if (!is.null(init.class)) {
 78 |       if(init.class[names(perc.ls[[i]]), "init.class"] == "Unknown") {
 79 |         which_type <- c(which_type, 0)
 80 |       } else {
 81 |         if (init.class[names(perc.ls[[i]]), "init.class"] == "Unknown.Progenitor") {
 82 |           which_type <- c(which_type, -1)
 83 |         } else {
 84 |           for (j in 1:(length(colnames(type_rank))-1)) {
 85 |             
 86 |             a<-type_rank_in_order[1]
 87 |             b<-type_rank_in_order[j+1]
 88 |             
 89 |             test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less")
 90 |             p_vals[i,j]<-test$p.value
 91 |             if(test$p.value<.05 & (init.class[names(perc.ls[[i]]), "init.class"] == "Single-ID")){
 92 |               which_type<-c(which_type,j)
 93 |               break
 94 |             } else {
 95 |               if (j == (ncol(type_rank_in_order_mtx) - 1) | (init.class[names(perc.ls[[i]]), "init.class"] == "Multi-ID")) {
 96 |                 which_type <- c(which_type, j+1)
 97 |               }
 98 |             }
 99 |           }
100 |         }
101 |       }
102 |     } else {
103 |       if(mean(perc_mtx[,type_rank_in_order[1]]) >= (benchmark.final[type_rank_in_order[1]] + 0.01)) {
104 |         which_type <- c(which_type, 0)
105 |       } else {
106 |         for (j in 1:(length(colnames(type_rank))-1)) {
107 |           
108 |           a<-type_rank_in_order[1]
109 |           b<-type_rank_in_order[j+1]
110 |           
111 |           test<-wilcox.test(x=perc_mtx[,a],y=perc_mtx[,b],alternative = "less")
112 |           p_vals[i,j]<-test$p.value
113 |           if(test$p.value<.05){
114 |             which_type<-c(which_type,j)
115 |             break
116 |           } else {
117 |             if (j == (ncol(type_rank_in_order_mtx) - 1)) {
118 |               which_type <- c(which_type, j+1)
119 |             }
120 |           }
121 |         }
122 |       }
123 |     }
124 |   }
125 |   
126 |   type_rank_in_order_mtx <- as.matrix(type_rank_in_order_mtx)
127 |   binary.mtx<-matrix(0,nrow = nrow(mtx), ncol = ncol(mtx))
128 |   colnames(binary.mtx)<-colnames(mtx)
129 |   cellnames<-c()
130 |   for (i in 1:length(which_type)) {
131 |     cellnames <- c(cellnames, names(perc.ls[[i]]))
132 |     if (which_type[i] > 0) {
133 |       for (j in 1:which_type[i]) {
134 |         binary.mtx[i,type_rank_in_order_mtx[i,j]] <- 1
135 |       }
136 |     }
137 |     if (which_type[i] < 0) {
138 |       for (j in 1:which_type[i]) {
139 |         binary.mtx[i,type_rank_in_order_mtx[i,j]] <- -1
140 |       }
141 |     }
142 |   }
143 |   rownames(binary.mtx)<-cellnames
144 |   
145 |   return(binary.mtx)
146 | }
147 | 


--------------------------------------------------------------------------------
/R/Classification.R:
--------------------------------------------------------------------------------
 1 | #' Generate classification results from the binarized result matrix
 2 | #'
 3 | #' This function genrate classification table from the binarized result matrix
 4 | #' @param bin.count.rslt The binarized classification result matrix
 5 | #' @keywords calssification, result
 6 | #' @note 
 7 | #' @export
 8 | #' @examples
 9 | #' 
10 | binary.to.classification <- function(bin.count.rslt) {
11 |   class.rslt <- data.frame()
12 |   bin.count.rsums <- rowSums(bin.count.rslt)
13 |   for (i in 1:nrow(bin.count.rslt)) {
14 |     curr.cell.bc <- rownames(bin.count.rslt)[i]
15 |     if (bin.count.rsums[i] > 1) {
16 |       curr.df <- data.frame(barcode = curr.cell.bc, call = "Multi_ID", stringsAsFactors = F)
17 |     } else {
18 |       if (bin.count.rsums[i] == 0) {
19 |         curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown", stringsAsFactors = F)
20 |       } 
21 |       else {
22 |         if (bin.count.rsums[i] < 0) {
23 |           curr.df <- data.frame(barcode = curr.cell.bc, call = "Unknown.Progenitor", stringsAsFactors = F)
24 |         } else {
25 |         idx <- which(bin.count.rslt[i,] == 1)
26 |         identity <- unlist(lapply(strsplit(colnames(bin.count.rslt), "frxn_cell.type_"), function(x) x[length(x)]))[idx]
27 |         curr.df <- data.frame(barcode = curr.cell.bc, call = identity[length(identity)], stringsAsFactors = F)
28 |         }
29 |       }
30 |     }
31 |     if (nrow(class.rslt) <= 0) {
32 |       class.rslt <- curr.df
33 |     } else {
34 |       class.rslt <- rbind(class.rslt, curr.df)
35 |     }
36 |   }
37 |   return(class.rslt)
38 | }
39 | 


--------------------------------------------------------------------------------
/R/EmpiricalPValueCalculation.R:
--------------------------------------------------------------------------------
 1 | #' Empirical p-value of Identity Score Calculation
 2 | #'
 3 | #' This function resamples from a sample dataset and returns an empirical p-value of identity score
 4 | #' @param dens.x The background dataset that we compare the identity score of our sample with
 5 | #' @param curr.val The identity score of the sample dataset to be evaluated its significance 
 6 | #' @param n The number of times of resampling. The default is n=1000
 7 | #' @keywords resampling, empirical p-value
 8 | #' @note 
 9 | #' @export
10 | #' @examples
11 | #' 
12 | sample.func <- function(dens.x, curr.val, prob, n = 1000) {
13 |   samp <- sample(1:length(dens.x), n, replace = T, prob = prob)
14 |   samp.val <- dens.x[samp]
15 |   perc <- sum(samp.val > curr.val)/n
16 |   return(perc)
17 | }
18 | 
19 | #' Emprical p-values for query cells
20 | #'
21 | #' This function returns a list of emprical p-value matrices. Every matrix in the list contains emprical p-values of all possilbe celltypes for a single query cells under test.times resampled backgrounds.
22 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
23 | #' @param bkgd.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types
24 | #' @keywords resampling, empirical p-value, all query cells
25 | #' @note 
26 | #' @export
27 | #' @examples
28 | #' 
29 | percentage.calc <- function(mtx, bkgd.mtx) {
30 |   cnms <- colnames(mtx)
31 |   cell.names <- rownames(mtx)
32 |   mtx.cell.num <- nrow(mtx)
33 |   mtx.ct.num <- ncol(mtx)
34 |   test.times <- 50
35 |   
36 |   perc.ls <- pbmclapply(as.list(seq_len(mtx.cell.num)), perc.calc.aux, cell.names = cell.names, mtx.ct.num = mtx.ct.num, 
37 |                         background.mtx = bkgd.mtx, mtx = mtx, cnms = cnms, mc.cores = 4)
38 |   
39 |   return(perc.ls)
40 | }
41 | 
42 | #' Emprical p-values for a query cell
43 | #'
44 | #' This function returns emprical p-values of all possilbe celltypes for a query cells under test.times resampled backgrounds.
45 | #' @param l The index of the sample cell of query 
46 | #' @param cell.names A vector of sample cells' names
47 | #' @param mtx.ct.num The number of all possible cell types
48 | #' @param background.mtx The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types
49 | #' @param mtx The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types
50 | #' @param cnms The names of all possible cell types
51 | #' @param test.times The number of times resampling the background. The default is test.times=50
52 | #' @keywords resampling, empirical p-value, single query
53 | #' @note 
54 | #' @export
55 | #' @examples
56 | #' 
57 | perc.calc.aux <- function(l, cell.names, mtx.ct.num, background.mtx, mtx, cnms, test.times = 50) {
58 |   curr.cell <- cell.names[l]
59 |   perc.vec <- lapply(seq(1, mtx.ct.num),
60 |                      function(x) {
61 |                        dens <- density(background.mtx[,x], n = nrow(mtx))
62 |                        curr.num <- mtx[l, x]
63 |                        if (curr.num <= .Machine$double.eps * 2) {
64 |                          pc.v <- rep(1, test.times)
65 |                        } else {
66 |                          pc.v <- apply(as.data.frame(seq_len(test.times)), 1, function(x) sample.func(dens.x = dens$x, curr.val = curr.num, prob = dens$y, n = 1000))
67 |                        }
68 |                        return(list(cnms[x], pc.v))
69 |                      })
70 |   comb.perc <- rbindlist(perc.vec)
71 |   comb.perc$label <- rep(seq(1,test.times), length(unique(comb.perc$V1)))
72 |   
73 |   comb.df <- dcast(comb.perc, label~V1, value.var = "V2")
74 |   comb.df <- comb.df[, -c(1)]
75 |   curr.list <- list()
76 |   curr.list[[curr.cell]] <- comb.df
77 |   return(curr.list)
78 | }
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/Helper.R:
--------------------------------------------------------------------------------
 1 | #' Get the Most Connected Cells
 2 | #'
 3 | #' This function returns index of cells that are the most connected within a cell type
 4 | #' @param mtx The normalized reference count matrix
 5 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
 6 | #' @export
 7 | #'
 8 | get.most.connected <- function(mtx, n.sample) {
 9 |   corr.mtx <- WGCNA::cor(mtx)
10 |   corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
11 |   corr.mtx.melt <- melt(corr.mtx.upper)
12 |   corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
13 |   corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ]
14 |   corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
15 |   corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
16 | 
17 |   sample.list <- c()
18 |   count.line <- 1
19 |   while(length(sample.list) < n.sample) {
20 |     sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
21 |     count.line <- count.line + 1
22 |   }
23 | 
24 |   return(sample.list[1:n.sample])
25 | }
26 | 
27 | 
28 | #' Get the Least Connected Cells
29 | #'
30 | #' This function returns index of cells that are the least connected within a cell type
31 | #' @param mtx The normalized reference count matrix
32 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
33 | #' @export
34 | #'
35 | get.least.connected <- function(mtx, n.sample) {
36 |   corr.mtx <- WGCNA::cor(mtx)
37 |   corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
38 |   corr.mtx.melt <- melt(corr.mtx.upper)
39 |   corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
40 |   corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ]
41 |   corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
42 |   corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
43 | 
44 |   sample.list <- c()
45 |   count.line <- 1
46 |   while(length(sample.list) < n.sample) {
47 |     sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
48 |     count.line <- count.line + 1
49 |   }
50 | 
51 |   return(sample.list[1:n.sample])
52 | }
53 | 
54 | #' Get the Medium Connected Cells
55 | #'
56 | #' This function returns index of cells that are the most connected within a cell type
57 | #' @param mtx The normalized reference count matrix
58 | #' @param n.sample The number of reference cells to be included within each pseudo-bulk
59 | #' @export
60 | #'
61 | get.mid.connected <- function(mtx, n.sample) {
62 |   corr.mtx <- WGCNA::cor(mtx)
63 |   corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
64 |   corr.mtx.melt <- melt(corr.mtx.upper)
65 |   corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
66 |   corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(corr.mtx.melt.pos$value), ]
67 |   corr.mtx.melt.pos.sort$X1 <- as.character(corr.mtx.melt.pos.sort$X1)
68 |   corr.mtx.melt.pos.sort$X2 <- as.character(corr.mtx.melt.pos.sort$X2)
69 | 
70 |   corr.mtx.melt.pos.sort.most <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ]
71 |   corr.mtx.melt.pos.sort.most$X1 <- as.character(corr.mtx.melt.pos.sort.most$X1)
72 |   corr.mtx.melt.pos.sort.most$X2 <- as.character(corr.mtx.melt.pos.sort.most$X2)
73 | 
74 |   sample.list <- c()
75 |   count.line <- 1
76 |   while(length(sample.list) < n.sample/2) {
77 |     sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$X1[count.line], corr.mtx.melt.pos.sort$X2[count.line]))))
78 |     count.line <- count.line + 1
79 |   }
80 |   count.line <- 1
81 |   while(length(sample.list) < n.sample) {
82 |     sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort.most$X1[count.line], corr.mtx.melt.pos.sort.most$X2[count.line]))))
83 |     count.line <- count.line + 1
84 |   }
85 | 
86 |   return(sample.list[1:n.sample])
87 | }
88 | 
89 | 
90 | # Create the function.
91 | getmode <- function(v) {
92 |   uniqv <- unique(v)
93 |   uniqv[which.max(tabulate(match(v, uniqv)))]
94 | }
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/R/HighResolutionReferenceConstruction.R:
--------------------------------------------------------------------------------
 1 | #' Systematic construction of a high-resolution reference
 2 | #'
 3 | #' Create a pseudo-bulk reference by sampling 90-cells from each cell type to maintain cellular resolution while increasing transcriptional resolution
 4 | #' @param ref.mtx The single-cell reference dataset
 5 | #' @param coldata.df The metadata (cell type information) for cells in the high-resolution reference
 6 | #' @param cell.num.for.ref The number of cell numbers used to build the reference for each cell type. The default is cell.num.for.ref = 90
 7 | #' @keywords pseudo-bulk reference
 8 | #' @note
 9 | #' @export
10 | #' @examples
11 | #'
12 | construct.high.res.reference <- function(ref.mtx, coldata.df, criteria,cell.num.for.ref = 90) {
13 | 
14 |   ct.freq <- as.data.frame(table(coldata.df[,criteria]), stringsAsFactors = F)
15 |   ref.meta <- data.frame()
16 |   ref.sc <- data.frame()
17 |   mca.counts.all.involved.sub<-ref.mtx
18 |   for (j in 1:nrow(ct.freq)) {
19 |     curr.ct.at.test <- as.character(ct.freq[j,1])
20 |     curr.cell.involve <- rownames(coldata.df)[which(coldata.df[,criteria] == curr.ct.at.test)]
21 |     curr.cell.index <- which(colnames(mca.counts.all.involved.sub) %in% curr.cell.involve)
22 |     curr.mtx.sub <- as.matrix(mca.counts.all.involved.sub[, curr.cell.index])
23 |     if (ct.freq$Freq[j] >= cell.num.for.ref) {
24 |       sample.ref.cell <- c(get.most.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2),
25 |                            get.least.connected(log1p(normalize.dt(curr.mtx.sub)), cell.num.for.ref/2))
26 |     } else {
27 |       sample.ref.cell <- sample(curr.cell.involve, size = cell.num.for.ref, replace = T)
28 |     }
29 |     new.bc <- paste0("Cell_", seq(((j-1) * cell.num.for.ref + 1), j * cell.num.for.ref))
30 |     curr.meta <- data.frame(row.names = new.bc, cell.type = curr.ct.at.test, cell.bc = sample.ref.cell, stringsAsFactors = F)
31 |     curr.sc <- curr.mtx.sub[, sample.ref.cell]
32 |     colnames(curr.sc) <- new.bc
33 |     if (ncol(ref.sc) <= 0) {
34 |       ref.meta <- curr.meta
35 |       ref.sc <- curr.sc
36 |     } else {
37 |       ref.meta <- rbind(ref.meta, curr.meta)
38 |       ref.sc <- cbind(ref.sc, curr.sc)
39 |     }
40 |   }
41 |   ref.df <- ref.construction(ref.sc, ref.meta, "cell.type")
42 |   return(list(ref.sc, ref.meta, ref.df))
43 | }
44 | 
45 | #' Reference Construction
46 | #'
47 | #' This function constructs reference from single-cell resolution reference data to be used for quadratic programming calculation
48 | #' @param sc The single-cell resolution dataset
49 | #' @param sc.aux The auxiliary data frame that annotate the single-cell resolutiond dataset
50 | #' @param criteria The column name to use for construction of the reference
51 | #' @export
52 | #' @examples
53 | #' ref.construction(single.ref.mtx, single.aux.df, "cell.type")
54 | ref.construction <- function(sc, sc.aux, criteria) {
55 |   criteria.col <- sc.aux[, criteria]
56 |   ref.df <- data.frame()
57 |   uniq.crit <- unique(criteria.col)
58 | 
59 |   for (i in 1:length(uniq.crit)) {
60 |     curr.crit <- uniq.crit[i]
61 |     sc.bc <- colnames(sc)[which(sc.aux[, criteria] == curr.crit)]
62 |     curr.sc.sub <- sc[, sc.bc]
63 |     curr.df <- as.data.frame(rowSums(curr.sc.sub))
64 |     if (ncol(ref.df) <= 0) {
65 |       ref.df <- curr.df
66 |     } else {
67 |       ref.df <- cbind(ref.df, curr.df)
68 |     }
69 |     colnames(ref.df)[i] <- paste0(criteria, "_", curr.crit)
70 |   }
71 |   return(ref.df)
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/R/MultiIDFilterQP.R:
--------------------------------------------------------------------------------
 1 | #' Multi-ID Score-Based Filter
 2 | #'
 3 | #' This function filters the multiple identity cells based on their QP scores, where we assume that a low QP score (less than 10E-3) are not a true identity to consider
 4 | #' @param binary.counts The binary count matrix, which is the output from binarization with Mann Whitney. 
 5 | #' @param classification The classification result, which is the output from binary to classification.
 6 | #' @param qp.matrix The matrix that contains QP scores calculated for the sample cells
 7 | #' @param qp.threshold The threshold to cut off for the QP scores in the multiple identity listed cells
 8 | #' @return A list contain 2 elements, the first is the curated and filtered multiple identity data frame and the second is the new classification data frame.
 9 | #' @keywords Multiple identities
10 | #' @export
11 | #' @examples
12 | #' multi.id.curate.qp(multi.id.meta)
13 | multi.id.curate.qp <- function(binary.counts, classification, qp.matrix, qp.threshold = 10^-3) {
14 |   
15 |   bin.count.std.multi <- as.data.frame(binary.counts[classification$barcode[which(classification$call == "Multi_ID")], ])
16 |   bin.count.std.multi$cell.bc <- rownames(bin.count.std.multi)
17 |   
18 |   bin.count.std.multi.melt <- reshape2::melt(bin.count.std.multi)
19 |   bin.count.std.multi.melt <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$value > 0), ]
20 |   
21 |   classification.new <- classification
22 |   
23 |   qp.column <- c()
24 |   for (i in 1:nrow(bin.count.std.multi.melt)) {
25 |     curr.cell <- as.character(bin.count.std.multi.melt$cell.bc[i])
26 |     curr.cell.type <- as.character(bin.count.std.multi.melt$variable[i])
27 |     qp.column <- c(qp.column, qp.matrix[curr.cell, curr.cell.type])
28 |   }
29 |   bin.count.std.multi.melt$qp.score <- qp.column
30 |   bin.count.std.multi.melt.remain <- bin.count.std.multi.melt[which(bin.count.std.multi.melt$qp.score >= qp.threshold), ]
31 |   
32 |   cell.occurrence <- as.data.frame(table(bin.count.std.multi.melt.remain$cell.bc))
33 |   single.id.cell <- as.character(cell.occurrence$Var1[which(cell.occurrence$Freq == 1)])
34 |     
35 |   actual.multi <- bin.count.std.multi.melt.remain
36 |   if (length(single.id.cell) > 0) {
37 |     single.id.cell.bin <- bin.count.std.multi.melt.remain[which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ]
38 |     rownames(single.id.cell.bin) <- single.id.cell.bin$cell.bc
39 |     classification.new[rownames(single.id.cell.bin), "call"] <- unlist(lapply(strsplit(as.character(single.id.cell.bin$variable), "frxn_cell.type_"), function(x) x[2]))
40 |     actual.multi <- bin.count.std.multi.melt.remain[-which(bin.count.std.multi.melt.remain$cell.bc %in% single.id.cell), ]
41 |   }
42 |   
43 |   return(list(actual.multi, classification.new))
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/R/SingleCellQuadraticProgramming.R:
--------------------------------------------------------------------------------
  1 | #' Single Round of Quadratic Programming
  2 | #'
  3 | #' This function runs preprocessing, including log normalization, gene intersection and scaling. Further, run one round of quadratic programming. See function sc.quad.prog.run for detailed description
  4 | #' @param ref The reference transcriptome taht contains the transcriptome of each potential cell type
  5 | #' @param sc.data the transcriptome profile of cells from single-cell RNA-sequencing
  6 | #' @param scale.bulk.sc either scale or non-scale. Scaling is recommended to make the reference comparable to the single-cell. Default to scale.
  7 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0
  8 | #' @param unix.par boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE
  9 | #' @param windows.par boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE
 10 | #' @param n.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used
 11 | #' @param save.to.path which directory would you like to save your file?
 12 | #' @param save.to.filename prefix to the filename to save to. The final filename will be constructed inside of the function by tagging the following string _scale.csv for scaling and _non_scale.csv for not scaling.
 13 | #' @param bulk.norm boolean value, either TRUE or FALSE. Would you like to normalize the reference?
 14 | #' @param norm.sc boolean value, either TRUE or FALSE. Would you like to normalize the single-cell dataset?
 15 | #' @param log.bulk boolean value, either TRUE or FALSE. Would you like to log the reference?
 16 | #' @param log.sc boolean value, either TRUE or FALSE. Would you like to log the single-cell dataset?
 17 | #' @keywords quadratic programming, scRNA-seq,
 18 | #' @note This function calls the quadratic programming referenced to Treutlein et. al.
 19 | #' @export
 20 | #' @examples
 21 | #' single.round.QP.analysis(ref = ref.transcriptome, sc.data = sc.transcriptome, force.eq = 1, save.to.path = "~/Desktop/", save.to.filename = "my_favorite")
 22 | single.round.QP.analysis <- function(ref, sc.data, scale.bulk.sc = "scale", unix.par = FALSE, windows.par = FALSE, force.eq = 0,
 23 |                                      n.cores = 1, save.to.path, save.to.filename, bulk.norm = T, norm.sc = T, log.bulk = T, log.sc = T) {
 24 |   norm.bulk <- as.matrix(ref)
 25 |   norm.sc.mtx <- as.matrix(sc.data)
 26 | 
 27 |   # Normalized the bulk and single-cell data
 28 |   if (bulk.norm) norm.bulk <- normalize.dt(ref)
 29 |   if (norm.sc) norm.sc.mtx <- normalize.dt(sc.data)
 30 | 
 31 |   # Calculate the scaling ratio
 32 |   scale.ratio.sc <- calc.scale.ratio(ref, sc.data)
 33 |   scale.norm.sc <- norm.bulk/scale.ratio.sc
 34 | 
 35 |   # Calculate gene intersections
 36 |   norm.ls.sc <- gene.intersect.sub(norm.bulk, norm.sc.mtx)
 37 |   scale.ls.sc <- gene.intersect.sub(scale.norm.sc, norm.sc.mtx)
 38 | 
 39 |   log.norm.ls.sc <- norm.ls.sc
 40 |   log.scale.ls.sc <- scale.ls.sc
 41 |   # Finish the log-normalization
 42 |   if (log.bulk & log.sc) {
 43 |     log.norm.ls.sc <- lapply(norm.ls.sc, log1p)
 44 |     log.scale.ls.sc <- lapply(scale.ls.sc, log1p)
 45 |   } else {
 46 |     if (log.bulk) {
 47 |       log.norm.ls.sc[[1]] <- log1p(log.norm.ls.sc[[1]])
 48 |       log.scale.ls.sc[[1]] <- log1p(log.scale.ls.sc[[1]])
 49 |     }
 50 |     if (log.sc) {
 51 |       log.norm.ls.sc[[2]] <- log1p(log.norm.ls.sc[[2]])
 52 |       log.scale.ls.sc[[2]] <- log1p(log.scale.ls.sc[[2]])
 53 |     }
 54 |   }
 55 | 
 56 |   # Run QP and save to file
 57 |   if (scale.bulk.sc == "scale") {
 58 |     qp.rslt <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]),
 59 |                                 single.cell.transcriptome = log.scale.ls.sc[[2]],
 60 |                                 unix.parallel = unix.par, windows.parallel = windows.par,
 61 |                                 parallel.cores = n.cores, force.eq = force.eq)
 62 |     write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F)
 63 |   } else {
 64 |     if (scale.bulk.sc == "non-scale") {
 65 |       qp.rslt <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]),
 66 |                                   single.cell.transcriptome = log.norm.ls.sc[[2]],
 67 |                                   unix.parallel = unix.par, windows.parallel = windows.par,
 68 |                                   parallel.cores = n.cores, force.eq = force.eq)
 69 |       write.csv(qp.rslt, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F)
 70 |     } else {
 71 |       qp.rslt.scl <- sc.quad.prog.run(as.matrix(log.scale.ls.sc[[1]]),
 72 |                                       single.cell.transcriptome = log.scale.ls.sc[[2]],
 73 |                                       unix.parallel = unix.par, windows.parallel = windows.par,
 74 |                                       parallel.cores = n.cores, force.eq = force.eq)
 75 |       qp.rslt.non.scl <- sc.quad.prog.run(as.matrix(log.norm.ls.sc[[1]]),
 76 |                                           single.cell.transcriptome = log.norm.ls.sc[[2]],
 77 |                                           unix.parallel = unix.par, windows.parallel = windows.par,
 78 |                                           parallel.cores = n.cores, force.eq = force.eq)
 79 |       write.csv(qp.rslt.scl, paste0(save.to.path, save.to.filename, "_scale.csv"), quote = F, row.names = F)
 80 |       write.csv(qp.rslt.non.scl, paste0(save.to.path, save.to.filename, "_non_scale.csv"), quote = F, row.names = F)
 81 |     }
 82 |   }
 83 | }
 84 | 
 85 | #' Single-Cell Quadratic Programming Calculation
 86 | #'
 87 | #' This function runs quadratic programming to identify the probability of the cells in single-cell RNA-seq belonging to cell types in the reference transcriptome
 88 | #' @param bulk.transcriptome The reference transcriptome taht contains the transcriptome of each potential cell type
 89 | #' @param single.cell.transcriptome the transcriptome profile of cells from single-cell RNA-sequencing
 90 | #' @param force.eq either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0
 91 | #' @param unix.parallel boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE
 92 | #' @param windows.parallel boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE
 93 | #' @param parallel.cores the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used
 94 | #' @keywords quadratic programming, scRNA-seq
 95 | #' @note Code reference from Treutlein et. al., Dissecting direct reprogramming from fibroblast to neuron using single-cell RNA-seq
 96 | #' @export
 97 | #' @examples
 98 | #' sc.quad.prog.run(ref.transcriptome, sc.transcriptome, force.eq = 1)
 99 | sc.quad.prog.run <- function(bulk.transcriptome, single.cell.transcriptome, force.eq = 0,
100 |                              unix.parallel = FALSE, windows.parallel = FALSE, parallel.cores = 4) {
101 |   # Check if the sizes of the matrices are comfortable with each other
102 |   if (nrow(bulk.transcriptome) != nrow(single.cell.transcriptome)) {
103 |     print("The number of genes included in the bulk expression is not the same as the single cell transcriptome")
104 |     return()
105 |   }
106 | 
107 |   # Initialize a data frame to hold the fraction identity matrix
108 |   identity.matx <- data.frame()
109 |   # Initialize the cell types given
110 |   given.cell.typs <- colnames(bulk.transcriptome)
111 | 
112 |   # If using unix/linux based system, can use this process to parallel the the program
113 |   # Set unix.parallel parameter to be TRUE, but set windows.parallel to be FALSE
114 |   if (unix.parallel) {
115 |     identy.mx.ls <- mclapply(as.list(seq_len(ncol(single.cell.transcriptome))), quad.prog.calc,
116 |                              bulk.transcriptome = bulk.transcriptome,
117 |                              single.cell.transcriptome = single.cell.transcriptome,
118 |                              force.eq = force.eq, mc.cores = parallel.cores)
119 |     identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution,
120 |                                                              identity[[1]]$Lagrangian[1], identity[[3]]))
121 | 
122 |     identity.matx <- matrix(unlist(identity.ls), byrow = TRUE,
123 |                             nrow = length(identity.ls), ncol = length(identity.ls[[1]]))
124 |   } else {
125 |     # If using Windows based system, can use this process to parallel the the program
126 |     # Set windows.parallel parameter to be TRUE, but set unix.parallel to be FALSE
127 |     # NOTE: This might be slower than applying parallelization under Unix/Linux based system
128 |     if (windows.parallel) {
129 |       cl<-makeCluster(parallel.cores)
130 |       identy.mx.ls <- clusterApply(cl, seq_len(ncol(single.cell.transcriptome)), quad.prog.calc,
131 |                                    bulk.transcriptome = bulk.transcriptome,
132 |                                    single.cell.transcriptome = single.cell.transcriptome,
133 |                                    force.eq = force.eq)
134 |       stopCluster(cl)
135 |       identity.ls <- lapply(identy.mx.ls, function(identity) c(identity[[2]], identity[[1]]$solution,
136 |                                                                identity[[1]]$Lagrangian[1], identity[[3]]))
137 | 
138 |       identity.matx <- matrix(unlist(identity.ls), byrow = TRUE,
139 |                               nrow = length(identity.ls), ncol = length(identity.ls[[1]]))
140 | 
141 |       # If not using parallel for this case, just run the following process in a loop for serial
142 |       # All parallel parameters option should be set to FALSE
143 |     } else {
144 |       for (i in 1:ncol(single.cell.transcriptome)){
145 |         identity<-c()
146 |         quad.rslt <- quad.prog.calc(i, bulk.transcriptome, single.cell.transcriptome, force.eq)
147 |         QP <- quad.rslt[[1]]
148 |         Error <- quad.rslt[[3]]
149 | 
150 |         identity <- c(colnames(single.cell.transcriptome)[i], QP$solution, QP$Lagrangian[1],Error)
151 | 
152 |         if (nrow(identity.matx) == 0) {
153 |           identity.matx <- as.data.frame(t(as.matrix(identity)))
154 |         } else {
155 |           identity.matx <- rbind(identity.matx, as.data.frame(t(as.matrix(identity))))
156 |         }
157 |       }
158 |     }
159 |   }
160 | 
161 |   # Set the column names of the returning data frame
162 |   col.frx.names <- paste("frxn_", given.cell.typs, sep = "")
163 |   colnames(identity.matx)<-c("cell_name", col.frx.names,"Lagrangian","Error")
164 |   return(identity.matx)
165 | }
166 | 
167 | quad.prog.calc <- function(col.num, bulk.transcriptome, single.cell.transcriptome, force.eq) {
168 |   library(quadprog)
169 |   Y <- as.matrix(single.cell.transcriptome[, col.num])
170 |   Rinv <- solve(chol(t(bulk.transcriptome) %*% bulk.transcriptome));
171 |   C <- cbind(rep(-1, ncol(bulk.transcriptome)), diag(ncol(bulk.transcriptome)))
172 |   n <- nrow(Rinv)
173 |   q <- ncol(bulk.transcriptome)
174 |   b <- c(-1, rep(0, q))
175 |   d <- t(Y) %*% bulk.transcriptome
176 | 
177 |   QP<-solve.QP(Dmat = Rinv, factorized = TRUE, dvec = d, bvec = b, Amat = C, meq = force.eq)
178 |   Error<-sum(abs(Y- bulk.transcriptome %*% QP$solution))
179 | 
180 |   return(list(QP, colnames(single.cell.transcriptome)[col.num], Error))
181 | }
182 | 
183 | 


--------------------------------------------------------------------------------
/R/SingleCellQuadraticProgrammingAux.R:
--------------------------------------------------------------------------------
 1 | #' Top Variable Gene Identificaiton
 2 | #'
 3 | #' This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note.
 4 | #' @param input.dir The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell.
 5 | #' @param output.dir The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names.
 6 | #' @param top.number.count The number of top variable genes to extract. Default to 500 genes
 7 | #' @keywords top variable gene extraction
 8 | #' @note Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015
 9 | #' @export
10 | #' @examples
11 | #' top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000)
12 | top.genes <- function(input.dir, output.dir, top.number.count = 500) {
13 |   dataset <- input.dir
14 |   top.dir <- output.dir
15 | 
16 |   DS <- read.table(dataset, sep="\t", header=TRUE, row.names=1, check.names=FALSE)
17 |   X <- voom(as.matrix(DS), normalize.method="none")$E
18 |   vars <- apply(X, 1, var)
19 |   top <- sort.list(-vars)[1:top.number.count]
20 |   X <- X[top,]
21 | 
22 |   topl <- as.data.frame(rownames(X))
23 |   colnames(topl) <- c(paste("Top", top.number.count, sep = "", collapse = ""))
24 |   write.table(topl, top.dir, sep="\t", quote=F, row.names = FALSE, col.names = TRUE)
25 |   return()
26 | }
27 | 
28 | #' Normalization of Single-Cell RNA-Seq Data
29 | #'
30 | #' This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage.
31 | #' @param dt.st The dataset to normalize
32 | #' @keywords normalization
33 | #' @export
34 | #' @examples
35 | #' normalize.dt(single.cell.mtx)
36 | normalize.dt <- function(dt.st) {
37 |   # Calculate column sums
38 |   csums <- Matrix::colSums(dt.st)
39 |   # Calculate averages of the sums
40 |   cavg <- mean(csums)
41 |   # Calculate normalized bulk
42 |   norm.dt.st <- dt.st
43 |   for (i in 1:length(csums)) {norm.dt.st[,i] <- (norm.dt.st[,i]/csums[i]) * cavg}
44 |   return(norm.dt.st)
45 | }
46 | 
47 | #' Scale Ratio Calculation
48 | #'
49 | #' This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction.
50 | #' @param bulk The reference dataset
51 | #' @param sc The single-cell dataset
52 | #' @keywords Scale ratio calculation
53 | #' @export
54 | #' @examples
55 | #' calc.scale.ratio(reference.dt, single.cell.mtx)
56 | calc.scale.ratio <- function(bulk, sc) {
57 |   # Calculate column sums
58 |   csums.bulk <- colSums(bulk)
59 |   csums.sc <- Matrix::colSums(sc)
60 |   # Calculate averages of the sums
61 |   cavg.bulk <- mean(csums.bulk)
62 |   cavg.sc <- mean(csums.sc)
63 |   # Calculate scaling ratio for each tissue
64 |   scale.ratio.sc <- cavg.bulk/cavg.sc
65 |   return(scale.ratio.sc)
66 | }
67 | 
68 | #' Find intersection genes
69 | #'
70 | #' This function identifies the intersection genes between reference and single-cell data
71 | #' @param bulk The reference dataset
72 | #' @param sc The single-cell dataset
73 | #' @export
74 | #' @examples
75 | #' gene.intersect.sub(reference.dt, single.cell.mtx)
76 | gene.intersect.sub <- function(bulk, sc) {
77 |   # Find genes with all zero-expression within sc/bulk
78 |   rsums.bulk <- Matrix::rowSums(bulk)
79 |   rsums.sc <- Matrix::rowSums(sc)
80 |   # Identify blank genes
81 |   bulk.blank.genes <- which(rsums.bulk == 0)
82 |   sc.blank.genes <- which(rsums.sc == 0)
83 |   # Remove those genes
84 |   norm.bulk.sub <- bulk
85 |   norm.sc.sub <- sc
86 |   if (length(bulk.blank.genes) > 0) {norm.bulk.sub <- bulk[-(bulk.blank.genes),]}
87 |   if (length(sc.blank.genes) > 0) {norm.sc.sub <- sc[-(sc.blank.genes),]}
88 |   # Find intersection genes
89 |   genes.inter <- intersect(rownames(norm.bulk.sub), rownames(norm.sc.sub))
90 |   # Find the intersected sub dataset of bulk (normalized bulk)
91 |   norm.bulk.sub.sub <- norm.bulk.sub[genes.inter, ]
92 |   norm.sc.sub.sub <- norm.sc.sub[genes.inter, ]
93 |   return(list(norm.bulk.sub.sub, norm.sc.sub.sub))
94 | }
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/R/TransitionScoreCalculation.R:
--------------------------------------------------------------------------------
 1 | #' Transition Score Calculation
 2 | #'
 3 | #' This function calculates the transition scores for each cell state that is connected by cells with multiple identities.
 4 | #' @param multi.id.meta A data frame that contains cells with multiple identities. Column 1 - cell barcode, Column 2 - cell type call, Column 3 - counts, Column 4 - corresponding QP scores.
 5 | #' @return A data frame that contains the calculated transition scores for each identity
 6 | #' @keywords Transition metric
 7 | #' @export
 8 | #' @examples
 9 | #' transition.score(multi.id.meta)
10 | transition.score <- function(multi.id.meta) {
11 |   
12 |   unique.ct <- unique(as.character(multi.id.meta[,2]))
13 |   ct.entropy <- data.frame()
14 |   for (i in 1:length(unique.ct)) {
15 |     curr.cell <- unique.ct[i]
16 |     actual.sub <- multi.id.meta[which(as.character(multi.id.meta[,2]) == curr.cell), ]
17 |     curr.entro <- sum(-actual.sub$qp.score*log(actual.sub$qp.score, base = max(2, nrow(actual.sub))))
18 |     curr.df <- data.frame(cell.ct = curr.cell, entropy = curr.entro, stringsAsFactors = F)
19 |     if (nrow(ct.entropy) <= 0) {
20 |       ct.entropy <- curr.df
21 |     } else {
22 |       ct.entropy <- rbind(ct.entropy, curr.df)
23 |     }
24 |   }
25 |   
26 |   rownames(ct.entropy) <- unlist(lapply(strsplit(ct.entropy$cell.ct, "frxn_cell.type_"), function(x) x[2]))
27 |   
28 |   return(ct.entropy)
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # R Package - Capybara <img src="/examples/Monocle_hat_colin.png" height="30" width="25">
  2 | 
  3 | 
  4 | Capybara is a tool to measure cell identity and fate transitions. This approach is designed to measure cell identity as a continuum, at a single-cell resolution. Capybara enables classification of discrete identities as well as cells with multiple identities. This package has a dependency on R version (R >= 3.5.0). In addition to the following tutorial, we provide the notebooks for production of figures in the morris-lab/Capybara_reproducibility (https://github.com/morris-lab/Capybara_reproducibility) repository. There could exist some variations due to difference in threshold selections. For details regarding the methods, usage and application, please refer to the following paper: Kong et al., Cell Stem Cell, 2022 (https://www.sciencedirect.com/science/article/pii/S1934590922000996?dgcid=coauthor)
  5 | 
  6 | ## Installation
  7 | ### Dependencies
  8 | 
  9 | Most dependencies can be installed along with Capybara through CRAN. The following dependencies may need to be installed manually through BioConductor (Instructions can also be found here: https://bioconductor.org/).
 10 | 
 11 | Install BiocManager
 12 | ```r
 13 | install.packages("BiocManager")
 14 | ```
 15 | Install Bioconductor dependency packages
 16 | ```r
 17 | BiocManager::install(c("limma","impute", "AnnotationDbi","GO.db", "preprocessCore"))
 18 | ```
 19 | 
 20 | ### Install the package
 21 | Install devtools
 22 | ```r
 23 | install.packages("devtools")
 24 | ```
 25 | Install the package from GitHub.
 26 | ```r
 27 | library("devtools")
 28 | devtools::install_github("morris-lab/Capybara")
 29 | ```
 30 | Load the package
 31 | ```r
 32 | library("Capybara")
 33 | ```
 34 | 
 35 | ## Step 1: Tissue-Level Classification
 36 | ### Application of quadratic programming on reference and sample single-cell dataset using a bulk reference
 37 | 
 38 | Bulk transcriptome profiles of all tissues are mined from ARCHS4, a platform that contains most published RNA-seq and ChiP-seq datasets (Lachmann et al.,  2018). ARCHS4 obtains raw datasets from the Gene Expression Omnibus (GEO), realigned and processed through a uniform pipeline. We filtered to contain only poly-A and total RNA-seq data from C57BL/6 mice. With further filtering and preprocessing (details can be found in the method section of the paper), we landed with a reference of a total of 30 tissues. We provide our mined bulk references, including a matrix in raw counts and a matrix in reads per kilobase per million (RPKM), as a part of the Capybara package. Selection of your preferred normalization method can be applied to raw counts. Here, we will demonstrate the usage of the bulk raw counts in the pipeline.
 39 | 
 40 | **1. Load the bulk reference**
 41 | ```r
 42 | # File path
 43 | bulk.raw.path <- system.file("extdata", "Bulk Reference Raw.Rds", package = "Capybara")
 44 | bulk.rpkm.path <- system.file("extdata", "Bulk Reference RPKM.Rds", package = "Capybara")
 45 | # Read the matrices
 46 | bulk.raw <- readRDS(bulk.raw.path)
 47 | bulk.rpkm <- readRDS(bulk.rpkm.path)
 48 | ```
 49 | 
 50 | With the bulk reference, we next load the single-cell reference, such as a cell atlas, and the single-cell sample to be used. The datasets to be used should be in a matrix form with each row representing a gene and each column representing a cell. Here, we use the Mouse Cell Atlas (MCA) as background and single-cell RNA-seq data of mouse pancreatic cells (Baron et al., 2016) as examples for demonstration. MCA can be obtained from https://figshare.com/articles/MCA_DGE_Data/5435866. We included the mouse pancreatic dataset in the package.
 51 | 
 52 | **2.Load the single-cell sample dataset and the corresponding meta data**
 53 | 
 54 | *Note: The meta data of this file contains 2 columns, where the first column represents cell.type and the second column represents barcode.*
 55 | 
 56 | ```r
 57 | # Read in the pancreatic data file that come with the package
 58 | fpath <- system.file("extdata", "baron_dataset.zip", package = "Capybara")
 59 | extract.dir <- "."
 60 | # Extract the dataset
 61 | unzip(fpath, overwrite = FALSE, exdir = ".")
 62 | # Identify the full path
 63 | full.fpath.meta <- paste0(extract.dir, "/", "baron_et_al_pancreatic_meta.csv")
 64 | full.fpath.raw <- paste0(extract.dir, "/", "baron_et_al_pancreatic.csv")
 65 | # Load the count matrix and the meta data
 66 | baron.expr <- read.csv(full.fpath.raw, header = T, row.names = 1, stringsAsFactors = F)
 67 | baron.meta <- read.csv(full.fpath.meta, header = T, row.names = 1, stringsAsFactors = F)
 68 | ```
 69 | 
 70 | **3. Application of QP on the sample single-cell data**
 71 | 
 72 | Notice: For Windows users, please set unix.par=F and n.cores=1
 73 | 
 74 | ```r
 75 | single.round.QP.analysis(bulk.raw, baron.expr, scale.bulk.sc = "scale", unix.par = TRUE, 
 76 |                          force.eq = 1, n.cores = 4, save.to.path = "./", 
 77 |                          save.to.filename = "baron_bulk_classification_qp")
 78 | ```
 79 | 
 80 | **4. Load the single-cell reference meta data**
 81 | 
 82 | *Note: The meta data of Mouse Cell Atlas contains 6 columns, including Cell.name, ClusterID, Tissue, Batch, Cell.Barcode, and Annotation. The annotation is what we used for high-resolution reference construction. We've included the version of meta data we used along with the package.*
 83 | 
 84 | ```r
 85 | # Read the meta data
 86 | mca.meta.fpath <- system.file("extdata", "MCA_CellAssignments.csv", package = "Capybara")
 87 | mca <- read.csv(mca.meta.fpath, row.names = 1, header = T, stringsAsFactors = F)
 88 | # Clean up the meta data
 89 | mca.meta <- data.frame(row.names = mca$Cell.name, 
 90 |                        tissue = mca$Tissue,
 91 |                        cell.type = mca$Annotation,
 92 |                        stringsAsFactors = F)
 93 | ```
 94 | 
 95 | **5. Load the single-cell reference atlas and apply QP tissue-by-tissue**
 96 | 
 97 | *Due to the large size of MCA count data, we did* ***NOT*** *include the counts along with the package. We further separated MCA into fetal/neonatal/embryonic and adult categories. The counts data were organized in the following manner.*
 98 | 
 99 | > Folder: MCA Counts
100 | 
101 | >> Tissue_1 Folder
102 | 
103 | >>> count.csv
104 | 
105 | >> Tissue_2 Folder
106 | 
107 | >>> count.csv
108 | 
109 | >> ...
110 | 
111 | ```r
112 | # List all possible files and tissues in the Mouse Cell Atlas
113 | file.ls <- list.files("./MCA_Counts/", full.names = T)
114 | base.nms <- basename(file.ls)
115 | 
116 | # Identify the tissues
117 | unq.tissue <- unique(base.nms)
118 | 
119 | # Set a path to save all QP files for all tissues
120 | general.path.to.save <- "./MCA_All_Tissue_QP/"
121 | for (k in 1:length(unq.tissue)) {
122 |   curr.tissue <- unq.tissue[k]
123 |   curr.filename <- paste0("0", k, "_", curr.tissue, "_Bulk_ARCHS4")
124 |   
125 |   file.base.name <- base.nms[which(startsWith(base.nms, curr.tissue))][1]
126 |   file.full <- file.ls[which(startsWith(base.nms, curr.tissue))][1]
127 |   
128 |   print(curr.tissue)
129 |   
130 |   sc.data <- read.csv(paste0(file.full, "/count.csv"), header = T, row.names = 1, stringsAsFactors = F)
131 |   
132 |   if (all(is.na(sc.data))) {
133 |     print("There is no data in this counting matrix!")
134 |   } else {
135 |     single.round.QP.analysis(bulk.raw, sc.data, scale.bulk.sc = "scale", unix.par = TRUE, 
136 |                              force.eq = 1, n.cores = 4, save.to.path = general.path.to.save, 
137 |                              save.to.filename = curr.filename)
138 |   }
139 | }
140 | ```
141 | 
142 | **6. Selection of 90 cells from each tissue to construct a QP background**
143 | 
144 | With all QP scores calculated on the bulk transcriptome profiles of all tissues, we select 90 most relevant cells of each tissue in the MCA (90 highest scored cells in the MCA to each bulk tissue) as a QP background. We use this QP background further to map our sample single-cell data. Assuming that each cell in each cell type of the MCA takes a unique combination of QP scores to each tissue in ARCHS4, cells in the sample that share similar combination to those in MCA are marked to relate to the corresponding tissue in the MCA. Here, we demonstrate how we constructed the backgrounds as following. We included the background matrices along with the packages such that it can be directly used for convenience.
145 | 
146 | **(a) Get QP scores for all annotated cells**
147 | ```r
148 | # Read the QP files from the directory
149 | qp.files.to.read.clean <- list.files("./MCA_All_Tissue_QP/", full.names = T)
150 | 
151 | full.qp.mtx.known.annotation <- data.frame()
152 | full.qp.mtx.unknown.annotation <- data.frame()
153 | for (i in 1:length(qp.files.to.read.clean)) {
154 |   curr.file <- qp.files.to.read.clean[i]
155 |   curr.qp.rslt <- read.csv(curr.file, header = T, row.names = 1, stringsAsFactors = F)
156 |   
157 |   cells.to.keep <- intersect(rownames(mca.meta), rownames(curr.qp.rslt))
158 |   cells.unlabel <- setdiff(rownames(curr.qp.rslt), cells.to.keep)
159 |   
160 |   curr.sub.mtx.to.keep <- curr.qp.rslt[cells.to.keep, ]
161 |   curr.sub.mtx.unlabel <- curr.qp.rslt[cells.unlabel, ]
162 |   
163 |   if (nrow(full.qp.mtx.known.annotation) <= 0) {
164 |     full.qp.mtx.known.annotation <- curr.sub.mtx.to.keep
165 |     full.qp.mtx.unknown.annotation <- curr.sub.mtx.unlabel
166 |   } else {
167 |     full.qp.mtx.known.annotation <- rbind(full.qp.mtx.known.annotation, curr.sub.mtx.to.keep)
168 |     full.qp.mtx.unknown.annotation <- rbind(full.qp.mtx.unknown.annotation, curr.sub.mtx.unlabel)
169 |   }
170 | }
171 | 
172 | full.qp.mtx.known.annotation.qp.score.only <- full.qp.mtx.known.annotation[,c(1:(ncol(full.qp.mtx.known.annotation) - 2))]
173 | ```
174 | 
175 | **(b) Selection of 90 cells**
176 | ```r
177 | # Create a map between MCA and ARCHS4
178 | map.df <- data.frame(mca.tissue = c("Embryonic-Mesenchyme", "Embryonic-Stem-Cell", "Trophoblast-Stem-Cell", "Fetal_Brain",
179 |                                    "Neonatal-Calvaria","Fetal_Intestine", "Fetal-Liver", "Fetal_Lung", "Fetal_Stomache",
180 |                                    "Neonatal-Heart", "Neonatal-Muscle",
181 |                                    "Neonatal-Rib", "Neonatal-Skin",  "NeonatalPancreas"),
182 |                      corresponding = c("frxn_embryo", "frxn_embryo", "frxn_embryo", "frxn_brain","frxn_brain",
183 |                                        "frxn_small.intestine", "frxn_liver", 
184 |                                        "frxn_lung", "frxn_stomach",  "frxn_heart", "frxn_muscle", "frxn_muscle", 
185 |                                        "frxn_skin", "frxn_pancreas"),
186 |                      stringsAsFactors = F)
187 | 
188 | # Identify top 90 cells for each tissue
189 | tm.tissue <- unique(map.df$tm.tissue)
190 | cell.selector <- c()
191 | n.sample <- 90
192 | for (i in 1:length(tm.tissue)) {
193 |   curr.tissue <- tm.tissue[i]
194 |   cell.names <- rownames(mca.meta)[which(mca.meta$tissue == curr.tissue)]
195 |   curr.qp.subset <- full.qp.mtx.known.annotation.qp.score.only[cell.names, ]
196 |   curr.map <- map.df$corresponding[which(map.df$tm.tissue == curr.tissue)]
197 |   if (length(curr.map) <= 1){
198 |     curr.qp.subset.sub <- data.frame(score = curr.qp.subset[,curr.map], cell.name = cell.names, stringsAsFactors = F)
199 |   } else {
200 |     curr.qp.subset.sub <- data.frame(score = rowSums(curr.qp.subset[,curr.map]), cell.name = cell.names, stringsAsFactors = F)
201 |   }
202 |   curr.qp.subset.sub.sort <- curr.qp.subset.sub[order(-curr.qp.subset.sub$score), ]
203 |   cells.to.incl <- curr.qp.subset.sub.sort$cell.name[1:n.sample]
204 |   
205 |   cell.selector <- c(cell.selector, cells.to.incl)
206 | }
207 | saveRDS(full.qp.mtx.known.annotation.qp.score.only[cell.selector, ], "./MCA_embryonic_background.RDS")
208 | ```
209 | 
210 | *Note: This constructed QP background can be saved and reused and does not need to be reconstructed every time.*
211 | 
212 | ### Identification of tissue correlate in the reference to the sample single-cell dataset
213 | 
214 | To find the correlated tissue in the reference to the sample single-cell dataset, we use a correlation based method. In brief, we calculate Pearson's correlation of the QP scores in a pairwise manner between each cell in the sample and each cell in the reference. Recall the assumption that cells in the sample that share similar combination of QP scores to those in MCA are marked to relate to the corresponding tissue in the MCA. If there is a significant percentage of reference cells of a tissue (over 70%) mapped to a cell, we record the tissue label. Then the frequency of each tissue label is calculated. Tissues with a frequency at least 0.5% (for cell number > 10,000) or at least 100 cells will be selected for further analysis. 
215 | 
216 | **1. Load the QP background matrix**
217 | ```r
218 | background.qp.fpath <- system.file("extdata", "MCA Adult Background.Rds", package = "Capybara")
219 | background.mtx <- readRDS(background.qp.fpath)
220 | ```
221 | 
222 | **2. Load the QP scores of the sample**
223 | 
224 | ```r
225 | ## Load QP results
226 | qp.rslt <- read.csv("./baron_bulk_classification_qp_scale.csv", row.names = 1, header = T, stringsAsFactors = F)
227 | 
228 | ## Reshape the data
229 | qp.rslt.sub <- qp.rslt[,c(1:(ncol(qp.rslt) - 2))]
230 | ```
231 | 
232 | **3. Correlation calculation**
233 | 
234 | *Note: we use WGCNA to calculate the correlation*
235 | 
236 | ```r
237 | mtx.test <- t(qp.rslt.sub[, colnames(background.mtx)])
238 | ref.test <- t(background.mtx)
239 | 
240 | # Pearson's Correlation Calculation
241 | corr.mtx <- WGCNA::cor(ref.test, mtx.test)
242 | ```
243 | 
244 | **4. Binarization based on correlation**
245 | 
246 | We perform binarization based on the correlation estimates.
247 | 
248 | ```r
249 | # Setup a correlation cutoff to the 90th quantile of the correlation matrix
250 | correlation.cutoff <- quantile(corr.mtx, 0.90)
251 | 
252 | # Binarization based on the correlation
253 | new.corr.bin <- corr.mtx
254 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1
255 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0
256 | new.corr.bin <- as.data.frame(new.corr.bin)
257 | ```
258 | 
259 | **5. Counting the tissues and select the final tissue types**
260 | 
261 | Count the frequency of occurrence of each tissue in the tissue list.
262 | 
263 | ```r
264 | # Count
265 | count.in.cat <- c()
266 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1])))
267 | for (uc in unique.cat) {
268 |   curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:1886)]
269 |   count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.7)
270 | }
271 | 
272 | count.in.cat <- as.data.frame(count.in.cat)
273 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3)
274 | 
275 | # Check frequency
276 | final.cell.types.adult <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)]
277 | ```
278 | 
279 | Below is a composition example for this pancreatic dataset, where we identify 3 major tissues, including stomach, pancreas, and small instestine.
280 | <p align="center">
281 |     <img src="/examples/bulk class mca pancreatic.png" height="800" width="400">
282 | </p>
283 | 
284 | 
285 | ## Step 2: Generation of a High-Resolution Custom Reference, and Continuous Identity Measurement
286 | 
287 | After tissue-level classification, relevant cell types are selected from cell atlas and built as a single cell reference dataset. As an alternative, users could also use their own single-cell reference dataset to benchmark their samples.
288 | 
289 | ### Systematic construction of a high-resolution reference
290 | 
291 | To alleviate the effect of technical variations, we construct pseudo-bulk references for each reference cell type. By default, 90 cells of each cell type would be used to build the reference. The construct.high.res.reference function returns a list containing expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference.
292 | 
293 | **Get the counts of the cell types involved in the tissues selected**
294 | 
295 | To obtain the exact same structure of the MCA data, please download the data here (https://wustl.box.com/s/z46vm5yq7r1lw3353o8ttz1xo8pu3dvw).
296 | 
297 | ```r
298 | mca.meta$cell.bc.tissue <- unlist(lapply(strsplit(rownames(mca.meta), "_"), function(x) x[1]))
299 | pancreatic.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.adult), ]
300 | 
301 | mca.counts.all.involved <- NULL
302 | tissues.to.read <- unique(pancreatic.all.meta$tissue)
303 | general.path <- "~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_Counts/"
304 | for (i in 1:length(tissues.to.read)) {
305 |   curr.t <- tissues.to.read[i]
306 |   curr.path.to.read <- paste0(general.path, curr.t, "/count.csv")
307 |   curr.count <- read.csv(curr.path.to.read, header = T, row.names = 1, stringsAsFactors = F)
308 |   if (is.null(mca.counts.all.involved)) {
309 |     mca.counts.all.involved <- curr.count
310 |   } else {
311 |     mca.counts.all.involved <- cbind(mca.counts.all.involved, curr.count)
312 |   }
313 | }
314 | 
315 | ## Meta data filtering
316 | pancreatic.all.meta$cell.type <- gsub("Dendrtic cell", "Dendritic cell", pancreatic.all.meta$cell.type)
317 | pancreatic.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", pancreatic.all.meta$cell.type)
318 | pancreatic.all.meta$cell.type.alone <- unlist(lapply(strsplit(pancreatic.all.meta$cell.type.1, "_"), function(x) x[1]))
319 | 
320 | ## Filter out cell types with less than 30 cells
321 | cell.type.alone.freq <- as.data.frame(table(pancreatic.all.meta$cell.type.alone))
322 | cell.type.over.30 <- cell.type.alone.freq$Var1[which(cell.type.alone.freq$Freq >= 30)]
323 | pancreatic.sub.meta <- pancreatic.all.meta[which(pancreatic.all.meta$cell.type.alone %in% as.character(cell.type.over.30)),]
324 | coldata.df <- pancreatic.sub.meta
325 | ```
326 | 
327 | If the data is obtained from MCA website (https://figshare.com/articles/MCA_DGE_Data/5435866), please download the compressed file with rmbatch_dge. The extraction of the compressed file will provide a folder contains space-delimited text files, containing the single-cell matrices for each tissue in the MCA from different animals. Please follow the next few lines for the processing of such files. Special thanks to Danyi_ZHENG for sharing the tutorial with these MCA files (Detailed tutorial can be found here - https://github.com/Danyi-ZHENG/Capybara_MCA_tutorial/blob/main/Capybara_tutorial_MCA_220415.R)
328 | 
329 | ```r
330 | mca.meta$cell.bc.tissue <- unlist(lapply(strsplit(rownames(mca.meta), "_"), function(x) x[1]))
331 | pancreatic.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.adult), ]
332 | 
333 | mca.counts.all.involved <- NULL
334 | tissues.to.read <- unique(pancreatic.all.meta$tissue)
335 | 
336 | curr.dir <- "../rmbatch_dge/"
337 | curr.dir.files <- list.files(curr.dir)
338 | curr.dir.files.sub <- unlist(lapply(strsplit(curr.dir.files, "_"), function(x) x[1]))
339 | 
340 | for (i in 1:length(tissues.to.read)) {
341 | 
342 |   print(i)
343 | 
344 |   curr.t <- tissues.to.read[i]
345 |   acceptable.files.starts <- c(curr.t, paste0(curr.t, seq(1,10)))
346 |   file.index <- which(curr.dir.files.sub %in% acceptable.files.starts)
347 | 
348 |   curr.files.to.read <- curr.dir.files[file.index]
349 |   curr.path.to.read <- paste0(curr.dir, "/", curr.files.to.read)
350 | 
351 |   curr.tissue.count.mtx <- NULL
352 | 
353 |   for (curr.f in curr.path.to.read) {
354 |   	curr.count <- read.table(curr.f, header = T, row.names = 1, stringsAsFactors = F)
355 |   	if (is.null(curr.tissue.count.mtx)) {
356 |   		curr.tissue.count.mtx <- curr.count
357 |   	} else {
358 |   		gene.intersect <- intersect(rownames(curr.tissue.count.mtx), rownames(curr.count))
359 |   		curr.tissue.count.mtx <- cbind(curr.tissue.count.mtx[gene.intersect,], curr.count[gene.intersect,])
360 |   	}
361 |   }
362 | 
363 |   
364 |   if (is.null(mca.counts.all.involved)) {
365 |     mca.counts.all.involved <- curr.tissue.count.mtx
366 |   } else {
367 |     gene.intersect <- intersect(rownames(mca.counts.all.involved), rownames(curr.tissue.count.mtx))
368 |     mca.counts.all.involved <- cbind(mca.counts.all.involved[gene.intersect,], curr.tissue.count.mtx[gene.intersect,])
369 |   }
370 | }
371 | 
372 | ## Meta data filtering
373 | pancreatic.all.meta$cell.type <- gsub("Dendrtic cell", "Dendritic cell", pancreatic.all.meta$cell.type)
374 | pancreatic.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", pancreatic.all.meta$cell.type)
375 | pancreatic.all.meta$cell.type.alone <- unlist(lapply(strsplit(pancreatic.all.meta$cell.type.1, "_"), function(x) x[1]))
376 | 
377 | ## Filter out cell types with less than 30 cells
378 | cell.type.alone.freq <- as.data.frame(table(pancreatic.all.meta$cell.type.alone))
379 | cell.type.over.30 <- cell.type.alone.freq$Var1[which(cell.type.alone.freq$Freq >= 30)]
380 | pancreatic.sub.meta <- pancreatic.all.meta[which(pancreatic.all.meta$cell.type.alone %in% as.character(cell.type.over.30)),]
381 | coldata.df <- pancreatic.sub.meta
382 | ```
383 | 
384 | **Construction**
385 | 
386 | ``` r
387 | # Construction of a high-resolution reference
388 | ref.list <- construct.high.res.reference(mca.counts.all.involved, coldata.df = coldata.df, criteria = "cell.type.alone")
389 | # Get expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference
390 | ref.df <- ref.list[[3]]
391 | ref.meta <- ref.list[[2]]
392 | ref.sc <- ref.list[[1]]
393 | ```
394 | 
395 | ### Application of quadratic programming on the self-established reference with the sample
396 | 
397 | ``` r
398 | # Measure cell identity in the reference dataset as a background 
399 | single.round.QP.analysis(ref.df, ref.list[[1]], n.cores = 4, save.to.path = "./", save.to.filename = "01_MCA_Based_scClassifier_reference_mix90_normalize_select", unix.par = TRUE)
400 | 
401 | # Measure cell identity in the query dataset 
402 | single.round.QP.analysis(ref.df, baron.expr, n.cores = 4, save.to.path = "./", save.to.filename = "02_MCA_Based_scClassifier_reference_mix90_test_normalize_select", unix.par = TRUE)
403 | ```
404 | 
405 | ## Step 3: Discrete Cell Type Classification and Multiple Identity Scoring
406 | 
407 | ### Empirical p-value calculation
408 | With the constructed single-cell reference, we apply QP to both the sample and reference single-cell datasets to generate continuous measurements of cell identity. The result of this step includes two lists of p-value matrices: one for the reference and the other for the sample. For each cell, each column of the p-value matrix denotes a cell type, while each row describes each round of 50 (default).
409 | ``` r
410 | # Read in background and testing identity scores
411 | background.mtx <- read.csv("./01_MCA_Based_scClassifier_reference_mix90_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
412 | mtx.test <- read.csv("./02_MCA_Based_scClassifier_reference_mix90_test_normalize_select_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
413 | 
414 | col.sub <- ncol(background.mtx) - 2
415 | 
416 | # Conduct reference randomization to get empirical p-value matrix
417 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)])
418 | 
419 | # Conduct test randomization to get empirical p-value matrix
420 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)]))
421 | ```
422 | 
423 | ### Binarization with Mann-Whitney
424 | A randomized test is performed using the background distributions as null to compute the occurrence probability or empirical p-values of each identity score. This test shapes the likelihood identity score occurrence as a continuous distribution, in which the cell type with the lowest likelihood rank is the classified identity. Capybara is also able to identify cells that harbor multiple identities, potentially representing cells transitioning between defined cell identities. To capture multiple cell identities, we use a Mann-Whitney (MW) test to compare the occurrence probabilities of the cell type with the lowest likelihood rank to that of other cell types, following the order from the second-lowest to the highest rank-sum. From this test, we calculate a p-value to determine whether two identities are equally likely to represent the identity of a specific cell. We stop our comparison when we identify the first cell type that is significantly (p-value < 0.05) less likely to represent one of the cell identities. A binarized matrix will be returned with each row representing a query cell and each column representing a possible cell type. 1 means inferred cell type in the matrix. 
425 | 
426 | ``` r
427 | # Binarization of inference results
428 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list)
429 | ```
430 | 
431 | ### Classification
432 | 
433 | Finally, we return a classification table of each query cell and its inferred cell type. Cells with multiple inferred identities are marked as "Multi_ID". Cells with no significant inferred identity are marked as "unassigned".
434 | 
435 | ``` r
436 | classification <- binary.to.classification(bin.count[,c(1:col.sub)])
437 | rownames(classification) <- classification$barcode
438 | ```
439 | 
440 | ### Check the Classification Result
441 | 
442 | We check the classification results by comparing the labels that are shared between the reference and manual annotation of Baron et al., 2016. Further, we visualize the agreement using ggplot2.
443 | 
444 | ```r
445 | rownames(baron.meta) <- gsub("-", ".",rownames(baron.meta))
446 | classification$actual <- baron.meta[rownames(classification), "cell.type"]
447 | 
448 | table.freq <- table(classification$actual, classification$call)
449 | table.freq.perc <- apply(table.freq, 1, function(x) round(x * 100/sum(x), digits = 3))
450 | 
451 | rownames(table.freq.perc)[nrow(table.freq.perc] <- "beta"
452 | 
453 | table.freq.sub <- as.data.frame(table.freq.perc[c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
454 |                                                   "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", 
455 |                                                   "Multi_ID", "Endocrine.cell"),
456 |                                                 c("B_cell", "beta", "ductal", "endothelial",
457 |                                                   "macrophage", "T_cell", "immune_other", "activated_stellate", 
458 |                                                   "alpha", "delta", "gamma")])
459 | table.freq.sub$Capybara.Call <- rownames(table.freq.sub)
460 | table.freq.melt <- melt(table.freq.sub)
461 | 
462 | table.freq.melt$Capybara.Call <- factor(table.freq.melt$Capybara.Call,
463 |                                         levels = c("B.cell", "beta", "Ductal.cell", "Endothelial.cell",
464 |                                                    "Macrophage", "T.cell", "Dendritic.cell", "Stromal.cell", 
465 |                                                    "Multi_ID", "Endocrine.cell"),
466 |                                         ordered = T)
467 | table.freq.melt$variable <- factor(table.freq.melt$variable,
468 |                                         levels = c("B_cell", "beta", "ductal", "endothelial",
469 |                                                    "macrophage", "T_cell", "immune_other", "activated_stellate", 
470 |                                                    "alpha", "delta", "gamma"),
471 |                                         ordered = T)
472 | 
473 | ggplot(table.freq.melt, aes(x = Capybara.Call, y = variable, size=ifelse(value==0, NA,  value))) +
474 |   geom_point(aes(colour = variable)) +
475 |   scale_size_area(name = "Percentage", max_size=12) +
476 |   scale_color_viridis_d(option = "A", begin = 0.15, end = 0.85) +
477 |   ggtitle("Mouse Pancreatic Dataset (Baron et al., 2016)") +
478 |   guides(fill = FALSE, color = FALSE) +
479 |   theme(legend.position = "bottom",
480 |         axis.text.x = element_text(size = 12, face = "bold.italic", angle = 90),
481 |         axis.text.y = element_text(size = 12, face = "bold.italic"),
482 |         axis.ticks = element_blank(), 
483 |         axis.title = element_blank(), 
484 |         title = element_text(face = "bold.italic", size = 14),
485 |         panel.grid.major = element_blank(), 
486 |         panel.grid.minor = element_blank(),
487 |         panel.background = element_blank(), 
488 |         axis.line = element_line(colour = "black", size = 1))
489 | ```
490 | 
491 | Below is a dot plot example for this pancreatic dataset to show agreement.
492 | <p align="center">
493 |     <img src="/examples/pancreatic dot plot.png" height="450" width="400">
494 | </p>
495 | 
496 | ## Analysis of Cells with Multiple Identities
497 | 
498 | A unique aspect of Capybara is the classificaiton of cells with multiple identities, which are key to characterize cell fate transitions in a continuous process. Cells with multiple identities label transition harbors, while the discrete cell identities that connect these cells mark potential pivotal states/hallmarks during the continuous processes. In Capybara, we further develop a 'transition metric', a transition score, to measure the flux through the mixed cell identities. It is worth noting that the intention of this score is not to measure potential of each identity but to measure the dynamics going through each discrete state. For details, please refer to the paper. Here, we use an example of cardiomyocyte reprogramming (Stone et al., 2019) to demonstrate the preprocessing of data, classification, analysis of cells with multiple identities and calculation of transition scores.
499 | 
500 | ### 1. Download the data
501 | 
502 | The dataset for the cardiomyocyte reprogramming can be found here under GEO: GSE131328. This dataset contains 6 timepoints of this reprogramming process, Day -1, 1, 2, 3, 7, and 14, where Day -1 marks the day of transduction of three transcription factors and Day 14 cells were sorted using a-MHC reporter (Stone et al., 2019). The data can be downloaded in terminal as well as in R.
503 | 
504 | ```
505 | wget https://www.ncbi.nlm.nih.gov/geo/download/acc=GSE133452&format=file&file=GSE133452%5Fm1%5F1%5F2%5F3%5F7%5F14P%5Fpaper%2Ecsv%2Egz
506 | ```
507 | 
508 | or
509 | 
510 | ```r
511 | download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133452&format=file&file=GSE133452%5Fm1%5F1%5F2%5F3%5F7%5F14P%5Fpaper%2Ecsv%2Egz", "./cardiomyocyte_reprogramming_m1_14p.csv.gz")
512 | unzip("./cardiomyocyte_reprogramming_m1_14p.csv.gz", overwrite = FALSE, exdir = ".")
513 | ```
514 | 
515 | ### 2. Preprocessing of the data with Seurat
516 | 
517 | In this step, we preprocess the data with Seurat to filter the data and obtain a UMAP embedding of the data. For details of Seurat processing, please refer to the instructions or vignettes here - https://satijalab.org/seurat/vignettes.html.
518 | 
519 | ```r
520 | # Read in the file path for all features and genes
521 | feature.file.path <- system.file("extdata", "features.tsv", package = "Capybara")
522 | 
523 | # Load the data
524 | stone.et.al <- read.csv("./cardiomyocyte_reprogramming_m1_14p.csv", row.names = 1, header = T, stringsAsFactors = F)
525 | feature.df <- read.table(feature.file.path, header = F, row.names = 1, stringsAsFactors = F)
526 | 
527 | # Map the gene names fr
528 | gene.name.subset <- feature.df[intersect(stone.et.al$X, rownames(feature.df)), ]
529 | stone.et.al.subset <- stone.et.al[which(stone.et.al$X %in% rownames(feature.df)), ]
530 | stone.et.al.subset$gene.name <- gene.name.subset[stone.et.al.subset$X, "V2"]
531 | stone.et.al.subset <- stone.et.al.subset[-which(duplicated(stone.et.al.subset$gene.name)), ]
532 | rnm <- stone.et.al.subset$gene.name
533 | stone.et.al.final <- stone.et.al.subset[, -c(1,ncol(stone.et.al.subset))]
534 | rownames(stone.et.al.final) <- rnm
535 | 
536 | # Create Seurat object
537 | sc.data.stone <- CreateSeuratObject(counts = stone.et.al.final, project = "cardiac.reprog", min.cells = 3, min.features = 200)
538 | 
539 | # Calculate mitochondria content
540 | sc.data.stone[["percent.mt"]] <- PercentageFeatureSet(sc.data.stone, pattern = "mt-")
541 | 
542 | # Visualize QC metrics as a violin plot and scatter plot
543 | VlnPlot(sc.data.stone, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
544 | 
545 | plot1 <- FeatureScatter(sc.data.stone, feature1 = "nCount_RNA", feature2 = "percent.mt")
546 | plot2 <- FeatureScatter(sc.data.stone, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
547 | CombinePlots(plots = list(plot1, plot2))
548 | 
549 | # Filter the dataset based on number of features
550 | sc.data.stone <- subset(sc.data.stone, subset = nFeature_RNA > 200 & nFeature_RNA < 5500)
551 | 
552 | # Log normalize the data
553 | sc.data.stone <- NormalizeData(sc.data.stone, normalization.method = "LogNormalize", scale.factor = 10000)
554 | 
555 | # Variable gene identification
556 | sc.data.stone <- FindVariableFeatures(sc.data.stone, selection.method = "vst", nfeatures = 2000)
557 | 
558 | # Scale the data
559 | all.genes <- rownames(sc.data.stone)
560 | sc.data.stone <- ScaleData(sc.data.stone, features = all.genes)
561 | 
562 | # PCA
563 | sc.data.stone <- RunPCA(sc.data.stone, features = VariableFeatures(object = sc.data.stone))
564 | 
565 | # JackStraw procedure and Elbow plot to select number of PCs
566 | sc.data.stone <- JackStraw(sc.data.stone, num.replicate = 100)
567 | sc.data.stone <- ScoreJackStraw(sc.data.stone, dims = 1:20)
568 | 
569 | JackStrawPlot(sc.data.stone, dims = 1:20)
570 | ElbowPlot(sc.data.stone)
571 | 
572 | # Identify neighbors and clusters
573 | sc.data.stone <- FindNeighbors(sc.data.stone, dims = 1:18)
574 | sc.data.stone <- FindClusters(sc.data.stone, resolution = 0.8)
575 | 
576 | # UMAP embedding
577 | sc.data.stone <- RunUMAP(sc.data.stone, dims = 1:18)
578 | ```
579 | 
580 | ### 3. Classification
581 | 
582 | Here, we perform the same classification pipeline as described above in the first section.
583 | 
584 | #### Step 1. Tissue Classification 
585 | 
586 | ***Load the bulk data***
587 | 
588 | ```r
589 | # File path
590 | bulk.raw.path <- system.file("extdata", "Bulk Reference Raw.Rds", package = "Capybara")
591 | bulk.rpkm.path <- system.file("extdata", "Bulk Reference RPKM.Rds", package = "Capybara")
592 | # Read the matrices
593 | bulk.raw <- readRDS(bulk.raw.path)
594 | bulk.rpkm <- readRDS(bulk.rpkm.path)
595 | ```
596 | 
597 | ***Application of Quadratic Programming using Bulk***
598 | 
599 | ```r
600 | single.round.QP.analysis(bulk.raw, stone.et.al, scale.bulk.sc = "scale", unix.par = TRUE, 
601 |                          force.eq = 1, n.cores = 4, save.to.path = "./", 
602 |                          save.to.filename = "stone_bulk_classification_qp")
603 | ```
604 | 
605 | ***Correlation Analysist***
606 | 
607 | ```r
608 | ## Load QP results
609 | qp.rslt <- read.csv("./stone_bulk_classification_qp_scale.csv", row.names = 1, header = T, stringsAsFactors = F)
610 | 
611 | ## Reshape the data
612 | qp.rslt.sub <- qp.rslt[,c(1:(ncol(qp.rslt) - 2))]
613 | 
614 | ## Background matrix
615 | background.qp.fpath <- system.file("extdata", "MCA Embryonic Background.Rds", package = "Capybara")
616 | background.mca <- readRDS(background.qp.fpath)
617 | background.mtx <- background.mca[[2]]
618 | 
619 | ## Correlation Analysis
620 | mtx.test <- t(qp.rslt.sub[, colnames(background.mtx)])
621 | ref.test <- t(background.mtx)
622 | 
623 | ## Pearson's Correlation Calculation
624 | corr.mtx <- WGCNA::cor(ref.test, mtx.test)
625 | 
626 | ## Setup a correlation cutoff to the 90th quantile of the correlation matrix
627 | correlation.cutoff <- quantile(corr.mtx, 0.90)
628 | 
629 | ## Binarization based on the correlation
630 | new.corr.bin <- corr.mtx
631 | new.corr.bin[which(new.corr.bin >= correlation.cutoff)] <- 1
632 | new.corr.bin[which(new.corr.bin < correlation.cutoff)] <- 0
633 | new.corr.bin <- as.data.frame(new.corr.bin)
634 | ```
635 | 
636 | ***Mapping to Tissues in Mouse Cell Atlas (MCA)***
637 | 
638 | ```r
639 | # Count
640 | count.in.cat <- c()
641 | unique.cat <- unique(unlist(lapply(strsplit(rownames(new.corr.bin), "_"), function(x) x[1])))
642 | for (uc in unique.cat) {
643 |   curr.subset <- new.corr.bin[which(startsWith(rownames(new.corr.bin), uc)), c(1:30729)]
644 |   count.in.cat[uc] <- sum(colSums(curr.subset) >= nrow(curr.subset) * 0.80)
645 | }
646 | 
647 | count.in.cat <- as.data.frame(count.in.cat)
648 | count.in.cat$perc <- round(count.in.cat$count.in.cat *100/sum(count.in.cat$count.in.cat), digits = 3)
649 | 
650 | final.cell.types.fetal <- rownames(count.in.cat)[which(count.in.cat$count.in.cat > 100)]
651 | ```
652 | 
653 | Below is the composition for this cardiac reprogramming dataset, where we identify 4 major tissues.
654 | <p align="center">
655 |     <img src="/examples/cardiac_bulk_v2.png" height="800" width="400">
656 | </p>
657 | 
658 | #### Step 2. Construction of Reference at High-Resolution and Continuous Identity Measurements
659 | 
660 | ***Get the counts of cell types in the selected tissues from MCA***
661 | 
662 | ```r
663 | # Background cells
664 | mca <- read.csv("~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_CellAssignments.csv",
665 |                 row.names = 1, header = T, stringsAsFactors = F)
666 | mca.meta <- data.frame(row.names = mca$Cell.name, 
667 |                        tissue = mca$Tissue,
668 |                        cell.bc.tissue = unlist(lapply(strsplit(mca$Cell.name, "_"), function(x) x[1])),
669 |                        cell.type = mca$Annotation,
670 |                        stringsAsFactors = F)
671 | 
672 | cardiac.rp.all.meta <- mca.meta[which(mca.meta$cell.bc.tissue %in% final.cell.types.fetal), ]
673 | 
674 | mca.counts.all.involved <- NULL
675 | tissues.to.read <- unique(cardiac.rp.all.meta$tissue)
676 | general.path <- "~/Box/Morris Lab/Classifier Analysis/Reference datasets/MCA/MCA_Counts/"
677 | for (i in 1:length(tissues.to.read)) {
678 |   curr.t <- tissues.to.read[i]
679 |   curr.path.to.read <- paste0(general.path, curr.t, "/count.csv")
680 |   curr.count <- read.csv(curr.path.to.read, header = T, row.names = 1, stringsAsFactors = F)
681 |   if (is.null(mca.counts.all.involved)) {
682 |     mca.counts.all.involved <- curr.count
683 |   } else {
684 |     mca.counts.all.involved <- cbind(mca.counts.all.involved, curr.count)
685 |   }
686 | }
687 | 
688 | ## meta data cleaning
689 | cardiac.rp.all.meta$cell.type.1 <- gsub("\\([^)]*\\)", "", cardiac.rp.all.meta$cell.type)
690 | cardiac.rp.all.meta$cell.type.alone <- unlist(lapply(strsplit(cardiac.rp.all.meta$cell.type.1, "_"), function(x) x[1]))
691 | 
692 | cardiac.rp.all.meta$cell.type.1 <- tolower(cardiac.rp.all.meta$cell.type.1)
693 | coldata.df <- cardiac.rp.all.meta
694 | ```
695 | 
696 | ***Construction***
697 | ```r
698 | # Construction of a high-resolution reference
699 | ref.list <- construct.high.res.reference(mca.counts.all.involved, coldata.df = coldata.df, criteria = "cell.type.1")
700 | # Get expression matrix and meta data of cells used to build the reference, as well as the constructed pseudo-bulk reference
701 | ref.df <- ref.construction(ref.list[[1]], ref.list[[2]], "cell.type")
702 | ```
703 | 
704 | ***Application of Quadratic Programming***
705 | ```r
706 | single.round.QP.analysis(ref.df, ref.list[[1]], n.cores = 4, save.to.path = "./", save.to.filename = "stone_et_al_reference_MCA", unix.par = TRUE)
707 | single.round.QP.analysis(ref.df, stone.et.al, n.cores = 4, save.to.path = "./", save.to.filename = "stone_et_al_test_MCA", unix.par = TRUE)
708 | ```
709 | 
710 | #### Step 3. Discrete Cell Type Classification and Multiple Identity scoring
711 | 
712 | ***Empirical p-value Calculation***
713 | ```r
714 | # Read in background and testing identity scores
715 | background.mtx <- read.csv("./stone_et_al_reference_MCA_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
716 | mtx.test <- read.csv("./stone_et_al_test_MCA_scale.csv", header = T, row.names = 1, stringsAsFactors = F)
717 | 
718 | col.sub <- ncol(background.mtx) - 2
719 | 
720 | # Conduct reference randomization to get empirical p-value matrix
721 | ref.perc.list <- percentage.calc(background.mtx[,c(1:col.sub)], background.mtx[,c(1:col.sub)])
722 | 
723 | # Conduct test randomization to get empirical p-value matrix
724 | perc.list <- percentage.calc(as.matrix(mtx.test[,c(1:col.sub)]), as.matrix(background.mtx[,c(1:col.sub)]))
725 | ```
726 | 
727 | ***Binarization and Classificationn***
728 | ```r
729 | # Binarization of inference results
730 | bin.count <- binarization.mann.whitney(mtx = mtx.test[,c(1:col.sub)], ref.perc.ls = ref.perc.list, ref.meta = ref.list[[2]], perc.ls = perc.list)
731 | # Classificationn
732 | classification <- binary.to.classification(bin.count[,c(1:col.sub)])
733 | rownames(classification) <- classification$barcode
734 | ```
735 | 
736 | ### 4. Filter Cells with Multiple Identities based on the QP scores
737 | 
738 | Different interpretations can be taken on the cells with multiple identities. Here, we interpret these cells as in transition. However, some cells with multiple identities may be incorrectly labelled, which is represented by close to zero (we defined as <10E-3, this threshold can be modified as an input parameter of the function) QP score for one of the labelled identities. Hence, we first filter the cells with multiple identities such that each cell receives relative significant QP scores for each shared identity. This function will return a list where \[\[1\]\] represents a data frame of actual multiple identity cells and \[\[2\]\] represents an updated classification data frame. In the data frame of actual multiple identity cells, the QP measurements for each identity are also included for calculation of transition scores.
739 | 
740 | ```r
741 | multi.classification.list <- multi.id.curate.qp(binary.counts = bin.count, classification = classification, qp.matrix = mtx.test)
742 | # Reassign variables
743 | actual.multi <- multi.classification.list[[1]]
744 | new.classification <- multi.classification.list[[2]]
745 | ```
746 | 
747 | ### 5. Calculate Transition Scores
748 | 
749 | ***Calculation***
750 | 
751 | Cells with multiple identities label critical transition states in different trajectories. Building on this concept, we also measure the strength and frequency of connections to the "hub" identities, i.e. discrete identities where the multiple identity cells are connected to. This provides a metric that we define as a "transition score". The calculation of transition scores only involves cells with multiple identities. In brief, we interpret QP scores as probabilities of the cell transitioning to each discrete cell identity, which  we further use as a measure of transition probability. Using this measurement of transition probability, we calculate the amount of information that the terminal cell state has received based on information theory. For detailed methods, please refer to the paper. Here, we demonstrate the calculation of transition scores for discrete cell identity states that are involved in cells with multiple identities in this cardiomyocyte reprogramming process. This function takes an input of the multiple identity data frame calculated from above and outputs a data frame with each cell state to their scores.
752 | 
753 | 
754 | ```r
755 | score.df <- transition.score(actual.multi)
756 | ```
757 | 
758 | ***Visualization***
759 | 
760 | Here, we visualize the transition scores on the UMAP, comparing to their time points of collection. Additionally, we compared the scores between time points using box plots. Other visualization methods, such as violin plots, can also be adapted.
761 | 
762 | Below is visualization results of cardiac reprogramming process.
763 | <p align="center">
764 |     <img src="/examples/cardiac_TS_plots.png" height="365" width="905">
765 | </p>
766 | 
767 | 
768 | *Note: this will be continuously updating*
769 | 


--------------------------------------------------------------------------------
/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/.DS_Store


--------------------------------------------------------------------------------
/examples/Monocle_hat_colin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/Monocle_hat_colin.png


--------------------------------------------------------------------------------
/examples/Step 1 - ARCHS4 Mining.R:
--------------------------------------------------------------------------------
  1 | ###############################################
  2 | # Step 1: ARCHS4 Mining
  3 | ###############################################
  4 | 
  5 | library(rhdf5)
  6 | 
  7 | destination_file = "mouse_matrix_download.h5"
  8 | url = "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix_download.h5"
  9 | 
 10 | # Check if gene expression file was already downloaded, if not in current directory download file form repository
 11 | if(!file.exists(destination_file)){
 12 |   print("Downloading compressed gene expression matrix.")
 13 |   download.file(url, destination_file, quiet = FALSE, mode = 'wb')
 14 | }
 15 | 
 16 | # Get the meta data from the h5 file
 17 | samples = h5read("mouse_matrix_download.h5", "meta")
 18 | 
 19 | sample_meta <- data.frame(channel_count = samples$Sample_channel_count, 
 20 |                           accession = samples$Sample_geo_accession, 
 21 |                           characteristics = samples$Sample_characteristics_ch1, 
 22 |                           organism = samples$Sample_organism_ch1, 
 23 |                           source = samples$Sample_source_name_ch1, 
 24 |                           molecule_info = samples$Sample_molecule_ch1, 
 25 |                           series_id = samples$Sample_series_id, stringsAsFactors=F)
 26 | 
 27 | # Use only total RNA and polyA RNA-seq data
 28 | total.RNA.indx <- which(sample_meta$molecule_info == "total RNA")
 29 | poly.A.indx <- which(sample_meta$molecule_info == "polyA RNA")
 30 | sample_meta_pa_total <- sample_meta[c(total.RNA.indx, poly.A.indx),]
 31 | 
 32 | ### Clean some of the labels
 33 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("stomachs", "stomach wt", "stomach tissue"))] <- "stomach"
 34 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("mus musculus strain c57bl/6j urinary bladder tissue adult (8 weeks)", "bladder"))] <- "urinary bladder"
 35 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("bone lining cells", "wild type_calvarial frontal bone", "wild type_calvarial parietal bone",  "wt mandibular bone"))] <- "bone"
 36 | sample_meta_pa_total$source[which(tolower(sample_meta_pa_total$source) %in% c("primary esophagus"))] <- "esophagus"
 37 | 
 38 | # Limit to these tissue labels
 39 | tissue.to.choose <- c("aorta", "bone", "bone marrow", "marrow", "brain", "colon", 
 40 |                       "large intestine", "diaphragm", "embryo", "esophagus", "brown fat", "white fat", 
 41 |                       "adipose", "gall bladder", "heart", "kidney", "lung", "ovary", 
 42 |                       "pancreas", "prostate", "skin", "small intestine", "spleen", "muscle", 
 43 |                       "stomach", "testis", "thymus", "tongue", "trachea", 
 44 |                       "urinary bladder", "uterus", "liver", "mammary gland")
 45 | indx <- which(tolower(sample_meta_pa_total$source) %in% tissue.to.choose)
 46 | 
 47 | sample_tissue_sub <- sample_meta_pa_total[indx,]
 48 | sample_tissue_sub$source <- tolower(sample_tissue_sub$source)
 49 | 
 50 | # Get the strain infomation in place
 51 | strains <- c()
 52 | treatment <- c()
 53 | genotype <- c()
 54 | for (i in 1:nrow(sample_tissue_sub)) {
 55 |   curr_characteristics <- sample_tissue_sub$characteristics[i]
 56 |   curr_parts <- strsplit(curr_characteristics, "Xx-xX")[[1]]
 57 |   strain.parts <- which(startsWith(tolower(curr_parts), "strain"))
 58 |   treatment.parts <- which(startsWith(tolower(curr_parts), "treatment"))
 59 |   genotype.parts <- which(startsWith(tolower(curr_parts), "genotype"))
 60 |   
 61 |   curr_strain <- " "
 62 |   curr_treatment <- " "
 63 |   curr_genotype <- " "
 64 |   if (length(strain.parts) > 0) {
 65 |     curr_strain <- strsplit(tolower(curr_parts[strain.parts]), "strain: ")[[1]][2]
 66 |   } 
 67 |   if (length(treatment.parts) > 0) {
 68 |     curr_treatment <- strsplit(tolower(curr_parts[treatment.parts]), "treatment: ")[[1]][2]
 69 |   } 
 70 |   if (length(genotype.parts) > 0) {
 71 |     curr_genotype <- strsplit(tolower(curr_parts[genotype.parts]), "genotype: ")[[1]][2]
 72 |   } 
 73 |   strains <- c(strains, curr_strain)
 74 |   treatment <- c(treatment, curr_treatment)
 75 |   genotype <- c(genotype, curr_genotype)
 76 | }
 77 | 
 78 | # subset to only include c57bl6
 79 | sample_tissue_sub$strain <- strains
 80 | sample_tissue_sub$treatment <- treatment
 81 | sample_tissue_sub$genotype <- genotype
 82 | 
 83 | sample_tissue_strain_sub <- sample_tissue_sub[which(tolower(sample_tissue_sub$strain) %in% c("c57bl6", "c57bl6j", "c57bl6n")),]
 84 | 
 85 | saveRDS(sample_tissue_sub, "01_full_accession_information_data_frame.Rds")
 86 | 
 87 | # Retrieve information from compressed data
 88 | samples_geo = samples$Sample_geo_accession
 89 | genes <- samples$genes
 90 | 
 91 | sample_locations = which(samples_geo %in% sample_tissue_strain_sub$accession)
 92 | 
 93 | # Read the expression data
 94 | expression = h5read(destination_file, "data/expression", index=list(1:length(genes), sample_locations))
 95 | H5close()
 96 | 
 97 | rownames(expression) = genes
 98 | colnames(expression) = samples_geo[sample_locations]
 99 | 
100 | # Use Ensembl to extract gene length information based on Ensembl ID
101 | library(biomaRt)
102 | gene.info <- read.table("gene_info.tsv", header=T, stringsAsFactors=F)
103 | 
104 | mart <- useMart("ensembl",dataset="mmusculus_gene_ensembl")
105 | ensembl.gene <- gene.info$gene.ensembl[which(!is.na(gene.info$gene.ensembl))]
106 | ensembl.map.rslt <- getBM(attributes = c("mgi_symbol", "start_position", "end_position", "ensembl_gene_id_version", "external_gene_name", "ensembl_gene_id"), 
107 |                           filters = "ensembl_gene_id", values = ensembl.gene, mart = mart)
108 | ensembl.map.rslt <- unique(ensembl.map.rslt[,c(2:6)])
109 | rownames(ensembl.map.rslt) <- ensembl.map.rslt$ensembl_gene_id  
110 | # Calculate length of the gene    
111 | ensembl.map.rslt$gene.length <- abs(ensembl.map.rslt$end_position - ensembl.map.rslt$start_position)/1000
112 | 
113 | gene.info$gene.length <- ensembl.map.rslt[gene.info$gene.ensembl, "gene.length"]
114 | 
115 | gene.info.sub <- gene.info[which(!is.na(gene.info$gene.length)), ]
116 | rownames(gene.info.sub) <- gene.info.sub$gene.sym
117 | 
118 | # RPKM Calculation
119 | raw.count.sub <- expression[which(rownames(expression) %in% gene.info.sub$gene.sym),]
120 | raw.count.sub <- as.data.frame(raw.count.sub)
121 | raw.count.sub$gene.length <- gene.info.sub[rownames(raw.count.sub), "gene.length"]
122 | rpkm.count <- raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))]
123 | # Calculate per million size factor and divide each sample by its size factor
124 | size.factors <- colSums(raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))])/1000000
125 | for (i in 1:(ncol(rpkm.count))) {
126 |    rpkm.count[,i] <- rpkm.count[,i]/size.factors[colnames(rpkm.count)[i]]
127 |    rpkm.count[,i] <- rpkm.count[,i]/raw.count.sub$gene.length
128 | }
129 | 
130 | saveRDS(rpkm.count, "03_rpkm_count_Tissue_bulk.Rds")
131 | saveRDS(raw.count.sub[,c(1:(ncol(raw.count.sub) - 1))], "03_raw_count_Tissue_bulk.Rds")
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/examples/Step 2 - bulk cleaning.R:
--------------------------------------------------------------------------------
  1 | ###############################################
  2 | # Step 2: ARCHS4 Bulk Cleaning
  3 | ###############################################
  4 | library(viridis)
  5 | library(pheatmap)
  6 | library(ggplot2)
  7 | library(reshape2)
  8 | 
  9 | ### function to get the most similar n samples
 10 | get.most.connected <- function(mtx, n.sample) {
 11 |   corr.mtx <- WGCNA::cor(mtx)
 12 |   corr.mtx.upper <- corr.mtx * upper.tri(corr.mtx)
 13 |   corr.mtx.melt <- melt(corr.mtx.upper)
 14 |   corr.mtx.melt.pos <- corr.mtx.melt[which(corr.mtx.melt$value > 0), ]
 15 |   corr.mtx.melt.pos.sort <- corr.mtx.melt.pos[order(-corr.mtx.melt.pos$value), ]
 16 |   corr.mtx.melt.pos.sort$Var1 <- as.character(corr.mtx.melt.pos.sort$Var1)
 17 |   corr.mtx.melt.pos.sort$Var2 <- as.character(corr.mtx.melt.pos.sort$Var2)
 18 |   
 19 |   sample.list <- c()
 20 |   count.line <- 1
 21 |   while(length(sample.list) < n.sample) {
 22 |     sample.list <- unique(c(sample.list, unique(c(corr.mtx.melt.pos.sort$Var1[count.line], corr.mtx.melt.pos.sort$Var2[count.line]))))
 23 |     count.line <- count.line + 1
 24 |   }
 25 |   
 26 |   return(sample.list[1:90])
 27 | }
 28 | 
 29 | ### Load the raw data/rpkm data from the step 1
 30 | raw.counts <- readRDS("03_raw_count_Tissue_bulk.Rds")
 31 | rpkm.count <- readRDS("03_rpkm_count_Tissue_bulk.Rds")
 32 | accession.full <- readRDS("01_full_accession_information_data_frame.Rds")
 33 | rownames(accession.full) <- accession.full$accession
 34 | 
 35 | ### Subset the accession 
 36 | accession.sub <- accession.full[colnames(raw.counts), ]
 37 | accession.sub$source <- tolower(accession.sub$source)
 38 | accession.freq <- as.data.frame(table(accession.sub$source, accession.sub$series_id))
 39 | 
 40 | accession.sub$merge <- paste0(accession.sub$source, "_", accession.sub$series_id)
 41 | accession.sub.freq <- as.data.frame(table(accession.sub$merge))
 42 | accession.sub.freq$Var1 <- as.character(accession.sub.freq$Var1)
 43 | accession.sub.freq$tissue <- unlist(lapply(strsplit(accession.sub.freq$Var1, "_"), function(x) x[1]))
 44 | 
 45 | ### Select for the most associated 90 samples across different GEO RNA-seq data
 46 | tissue.uniq <- unique(accession.sub$source)
 47 | tissue.uniq <- tissue.uniq[which(!is.na(tissue.uniq))]
 48 | series.to.keep <- c()
 49 | sample.accession.to.keep <- c()
 50 | new.accession <- data.frame()
 51 | new.sample.accession <- data.frame()
 52 | sample.count <- data.frame()
 53 | sample.rpkm <- data.frame()
 54 | n.sample <- 90
 55 | for (tis in tissue.uniq) {
 56 |   curr.freq.sub <- accession.sub.freq[which(accession.sub.freq$tissue == tis), ]
 57 |   curr.freq.sub.sort <- curr.freq.sub[order(-curr.freq.sub$Freq), ]
 58 |   curr.series.to.keep <- strsplit(curr.freq.sub.sort$Var1[1], "_")[[1]][2]
 59 |   curr.geo.to.keep <- accession.sub[which(accession.sub$series_id == curr.series.to.keep & accession.sub$source == tis), ]
 60 |   curr.geo.to.keep$tissue <- tis
 61 |   print(tis)
 62 |   print(nrow(curr.geo.to.keep))
 63 |   series.to.keep <- c(series.to.keep, curr.series.to.keep)
 64 |   #  sample.accession.to.keep <- c(sample.accession.to.keep, curr.geo.to.keep)
 65 |   
 66 |   if (nrow(curr.geo.to.keep) >= n.sample) {
 67 |     geo.sample <- get.most.connected(log2(rpkm.count[,curr.geo.to.keep$accession] + 1), n.sample = n.sample)
 68 |   } else {
 69 |     geo.sample <- sample(curr.geo.to.keep$accession, n.sample, replace = T)
 70 |   }
 71 |   
 72 |   curr.count.sample <- raw.counts[, geo.sample]
 73 |   curr.rpkm.sample <- rpkm.count[, geo.sample]
 74 |   
 75 |   colnames(curr.count.sample) <- paste0(tis, "_", seq(1,n.sample))
 76 |   colnames(curr.rpkm.sample) <- paste0(tis, "_", seq(1,n.sample))
 77 |   
 78 |   if (nrow(new.accession) <= 0) {
 79 |     new.accession <- curr.geo.to.keep
 80 |     new.sample.accession <- curr.geo.to.keep[geo.sample,]
 81 |     sample.count <- curr.count.sample
 82 |     sample.rpkm <- curr.rpkm.sample
 83 |   } else {
 84 |     new.accession <- rbind(new.accession, curr.geo.to.keep)
 85 |     new.sample.accession <- rbind(new.sample.accession, curr.geo.to.keep[geo.sample, ])
 86 |     sample.count <- cbind(sample.count, curr.count.sample)
 87 |     sample.rpkm <- cbind(sample.rpkm, curr.rpkm.sample)
 88 |   }
 89 | } 
 90 | 
 91 | accession.new.freq <- as.data.frame(table(new.accession$source))
 92 | 
 93 | ### Look at the correlation between samples
 94 | corr.sample.rpkm <- WGCNA::cor(sample.rpkm)
 95 | corr.sample.rpkm.log <- WGCNA::cor(log2(sample.rpkm + 1))
 96 | 
 97 | my.col <- data.frame(row.names = rownames(corr.sample.rpkm), 
 98 |                      tissue = unlist(lapply(strsplit(rownames(corr.sample.rpkm), "_"), function(x) x[1])), 
 99 |                      stringsAsFactors = F)
100 | 
101 | ### Heatmap plot to see if the samples selected are well correlated as well as well distinguished from other tissues
102 | pheatmap::pheatmap(corr.sample.rpkm, color = viridis(20, option = "A"), show_colnames = F, show_rownames = F, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 
103 |                    annotation_col = my.col, cellheight = 0.2, cellwidth = 0.2, file = "~/Desktop/test.pdf")
104 | pheatmap::pheatmap(corr.sample.rpkm.log, color = viridis(20, option = "A"), show_colnames = F, show_rownames = F, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 
105 |                    annotation_col = my.col, cellheight = 0.2, cellwidth = 0.2, file = "~/Desktop/test_log.pdf")
106 | 
107 | saveRDS(sample.count, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sampled_raw_counts.Rds")
108 | saveRDS(sample.rpkm, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sampled_raw_rpkm.Rds")
109 | 
110 | ### Compute the final averaged bulk dataset
111 | final.rpkm.tissue <- data.frame()
112 | final.raw.count.tissue <- data.frame()
113 | uniq.tissue <- unique(my.col$tissue)
114 | 
115 | for (tis in uniq.tissue) {
116 |   curr.rpkm.tis <- sample.rpkm[, rownames(my.col)[which(my.col$tissue == tis)]]
117 |   curr.rpkm.mean <- as.data.frame(rowMeans(curr.rpkm.tis))
118 |   
119 |   curr.count.tis <- sample.count[, rownames(my.col)[which(my.col$tissue == tis)]]
120 |   curr.count.mean <- as.data.frame(rowMeans(curr.count.tis))
121 |   
122 |   colnames(curr.rpkm.mean) <- tis
123 |   colnames(curr.count.mean) <- tis
124 |   
125 |   if (ncol(final.rpkm.tissue) <= 0) {
126 |     final.rpkm.tissue <- curr.rpkm.mean
127 |     final.raw.count.tissue <- curr.count.mean
128 |   } else {
129 |     final.rpkm.tissue <- cbind(final.rpkm.tissue, curr.rpkm.mean)
130 |     final.raw.count.tissue <- cbind(final.raw.count.tissue, curr.count.mean)
131 |   }
132 | }
133 | 
134 | ### Check if the averaged bulk dataset has tissues that are distinguishable from each other
135 | rpkm.tissue.cor <- WGCNA::cor(log2(final.rpkm.tissue + 1))
136 | pheatmap::pheatmap(rpkm.tissue.cor, color = viridis(20, option = "A"), show_colnames = T, show_rownames = T, cluster_cols = F, cluster_rows = F, clustering_method = "ward.D2", 
137 |                    cellheight = 15, cellwidth = 15, file = "~/Desktop/test_log.pdf")
138 | 
139 | saveRDS(final.rpkm.tissue, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sample_final_rpkm_tissue_collapsed.Rds")
140 | saveRDS(final.raw.count.tissue, "~/Box/Morris Lab/Classifier Analysis/ARCHS4 Reference/05_90_sample_final_raw_count_tissue_collapsed.Rds")
141 | 
142 | 


--------------------------------------------------------------------------------
/examples/bulk class mca pancreatic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/bulk class mca pancreatic.png


--------------------------------------------------------------------------------
/examples/bulk_composition_pancreatic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/bulk_composition_pancreatic.pdf


--------------------------------------------------------------------------------
/examples/cardiac_TS_plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/cardiac_TS_plots.png


--------------------------------------------------------------------------------
/examples/cardiac_bulk_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/cardiac_bulk_v2.png


--------------------------------------------------------------------------------
/examples/pancreatic dot plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/pancreatic dot plot.pdf


--------------------------------------------------------------------------------
/examples/pancreatic dot plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/examples/pancreatic dot plot.png


--------------------------------------------------------------------------------
/inst/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/.DS_Store


--------------------------------------------------------------------------------
/inst/extdata/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/.DS_Store


--------------------------------------------------------------------------------
/inst/extdata/Bulk Reference RPKM.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/Bulk Reference RPKM.Rds


--------------------------------------------------------------------------------
/inst/extdata/Bulk Reference Raw.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/Bulk Reference Raw.Rds


--------------------------------------------------------------------------------
/inst/extdata/MCA Adult Background.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/MCA Adult Background.Rds


--------------------------------------------------------------------------------
/inst/extdata/MCA Embryonic Background.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/MCA Embryonic Background.Rds


--------------------------------------------------------------------------------
/inst/extdata/baron_dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/morris-lab/Capybara/668e2aabb502082b21655e9951c17a08ba2ccb8a/inst/extdata/baron_dataset.zip


--------------------------------------------------------------------------------
/man/binarization.mann.whitney.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Binarization.R
 3 | \name{binarization.mann.whitney}
 4 | \alias{binarization.mann.whitney}
 5 | \title{Binarization and Identity Calling from Identity Score}
 6 | \usage{
 7 | binarization.mann.whitney(
 8 |   mtx,
 9 |   ref.perc.ls,
10 |   ref.meta,
11 |   perc.ls,
12 |   bulk = FALSE,
13 |   map.df = NULL,
14 |   init.class = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types}
19 | 
20 | \item{ref.perc.ls}{Emprical p-values for reference cells}
21 | 
22 | \item{ref.meta}{The celltype meta information for reference cells}
23 | 
24 | \item{perc.ls}{Emprical p-values for query cells}
25 | 
26 | \item{bulk}{If the reference data type is bulk RNA-seq. The default is bulk = FALSE}
27 | 
28 | \item{map.df}{bulk mapping. The default is bulk = FALSE}
29 | 
30 | \item{init.class}{initial classification. The default is init.class = NULL}
31 | }
32 | \description{
33 | This function calls single identity or multiple identities of query cells from empirical p-values. Inferred cell types are marked by 1 in a binarized matrix.
34 | }
35 | \note{
36 | 
37 | }
38 | \examples{
39 | 
40 | }
41 | \keyword{binarization,}
42 | \keyword{calling}
43 | \keyword{identity}
44 | 


--------------------------------------------------------------------------------
/man/binary.to.classification.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Classification.R
 3 | \name{binary.to.classification}
 4 | \alias{binary.to.classification}
 5 | \title{Generate classification results from the binarized result matrix}
 6 | \usage{
 7 | binary.to.classification(bin.count.rslt)
 8 | }
 9 | \arguments{
10 | \item{bin.count.rslt}{The binarized classification result matrix}
11 | }
12 | \description{
13 | This function genrate classification table from the binarized result matrix
14 | }
15 | \note{
16 | 
17 | }
18 | \examples{
19 | 
20 | }
21 | \keyword{calssification,}
22 | \keyword{result}
23 | 


--------------------------------------------------------------------------------
/man/calc.scale.ratio.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R
 3 | \name{calc.scale.ratio}
 4 | \alias{calc.scale.ratio}
 5 | \title{Scale Ratio Calculation}
 6 | \usage{
 7 | calc.scale.ratio(bulk, sc)
 8 | }
 9 | \arguments{
10 | \item{bulk}{The reference dataset}
11 | 
12 | \item{sc}{The single-cell dataset}
13 | }
14 | \description{
15 | This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction.
16 | }
17 | \examples{
18 | calc.scale.ratio(reference.dt, single.cell.mtx)
19 | }
20 | \keyword{Scale}
21 | \keyword{calculation}
22 | \keyword{ratio}
23 | 


--------------------------------------------------------------------------------
/man/construct.high.res.reference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/HighResolutionReferenceConstruction.R
 3 | \name{construct.high.res.reference}
 4 | \alias{construct.high.res.reference}
 5 | \title{Systematic construction of a high-resolution reference}
 6 | \usage{
 7 | construct.high.res.reference(
 8 |   ref.mtx,
 9 |   coldata.df,
10 |   criteria,
11 |   cell.num.for.ref = 90
12 | )
13 | }
14 | \arguments{
15 | \item{ref.mtx}{The single-cell reference dataset}
16 | 
17 | \item{coldata.df}{The metadata (cell type information) for cells in the high-resolution reference}
18 | 
19 | \item{cell.num.for.ref}{The number of cell numbers used to build the reference for each cell type. The default is cell.num.for.ref = 90}
20 | }
21 | \description{
22 | Create a pseudo-bulk reference by sampling 90-cells from each cell type to maintain cellular resolution while increasing transcriptional resolution
23 | }
24 | \note{
25 | 
26 | }
27 | \examples{
28 | 
29 | }
30 | \keyword{pseudo-bulk}
31 | \keyword{reference}
32 | 


--------------------------------------------------------------------------------
/man/gene.intersect.sub.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R
 3 | \name{gene.intersect.sub}
 4 | \alias{gene.intersect.sub}
 5 | \title{Find intersection genes}
 6 | \usage{
 7 | gene.intersect.sub(bulk, sc)
 8 | }
 9 | \arguments{
10 | \item{bulk}{The reference dataset}
11 | 
12 | \item{sc}{The single-cell dataset}
13 | }
14 | \description{
15 | This function identifies the intersection genes between reference and single-cell data
16 | }
17 | \examples{
18 | gene.intersect.sub(reference.dt, single.cell.mtx)
19 | }
20 | 


--------------------------------------------------------------------------------
/man/get.least.connected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Helper.R
 3 | \name{get.least.connected}
 4 | \alias{get.least.connected}
 5 | \title{Get the Least Connected Cells}
 6 | \usage{
 7 | get.least.connected(mtx, n.sample)
 8 | }
 9 | \arguments{
10 | \item{mtx}{The normalized reference count matrix}
11 | 
12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk}
13 | }
14 | \description{
15 | This function returns index of cells that are the least connected within a cell type
16 | }
17 | 


--------------------------------------------------------------------------------
/man/get.mid.connected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Helper.R
 3 | \name{get.mid.connected}
 4 | \alias{get.mid.connected}
 5 | \title{Get the Medium Connected Cells}
 6 | \usage{
 7 | get.mid.connected(mtx, n.sample)
 8 | }
 9 | \arguments{
10 | \item{mtx}{The normalized reference count matrix}
11 | 
12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk}
13 | }
14 | \description{
15 | This function returns index of cells that are the most connected within a cell type
16 | }
17 | 


--------------------------------------------------------------------------------
/man/get.most.connected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Helper.R
 3 | \name{get.most.connected}
 4 | \alias{get.most.connected}
 5 | \title{Get the Most Connected Cells}
 6 | \usage{
 7 | get.most.connected(mtx, n.sample)
 8 | }
 9 | \arguments{
10 | \item{mtx}{The normalized reference count matrix}
11 | 
12 | \item{n.sample}{The number of reference cells to be included within each pseudo-bulk}
13 | }
14 | \description{
15 | This function returns index of cells that are the most connected within a cell type
16 | }
17 | 


--------------------------------------------------------------------------------
/man/multi.id.curate.qp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/MultiIDFilterQP.R
 3 | \name{multi.id.curate.qp}
 4 | \alias{multi.id.curate.qp}
 5 | \title{Multi-ID Score-Based Filter}
 6 | \usage{
 7 | multi.id.curate.qp(
 8 |   binary.counts,
 9 |   classification,
10 |   qp.matrix,
11 |   qp.threshold = 10^-3
12 | )
13 | }
14 | \arguments{
15 | \item{binary.counts}{The binary count matrix, which is the output from binarization with Mann Whitney.}
16 | 
17 | \item{classification}{The classification result, which is the output from binary to classification.}
18 | 
19 | \item{qp.matrix}{The matrix that contains QP scores calculated for the sample cells}
20 | 
21 | \item{qp.threshold}{The threshold to cut off for the QP scores in the multiple identity listed cells}
22 | }
23 | \value{
24 | A list contain 2 elements, the first is the curated and filtered multiple identity data frame and the second is the new classification data frame.
25 | }
26 | \description{
27 | This function filters the multiple identity cells based on their QP scores, where we assume that a low QP score (less than 10E-3) are not a true identity to consider
28 | }
29 | \examples{
30 | multi.id.curate.qp(multi.id.meta)
31 | }
32 | \keyword{Multiple}
33 | \keyword{identities}
34 | 


--------------------------------------------------------------------------------
/man/normalize.dt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R
 3 | \name{normalize.dt}
 4 | \alias{normalize.dt}
 5 | \title{Normalization of Single-Cell RNA-Seq Data}
 6 | \usage{
 7 | normalize.dt(dt.st)
 8 | }
 9 | \arguments{
10 | \item{dt.st}{The dataset to normalize}
11 | }
12 | \description{
13 | This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage.
14 | }
15 | \examples{
16 | normalize.dt(single.cell.mtx)
17 | }
18 | \keyword{normalization}
19 | 


--------------------------------------------------------------------------------
/man/perc.calc.aux.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R
 3 | \name{perc.calc.aux}
 4 | \alias{perc.calc.aux}
 5 | \title{Emprical p-values for a query cell}
 6 | \usage{
 7 | perc.calc.aux(
 8 |   l,
 9 |   cell.names,
10 |   mtx.ct.num,
11 |   background.mtx,
12 |   mtx,
13 |   cnms,
14 |   test.times = 50
15 | )
16 | }
17 | \arguments{
18 | \item{l}{The index of the sample cell of query}
19 | 
20 | \item{cell.names}{A vector of sample cells' names}
21 | 
22 | \item{mtx.ct.num}{The number of all possible cell types}
23 | 
24 | \item{background.mtx}{The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types}
25 | 
26 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types}
27 | 
28 | \item{cnms}{The names of all possible cell types}
29 | 
30 | \item{test.times}{The number of times resampling the background. The default is test.times=50}
31 | }
32 | \description{
33 | This function returns emprical p-values of all possilbe celltypes for a query cells under test.times resampled backgrounds.
34 | }
35 | \note{
36 | 
37 | }
38 | \examples{
39 | 
40 | }
41 | \keyword{empirical}
42 | \keyword{p-value,}
43 | \keyword{query}
44 | \keyword{resampling,}
45 | \keyword{single}
46 | 


--------------------------------------------------------------------------------
/man/percentage.calc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R
 3 | \name{percentage.calc}
 4 | \alias{percentage.calc}
 5 | \title{Emprical p-values for query cells}
 6 | \usage{
 7 | percentage.calc(mtx, bkgd.mtx)
 8 | }
 9 | \arguments{
10 | \item{mtx}{The matrix of identity scores of query cells. The row number is the total number of query cells and the column number is the number of total number of possible cell types}
11 | 
12 | \item{bkgd.mtx}{The matrix of identity score of reference cells. The row number is the total number of reference cells and the column number is the number of total number of possible cell types}
13 | }
14 | \description{
15 | This function returns a list of emprical p-value matrices. Every matrix in the list contains emprical p-values of all possilbe celltypes for a single query cells under test.times resampled backgrounds.
16 | }
17 | \note{
18 | 
19 | }
20 | \examples{
21 | 
22 | }
23 | \keyword{all}
24 | \keyword{cells}
25 | \keyword{empirical}
26 | \keyword{p-value,}
27 | \keyword{query}
28 | \keyword{resampling,}
29 | 


--------------------------------------------------------------------------------
/man/ref.construction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/HighResolutionReferenceConstruction.R
 3 | \name{ref.construction}
 4 | \alias{ref.construction}
 5 | \title{Reference Construction}
 6 | \usage{
 7 | ref.construction(sc, sc.aux, criteria)
 8 | }
 9 | \arguments{
10 | \item{sc}{The single-cell resolution dataset}
11 | 
12 | \item{sc.aux}{The auxiliary data frame that annotate the single-cell resolutiond dataset}
13 | 
14 | \item{criteria}{The column name to use for construction of the reference}
15 | }
16 | \description{
17 | This function constructs reference from single-cell resolution reference data to be used for quadratic programming calculation
18 | }
19 | \examples{
20 | ref.construction(single.ref.mtx, single.aux.df, "cell.type")
21 | }
22 | 


--------------------------------------------------------------------------------
/man/sample.func.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/EmpiricalPValueCalculation.R
 3 | \name{sample.func}
 4 | \alias{sample.func}
 5 | \title{Empirical p-value of Identity Score Calculation}
 6 | \usage{
 7 | sample.func(dens.x, curr.val, prob, n = 1000)
 8 | }
 9 | \arguments{
10 | \item{dens.x}{The background dataset that we compare the identity score of our sample with}
11 | 
12 | \item{curr.val}{The identity score of the sample dataset to be evaluated its significance}
13 | 
14 | \item{n}{The number of times of resampling. The default is n=1000}
15 | }
16 | \description{
17 | This function resamples from a sample dataset and returns an empirical p-value of identity score
18 | }
19 | \note{
20 | 
21 | }
22 | \examples{
23 | 
24 | }
25 | \keyword{empirical}
26 | \keyword{p-value}
27 | \keyword{resampling,}
28 | 


--------------------------------------------------------------------------------
/man/sc.quad.prog.run.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgramming.R
 3 | \name{sc.quad.prog.run}
 4 | \alias{sc.quad.prog.run}
 5 | \title{Single-Cell Quadratic Programming Calculation}
 6 | \usage{
 7 | sc.quad.prog.run(
 8 |   bulk.transcriptome,
 9 |   single.cell.transcriptome,
10 |   force.eq = 0,
11 |   unix.parallel = FALSE,
12 |   windows.parallel = FALSE,
13 |   parallel.cores = 4
14 | )
15 | }
16 | \arguments{
17 | \item{bulk.transcriptome}{The reference transcriptome taht contains the transcriptome of each potential cell type}
18 | 
19 | \item{single.cell.transcriptome}{the transcriptome profile of cells from single-cell RNA-sequencing}
20 | 
21 | \item{force.eq}{either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0}
22 | 
23 | \item{unix.parallel}{boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE}
24 | 
25 | \item{windows.parallel}{boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE}
26 | 
27 | \item{parallel.cores}{the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used}
28 | }
29 | \description{
30 | This function runs quadratic programming to identify the probability of the cells in single-cell RNA-seq belonging to cell types in the reference transcriptome
31 | }
32 | \note{
33 | Code reference from Treutlein et. al., Dissecting direct reprogramming from fibroblast to neuron using single-cell RNA-seq
34 | }
35 | \examples{
36 | sc.quad.prog.run(ref.transcriptome, sc.transcriptome, force.eq = 1)
37 | }
38 | \keyword{programming,}
39 | \keyword{quadratic}
40 | \keyword{scRNA-seq}
41 | 


--------------------------------------------------------------------------------
/man/single.round.QP.analysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgramming.R
 3 | \name{single.round.QP.analysis}
 4 | \alias{single.round.QP.analysis}
 5 | \title{Single Round of Quadratic Programming}
 6 | \usage{
 7 | single.round.QP.analysis(
 8 |   ref,
 9 |   sc.data,
10 |   scale.bulk.sc = "scale",
11 |   unix.par = FALSE,
12 |   windows.par = FALSE,
13 |   force.eq = 0,
14 |   n.cores = 1,
15 |   save.to.path,
16 |   save.to.filename,
17 |   bulk.norm = T,
18 |   norm.sc = T,
19 |   log.bulk = T,
20 |   log.sc = T
21 | )
22 | }
23 | \arguments{
24 | \item{ref}{The reference transcriptome taht contains the transcriptome of each potential cell type}
25 | 
26 | \item{sc.data}{the transcriptome profile of cells from single-cell RNA-sequencing}
27 | 
28 | \item{scale.bulk.sc}{either scale or non-scale. Scaling is recommended to make the reference comparable to the single-cell. Default to scale.}
29 | 
30 | \item{unix.par}{boolean value, either TRUE or FALSE. If using unix/linux based systems, this command can be set to TRUE to parallelize use parallel package. Default to FALSE}
31 | 
32 | \item{windows.par}{boolean value, either TRUE or FALSE. If using Windows based systems, this command can be set to TRUE to parallelize use snow package. Default to FALSE}
33 | 
34 | \item{force.eq}{either 0 or 1. Setting to 0 assumes the 1st constraint as inequality. Setting to 1 assumes equality. Default to 0}
35 | 
36 | \item{n.cores}{the number of cores to use for parallel processes. If no parallelization selected, no parallelization will be implemented. Only 1 core will be used}
37 | 
38 | \item{save.to.path}{which directory would you like to save your file?}
39 | 
40 | \item{save.to.filename}{prefix to the filename to save to. The final filename will be constructed inside of the function by tagging the following string _scale.csv for scaling and _non_scale.csv for not scaling.}
41 | 
42 | \item{bulk.norm}{boolean value, either TRUE or FALSE. Would you like to normalize the reference?}
43 | 
44 | \item{norm.sc}{boolean value, either TRUE or FALSE. Would you like to normalize the single-cell dataset?}
45 | 
46 | \item{log.bulk}{boolean value, either TRUE or FALSE. Would you like to log the reference?}
47 | 
48 | \item{log.sc}{boolean value, either TRUE or FALSE. Would you like to log the single-cell dataset?}
49 | }
50 | \description{
51 | This function runs preprocessing, including log normalization, gene intersection and scaling. Further, run one round of quadratic programming. See function sc.quad.prog.run for detailed description
52 | }
53 | \note{
54 | This function calls the quadratic programming referenced to Treutlein et. al.
55 | }
56 | \examples{
57 | single.round.QP.analysis(ref = ref.transcriptome, sc.data = sc.transcriptome, force.eq = 1, save.to.path = "~/Desktop/", save.to.filename = "my_favorite")
58 | }
59 | \keyword{programming,}
60 | \keyword{quadratic}
61 | \keyword{scRNA-seq,}
62 | 


--------------------------------------------------------------------------------
/man/top.genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SingleCellQuadraticProgrammingAux.R
 3 | \name{top.genes}
 4 | \alias{top.genes}
 5 | \title{Top Variable Gene Identificaiton}
 6 | \usage{
 7 | top.genes(input.dir, output.dir, top.number.count = 500)
 8 | }
 9 | \arguments{
10 | \item{input.dir}{The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell.}
11 | 
12 | \item{output.dir}{The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names.}
13 | 
14 | \item{top.number.count}{The number of top variable genes to extract. Default to 500 genes}
15 | }
16 | \description{
17 | This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note.
18 | }
19 | \note{
20 | Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015
21 | }
22 | \examples{
23 | top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000)
24 | }
25 | \keyword{extraction}
26 | \keyword{gene}
27 | \keyword{top}
28 | \keyword{variable}
29 | 


--------------------------------------------------------------------------------
/man/transition.score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TransitionScoreCalculation.R
 3 | \name{transition.score}
 4 | \alias{transition.score}
 5 | \title{Transition Score Calculation}
 6 | \usage{
 7 | transition.score(multi.id.meta)
 8 | }
 9 | \arguments{
10 | \item{multi.id.meta}{A data frame that contains cells with multiple identities. Column 1 - cell barcode, Column 2 - cell type call, Column 3 - counts, Column 4 - corresponding QP scores.}
11 | }
12 | \value{
13 | A data frame that contains the calculated transition scores for each identity
14 | }
15 | \description{
16 | This function calculates the transition scores for each cell state that is connected by cells with multiple identities.
17 | }
18 | \examples{
19 | transition.score(multi.id.meta)
20 | }
21 | \keyword{Transition}
22 | \keyword{metric}
23 | 


--------------------------------------------------------------------------------