├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── .gitignore
    ├── KNNfunctions.R
    ├── data.R
    ├── ensemble.R
    ├── featureSelection.R
    ├── global.R
    ├── learningCurve.R
    ├── predict_scClassify.R
    ├── runHOPACH.R
    ├── sampleSizeCal.R
    ├── scClassify.R
    ├── scClassifyTrainClass.R
    ├── train_scClassify.R
    └── utils_scClassify.R
├── README.md
├── _pkgdown.yml
├── data
    ├── scClassify_example.rda
    ├── trainClassExample_wang.rda
    └── trainClassExample_xin.rda
├── inst
    └── CITATION
├── man
    ├── .gitignore
    ├── cellTypeTrain.Rd
    ├── cellTypeTree.Rd
    ├── dot-scClassifyTrainModel.Rd
    ├── features.Rd
    ├── figures
    │   └── scClassifySticker.png
    ├── getN.Rd
    ├── learningCurve.Rd
    ├── model.Rd
    ├── modelweights.Rd
    ├── name.Rd
    ├── plotCellTypeTree.Rd
    ├── predict_scClassify.Rd
    ├── predict_scClassifyJoint.Rd
    ├── runHOPACH.Rd
    ├── runSampleCal.Rd
    ├── scClassify.Rd
    ├── scClassifyTrainModel-class.Rd
    ├── scClassifyTrainModelList-class.Rd
    ├── scClassifyTrainModelList.Rd
    ├── scClassify_example.Rd
    ├── trainClassExample_wang.Rd
    ├── trainClassExample_xin.Rd
    └── train_scClassify.Rd
└── vignettes
    ├── .gitignore
    ├── pretrainedModel.Rmd
    └── scClassify.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^docs$
4 | ^_pkgdown\.yml$
5 | ^vignettes/webOnly/
6 | .travis.yml
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .Rproj
 6 | inst/doc
 7 | ^vignettes/webOnly/
 8 | scClassify.Rproj
 9 | .DS_Store
10 | docs/*


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: R
 2 | cache: packages
 3 | R: bioc-devel
 4 | dist: trusty
 5 | 
 6 | os:
 7 |   - linux
 8 | 
 9 | env:
10 |   - R_BIOC_VERSION: "3.12"
11 | 
12 | before_cache: Rscript -e 'remotes::install_cran("pkgdown")'
13 | 
14 | deploy:
15 |   provider: script
16 |   script: Rscript -e 'pkgdown::deploy_site_github()'
17 |   skip_cleanup: true
18 |   
19 | 
20 | #after_success:
21 | #  - Rscript -e 'pkgdown::build_site()'
22 | #
23 | #deploy:
24 | #  provider: pages
25 | #  skip-cleanup: true
26 | #  github-token: $GITHUB_PAT
27 | #  keep-history: true
28 | #  local-dir: docs
29 | #  on:
30 | #    branch: master


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scClassify
 2 | Type: Package
 3 | Title: scClassify: single-cell Hierarchical Classification 
 4 | Version: 1.5.1
 5 | Author: Yingxin Lin
 6 | Maintainer: Yingxin Lin <yingxin.lin@sydney.edu.au>
 7 | Description: scClassify is a multiscale classification framework for single-cell RNA-seq data based on ensemble learning and cell type hierarchies, enabling sample size estimation required for accurate cell type classification and joint classification of cells using multiple references.
 8 | License: GPL-3
 9 | Encoding: UTF-8
10 | LazyData: false
11 | Depends: R (>= 4.0)
12 | Imports:
13 |     S4Vectors,
14 |     limma,
15 |     ggraph,
16 |     igraph,
17 |     methods,
18 |     cluster,
19 |     minpack.lm,
20 |     mixtools,
21 |     BiocParallel,
22 |     proxy,
23 |     proxyC,
24 |     Matrix,
25 |     ggplot2,
26 |     hopach,
27 |     diptest,
28 |     mgcv,
29 |     stats,
30 |     graphics,
31 |     statmod,
32 |     Cepo
33 | RoxygenNote: 7.1.1
34 | Suggests: 
35 |     knitr,
36 |     rmarkdown,
37 |     BiocStyle,
38 |     pkgdown
39 | VignetteBuilder: knitr
40 | biocViews: SingleCell, GeneExpression, Classification
41 | BugReports: https://github.com/SydneyBioX/scClassify/issues
42 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(cellTypeTrain)
 4 | export(cellTypeTree)
 5 | export(features)
 6 | export(getN)
 7 | export(learningCurve)
 8 | export(model)
 9 | export(modelweights)
10 | export(name)
11 | export(plotCellTypeTree)
12 | export(predict_scClassify)
13 | export(predict_scClassifyJoint)
14 | export(runHOPACH)
15 | export(runSampleCal)
16 | export(scClassify)
17 | export(scClassifyTrainModelList)
18 | export(train_scClassify)
19 | exportClasses(scClassifyTrainModel)
20 | exportClasses(scClassifyTrainModelList)
21 | import(ggplot2)
22 | import(igraph)
23 | import(limma)
24 | import(statmod)
25 | importClassesFrom(S4Vectors,DataFrame)
26 | importClassesFrom(S4Vectors,SimpleList)
27 | importClassesFrom(S4Vectors,character_OR_NULL)
28 | importFrom(BiocParallel,SerialParam)
29 | importFrom(BiocParallel,bplapply)
30 | importFrom(Cepo,Cepo)
31 | importFrom(Cepo,topGenes)
32 | importFrom(S4Vectors,DataFrame)
33 | importFrom(S4Vectors,SimpleList)
34 | importFrom(cluster,pam)
35 | importFrom(ggraph,geom_edge_diagonal)
36 | importFrom(ggraph,geom_node_point)
37 | importFrom(ggraph,geom_node_text)
38 | importFrom(ggraph,ggraph)
39 | importFrom(graphics,abline)
40 | importFrom(graphics,plot)
41 | importFrom(graphics,text)
42 | importFrom(hopach,as.hdist)
43 | importFrom(hopach,distancevector)
44 | importFrom(hopach,hdist)
45 | importFrom(hopach,improveordering)
46 | importFrom(hopach,is.hdist)
47 | importFrom(hopach,vectmatrix)
48 | importFrom(limma,eBayes)
49 | importFrom(limma,lmFit)
50 | importFrom(methods,as)
51 | importFrom(methods,is)
52 | importFrom(methods,new)
53 | importFrom(methods,slot)
54 | importFrom(mgcv,gam)
55 | importFrom(minpack.lm,nlsLM)
56 | importFrom(mixtools,normalmixEM)
57 | importFrom(proxy,as.dist)
58 | importFrom(proxy,dist)
59 | importFrom(proxyC,simil)
60 | importFrom(stats,coef)
61 | importFrom(stats,cutree)
62 | importFrom(stats,dnorm)
63 | importFrom(stats,lm)
64 | importFrom(stats,median)
65 | importFrom(stats,na.omit)
66 | importFrom(stats,predict)
67 | importFrom(stats,qnorm)
68 | importFrom(stats,quantile)
69 | importFrom(stats,uniroot)
70 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | ## scClassify 0.99.3
2 | 
3 | * Improve coding format 
4 | * sapply changed to vapply
5 | * Add examples
6 | * Improve vignettes
7 | 


--------------------------------------------------------------------------------
/R/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/R/KNNfunctions.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom graphics plot abline
  2 | #' @importFrom stats uniroot qnorm dnorm coef
  3 | #' @importFrom mixtools normalmixEM
  4 | #' @importFrom methods slot
  5 | 
  6 | KNNcor <- function(corMat,
  7 |                    subLevelModel,
  8 |                    cutoff_method = c("dynamic", "static"),
  9 |                    k = 10,
 10 |                    prob_threshold = 0.8,
 11 |                    cor_threshold_static = 0.5,
 12 |                    cor_threshold_high = 0.7,
 13 |                    topLevel = FALSE,
 14 |                    verbose = TRUE) {
 15 | 
 16 | 
 17 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"),
 18 |                                several.ok = FALSE)
 19 | 
 20 | 
 21 |     # If we use static correlation cutoff
 22 |     if (cutoff_method == "static") {
 23 |         if (verbose) {
 24 |             print("Using static correlation cutoff...")
 25 |         }
 26 | 
 27 |         # get the KNN labels
 28 |         topKNN <- apply(corMat, 2, function(x)
 29 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
 30 | 
 31 |         # get the KNN correlation
 32 |         topKNN_cor <- apply(corMat, 2, function(x)
 33 |             x[order(x, decreasing = TRUE)][seq_len(k)])
 34 | 
 35 |         # If the KNN labels with correlation less than the threshood,
 36 |         # set the labels as -1
 37 |         topKNN <- ifelse(topKNN_cor >= cor_threshold_static, topKNN, -1)
 38 | 
 39 | 
 40 |         # Get the predicted results
 41 |         predRes <- apply(topKNN, 2, function(x) {
 42 |             tab <- table(x)/length(x)
 43 |             if (max(tab, na.rm = TRUE) < prob_threshold) {
 44 |                 0
 45 |             }else{
 46 |                 if (names(tab)[which(tab == max(tab, na.rm = TRUE))] == "-1") {
 47 |                     0
 48 |                 }else{
 49 |                     names(tab)[which(tab == max(tab, na.rm = TRUE))]
 50 |                 }
 51 |             }
 52 |         })
 53 | 
 54 |     }
 55 | 
 56 | 
 57 |     # If we use dynamic correlation cutoff based on mixture model
 58 |     if (cutoff_method == "dynamic") {
 59 | 
 60 |         if (verbose) {
 61 |             print("Using static correlation cutoff...")
 62 |         }
 63 | 
 64 | 
 65 |         # If this is the top level
 66 | 
 67 | 
 68 |         unique_y <- levels(subLevelModel$y)
 69 |         cor_threshold_tmp <- c()
 70 | 
 71 |         for (l in seq_len(length(unique_y))) {
 72 | 
 73 | 
 74 |             if ("dgCMatrix" %in% is(corMat)) {
 75 |                 corMat_vec <- methods::slot(corMat, "x")
 76 |             } else {
 77 |                 corMat_vec <- c(as.matrix(corMat[subLevelModel$y ==
 78 |                                                      unique_y[l],]))
 79 |             }
 80 | 
 81 |             # Fitting the mixture model if it is not unimodal distribution.
 82 |             mix_res <- mix_threshold(corMat_vec = corMat_vec,
 83 |                                      length_unique_y = length(unique_y),
 84 |                                      verbose = verbose)
 85 |             dip_test <- mix_res$dip_test
 86 |             mixmdl <- mix_res$mixmdl
 87 | 
 88 |             # Caculate the threshold for this branch
 89 |             if (dip_test$p.value > 0.01) {
 90 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
 91 |             }else if ("try-error" %in% is(mixmdl)) {
 92 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
 93 |             }else{
 94 |                 # plot(mixmdl,which = 2)
 95 |                 t_G2 <- getThreshold(mixmdl, verbose = verbose)
 96 |                 cor_threshold_tmp = c(cor_threshold_tmp, t_G2)
 97 |             }
 98 | 
 99 | 
100 |         }
101 |         names(cor_threshold_tmp) <- unique_y
102 | 
103 |         topKNN <- apply(corMat, 2, function(x)
104 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
105 |         topKNN_cor <- apply(corMat, 2, function(x)
106 |             x[order(x, decreasing = TRUE)][seq_len(k)])
107 | 
108 |         # get the threshold
109 |         # the threshold_tmp is order based on the levels of factor.
110 |         #no need to change the character
111 |         topKNN_threshold <- apply(corMat, 2, function(x) {
112 |             num_l <- subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)]
113 |             num_l <- as.numeric(num_l)
114 |             cor_threshold_tmp[num_l]
115 |         })
116 | 
117 |         topKNN <- ifelse(topKNN_cor >= topKNN_threshold, topKNN, -1)
118 | 
119 | 
120 |         predRes <- apply(topKNN, 2, function(x){
121 |             # x <- stats::na.omit(x)
122 |             tab <- table(x)/length(x)
123 |             if (max(tab, na.rm = TRUE) < prob_threshold) {
124 |                 0
125 |             }else{
126 |                 if (names(tab)[which(tab == max(tab, na.rm = TRUE))] == "-1") {
127 |                     0
128 |                 }else{
129 |                     names(tab)[which(tab == max(tab, na.rm = TRUE))]
130 |                 }
131 | 
132 |             }
133 |         })
134 | 
135 | 
136 | 
137 | 
138 |     }
139 | 
140 |     return(list(predRes = predRes))
141 | }
142 | 
143 | #' @importFrom methods slot
144 | WKNNcor <- function(corMat,
145 |                     subLevelModel,
146 |                     cutoff_method = c("dynamic", "static"),
147 |                     k = 10,
148 |                     prob_threshold = 0.8,
149 |                     cor_threshold_static = 0.5,
150 |                     cor_threshold_high = 0.7,
151 |                     topLevel = FALSE,
152 |                     verbose = TRUE){
153 | 
154 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"),
155 |                                several.ok = FALSE)
156 | 
157 |     if (cutoff_method == "static") {
158 | 
159 |         if (verbose) {
160 |             print("Using static correlation cutoff...")
161 |         }
162 | 
163 | 
164 |         topKNN <- apply(corMat, 2, function(x)
165 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
166 |         topKNN_cor <- apply(corMat, 2, function(x)
167 |             x[order(x, decreasing = TRUE)][seq_len(k)])
168 |         topKNN <- ifelse(topKNN_cor >= cor_threshold_static, topKNN, -1)
169 | 
170 |         topKNN_weight <- apply(topKNN_cor, 2, function(x){
171 |             x <- stats::na.omit(x) # add in 20190613
172 |             if (x[1] == x[length(x)]) {
173 |                 rep(1, length(x))
174 |             } else{
175 |                 (x - x[length(x)])/(x[1] - x[length(x)])
176 |             }
177 |         })
178 | 
179 | 
180 |         predRes <- getNumericPredRes(corMat, topKNN_weight,
181 |                                      topKNN, prob_threshold)
182 | 
183 |     }
184 | 
185 |     if (cutoff_method == "dynamic") {
186 | 
187 |         if (verbose) {
188 |             print("Using dynamic correlation cutoff...")
189 |         }
190 | 
191 | 
192 |         unique_y <- levels(subLevelModel$y)
193 |         cor_threshold_tmp <- c()
194 |         for (l in seq_len(length(unique_y))) {
195 | 
196 |             if ("dgCMatrix" %in% is(corMat)) {
197 |                 corMat_vec <- methods::slot(corMat, "x")
198 |             } else {
199 |                 corMat_vec <- c(as.matrix(corMat[subLevelModel$y ==
200 |                                                      unique_y[l],]))
201 |             }
202 | 
203 | 
204 |             # Fitting the mixture model if it is not unimodal distribution.
205 |             mix_res <- mix_threshold(corMat_vec = corMat_vec,
206 |                                      length_unique_y = length(unique_y),
207 |                                      verbose = verbose)
208 |             dip_test <- mix_res$dip_test
209 |             mixmdl <- mix_res$mixmdl
210 | 
211 |             # Caculate the threshold for this branch
212 |             if (dip_test$p.value > 0.01) {
213 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
214 |             }else if ("try-error" %in% is(mixmdl)) {
215 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
216 |             }else{
217 |                 # plot(mixmdl,which = 2)
218 |                 t_G2 <- getThreshold(mixmdl, verbose = verbose)
219 |                 cor_threshold_tmp = c(cor_threshold_tmp, t_G2)
220 |             }
221 | 
222 |         }
223 |         names(cor_threshold_tmp) <- unique_y
224 | 
225 | 
226 |         ### calcualte the Weighted KNN
227 |         topKNN <- apply(corMat, 2, function(x)
228 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
229 |         topKNN_cor <- apply(corMat, 2, function(x)
230 |             x[order(x, decreasing = TRUE)][seq_len(k)])
231 |         topKNN_threshold <- apply(corMat, 2, function(x) {
232 |             num_l <- subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)]
233 |             num_l <- as.numeric(num_l)
234 |             cor_threshold_tmp[num_l]
235 |         })
236 | 
237 | 
238 | 
239 |         topKNN <- ifelse(topKNN_cor >= topKNN_threshold, topKNN, -1)
240 | 
241 |         topKNN_weight <- apply(topKNN_cor, 2, function(x){
242 |             #x <- stats::na.omit(x) # add 20190613
243 |             x[is.na(x)] <- 0
244 |             if (x[1] == x[length(x)]) {
245 |                 rep(1, length(x))
246 |             }else{
247 |                 (x - x[length(x)])/(x[1] - x[length(x)])
248 |             }
249 |         })
250 | 
251 |         predRes <- getNumericPredRes(corMat, topKNN_weight,
252 |                                      topKNN, prob_threshold)
253 |     }
254 | 
255 |     return(list(predRes = predRes))
256 | }
257 | 
258 | 
259 | 
260 | 
261 | 
262 | #' @importFrom methods slot
263 | DWKNNcor <- function(corMat,
264 |                      subLevelModel,
265 |                      cutoff_method = c("dynamic", "static"),
266 |                      k = 10,
267 |                      prob_threshold = 0.8,
268 |                      cor_threshold_static = 0.5,
269 |                      cor_threshold_high = 0.7,
270 |                      topLevel = FALSE,
271 |                      verbose = TRUE){
272 | 
273 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"),
274 |                                several.ok = FALSE)
275 | 
276 |     if (cutoff_method == "static") {
277 | 
278 |         if (verbose) {
279 |             print("Using static correlation cutoff...")
280 |         }
281 | 
282 | 
283 |         topKNN <- apply(corMat, 2, function(x)
284 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
285 |         topKNN_cor <- apply(corMat, 2, function(x)
286 |             x[order(x, decreasing = TRUE)][seq_len(k)])
287 |         topKNN <- ifelse(topKNN_cor >= cor_threshold_static, topKNN, -1)
288 | 
289 | 
290 | 
291 |         topKNN_weight <- apply(topKNN_cor, 2, function(x){
292 |             #x <- stats::na.omit(x) # add in 20190613
293 |             x[is.na(x)] <- 0
294 |             if (x[1] == x[length(x)]) {
295 |                 rep(1, length(x))
296 |             }else{
297 |                 (x - x[length(x)]) *
298 |                     (2 - x[length(x)] - x[1])/((x[1] - x[length(x)]) *
299 |                                                    (2 - x[length(x)] - x))
300 |             }
301 |         })
302 | 
303 |         predRes <- getNumericPredRes(corMat, topKNN_weight,
304 |                                      topKNN, prob_threshold)
305 | 
306 |     }
307 | 
308 |     if (cutoff_method == "dynamic") {
309 | 
310 |         if (verbose) {
311 |             print("Using dynamic correlation cutoff...")
312 |         }
313 | 
314 |         unique_y <- levels(subLevelModel$y)
315 |         cor_threshold_tmp <- c()
316 |         for (l in seq_len(length(unique_y))) {
317 | 
318 |             if ("dgCMatrix" %in% is(corMat)) {
319 |                 corMat_vec <- methods::slot(corMat, "x")
320 |             } else {
321 |                 corMat_vec <- c(as.matrix(corMat[subLevelModel$y ==
322 |                                                      unique_y[l],]))
323 |             }
324 | 
325 | 
326 | 
327 |             # Fitting the mixture model if it is not unimodal distribution.
328 |             mix_res <- mix_threshold(corMat_vec = corMat_vec,
329 |                                      length_unique_y = length(unique_y),
330 |                                      verbose = verbose)
331 |             dip_test <- mix_res$dip_test
332 |             mixmdl <- mix_res$mixmdl
333 | 
334 |             # Caculate the threshold for this branch
335 |             if (dip_test$p.value > 0.01) {
336 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
337 |             }else if ("try-error" %in% is(mixmdl)) {
338 |                 cor_threshold_tmp = c(cor_threshold_tmp, 0)
339 |             }else{
340 |                 # plot(mixmdl,which = 2)
341 |                 t_G2 <- getThreshold(mixmdl, verbose = verbose)
342 |                 cor_threshold_tmp = c(cor_threshold_tmp, t_G2)
343 |             }
344 | 
345 | 
346 |         }
347 |         names(cor_threshold_tmp) <- unique_y
348 | 
349 |         ### calcualte the Weighted KNN
350 |         topKNN <- apply(corMat, 2, function(x)
351 |             subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)])
352 |         topKNN_cor <- apply(corMat, 2, function(x)
353 |             x[order(x, decreasing = TRUE)][seq_len(k)])
354 |         topKNN_threshold <- apply(corMat, 2, function(x) {
355 |             num_l <- subLevelModel$y[order(x, decreasing = TRUE)][seq_len(k)]
356 |             num_l <- as.numeric(num_l)
357 |             cor_threshold_tmp[num_l]
358 |         })
359 | 
360 |         topKNN <- ifelse(topKNN_cor >= topKNN_threshold, topKNN, -1)
361 | 
362 |         topKNN_weight <- apply(topKNN_cor, 2, function(x){
363 |             #x <- stats::na.omit(x) # add in 20190613
364 |             x[is.na(x)] <- 0
365 |             if (x[1] == x[length(x)]) {
366 |                 rep(1, length(x))
367 |             }else{
368 |                 (x - x[length(x)]) * (2 - x[length(x)] - x[1]) /
369 |                     ((x[1] - x[length(x)]) * (2 - x[length(x)] - x))
370 |             }
371 |         })
372 | 
373 | 
374 | 
375 | 
376 | 
377 |         predRes <- getNumericPredRes(corMat, topKNN_weight,
378 |                                      topKNN, prob_threshold)
379 | 
380 | 
381 |     }
382 | 
383 |     return(list(predRes = predRes))
384 | }
385 | 
386 | # function to fit a mixture model
387 | 
388 | mix_threshold <- function(corMat_vec, length_unique_y, verbose = verbose) {
389 | 
390 |     corMat_vec <- thin_cor(corMat_vec, min(10000, length(corMat_vec)))
391 |     corMat_vec <- corMat_vec[corMat_vec != min(corMat_vec)]
392 |     suppressMessages(dip_test <- diptest::dip.test(corMat_vec, B = 10000))
393 |     if (dip_test$p.value <= 0.01) {
394 |         quiet(mixmdl <- try(mixtools::normalmixEM(corMat_vec,
395 |                                                   fast = TRUE,
396 |                                                   maxrestarts = 100,
397 |                                                   k = length_unique_y,
398 |                                                   maxit = 1000,
399 |                                                   mu = c(-0.5, rep(0.5, length_unique_y - 2),  1),
400 |                                                   lambda = c(1/length_unique_y),
401 |                                                   sigma = rep(0.2,
402 |                                                               length_unique_y),
403 |                                                   ECM = TRUE,
404 |                                                   verb = verbose),
405 |                             silent = TRUE))
406 | 
407 |         if (!"try-error" %in% is(mixmdl)) {
408 |             if (suppressWarnings(min(unique(mixmdl$rho)) == 0)) {
409 |                 quiet(mixmdl <- try(mixtools::normalmixEM(corMat_vec,
410 |                                                           fast = TRUE,
411 |                                                           maxrestarts = 100,
412 |                                                           k = length_unique_y,
413 |                                                           maxit = 2000,
414 |                                                           ECM = TRUE,
415 |                                                           verb = verbose),
416 |                                     silent = TRUE))
417 |             }
418 |         }else{
419 |             quiet(mixmdl <- try(mixtools::normalmixEM(corMat_vec, fast = TRUE,
420 |                                                       maxrestarts = 100,
421 |                                                       k = length_unique_y,
422 |                                                       maxit = 2000,
423 |                                                       ECM = TRUE,
424 |                                                       verb = verbose),
425 |                                 silent = TRUE))
426 |         }
427 |     } else {
428 |         mixmdl <- NULL
429 |     }
430 | 
431 |     return(list(mixmdl = mixmdl, dip_test = dip_test))
432 | }
433 | 
434 | 
435 | 
436 | # Function to generate the mixture model
437 | # based on the mixtools normalmixEM results
438 | 
439 | funMixModel <- function(x, mu1, mu2, sd1, sd2, rho1, rho2) {
440 | 
441 |     dnorm(x, mean = mu1, sd = sd1) * rho1 -
442 |         dnorm(x, mean = mu2, sd = sd2) * rho2
443 | 
444 | 
445 | }
446 | 
447 | 
448 | 
449 | 
450 | # Function to get the threshold for correlation based on mixture model
451 | 
452 | getThreshold <- function(mixmdl, verbose = FALSE){
453 | 
454 |     # if (verbose) {
455 |     #   plot(mixmdl, which = 2)
456 |     # }
457 | 
458 |     membership <- apply(mixmdl$posterior, 1, which.max)
459 |     m_list <- sort(unique(membership))
460 | 
461 |     mu_list <- mixmdl$mu
462 |     names(mu_list) <- seq_len(length(mu_list))
463 |     mu_list <- mu_list[m_list]
464 | 
465 |     if (length(mu_list) > 1) {
466 |         idx1 <- as.numeric(names(mu_list)[order(mu_list)][1])
467 |         idx2 <- as.numeric(names(mu_list)[order(mu_list)][2])
468 | 
469 |         root <- try(uniroot(funMixModel, interval = c(mixmdl$mu[idx1] -
470 |                                                           mixmdl$sigma[idx1],
471 |                                                       mixmdl$mu[idx2] +
472 |                                                           mixmdl$sigma[idx2]),
473 |                             mu1 = mixmdl$mu[idx1], mu2 = mixmdl$mu[idx2],
474 |                             sd1 = mixmdl$sigma[idx1], sd2 = mixmdl$sigma[idx2],
475 |                             rho1 = mixmdl$lambda[idx1],
476 |                             rho2 = mixmdl$lambda[idx2]),
477 |                     silent = TRUE)
478 | 
479 | 
480 |         if (!"try-error" %in%  is(root)) {
481 |             t <- root$root
482 |         }else{
483 |             t <- 0
484 |         }
485 | 
486 |     }else{
487 |         t <- 0
488 |     }
489 | 
490 |     return(t)
491 | }
492 | 
493 | 
494 | 
495 | thin_cor <- function(cor_vector, n = 10000) {
496 | 
497 |     N <- length(cor_vector)
498 | 
499 |     if (n == N) {
500 |         return(cor_vector)
501 |     }
502 | 
503 |     new_n <- floor(N/n)
504 | 
505 |     index <- seq(from = 1, to = N, by = new_n)
506 | 
507 |     cor_vector <- cor_vector[index]
508 | 
509 |     return(cor_vector)
510 | }
511 | 
512 | 
513 | ##
514 | 
515 | getNumericPredRes <- function(corMat, topKNN_weight, topKNN,
516 |                               prob_threshold) {
517 |     #
518 |     # predRes <- sapply(seq_len(ncol(corMat)),
519 |     #                   function(i){
520 |     #                       vote <- stats::aggregate(topKNN_weight[,i],
521 |     #                                                by = list(topKNN[,i]),
522 |     #                                                sum)
523 |     #                       maxIdx <- which.max(vote$x)
524 |     #                       if (max(vote$x/sum(vote$x)) < prob_threshold) {
525 |     #                           0
526 |     #                       }else{
527 |     #                           if (vote$Group.1[maxIdx] == "-1") {
528 |     #                               0
529 |     #                           }else{
530 |     #                               vote$Group.1[maxIdx]
531 |     #                           }
532 |     #                       }
533 |     #                   })
534 | 
535 |     predRes <- vapply(seq_len(ncol(corMat)),
536 |                       function(i){
537 |                           vote <- stats::aggregate(topKNN_weight[,i],
538 |                                                    by = list(topKNN[,i]),
539 |                                                    sum)
540 |                           maxIdx <- which.max(vote$x)
541 |                           if (max(vote$x/sum(vote$x)) < prob_threshold) {
542 |                               "0"
543 |                           }else{
544 |                               if (vote$Group.1[maxIdx] == "-1") {
545 |                                   "0"
546 |                               }else{
547 |                                   vote$Group.1[maxIdx]
548 |                               }
549 |                           }
550 |                       }, character(1L))
551 | 
552 |     names(predRes) <- colnames(corMat)
553 | 
554 | 
555 |     return(predRes)
556 | }
557 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' @title Example data used in scClassify package
 2 | #' @description  A list includes expression matrix and cell type of subsets of
 3 | #' wang et al., xin et al.
 4 | #'
 5 | #' @usage data(scClassify_example, package = 'scClassify')
 6 | #' @source
 7 | #' Wang YJ, Schug J, Won K-J, Liu C, Naji A, Avrahami D, Golson ML &
 8 | #' Kaestner KH (2016) Single cell transcriptomics of the human endocrine
 9 | #' pancreas. Diabetes: db160405
10 | #'
11 | #' Xin Y, Kim J, Okamoto H, Ni M, Wei Y, Adler C, Murphy AJ, Yancopoulos GD,
12 | #' Lin C & Gromada J (2016) RNA sequencing of single human islet cells reveals
13 | #' type 2 diabetes genes. Cell Metab. 24: 608–615
14 | "scClassify_example"
15 | 
16 | 
17 | #' @title Subset of pretrained model of Xin et al.
18 | #' @description  An obejct of scClassifyTrainModel for Xin et al.
19 | #' @usage data(trainClassExample_xin, package = 'scClassify')
20 | #' @source Xin Y, Kim J, Okamoto H, Ni M, Wei Y, Adler C, Murphy AJ, Yancopoulos GD,
21 | #' Lin C & Gromada J (2016) RNA sequencing of single human islet cells reveals
22 | #' type 2 diabetes genes. Cell Metab. 24: 608–615
23 | "trainClassExample_xin"
24 | 
25 | #' @title Subset of pretrained model of Wang et al.
26 | #' @description  An obejct of scClassifyTrainModel for Wang et al.
27 | #' @usage data(trainClassExample_wang, package = 'scClassify')
28 | #'
29 | #' @source Wang YJ, Schug J, Won K-J, Liu C, Naji A, Avrahami D, Golson ML &
30 | #' Kaestner KH (2016) Single cell transcriptomics of the human endocrine
31 | #' pancreas. Diabetes: db160405
32 | #'
33 | "trainClassExample_wang"
34 | 


--------------------------------------------------------------------------------
/R/ensemble.R:
--------------------------------------------------------------------------------
  1 | # functions get ensemble results
  2 | 
  3 | 
  4 | 
  5 | # function to transfer the error
  6 | alpha <- function(e) {
  7 |     e[e < 0.001] <- 0.001
  8 |     e[e > 0.999] <- 0.999
  9 |     log((1 - e)/e)
 10 | }
 11 | 
 12 | 
 13 | 
 14 | getTrainWeights <- function(trainRes) {
 15 |     errClass_train <- do.call(rbind, lapply(trainRes, function(x)
 16 |         table(x$classifyRes)/length(x$classifyRes)))
 17 |     weighted_train <- alpha(1 - (errClass_train[, 1] + errClass_train[, 2]))
 18 |     return(weighted_train)
 19 | }
 20 | 
 21 | 
 22 | # function to get the ensemble res
 23 | getEnsembleRes <- function(testRes, weighted_train,
 24 |                            exclude = NULL, weighted_ensemble = TRUE) {
 25 |     #
 26 |     #    trainRes <- testRes$train
 27 |     # Get the weighted
 28 |     if (weighted_ensemble) {
 29 |         if (!is.null(exclude)) {
 30 |             keep_method <- !grepl(paste(exclude, collapse = "|"),
 31 |                                   names(weighted_train))
 32 |             weighted_train <- weighted_train[keep_method]
 33 |         }
 34 |         
 35 |     } else {
 36 |         weighted_train <- NULL
 37 |     }
 38 |     
 39 |     ensembleRes <- calEnsembleRes(testRes,
 40 |                                   exclude = exclude,
 41 |                                   weight = weighted_train)
 42 |     return(ensembleRes)
 43 |     
 44 | }
 45 | 
 46 | # Function to combine the ensemble res
 47 | 
 48 | calEnsembleRes <- function(res, exclude = NULL, weight = NULL){
 49 |     
 50 |     
 51 |     if (!is.null(exclude)) {
 52 |         keep_method <- !grepl(paste(exclude,collapse = "|"), names(res))
 53 |         res <- res[keep_method]
 54 |         weight <- weight[keep_method]
 55 |     }
 56 |     
 57 | 
 58 |     ensembleResMat <- do.call(cbind, lapply(res, function(x) x$predRes))
 59 |     
 60 |     if (is.null(weight)) {
 61 |         weight <- rep(1, ncol(ensembleResMat))
 62 |     }
 63 |     
 64 |     ensembleRes <- apply(ensembleResMat, 1, function(x) {
 65 |         names(x) <- colnames(ensembleResMat)
 66 |         keep <- rep(TRUE, length(x))
 67 |         if (sum(keep) == 0) {
 68 |             data.frame(cellTypes = "unassigned", scores = 0)
 69 |         }else{
 70 |             getResByWeights(x[keep], weight[keep])
 71 |         }
 72 |         
 73 |     })
 74 |     
 75 |     ensembleRes <- do.call(rbind, ensembleRes)
 76 |     return(ensembleRes)
 77 | }
 78 | 
 79 | 
 80 | 
 81 | 
 82 | # function to calculate the weight for each base classifier
 83 | 
 84 | getResByWeights <- function(res, weight) {
 85 |     resType <- unique(res)
 86 |     if (length(resType) == 1) {
 87 |         final <- resType
 88 |         scores <- 1
 89 |     } else {
 90 |         
 91 |         
 92 |         mat <- vapply(seq_len(length(resType)), function(i){
 93 |             ifelse(res %in% resType[i], 1, 0) * weight
 94 |         }, numeric(length(res)))
 95 |         
 96 |         colnames(mat) <- resType
 97 |         rownames(mat) <- names(res)
 98 |         mat_colMeans <- Matrix::colMeans(mat)
 99 |         final <- names(mat_colMeans)[which.max(mat_colMeans)]
100 |         scores <- max(mat_colMeans)/sum(mat_colMeans)
101 |     }
102 |     
103 |     return(data.frame(cellTypes = final, scores = scores))
104 | }
105 | 
106 | 
107 | 
108 | 
109 | # Function to summarise joint classification results
110 | 
111 | getJointRes <- function(predictResList, trainRes) {
112 |     
113 |     predictJointResList <- lapply(seq_along(predictResList), function(l) {
114 |         
115 |         if ("scClassifyTrainModel" %in% is(trainRes[[l]])) {
116 |             weights_train <- modelweights(trainRes[[l]])
117 |         } else {
118 |             weights_train <- trainRes[[l]]$modelweights
119 |         }
120 |         
121 |         getEnsembleRes(predictResList[[l]],
122 |                        weights_train,
123 |                        exclude = NULL,
124 |                        weighted_ensemble = TRUE)
125 |     })
126 |     
127 |     names(predictJointResList) <- names(predictResList)
128 |     
129 |     
130 |     jointCellType <- do.call(cbind, lapply(predictJointResList, function(x)
131 |         x$cellTypes))
132 |     colnames(jointCellType) <- names(predictJointResList)
133 |     
134 |     
135 |     jointCellType <- apply(jointCellType, 2, function(x) {
136 |         len <- unlist(lapply(strsplit(x, "_"), length))
137 |         x[len > 2] <- "intermediate"
138 |         x
139 |     })
140 |     
141 |     jointWeights <- do.call(cbind, lapply(predictJointResList, function(x)
142 |         x$scores))
143 |     colnames(jointWeights) <- names(predictJointResList)
144 |     
145 |     
146 |     
147 |     jointRes <- lapply(seq_len(nrow(jointCellType)), function(idx) {
148 |         cell_res <- jointCellType[idx, ]
149 |         names(cell_res) <- colnames(jointCellType)
150 |         if (length(cell_res) == 2 & sum(cell_res == "unassigned") == 1) {
151 |             data.frame(cellTypes = cell_res[cell_res != "unassigned"],
152 |                        scores = 0.5)
153 |         } else {
154 |             getResByWeights(cell_res, jointWeights[idx, ])
155 |         }
156 |     })
157 |     
158 |     
159 |     jointRes <- do.call(rbind, jointRes)
160 |     
161 |     jointRes <- data.frame(jointRes)
162 |     
163 |     return(jointRes)
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/R/featureSelection.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom methods new
  2 | #' @importFrom Cepo Cepo topGenes
  3 | #' @import limma
  4 | 
  5 | 
  6 | featureSelection <- function(exprsMat,
  7 |                              trainClass,
  8 |                              feature = c("limma", "DV", "DD", "chisq", "BI",
  9 |                                          "Cepo"),
 10 |                              topN = 50,
 11 |                              pSig = 0.001
 12 | ){
 13 | 
 14 |     feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
 15 |                          several.ok = FALSE)
 16 | 
 17 | 
 18 |     if (feature == "DV") {
 19 |         tt <- doDV(exprsMat, trainClass)
 20 |         tt <- lapply(tt, function(x)sort(x))
 21 |         res <- Reduce(union, lapply(tt, function(t)
 22 |             names(t)[seq_len(min(topN, sum(t < pSig)))]))
 23 |     } else if (feature == "DD") {
 24 |         tt <- doDD(exprsMat, trainClass)
 25 |         tt <- lapply(tt, function(x)sort(x))
 26 |         res <- Reduce(union, lapply(tt, function(t)
 27 |             names(t)[seq_len(min(topN, sum(t < pSig)))]))
 28 |     } else if (feature == "chisq") {
 29 |         tt <- doChisSquared(exprsMat, trainClass)
 30 |         tt <- lapply(tt, function(x)sort(x))
 31 |         res <- Reduce(union, lapply(tt, function(t)
 32 |             names(t)[seq_len(min(topN, sum(t < pSig)))]))
 33 |         #
 34 |     } else if (feature == "BI") {
 35 |         tt <- doBI(exprsMat, trainClass)
 36 |         tt <- lapply(tt, function(x)x)
 37 |         res <- Reduce(union, lapply(tt, function(t)
 38 |             names(t)[seq_len(topN)]))
 39 |     } else if (feature == "Cepo") {
 40 |         tt <- Cepo::Cepo(as.matrix(exprsMat), trainClass, exprsPct = 0.05)
 41 |         res <- Reduce(union, Cepo::topGenes(tt, n = topN))
 42 |     } else{
 43 |         tt <- doLimma(exprsMat, trainClass)
 44 |         res <- Reduce(union, lapply(tt, function(t)
 45 |             rownames(t[t$logFC > 0 & (t$meanPct.2 - t$meanPct.1) > 0.05 &
 46 |                            t$adj.P.Val < pSig,])[seq_len(topN)]))
 47 | 
 48 |     }
 49 | 
 50 |     return(res)
 51 | }
 52 | 
 53 | 
 54 | #' @importFrom limma eBayes lmFit
 55 | #' @importFrom methods new
 56 | 
 57 | doLimma <- function(exprsMat, cellTypes, exprs_pct = 0.05){
 58 | 
 59 |     cellTypes <- droplevels(as.factor(cellTypes))
 60 |     tt <- list()
 61 |     for (i in seq_len(nlevels(cellTypes))) {
 62 |         tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
 63 |         design <- stats::model.matrix(~tmp_celltype)
 64 | 
 65 | 
 66 |         meanExprs <- do.call(cbind, lapply(c(0,1), function(i){
 67 |             Matrix::rowMeans(exprsMat[, tmp_celltype == i, drop = FALSE])
 68 |         }))
 69 | 
 70 |         meanPct <- do.call(cbind, lapply(c(0,1), function(i){
 71 |             Matrix::rowSums(exprsMat[, tmp_celltype == i,
 72 |                                      drop = FALSE] > 0)/sum(tmp_celltype == i)
 73 |         }))
 74 | 
 75 |         keep <- meanPct[,2] > exprs_pct
 76 | 
 77 |         y <- methods::new("EList")
 78 |         y$E <- exprsMat[keep, ]
 79 |         fit <- limma::lmFit(y, design = design)
 80 |         fit <- limma::eBayes(fit, trend = TRUE, robust = TRUE)
 81 |         tt[[i]] <- limma::topTable(fit, n = Inf, adjust.method = "BH", coef = 2)
 82 | 
 83 | 
 84 | 
 85 |         if (!is.null(tt[[i]]$ID)) {
 86 |             tt[[i]] <- tt[[i]][!duplicated(tt[[i]]$ID),]
 87 |             rownames(tt[[i]]) <- tt[[i]]$ID
 88 |         }
 89 | 
 90 |         tt[[i]]$meanExprs.1 <- meanExprs[rownames(tt[[i]]), 1]
 91 |         tt[[i]]$meanExprs.2 <- meanExprs[rownames(tt[[i]]), 2]
 92 |         tt[[i]]$meanPct.1 <- meanPct[rownames(tt[[i]]), 1]
 93 |         tt[[i]]$meanPct.2 <- meanPct[rownames(tt[[i]]), 2]
 94 |     }
 95 | 
 96 | 
 97 | 
 98 |     return(tt)
 99 | 
100 | 
101 | }
102 | 
103 | 
104 | 
105 | 
106 | doDV <- function(exprsMat, cellTypes){
107 | 
108 | 
109 |     cellTypes <- droplevels(as.factor(cellTypes))
110 |     tt <- list()
111 |     for (i in seq_len(nlevels(cellTypes))) {
112 |         tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
113 | 
114 |         meanPct <- do.call(cbind, lapply(c(0,1), function(i){
115 |             Matrix::rowSums(exprsMat[,
116 |                                      tmp_celltype == i,
117 |                                      drop = FALSE] > 0)/sum(tmp_celltype == i)
118 |         }))
119 | 
120 | 
121 |         posNeg <- (meanPct[,2] - meanPct[,1]) > 0.05
122 |         # print(sum(posNeg))
123 |         exprsMat_filt <- exprsMat[posNeg,]
124 |         tt[[i]] <- apply(exprsMat_filt, 1, function(x) {
125 |             df <- data.frame(gene = x, cellTypes = as.factor(tmp_celltype))
126 |             stats::bartlett.test(gene~cellTypes, df)$p.value
127 |         })
128 | 
129 |         tt[[i]] <- stats::p.adjust(tt[[i]], method = "BH")
130 |     }
131 | 
132 | 
133 | 
134 |     return(tt)
135 | 
136 | 
137 | }
138 | 
139 | doDD <- function(exprsMat, cellTypes){
140 | 
141 |     cellTypes <- droplevels(as.factor(cellTypes))
142 |     tt <- list()
143 |     for (i in seq_len(nlevels(cellTypes))) {
144 |         tmp_celltype <- ifelse(cellTypes == levels(cellTypes)[i], 1, 0)
145 | 
146 | 
147 |         meanPct <- do.call(cbind, lapply(c(0,1), function(i){
148 |             Matrix::rowSums(exprsMat[,
149 |                                      tmp_celltype == i,
150 |                                      drop = FALSE] > 0)/sum(tmp_celltype == i)
151 |         }))
152 | 
153 |         posNeg <- (meanPct[,2] - meanPct[,1]) > 0.05
154 |         # print(sum(posNeg))
155 |         exprsMat_filt <- exprsMat[posNeg,]
156 |         tt[[i]] <- apply(exprsMat_filt, 1, function(x) {
157 |             x1 <- x[tmp_celltype == 0]
158 |             x2 <- x[tmp_celltype == 1]
159 |             stats::ks.test(x1, x2, alternative = "greater")$p.value
160 |         })
161 | 
162 | 
163 | 
164 |         tt[[i]] <- stats::p.adjust(tt[[i]], method = "BH")
165 |     }
166 | 
167 | 
168 | 
169 |     return(tt)
170 | 
171 | 
172 | }
173 | 
174 | 
175 | 
176 | doChisSquared <- function(exprsMat, cellTypes, threshold = 1){
177 | 
178 | 
179 |     cellTypes <- droplevels(as.factor(cellTypes))
180 |     tt <- list()
181 |     for (i in seq_len(nlevels(cellTypes))) {
182 |         tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
183 | 
184 | 
185 |         zerosMat <- ifelse(exprsMat > threshold, 1, 0)
186 | 
187 |         tt[[i]] <- apply(zerosMat,1,  function(x){
188 |             tab <- c()
189 |             for (i in c(0,1)) {
190 |                 tmp <- factor(x[tmp_celltype == i], levels = c(0, 1))
191 |                 tab <- rbind(tab, table(tmp))
192 |             }
193 | 
194 | 
195 |             suppressWarnings(stats::chisq.test(tab)$p.value)
196 | 
197 | 
198 |         })
199 | 
200 | 
201 | 
202 | 
203 |         tt[[i]] <- stats::p.adjust(tt[[i]], method = "BH")
204 |     }
205 | 
206 | 
207 | 
208 |     return(tt)
209 | 
210 | 
211 | }
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | doBI <- function(exprsMat, cellTypes){
219 |     # Select genes by bimodal index
220 | 
221 |     cellTypes <- droplevels(as.factor(cellTypes))
222 |     tt <- list()
223 |     for (i in seq_len(nlevels(cellTypes))) {
224 |         tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
225 | 
226 |         pi <- table(tmp_celltype)/length(tmp_celltype)
227 | 
228 |         agg_mean <- do.call(cbind, lapply(c(0,1), function(i){
229 |             Matrix::rowMeans(exprsMat[, tmp_celltype == i, drop = FALSE])
230 |         }))
231 | 
232 |         agg_sd2 <- do.call(cbind, lapply(c(0,1), function(i){
233 |             apply(exprsMat[, tmp_celltype == i, drop = FALSE], 1, stats::var)
234 |         }))
235 | 
236 |         bi <- abs(agg_mean[,2] - agg_mean[,1])/sqrt(pi[1]*agg_sd2[,1] +
237 |                                                         pi[2]*agg_sd2[,2])
238 | 
239 |         bi <- unlist(bi)
240 |         names(bi) <- rownames(exprsMat)
241 |         bi <- bi[order(bi, decreasing = TRUE)]
242 |         tt[[i]] <- bi
243 |     }
244 | 
245 |     return(tt)
246 | 
247 | 
248 | }
249 | 


--------------------------------------------------------------------------------
/R/global.R:
--------------------------------------------------------------------------------
1 | utils::globalVariables(c("y", "quantile_25", "quantile_75",
2 |                          "quantile_05", "quantile_95", "name", "leaf"))
3 | 


--------------------------------------------------------------------------------
/R/learningCurve.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Fit learning curve for accuracy matrix
  3 | #' @param accMat Matrix of accuracy rate where column indicate
  4 | #' different sample size
  5 | #' @param n Vector indicates the sample size
  6 | #' @param auto_initial whether automatical intialise
  7 | #' @param a input the parameter a starting point
  8 | #' @param b input the parameter a starting point
  9 | #' @param c input the parameter a starting point
 10 | #' @param d_list range of d
 11 | #' @param fitmodel "nls", "nls_mix", "gam"
 12 | #' @param plot indicates whether plot or not
 13 | #' @param verbose indicates whether verbose or not
 14 | #' @return list of results
 15 | #' @examples
 16 | #' set.seed(2019)
 17 | #' n <- seq(20, 10000, 100)
 18 | #' accMat <- do.call(cbind, lapply(1:length(n), function(i){
 19 | #' tmp_n <- rep(n[i], 50)
 20 | #' y <- -2/(tmp_n^0.8) + 0.95 + rnorm(length(tmp_n), 0, 0.02)
 21 | #' }))
 22 | #' res <- learningCurve(accMat = accMat, n)
 23 | #' N <- getN(res, acc = 0.9)
 24 | #'
 25 | #' @author Yingxin Lin
 26 | #' @import ggplot2
 27 | #' @importFrom minpack.lm nlsLM
 28 | #' @importFrom stats quantile coef predict
 29 | #' @importFrom mgcv gam
 30 | #' @export
 31 | 
 32 | learningCurve <- function(accMat, n, auto_initial = TRUE,
 33 |                           a = NULL, b = NULL, c = NULL, d_list = NULL,
 34 |                           fitmodel = c("nls", "nls_mix", "gam"),
 35 |                           plot = TRUE, verbose = TRUE){
 36 | 
 37 |     ## TODO: to make several ok TRUE
 38 |     fitmodel <- match.arg(fitmodel, c("nls", "nls_mix", "gam"),
 39 |                           several.ok = FALSE)
 40 | 
 41 | 
 42 | 
 43 |     if (any(c("matrix", "data.frame") %in% is(accMat))) {
 44 |         if (ncol(accMat) != length(n)) {
 45 |             stop("Number of column doesn't match with the length of n")
 46 |         }
 47 |         dat <- data.frame(y = colMeans(accMat),
 48 |                           y_25 = apply(accMat, 2,
 49 |                                        function(x) quantile(x, 0.25)),
 50 |                           y_75 = apply(accMat, 2,
 51 |                                        function(x) quantile(x, 0.75))
 52 |         )
 53 |     }
 54 | 
 55 |     if ( "list" %in% is(accMat)) {
 56 |         if (length(accMat) != length(n)) {
 57 |             stop("Number of column doesn't match with the length of n")
 58 |         }
 59 |         dat <- data.frame(y = unlist(lapply(accMat, mean)),
 60 |                           y_25 = unlist(lapply(accMat,
 61 |                                                function(x) quantile(x, 0.25))),
 62 |                           y_75 = unlist(lapply(accMat,
 63 |                                                function(x) quantile(x, 0.75))))
 64 |     }
 65 | 
 66 | 
 67 | 
 68 |     # Fitting model
 69 |     model <- list()
 70 |     for (i in seq(ncol(dat))) {
 71 |         if (fitmodel == "nls_mix") {
 72 |             model[[i]] <- fitLC_mixture(dat[,i], n,
 73 |                                         b = b, d_list = d_list,
 74 |                                         verbose = verbose)
 75 | 
 76 |         } else if (fitmodel == "gam") {
 77 |             model[[i]] <- mgcv::gam(acc ~ s(n, bs = "cr"),
 78 |                                     data = data.frame(acc = dat[,i], n = n),
 79 |                                     method = "REML")
 80 |         }
 81 | 
 82 |         else{
 83 |             model[[i]] <- fitLC(dat[,i], n, auto_initial = auto_initial,
 84 |                                 a = a, b = b, c = c,
 85 |                                 verbose = verbose, b_max = 1)
 86 |             if (i == 1) {
 87 |                 b_max <- coef(model[[i]])["b"]
 88 |             }
 89 |         }
 90 | 
 91 |     }
 92 | 
 93 | 
 94 |     # Get fitted value
 95 |     dat$n <- n
 96 |     new_n <- data.frame(n = seq(min(n), max(n), by = 0.1))
 97 |     fit <- lapply(model, function(x) predict(x, newdata = new_n))
 98 | 
 99 |     names(model) <- names(fit) <- c("mean", "quantile_25", "quantile_75")
100 |     fit[["n"]] <- new_n$n
101 |     dat_fit <- data.frame(do.call(cbind, fit))
102 | 
103 |     if (plot) {
104 | 
105 | 
106 |         cols <- c("Mean" = "#c8133b","Quantile25/75" = "#ea8783")
107 |         g <-  ggplot2::ggplot(dat, ggplot2::aes(x = n, y = y))  +
108 |             xlab("N") + ylab("Accuracy Rate") +
109 |             ggplot2::geom_point(alpha = 0.7) +
110 |             ggplot2::geom_line(data = dat_fit, aes(x = n,
111 |                                                    y = mean,
112 |                                                    color = "Mean"),
113 |                                linetype = "solid", size = 1) +
114 |             ggplot2::geom_line(data = dat_fit, aes(x = n, y = quantile_25,
115 |                                                    color = "Quantile25/75"),
116 |                                linetype = "dashed") +
117 |             ggplot2::geom_line(data = dat_fit, aes(x = n, y = quantile_75,
118 |                                                    color = "Quantile25/75"),
119 |                                linetype = "dashed") +
120 |             ggplot2::scale_color_manual(values = cols) +
121 |             ggplot2::theme_bw() +
122 |             ggplot2::theme(legend.position = "bottom")
123 | 
124 |         return(list(model = model, fit = fit, plot = g))
125 | 
126 |     }else{
127 |         return(list(model = model, fit = fit))
128 |     }
129 | }
130 | 
131 | 
132 | init_fit <- function(acc, n){
133 |     fit <- lm(log(n) ~ log(max(acc) + 0.001 - acc))
134 | 
135 |     c <- -coef(fit)[2]
136 |     a <- -exp(coef(fit)[1])
137 |     return(list(a = a, c = c))
138 | }
139 | 
140 | # Function to fit the learning curve by inverse power function
141 | 
142 | fitLC <- function(acc, n, auto_initial = TRUE,
143 |                   a = NULL, b = NULL, c = NULL, verbose = TRUE,
144 |                   b_max = NULL){
145 |     dat_train <- data.frame(n = n, acc = acc)
146 | 
147 |     if (is.null(b)) {
148 |         b = 0.9
149 |     }
150 | 
151 |     if (is.null(b_max)) {
152 |         b_max = 1
153 |     }
154 | 
155 |     if (auto_initial) {
156 |         init <- init_fit(acc, n)
157 | 
158 |         lower_bound <- c(-max(max(dat_train$acc) + 0.01, 1)*(10^init$c),
159 |                          -Inf, 0)
160 | 
161 |         learning_curve <-  try({minpack.lm::nlsLM(acc ~ I(1 / n^(c) * a) + b,
162 |                                                   data = dat_train,
163 |                                                   start = list(a = init$a,
164 |                                                                c = init$c,
165 |                                                                b = b),
166 |                                                   upper = c(10, Inf, b_max),
167 |                                                   lower = lower_bound
168 |         )
169 |         }, silent = TRUE)
170 | 
171 | 
172 |     }else if (!is.null(a) & !is.null(b) & !is.null(c)) {
173 |         # For the case that user supplies starting point
174 |         learning_curve <- stats::nls(acc ~ I(1 / n^(c) * a) + b,
175 |                                      data = dat_train,
176 |                                      start = list(a = a, c = c, b = b))
177 |     }else{
178 |         # this starting point may not work all the time
179 |         learning_curve <- stats::nls(acc ~ I(1 / n^(c) * a) + b,
180 |                                      data = dat_train,
181 |                                      start = list(a = -10, c = 1, b = 1))
182 |     }
183 | 
184 |     para <- coef(learning_curve)
185 |     names(para) <- c("a1", "c1", "b1")
186 | 
187 |     if (verbose) {
188 |         print(summary(learning_curve))
189 |     }
190 | 
191 |     learning_curve$para <- para
192 |     learning_curve$fit_model <- "nls"
193 | 
194 |     return(learning_curve)
195 | }
196 | 
197 | 
198 | 
199 | 
200 | fitLC_mixture <- function(acc, n,  b = NULL,
201 |                           d_list = NULL,
202 |                           verbose = TRUE){
203 |     dat_train <- data.frame(n = n, acc = acc)
204 | 
205 |     if (is.null(b)) {
206 |         b = 0.9
207 |     }
208 | 
209 |     if (is.null(d_list)) {
210 |         d_list <- seq(min(n) + 10, round(max(n)/2), 10)
211 |     }
212 | 
213 |     init <- init_fit(acc, n)
214 | 
215 |     rss <- c()
216 |     for (i in seq_len(length(d_list))) {
217 |         d <- d_list[i]
218 |         learning_curve <-  try({minpack.lm::nlsLM(acc ~ H1(d - n) *
219 |                                                       I(1 / n^(c) * a) +
220 |                                                       H(n - d)  *
221 |                                                       I(1 / n^(c1) *
222 |                                                             a*d^(c1 - c))  + b,
223 |                                                   data = dat_train,
224 |                                                   start = list(a = init$a,
225 |                                                                c = init$c,
226 |                                                                b = b,
227 |                                                                c1 = init$c),
228 |                                                   upper = c(Inf, Inf, 1, Inf),
229 |                                                   lower = c(-Inf, -Inf, 0,
230 |                                                             -Inf),
231 |                                                   control = list(maxiter =
232 |                                                                      1000))},
233 |                                silent = TRUE)
234 | 
235 |         if (!"try-error" %in% is(learning_curve)) {
236 |             rss <- c(rss, summary(learning_curve)$sigma)
237 |         }else{
238 |             rss <- c(rss, Inf)
239 |         }
240 | 
241 |     }
242 | 
243 | 
244 |     d_opt <- d_list[which.min(rss)]
245 | 
246 |     learning_curve <-  minpack.lm::nlsLM(acc ~ H1(d_opt - n) *
247 |                                              I(1 / n^(c) * a) +
248 |                                              H(n - d_opt)  *
249 |                                              I(1 / n^(c1) * a*d_opt^(c1-c))  +
250 |                                              b,
251 |                                          data = dat_train,
252 |                                          start = list(a = init$a, c = init$c,
253 |                                                       b = 0.95,
254 |                                                       c1 = init$c),
255 |                                          upper = c(Inf, Inf, 1, Inf),
256 |                                          lower = c(-Inf, -Inf, 0, -Inf),
257 |                                          control = list(maxiter = 1000)
258 | 
259 |     )
260 | 
261 |     para <- coef(learning_curve)
262 |     para <- c(para, para[1]*d_opt^(para[4] - para[2]), d_opt)
263 |     names(para) <- c("a1", "c1", "b1", "c2", "a2", "d")
264 | 
265 |     if (verbose) {
266 |         print(summary(learning_curve))
267 |     }
268 |     learning_curve$para <- para
269 |     learning_curve$fit_model <- "nls_mix"
270 |     return(learning_curve)
271 | }
272 | 
273 | 
274 | #' Function to get the required N given by the accuracy and
275 | #' the learning curve model
276 | #' @param res model results returned by \code{learning_curve} function
277 | #' @param acc accuracy that are quired
278 | #' @return sample size that are required
279 | #'
280 | #' @examples
281 | #' set.seed(2019)
282 | #' n <- seq(20, 10000, 100)
283 | #' accMat <- do.call(cbind, lapply(1:length(n), function(i){
284 | #' tmp_n <- rep(n[i], 50)
285 | #' y <- -2/(tmp_n^0.8) + 0.95 + rnorm(length(tmp_n), 0, 0.02)
286 | #' }))
287 | #' res <- learningCurve(accMat = accMat, n)
288 | #' N <- getN(res, acc = 0.9)
289 | #'
290 | #' @export
291 | 
292 | getN <- function(res, acc = 0.9) {
293 |     para <- coef(res$model$mean)
294 |     names(para) <- c("a", "c", "b")
295 |     if (acc > para["b"]) {
296 |         stop("Required accuracy is too high to achieve :(")
297 |     }
298 |     N <- round(exp(1/para["c"]*log(para["a"]/(acc - para["b"]))))
299 |     names(N) <- NULL
300 | 
301 |     return(N)
302 | }
303 | 
304 | H <- function(x) as.numeric(x > 0)
305 | H1 <- function(x) as.numeric(x >= 0)
306 | 
307 | 


--------------------------------------------------------------------------------
/R/predict_scClassify.R:
--------------------------------------------------------------------------------
  1 | #' Testing scClassify model
  2 | #'
  3 | #' @param exprsMat_test A list or a matrix indicates the log-transformed
  4 | #' expression matrices of the query datasets
  5 | #' @param trainRes A `scClassifyTrainModel` or a `list` indicates
  6 | #' scClassify trained model
  7 | #' @param cellTypes_test A list or a vector indicates cell types
  8 | #' of the qurey datasets (Optional).
  9 | #' @param k An integer indicates the number of neighbour
 10 | #' @param prob_threshold A numeric indicates the probability threshold
 11 | #' for KNN/WKNN/DWKNN.
 12 | #' @param cor_threshold_static A numeric indicates the static
 13 | #' correlation threshold.
 14 | #' @param cor_threshold_high A numeric indicates the highest
 15 | #' correlation threshold
 16 | #' @param features A vector indicates the gene selection method,
 17 | #' set as "limma" by default.
 18 | #' This should be one or more of "limma", "DV", "DD", "chisq", "BI".
 19 | #' @param algorithm A vector indicates the KNN method that are used,
 20 | #' set as "WKNN" by default.
 21 | #' This should be one or more of "WKNN", "KNN", "DWKNN".
 22 | #' @param similarity A vector indicates the similarity measure that are used,
 23 | #' set as "pearson" by default.
 24 | #' This should be one or more of "pearson",  "spearman", "cosine",
 25 | #' "jaccard", "kendall", "binomial", "weighted_rank","manhattan"
 26 | #' @param cutoff_method A vector indicates the method to cutoff the
 27 | #' correlation distribution. Set as "dynamic" by default.
 28 | #' @param weighted_ensemble A logical input indicates in ensemble learning,
 29 | #' whether the results is combined by a
 30 | #' weighted score for each base classifier.
 31 | #' @param weights A vector indicates the weights for ensemble
 32 | #' @param parallel A logical input indicates whether running in paralllel or not
 33 | #' @param BPPARAM  A \code{BiocParallelParam} class object
 34 | #' from the \code{BiocParallel} package is used. Default is SerialParam().
 35 | #' @param verbose A logical input indicates whether the intermediate steps
 36 | #' will be printed
 37 | #' @return list of results
 38 | #'
 39 | #' @examples
 40 | #' data("scClassify_example")
 41 | #' wang_cellTypes <- scClassify_example$wang_cellTypes
 42 | #' exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
 43 | #' data("trainClassExample_xin")
 44 | #'
 45 | #' pred_res <- predict_scClassify(exprsMat_test = exprsMat_wang_subset,
 46 | #' trainRes = trainClassExample_xin,
 47 | #' cellTypes_test = wang_cellTypes,
 48 | #' algorithm = "WKNN",
 49 | #' features = c("limma"),
 50 | #' similarity = c("pearson"),
 51 | #' prob_threshold = 0.7,
 52 | #' verbose = TRUE)
 53 | #'
 54 | #' @author Yingxin Lin
 55 | #'
 56 | #' @importFrom methods is
 57 | #' @importFrom BiocParallel SerialParam bplapply
 58 | #'
 59 | #' @export
 60 | 
 61 | # function to predict multiple feature + distance combinations
 62 | 
 63 | predict_scClassify <- function(exprsMat_test,
 64 |                                trainRes,
 65 |                                cellTypes_test = NULL,
 66 |                                k = 10,
 67 |                                prob_threshold = 0.7,
 68 |                                cor_threshold_static = 0.5,
 69 |                                cor_threshold_high = 0.7,
 70 |                                features = "limma",
 71 |                                algorithm = "WKNN",
 72 |                                similarity = "pearson",
 73 |                                cutoff_method = c("dynamic", "static"),
 74 |                                weighted_ensemble = FALSE,
 75 |                                weights = NULL,
 76 |                                parallel = FALSE,
 77 |                                BPPARAM = BiocParallel::SerialParam(),
 78 |                                verbose = FALSE){
 79 | 
 80 |     # checking input
 81 | 
 82 |     algorithm <- match.arg(algorithm, c("WKNN", "KNN", "DWKNN"),
 83 |                            several.ok = TRUE)
 84 |     similarity <- match.arg(similarity, c("pearson",  "spearman",
 85 |                                           "cosine", "jaccard", "kendall",
 86 |                                           "weighted_rank","manhattan"),
 87 |                             several.ok = TRUE)
 88 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"),
 89 |                                several.ok = TRUE)
 90 | 
 91 | 
 92 |     if ("scClassifyTrainModelList" %in% is(trainRes)) {
 93 |         stop("For a list of training model,
 94 |          please use predict_scClassifyJoint()
 95 |              instead to get joint training results.")
 96 |     }
 97 | 
 98 |     if (!any(c("scClassifyTrainModel", "list") %in% is(trainRes))) {
 99 |         stop("Wrong trainRes input.
100 |              Need to be either scClassifyTrainModel or list")
101 |     }
102 | 
103 |     if ((length(features) > 1 | length(algorithm) > 1 |
104 |          length(similarity) > 1) ) {
105 |         if (weighted_ensemble) {
106 |             weighted_ensemble <- TRUE
107 |             ensemble <- TRUE
108 | 
109 |             if (verbose) {
110 |                 cat("Performing weighted ensemble learning... \n")
111 |             }
112 | 
113 |         } else {
114 |             weighted_ensemble <- FALSE
115 |             ensemble <- TRUE
116 | 
117 |             if (verbose) {
118 |                 cat("Performing unweighted ensemble learning... \n")
119 |             }
120 | 
121 |         }
122 |     } else {
123 |         weighted_ensemble <- FALSE
124 |         ensemble <- FALSE
125 | 
126 |         if (verbose) {
127 |             cat("Ensemble learning is disabled... \n")
128 |         }
129 | 
130 |     }
131 | 
132 | 
133 | 
134 |     ensemble_methods <- as.matrix(expand.grid(similarity = similarity,
135 |                                               algorithm = algorithm,
136 |                                               features = features))
137 | 
138 |     if (!is.null(weights)) {
139 |         if (length(weights) != nrow(ensemble_methods)) {
140 |             stop("The length of weights is not equal to
141 |            the number of combination of ensemble methods")
142 |         }
143 |     }
144 | 
145 | 
146 | 
147 |     if (parallel) {
148 | 
149 |         predictRes <- BiocParallel::bplapply(seq_len(nrow(ensemble_methods)),
150 |                                              function(em){
151 |                                                  predict_scClassifySingle(exprsMat_test =  exprsMat_test,
152 |                                                                           trainRes = trainRes,
153 |                                                                           cellTypes_test = cellTypes_test,
154 |                                                                           k = k,
155 |                                                                           prob_threshold = prob_threshold,
156 |                                                                           cor_threshold_static = cor_threshold_static,
157 |                                                                           cor_threshold_high = cor_threshold_high,
158 |                                                                           features = ensemble_methods[em, 3],
159 |                                                                           algorithm = ensemble_methods[em, 2],
160 |                                                                           similarity = ensemble_methods[em, 1],
161 |                                                                           cutoff_method = cutoff_method,
162 |                                                                           verbose = verbose)
163 |                                              }, BPPARAM = BPPARAM)
164 | 
165 | 
166 |     }else{
167 |         predictRes <- list()
168 |         for (em in seq(nrow(ensemble_methods))) {
169 | 
170 |             if (verbose) {
171 |                 cat("Using parameters: \n")
172 |                 print(ensemble_methods[em, ])
173 |             }
174 | 
175 |             predictRes[[em]] <- predict_scClassifySingle(exprsMat_test =  exprsMat_test,
176 |                                                          trainRes = trainRes,
177 |                                                          cellTypes_test = cellTypes_test,
178 |                                                          k = k,
179 |                                                          prob_threshold = prob_threshold,
180 |                                                          cor_threshold_static = cor_threshold_static,
181 |                                                          cor_threshold_high = cor_threshold_high,
182 |                                                          features = ensemble_methods[em, 3],
183 |                                                          algorithm = ensemble_methods[em, 2],
184 |                                                          similarity = ensemble_methods[em, 1],
185 |                                                          cutoff_method = cutoff_method,
186 |                                                          verbose = verbose)
187 |         }
188 | 
189 | 
190 |     }
191 |     names(predictRes) <- paste(ensemble_methods[, 1],
192 |                                ensemble_methods[, 2],
193 |                                ensemble_methods[, 3],
194 |                                sep = "_")
195 | 
196 |     if (is.null(weights)) {
197 |         if ("list" %in% is(trainRes)) {
198 |             weights <- trainRes$modelweights
199 |             weights <- weights[names(predictRes)]
200 |         }
201 | 
202 |         if ("scClassifyTrainModel" %in% is(trainRes)) {
203 |             weights <- modelweights(trainRes)
204 |             weights <- weights[names(predictRes)]
205 |         }
206 | 
207 |     }
208 | 
209 |     
210 |     if (verbose) {
211 |         cat("weights for each base method: \n")
212 |         print(weights)
213 |     }
214 | 
215 | 
216 | 
217 |     if (ensemble) {
218 |         ensembleRes <- getEnsembleRes(predictRes,
219 |                                       weights,
220 |                                       exclude = NULL,
221 |                                       weighted_ensemble = weighted_ensemble)
222 | 
223 | 
224 |         predictRes$ensembleRes <- ensembleRes
225 |     }
226 | 
227 | 
228 |     return(predictRes)
229 | }
230 | 
231 | 
232 | #' Testing scClassify model (joint training)
233 | #'
234 | #' @param exprsMat_test A list or a matrix indicates the expression matrices of the testing datasets
235 | #' @param trainRes A `scClassifyTrainModel` or a `list` indicates scClassify training model
236 | #' @param cellTypes_test A list or a vector indicates cell types of the testing datasets (Optional).
237 | #' @param k An integer indicates the number of neighbour
238 | #' @param prob_threshold A numeric indicates the probability threshold for KNN/WKNN/DWKNN.
239 | #' @param cor_threshold_static A numeric indicates the static correlation threshold.
240 | #' @param cor_threshold_high A numeric indicates the highest correlation threshold
241 | #' @param features A vector indicates the method to select features, set as "limma" by default.
242 | #' This should be one or more of "limma", "DV", "DD", "chisq", "BI".
243 | #' @param algorithm A vector indicates the KNN method that are used, set as "WKNN" by default.
244 | #' This should be one or more of "WKNN", "KNN", "DWKNN".
245 | #' @param similarity A vector indicates the similarity measure that are used,
246 | #' set as "pearson" by default.
247 | #' This should be one or more of "pearson",  "spearman", "cosine", "jaccard", "kendall",
248 | #' "binomial", "weighted_rank","manhattan"
249 | #' @param cutoff_method A vector indicates the method to cutoff the correlation distribution.
250 | #' Set as "dynamic" by default.
251 | #' @param parallel A logical input indicates whether running in paralllel or not
252 | #' @param BPPARAM  A \code{BiocParallelParam} class object
253 | #' from the \code{BiocParallel} package is used. Default is SerialParam().
254 | #' @param verbose A logical input indicates whether the intermediate steps will be printed
255 | #' @return list of results
256 | #' @author Yingxin Lin
257 | #'
258 | #' @examples
259 | #' data("scClassify_example")
260 | #' wang_cellTypes <- scClassify_example$wang_cellTypes
261 | #' exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
262 | #' data("trainClassExample_xin")
263 | #' data("trainClassExample_wang")
264 | #'
265 | #' trainClassExampleJoint <- scClassifyTrainModelList(trainClassExample_wang,
266 | #' trainClassExample_xin)
267 | #'
268 | #' pred_res_joint <- predict_scClassifyJoint(exprsMat_test = exprsMat_wang_subset,
269 | #' trainRes = trainClassExampleJoint,
270 | #' cellTypes_test = wang_cellTypes,
271 | #' algorithm = "WKNN",
272 | #' features = c("limma"),
273 | #' similarity = c("pearson"),
274 | #' prob_threshold = 0.7,
275 | #' verbose = FALSE)
276 | #'
277 | #' table(pred_res_joint$jointRes$cellTypes, wang_cellTypes)
278 | #'
279 | #' @importFrom methods is slot
280 | #' @importFrom BiocParallel SerialParam
281 | #' @export
282 | 
283 | predict_scClassifyJoint <- function(exprsMat_test,
284 |                                     trainRes,
285 |                                     cellTypes_test = NULL,
286 |                                     k = 10,
287 |                                     prob_threshold = 0.7,
288 |                                     cor_threshold_static = 0.5,
289 |                                     cor_threshold_high = 0.7,
290 |                                     features = "limma",
291 |                                     algorithm = "WKNN",
292 |                                     similarity = "pearson",
293 |                                     cutoff_method = c("dynamic", "static"),
294 |                                     parallel = FALSE,
295 |                                     BPPARAM = BiocParallel::SerialParam(),
296 |                                     verbose = FALSE){
297 | 
298 |     # checking input
299 | 
300 |     algorithm <- match.arg(algorithm, c("WKNN", "KNN", "DWKNN"),
301 |                            several.ok = TRUE)
302 |     similarity <- match.arg(similarity, c("pearson",  "spearman",
303 |                                           "cosine", "jaccard", "kendall",
304 |                                           "weighted_rank","manhattan"),
305 |                             several.ok = TRUE)
306 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"),
307 |                                several.ok = TRUE)
308 | 
309 | 
310 |     if (!"scClassifyTrainModelList" %in% is(trainRes)) {
311 |         stop("trainRes needs to be a scClassifyTrainModelList object")
312 |     }
313 | 
314 | 
315 | 
316 |     predictResList <- list()
317 | 
318 | 
319 |     for (trainListIdx in seq_len(length(methods::slot(trainRes, "listData")))) {
320 | 
321 |         if (verbose) {
322 |             cat("Training using ")
323 |             cat(names(methods::slot(trainRes, "listData"))[trainListIdx], "\n")
324 |         }
325 | 
326 |         predictResList[[trainListIdx]] <- predict_scClassify(exprsMat_test = exprsMat_test,
327 |                                                              trainRes = trainRes[[trainListIdx]],
328 |                                                              cellTypes_test = cellTypes_test,
329 |                                                              k = k,
330 |                                                              prob_threshold = prob_threshold,
331 |                                                              cor_threshold_static = cor_threshold_static,
332 |                                                              cor_threshold_high = cor_threshold_high,
333 |                                                              features = features,
334 |                                                              algorithm = algorithm,
335 |                                                              similarity = similarity,
336 |                                                              cutoff_method = cutoff_method,
337 |                                                              parallel = parallel,
338 |                                                              BPPARAM = BPPARAM,
339 |                                                              verbose = verbose)
340 |     }
341 | 
342 | 
343 |     names(predictResList) <- paste("Trained_by",
344 |                                    names(methods::slot(trainRes, "listData")),
345 |                                    sep = "_")
346 | 
347 |     jointRes <- getJointRes(predictResList, trainRes)
348 | 
349 |     rownames(jointRes) <- colnames(exprsMat_test)
350 | 
351 | 
352 |     predictResList$jointRes <- jointRes
353 | 
354 | 
355 |     return(predictResList)
356 | }
357 | 
358 | 
359 | predict_scClassifySingle <- function(exprsMat_test,
360 |                                      trainRes,
361 |                                      cellTypes_test,
362 |                                      k = 10,
363 |                                      prob_threshold = 0.7,
364 |                                      cor_threshold_static = 0.5,
365 |                                      cor_threshold_high = 0.7,
366 |                                      features = "limma",
367 |                                      algorithm = c("WKNN", "KNN", "DWKNN"),
368 |                                      similarity = c("pearson",  "spearman",
369 |                                                     "cosine", "jaccard",
370 |                                                     "kendall", "weighted_rank",
371 |                                                     "manhattan"),
372 |                                      cutoff_method = c("dynamic", "static"),
373 |                                      verbose = TRUE){
374 | 
375 | 
376 |     if (!is.null(cellTypes_test)) {
377 |         if (length(cellTypes_test) != ncol(exprsMat_test)) {
378 |             stop("Length of testing cell types does not match
379 |            with number of column of testing expression matrix")
380 |         }
381 |     }
382 | 
383 |     if (is.null(rownames(exprsMat_test))) {
384 |         stop("rownames of exprsMat_test is NULL")
385 |     }
386 | 
387 |     if (is.null(colnames(exprsMat_test)) |
388 |         sum(duplicated(colnames(exprsMat_test))) != 0) {
389 |         stop("colnames of exprsMat_test is NULL or not unique")
390 |     }
391 | 
392 |     if (all(exprsMat_test %% 1 == 0)) {
393 |         warning("exprsMat_test looks like a count matrix
394 |             (scClassify requires a log-transformed normalised input)")
395 |     }
396 | 
397 |     # check input
398 |     algorithm <- match.arg(algorithm, c("WKNN", "KNN", "DWKNN"))
399 |     similarity <- match.arg(similarity, c("pearson",  "spearman",
400 |                                           "cosine", "jaccard", "kendall",
401 |                                           "weighted_rank","manhattan"))
402 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"))
403 | 
404 | 
405 |     if (ncol(exprsMat_test) < 100 & cutoff_method == "dynamic") {
406 |         warning("Number of cells in test data is small.
407 |                 The cutoff method is changed to static.")
408 |         cutoff_method <- "static"
409 |     }
410 | 
411 | 
412 |     if ("scClassifyTrainModelList" %in% is(trainRes)) {
413 |         stop("For a list of training model,
414 |          please use predict_scClassifyJoint instead to
415 |              get joint training results.")
416 |     }
417 | 
418 |     if (!any(c("scClassifyTrainModel", "list") %in% is(trainRes))) {
419 |         stop("wrong trainRes input.
420 |              Need to be either scClassifyTrainModel or list")
421 |     }
422 | 
423 |     if ("scClassifyTrainModel" %in% is(trainRes)) {
424 | 
425 |         if (!features %in% features(trainRes)) {
426 |             stop("The selected features are not trained in the provided model!")
427 |         }
428 | 
429 | 
430 |         # get the input from train model
431 | 
432 |         levelModel <- model(trainRes)[[features]]$model
433 |         levelHVG <- model(trainRes)[[features]]$hvg
434 |         cutree_list <- cellTypeTree(trainRes)
435 |         cellTypes_train <- cellTypeTrain(trainRes)
436 |     } else {
437 | 
438 |         if (!features %in% names(trainRes$hierarchyKNNRes)) {
439 |             stop("The selected features are not trained in the provided model!")
440 |         }
441 | 
442 | 
443 |         # get the input from train model
444 | 
445 |         levelModel <- trainRes$hierarchyKNNRes[[features]]$model
446 |         levelHVG <- trainRes$hierarchyKNNRes[[features]]$hvg
447 |         cutree_list <- trainRes$cutree_list
448 |         cellTypes_train <- trainRes$cellTypes_train
449 | 
450 |     }
451 | 
452 |     pred <- list()
453 | 
454 |     # For each level
455 |     for (i in seq_len(length(levelModel))) {
456 |         # If this level is not NULL (Not Level1)
457 |         if (!is.null(levelModel[[i]])) {
458 | 
459 | 
460 |             #If not all the cells are not unassigned in the parent nodes
461 |             if (sum(pred[[i - 1]] != 0) != 0) {
462 |                 pred_level <- list()
463 | 
464 |                 for (j in seq_len(length(levelModel[[i]]))) {
465 | 
466 |                     # If the model of level i-1, cells labeled as j is NOT "no model"
467 |                     if (!"character" %in% is(levelModel[[i]][[j]])) {
468 | 
469 |                         # Select the cells that are going to classified
470 |                         # (according to what they are classified in last level)
471 |                         predIdx <- which(pred[[i - 1]] == j)
472 | 
473 |                         if (length(predIdx) != 0) {
474 | 
475 |                             # features that are in the test dataset
476 |                             common_HVG <- intersect(rownames(exprsMat_test), levelHVG[[i]][[j]])
477 |                             exprsMat_toTest <- exprsMat_test[common_HVG, predIdx, drop = FALSE]
478 |                             exprsMat_toTest <- exprsMat_toTest[Matrix::rowSums(exprsMat_toTest) != 0, ,
479 |                                                                drop = FALSE]
480 | 
481 |                             if (nrow(exprsMat_toTest) < 5) {
482 | 
483 |                                 message(paste("There are only", nrow(exprsMat_toTest),
484 |                                               "selected genes in reference data expressed in query data"))
485 |                                 pred_level[[j]] <- rep(0, length(predIdx))
486 |                             }else{
487 | 
488 |                                 # Calculate the similarity
489 |                                 corMat <- calculateSimilarity(exprsMat_train = Matrix::t(levelModel[[i]][[j]]$train[ ,rownames(exprsMat_toTest)]),
490 |                                                               exprsMat_test = exprsMat_toTest,
491 |                                                               similarity = similarity)
492 | 
493 |                                 # Different algorithm
494 |                                 if (algorithm == "KNN") {
495 |                                     predRes <- KNNcor(corMat = corMat,
496 |                                                       subLevelModel = levelModel[[i]][[j]],
497 |                                                       cutoff_method = cutoff_method,
498 |                                                       k = k,
499 |                                                       prob_threshold = prob_threshold,
500 |                                                       cor_threshold_static = cor_threshold_static,
501 |                                                       cor_threshold_high = cor_threshold_high,
502 |                                                       topLevel = FALSE,
503 |                                                       verbose = verbose)
504 |                                 }
505 |                                 if (algorithm == "WKNN") {
506 |                                     predRes <- WKNNcor(corMat = corMat,
507 |                                                        subLevelModel = levelModel[[i]][[j]],
508 |                                                        cutoff_method = cutoff_method,
509 |                                                        k = k,
510 |                                                        prob_threshold = prob_threshold,
511 |                                                        cor_threshold_static = cor_threshold_static,
512 |                                                        cor_threshold_high = cor_threshold_high,
513 |                                                        topLevel = FALSE,
514 |                                                        verbose = verbose)
515 |                                 }
516 | 
517 |                                 if (algorithm == "DWKNN") {
518 |                                     predRes <- DWKNNcor(corMat = corMat,
519 |                                                         subLevelModel = levelModel[[i]][[j]],
520 |                                                         cutoff_method = cutoff_method,
521 |                                                         k = k,
522 |                                                         prob_threshold = prob_threshold,
523 |                                                         cor_threshold_static = cor_threshold_static,
524 |                                                         cor_threshold_high = cor_threshold_high,
525 |                                                         topLevel = FALSE,
526 |                                                         verbose = verbose)
527 |                                 }
528 | 
529 |                                 pred_level[[j]] <- predRes$predRes
530 |                             }
531 | 
532 |                         }
533 | 
534 |                     }
535 | 
536 |                     # Else, the model of (level i-1, cells labeled as j) IS "no model"
537 |                     # maintain the same class.
538 |                     else {
539 |                         predIdx <- which(pred[[i - 1]] == j)
540 |                         # check the label of current level based on the label of last level
541 |                         pred_level[[j]] <- as.factor(rep(unique(cutree_list[[i]][cutree_list[[i - 1]] == j]),
542 |                                                          length(predIdx)))
543 |                         names(pred_level[[j]]) <- colnames(exprsMat_test)[predIdx]
544 |                     }
545 |                 }
546 | 
547 | 
548 | 
549 |                 # Get the predict results for level i, change it to the numeric
550 | 
551 |                 pred[[i]] <- unlist(lapply(pred_level, function(x){
552 |                     cellNames <- names(x)
553 |                     x <- as.numeric(as.character(x))
554 |                     names(x) <- cellNames
555 |                     x
556 |                 }))
557 | 
558 |                 # reorder the prediction results to consistent
559 |                 #with the exprsMat_test
560 |                 pred[[i]] <- pred[[i]][colnames(exprsMat_test)]
561 | 
562 |                 names(pred[[i]]) <- colnames(exprsMat_test)
563 |                 pred[[i]] <- as.numeric(as.character(pred[[i]]))
564 |                 # there will be NA since they are unassigned
565 |                 #from the last level,
566 |                 #and therefore are not predicted in this level
567 |                 pred[[i]][is.na(pred[[i]])] <- 0
568 |                 names(pred[[i]]) <- colnames(exprsMat_test)
569 | 
570 | 
571 |             }
572 |             else{
573 | 
574 |                 # else, if all the cells in paraent nodes are unassigned
575 |                 pred[[i]] <- as.factor(rep(0, ncol(exprsMat_test)))
576 |                 names(pred[[i]]) <- colnames(exprsMat_test)
577 |             }
578 | 
579 |         }else{
580 |             # If this level is NULL (Level 1)
581 |             pred[[i]] <- as.factor(rep(1, ncol(exprsMat_test)))
582 |             names(pred[[i]]) <- colnames(exprsMat_test)
583 |         }
584 | 
585 | 
586 |     }
587 | 
588 |     predMat <- do.call(cbind, pred)
589 |     predMat <- vapply(seq_len(ncol(predMat)),
590 |                       function(x) getPredRes(predMat, cutree_list, x),
591 |                       character(nrow(predMat)))
592 | 
593 |     predRes <- apply(predMat, 1, function(x){
594 |         unAssIdx <- which(x == "unassigned")
595 |         if (length(unAssIdx) != 0) {
596 |             if (min(unAssIdx) >= 3) {
597 |                 x[min(unAssIdx) - 1]
598 |             }else{
599 |                 "unassigned"
600 |             }
601 |         }else{
602 |             x[length(x)]
603 |         }
604 |     })
605 | 
606 |     if (is.null(cellTypes_test)) {
607 |         return(list(predRes = predRes, predLabelMat = predMat))
608 |     }else{
609 | 
610 |         classify_res <-  ClassifyError(cellTypes_pred = predRes,
611 |                                        cellTypes_test = cellTypes_test,
612 |                                        cellTypes_train = cellTypes_train)
613 | 
614 |         if (verbose) {
615 |             print(table(classify_res)/length(classify_res))
616 |         }
617 | 
618 |         return(list(predRes = predRes, predLabelMat = predMat,
619 |                     classifyRes = classify_res))
620 | 
621 |     }
622 | 
623 | }
624 | 
625 | 
626 | 
627 | # Function to calculate similarity
628 | #' @importFrom proxyC simil
629 | #' @importFrom proxy as.dist
630 | #' @importFrom proxy dist
631 | #' @importFrom methods as
632 | 
633 | 
634 | calculateSimilarity <- function(exprsMat_train,
635 |                                 exprsMat_test,
636 |                                 similarity = c("pearson",  "spearman",
637 |                                                "cosine", "jaccard", "kendall",
638 |                                                "weighted_rank","manhattan")) {
639 | 
640 | 
641 |     similarity <- match.arg(similarity, c("pearson",  "spearman",
642 |                                           "cosine", "jaccard", "kendall",
643 |                                           "weighted_rank","manhattan"))
644 | 
645 |     if ("dgCMatrix"  %in% is(exprsMat_test) &
646 |         !"dgCMatrix" %in% is(exprsMat_train)) {
647 |         exprsMat_train <- methods::as(exprsMat_train, "dgCMatrix")
648 |     }
649 | 
650 |     if (!"dgCMatrix" %in% is(exprsMat_test) &
651 |         "dgCMatrix" %in% is(exprsMat_train)) {
652 |         exprsMat_test <- methods::as(exprsMat_test, "dgCMatrix")
653 |     }
654 | 
655 | 
656 |     if (similarity == "cosine") {
657 | 
658 |         if ("dgCMatrix" %in% is(exprsMat_test) &
659 |             "dgCMatrix" %in% is(exprsMat_train)) {
660 |             corMat <- proxyC::simil(Matrix::t(exprsMat_train),
661 |                                     Matrix::t(exprsMat_test),
662 |                                     method = "cosine")
663 |         } else {
664 |             corMat <- 1 - as.matrix(proxy::dist(t(as.matrix(exprsMat_train)),
665 |                                                 t(as.matrix(exprsMat_test)),
666 |                                                 method = "cosine"))
667 |         }
668 | 
669 |         corMat[is.na(corMat) | is.infinite(corMat)] <- min(corMat)
670 | 
671 |     } else if (similarity == "kendall") {
672 | 
673 | 
674 | 
675 |         corMat <- stats::cor(as.matrix(exprsMat_train),
676 |                              as.matrix(exprsMat_test), method = "kendall")
677 |         corMat[is.na(corMat) | is.infinite(corMat)] <- -1
678 | 
679 |     } else if (similarity == "jaccard") {
680 | 
681 |         if ("dgCMatrix" %in% is(exprsMat_test) &
682 |             "dgCMatrix" %in% is(exprsMat_train)) {
683 |             corMat <- proxyC::simil(Matrix::t(exprsMat_train),
684 |                                     Matrix::t(exprsMat_test),
685 |                                     method = "jaccard")
686 |         } else {
687 |             corMat <- 1 -
688 |                 as.matrix(proxy::dist(t(as.matrix(exprsMat_train > 0)),
689 |                                       t(as.matrix(exprsMat_test > 0)),
690 |                                       method = "Jaccard"))
691 | 
692 |         }
693 |         corMat[is.na(corMat) | is.infinite(corMat)] <- min(corMat)
694 |     }else if (similarity == "weighted_rank") {
695 | 
696 |         corMat <- wtd_rank2(as.matrix(exprsMat_train),
697 |                             as.matrix(exprsMat_test), method = "pearson")
698 |         corMat[is.na(corMat) | is.infinite(corMat)] <- -1
699 | 
700 |     }else if (similarity == "manhattan") {
701 |         if ("dgCMatrix" %in% is(exprsMat_test) &
702 |             "dgCMatrix" %in% is(exprsMat_train)) {
703 |             corMat <- 1 - as.matrix(proxy::dist(t(as.matrix(exprsMat_train)),
704 |                                                 t(as.matrix(exprsMat_test)),
705 |                                                 method = "Manhattan"))
706 |         } else {
707 |             corMat <- 1 - as.matrix(proxy::dist(t(as.matrix(exprsMat_train)),
708 |                                                 t(as.matrix(exprsMat_test)),
709 |                                                 method = "Manhattan"))
710 |         }
711 |         corMat[is.na(corMat) | is.infinite(corMat)] <- min(corMat)
712 | 
713 |     }else if (similarity == "spearman") {
714 | 
715 |         corMat <- stats::cor(as.matrix(exprsMat_train),
716 |                              as.matrix(exprsMat_test), method = "spearman")
717 |         corMat[is.na(corMat) | is.infinite(corMat)] <- -1
718 | 
719 |     }else if (similarity == "pearson") {
720 | 
721 |         if ("dgCMatrix" %in% is(exprsMat_test) &
722 |             "dgCMatrix" %in% is(exprsMat_train)) {
723 |             corMat <- proxyC::simil(Matrix::t(exprsMat_train),
724 |                                     Matrix::t(exprsMat_test),
725 |                                     method = "correlation")
726 |         } else {
727 |             corMat <- stats::cor(as.matrix(exprsMat_train),
728 |                                  as.matrix(exprsMat_test), method = "pearson")
729 |         }
730 |         corMat[is.na(corMat) | is.infinite(corMat)] <- -1
731 | 
732 |     }else{
733 | 
734 |         if ("dgCMatrix" %in% is(exprsMat_test) &
735 |             "dgCMatrix" %in% is(exprsMat_train)) {
736 |             corMat <- proxyC::simil(Matrix::t(exprsMat_train),
737 |                                     Matrix::t(exprsMat_test),
738 |                                     method = "correlation")
739 |         } else {
740 |             corMat <- stats::cor(as.matrix(exprsMat_train),
741 |                                  as.matrix(exprsMat_test), method = "pearson")
742 |         }
743 |         corMat[is.na(corMat) | is.infinite(corMat)] <- -1
744 |     }
745 | 
746 |     return(corMat)
747 | }
748 | 
749 | 
750 | # Function to caculate weighted rank correlation
751 | # the codes are modified from the wtd_rank function in dismay package
752 | 
753 | wtd_rank2 <- function(mat1, mat2 = NULL, method = "pearson") {
754 | 
755 |     method <- match.arg(method, c("pearson", "spearman"))
756 | 
757 |     ranks1 <- apply(mat1, 2, function(x) rank(-x, ties.method = "average"))
758 |     # weight the ranks
759 |     # calculate the savage scores
760 |     n1 <- nrow(mat1)
761 |     reciprocals1 <- 1 / seq_len(n1)
762 |     savage1 <- vapply(seq_len(n1),
763 |                       function(i) sum(reciprocals1[i:n1]),
764 |                       numeric(1L))
765 |     # replace each rank with the savage score
766 |     savages1 <- ranks1
767 |     savages1[] <- savage1[ranks1]
768 | 
769 |     if (!is.null(mat2)) {
770 |         ranks2 <- apply(mat2, 2, function(x) rank(-x, ties.method = "average"))
771 |         # weight the ranks
772 |         # calculate the savage scores
773 |         n2 <- nrow(mat2)
774 |         reciprocals2 <- 1 / seq_len(n2)
775 |         savage2 <- vapply(seq_len(n2),
776 |                           function(i) sum(reciprocals2[i:n2]),
777 |                           numeric(1L))
778 |         # replace each rank with the savage score
779 |         savages2 <- ranks2
780 |         savages2[] <- savage2[ranks2]
781 | 
782 |         cor <- stats::cor(savages1, savages2, method = method)
783 |     } else {
784 |         cor <- stats::cor(savages1, method = method)
785 |     }
786 | 
787 | 
788 |     # calculate pearson correlation
789 | 
790 |     return(cor)
791 | }
792 | 
793 | # Function to get the prediction labels according to the tree
794 | getPredRes <- function(predMat, cutree_list, level){
795 |     res <- predMat[,level]
796 |     for (i in seq_len(length(predMat[,level]))) {
797 |         if (predMat[i,level] == 0) {
798 |             res[i] <- "unassigned"
799 |         }else{
800 |             res[i] <- paste(names(cutree_list[[level]])[cutree_list[[level]]
801 |                                                         %in% predMat[i,level]],
802 |                             collapse = "_")
803 |         }
804 | 
805 |     }
806 |     return(res)
807 | }
808 | 
809 | 
810 | # Functions to classify the error for the predicted cell types
811 | 
812 | ClassifyError <- function(cellTypes_pred, cellTypes_test, cellTypes_train){
813 | 
814 |     errClass <- c("correct", "correctly unassigned",
815 |                   "intermediate", "incorrectly unassigned",
816 |                   "error assigned", "misclassified")
817 | 
818 | 
819 |     if (length(cellTypes_pred) != length(cellTypes_test)) {
820 |         stop("wrong input")
821 |     }
822 |     train_ref <- unique(cellTypes_train)
823 |     res <- vapply(seq_len(length(cellTypes_pred)), function(i){
824 |         if (cellTypes_test[i] %in% train_ref) {
825 |             if (cellTypes_pred[i] %in% c("unassigned", "Unassigned")) {
826 |                 "incorrectly unassigned"
827 |             } else if (cellTypes_pred[i] == "intermediate") {
828 |                 "intermediate"
829 |             }else{
830 |                 if (cellTypes_test[i] == cellTypes_pred[i]) {
831 |                     "correct"
832 |                 }else if (grepl(cellTypes_test[i], cellTypes_pred[i])) {
833 |                     "intermediate"
834 |                 }
835 |                 else{
836 |                     "misclassified"
837 |                 }
838 |             }
839 |         }else{
840 |             if (cellTypes_pred[i] %in% c("unassigned","Unassigned")) {
841 |                 "correctly unassigned"
842 |             }else{
843 |                 "error assigned"
844 |             }
845 |         }
846 |     }, character(1L))
847 | 
848 |     res <- factor(res, levels = errClass)
849 |     return(res)
850 | }
851 | 
852 | 
853 | 
854 | 


--------------------------------------------------------------------------------
/R/sampleSizeCal.R:
--------------------------------------------------------------------------------
  1 | #' @title Run sample size calculation for pilot data for reference dataset
  2 | #'
  3 | #' @param exprsMat A matrix of expression matrix of pilot dataset
  4 | #' (log-transformed, or normalised)
  5 | #' @param cellTypes A vector of cell types of pilot dataset
  6 | #' @param n_list A vector of integer indicates the sample size to run.
  7 | #' @param num_repeat An integer indicates the number of run for
  8 | #' each sample size will be repeated.
  9 | #' @param BPPARAM  A \code{BiocParallelParam} class object
 10 | #' from the \code{BiocParallel} package is used. Default is SerialParam().
 11 | #' @param subset_test A ogical input indicates whether we used a subset of data
 12 | #' (fixed number for each sample size)
 13 | #' to test instead of all remaining data. By default, it is FALSE.
 14 | #' @param num_test An integer indicates the size of the test data.
 15 | #' @param cellType_tree A list indicates the cell type tree (optional),
 16 | #' if it is NULL, the accuracy rate is calculate
 17 | #' based on the provided cellTypes.
 18 | #' @param level An integer indicates the accuracy rate is calculate
 19 | #' based on the n-th level from top of cell type tree.
 20 | #' If it is NULL (by default), it will be the bottom of the cell type tree.
 21 | #' It can not be larger than the total number of levels of the tree.
 22 | #' @param ... other parameter from scClassify
 23 | #'
 24 | #' @return A matrix of accuracy matrix, where columns corresponding to different
 25 | #' sample sizes, rows corresponding to the number of repetation.
 26 | #'
 27 | #' @examples
 28 | #' data("scClassify_example")
 29 | #' xin_cellTypes <- scClassify_example$xin_cellTypes
 30 | #' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
 31 | #'
 32 | #' exprsMat_xin_subset <- as(exprsMat_xin_subset, "dgCMatrix")
 33 | #' set.seed(2019)
 34 | #' accMat <- runSampleCal(exprsMat_xin_subset,
 35 | #' xin_cellTypes,
 36 | #' n_list = seq(20, 100, 20),
 37 | #' num_repeat = 5, BPPARAM = BiocParallel::SerialParam())
 38 | #'
 39 | #' @importFrom BiocParallel bplapply SerialParam
 40 | #' @export
 41 | 
 42 | 
 43 | runSampleCal <- function(exprsMat,
 44 |                          cellTypes,
 45 |                          n_list = c(20, 40, 60, 80, 100, seq(200, 500, 100)),
 46 |                          num_repeat = 20,
 47 |                          level = NULL,
 48 |                          cellType_tree = NULL,
 49 |                          BPPARAM = BiocParallel::SerialParam(),
 50 |                          subset_test = FALSE,
 51 |                          num_test = NULL,
 52 |                          ...) {
 53 | 
 54 | 
 55 | 
 56 | 
 57 |     if (length(n_list) < 5) {
 58 |         stop("length of n_list provided is too short...
 59 |              wont be enough point to fit the learning curve")
 60 |     }
 61 | 
 62 | 
 63 | 
 64 | 
 65 |     # If there is an input of cell type tree,
 66 |     #relabelled the cell types based on input level
 67 | 
 68 |     if (!is.null(cellType_tree)) {
 69 |         if (is.null(level)) {
 70 |             level <- length(cellType_tree)
 71 |         } else {
 72 |             if (level <= 0 | level > length(cellType_tree)) {
 73 |                 stop("The input of level is invalid")
 74 |             }
 75 | 
 76 |             level <- level + 1
 77 |         }
 78 | 
 79 |         cellTypes_ind <- cellType_tree[[level]][as.character(cellTypes)]
 80 |         cellTypes_relabelled <- vapply(seq_len(length(cellTypes_ind)),
 81 |            function(i) {
 82 |                paste(names(cellType_tree[[level]])[cellType_tree[[level]] %in%
 83 |                                                        cellTypes_ind[i]],
 84 |                      collapse = "_")
 85 |            }, character(1L))
 86 | 
 87 |         cellTypes <- cellTypes_relabelled
 88 |         cellType_tree_relabelled <- reLevelCellTypeTree(cellType_tree,
 89 |                                                         level = level)
 90 | 
 91 |     } else {
 92 |         cellType_tree_relabelled <- NULL
 93 |     }
 94 | 
 95 | 
 96 |     exprsMat <- as(exprsMat, "dgCMatrix")
 97 |     # n_list <- c(20, 40, 60, 80, 100, seq(200, 500, 100))
 98 | 
 99 |     res_sub <- list()
100 | 
101 |     for (i in seq_len(length(n_list))) {
102 |         print(paste("n=",n_list[i]))
103 |         res_sub[[i]] <- BiocParallel::bplapply(seq_len(num_repeat),
104 |            function(x) {
105 |                tryCatch({
106 |                    l <- runSubSampling(exprsMat, cellTypes, n = n_list[[i]],
107 |                                        subset_test = subset_test,
108 |                                        num_test = num_test,
109 |                                        cellType_tree = cellType_tree_relabelled,
110 |                                        cutoff_method = "dynamic",
111 |                                        prob_threshold = 0.6
112 |                    )
113 | 
114 |                    table(l)/length(l)},
115 |                    error = function(e){NULL})
116 |            }, BPPARAM = BPPARAM)
117 |         print(do.call(cbind, res_sub[[i]]))
118 |         gc(reset = TRUE)
119 |     }
120 | 
121 |     names(res_sub) <- n_list
122 | 
123 |     res_sub <- lapply(res_sub, function(x) unlist(lapply(x, "[[", "correct")))
124 |     res_sub <- res_sub[!is.null(res_sub)]
125 |     accuracy_mat <- do.call(cbind, res_sub)
126 |     return(accuracy_mat)
127 | }
128 | 
129 | 
130 | 
131 | 
132 | # function to run subsampling
133 | 
134 | 
135 | runSubSampling <- function(exprsMat,
136 |                            cellTypes,
137 |                            n = 50,
138 |                            subset_test = FALSE,
139 |                            num_test = 2000,
140 |                            ...){
141 | 
142 | 
143 |     # make sure the smallest training type has at least 3 cells.
144 |     n_subset <- round(table(cellTypes)/length(cellTypes)*n)
145 |     n_subset <- ifelse(n_subset < 3, 3, n_subset)
146 | 
147 |     trainIdx <- unlist(lapply(seq_len(length(n_subset)), function(x)
148 |         sample(which(cellTypes == names(n_subset)[x]), n_subset[x])))
149 | 
150 |     exprsMat_train <- exprsMat[,trainIdx]
151 |     cellTypes_train <- cellTypes[trainIdx]
152 | 
153 |     print(table(cellTypes_train))
154 | 
155 |     if (subset_test) {
156 |         testIdx <- sample(seq_len(ncol(exprsMat))[-trainIdx], num_test)
157 |         exprsMat_test <- exprsMat[, testIdx]
158 |         cellTypes_test <- cellTypes[testIdx]
159 |     }else{
160 |         exprsMat_test <- exprsMat[, -trainIdx]
161 |         cellTypes_test <- cellTypes[-trainIdx]
162 |     }
163 | 
164 | 
165 |     trainRes <- scClassify(exprsMat_train = exprsMat_train,
166 |                            cellTypes_train = cellTypes_train,
167 |                            exprsMat_test = list(
168 |                                test = exprsMat_test),
169 |                            cellTypes_test = list(
170 |                                test = cellTypes_test),
171 |                            ...)
172 | 
173 |     acc_cls <- trainRes$testRes$test$pearson_WKNN_limma$classifyRes
174 |     return(acc_cls)
175 | }
176 | 
177 | 
178 | 
179 | reLevelCellTypeTree <- function(cellType_tree, level) {
180 | 
181 |     cellTypes_newTree <- list()
182 | 
183 |     newCellTypeNames <- vapply(seq_len(max(cellType_tree[[level]])),
184 |                                function(i) {
185 |         paste(names(cellType_tree[[level]])[cellType_tree[[level]] == i],
186 |               collapse = "_")
187 |     }, character(1))
188 | 
189 |     cellTypes_newTree[[level]] <- seq_len(max(cellType_tree[[level]]))
190 |     names(cellTypes_newTree[[level]]) <- newCellTypeNames
191 | 
192 |     for (i in seq_len((level - 1))) {
193 | 
194 |         cellTypes_newTree[[i]] <- vapply(seq_len(length(newCellTypeNames)),
195 |          function(j)
196 |              unique(cellType_tree[[i]][names(cellType_tree[[i]]) %in%
197 |                                            unlist(strsplit(newCellTypeNames[j],
198 |                                                            "_"))]),
199 |          numeric(1)
200 |         )
201 |         names(cellTypes_newTree[[i]]) <- newCellTypeNames
202 | 
203 |     }
204 | 
205 |     return(cellTypes_newTree)
206 | }
207 | 


--------------------------------------------------------------------------------
/R/scClassify.R:
--------------------------------------------------------------------------------
  1 | #' @title Train and test scClassify model
  2 | #'
  3 | #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset
  4 | #' @param cellTypes_train A vector of cell types of reference dataset
  5 | #' @param exprsMat_test A list or a matrix indicates the expression matrices of the query datasets
  6 | #' @param cellTypes_test A list or a vector indicates cell types of the query datasets (Optional).
  7 | #' @param tree A vector indicates the method to build hierarchical tree, set as "HOPACH" by default.
  8 | #' This should be one of "HOPACH" and "HC" (using hclust).
  9 | #' @param selectFeatures A vector indicates the gene selection method, set as "limma" by default.
 10 | #' This should be one or more of "limma", "DV", "DD", "chisq", "BI" and "Cepo".
 11 | #' @param algorithm A vector indicates the KNN method that are used, set as
 12 | #' "WKNN" by default. Thisshould be one or more of "WKNN", "KNN", "DWKNN".
 13 | #' @param similarity A vector indicates the similarity measure that are used,
 14 | #' set as "pearson" by default. This should be one or more of "pearson",
 15 | #' "spearman", "cosine", "jaccard", kendall", "binomial", "weighted_rank","manhattan"
 16 | #' @param cutoff_method A vector indicates the method to cutoff the correlation distribution.
 17 | #' Set as "dynamic" by default.
 18 | #' @param weighted_ensemble A logical input indicates in ensemble learning,
 19 | #' whether the results is combined by a weighted score for each base classifier.
 20 | #' @param weights A vector indicates the weights for ensemble
 21 | #' @param weighted_jointClassification A logical input indicates in joint classification
 22 | #' using multiple training datasets,
 23 | #' whether the results is combined by a weighted score for each training model.
 24 | #' @param cellType_tree A list indicates the cell type tree provided by user.
 25 | #' (By default, it is NULL) (Only for one training data input)
 26 | #' @param k An integer indicates the number of neighbour
 27 | #' @param topN An integer indicates the top number of features that are selected
 28 | #' @param hopach_kmax An integer between 1 and 9 specifying the maximum number of
 29 | #' children at each node in the HOPACH tree.
 30 | #' @param pSig A numeric indicates the cutoff of pvalue for features
 31 | #' @param prob_threshold A numeric indicates the probability threshold for KNN/WKNN/DWKNN.
 32 | #' @param cor_threshold_static A numeric indicates the static correlation threshold.
 33 | #' @param cor_threshold_high A numeric indicates the highest correlation threshold
 34 | #' @param returnList A logical input indicates whether the output will be class of list
 35 | #' @param parallel A logical input indicates whether running in paralllel or not
 36 | #' @param BPPARAM  A \code{BiocParallelParam} class object
 37 | #' from the \code{BiocParallel} package is used. Default is SerialParam().
 38 | #' @param verbose A logical input indicates whether the intermediate steps will be printed
 39 | #'
 40 | #' @return A list of the results, including testRes storing the results of the testing information,
 41 | #' and trainRes storing the training model inforamtion.
 42 | #'
 43 | #' @author Yingxin Lin
 44 | #'
 45 | #' @examples
 46 | #'
 47 | #' data("scClassify_example")
 48 | #' xin_cellTypes <- scClassify_example$xin_cellTypes
 49 | #' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
 50 | #' wang_cellTypes <- scClassify_example$wang_cellTypes
 51 | #' exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
 52 | #'
 53 | #' scClassify_res <- scClassify(exprsMat_train = exprsMat_xin_subset,
 54 | #' cellTypes_train = xin_cellTypes,
 55 | #' exprsMat_test = list(wang = exprsMat_wang_subset),
 56 | #' cellTypes_test = list(wang = wang_cellTypes),
 57 | #' tree = "HOPACH",
 58 | #' algorithm = "WKNN",
 59 | #' selectFeatures = c("limma"),
 60 | #' similarity = c("pearson"),
 61 | #' returnList = FALSE,
 62 | #' verbose = FALSE)
 63 | #'
 64 | #' @importFrom S4Vectors DataFrame
 65 | #' @importFrom methods is
 66 | #' @importFrom BiocParallel SerialParam
 67 | #' @export
 68 | 
 69 | 
 70 | scClassify <- function(exprsMat_train = NULL,
 71 |                        cellTypes_train = NULL,
 72 |                        exprsMat_test = NULL,
 73 |                        cellTypes_test = NULL,
 74 |                        tree = "HOPACH",
 75 |                        algorithm = "WKNN",
 76 |                        selectFeatures = "limma",
 77 |                        similarity = "pearson",
 78 |                        cutoff_method = c("dynamic", "static"),
 79 |                        weighted_ensemble = FALSE,
 80 |                        weights = NULL,
 81 |                        weighted_jointClassification = TRUE,
 82 |                        cellType_tree = NULL,
 83 |                        k = 10,
 84 |                        topN = 50,
 85 |                        hopach_kmax = 5,
 86 |                        pSig = 0.01,
 87 |                        prob_threshold = 0.7,
 88 |                        cor_threshold_static = 0.5,
 89 |                        cor_threshold_high = 0.7,
 90 |                        returnList = TRUE,
 91 |                        parallel = FALSE,
 92 |                        BPPARAM = BiocParallel::SerialParam(),
 93 |                        verbose = FALSE) {
 94 | 
 95 | 
 96 |     # check input
 97 |     if (is.null(exprsMat_train) | is.null(cellTypes_train) |
 98 |         is.null(exprsMat_test)) {
 99 |         stop("exprsMat_train or cellTypes_train or exprsMat_test is NULL!")
100 |     }
101 | 
102 |     if (!is.null(cellTypes_test)) {
103 |         if ("character" %in% is(cellTypes_test)) {
104 |             if (length(cellTypes_test) != ncol(exprsMat_test)) {
105 |                 stop("Length of testing cell types does not match
106 |              with number of column of testing expression matrix")
107 |             }
108 |         }
109 | 
110 |         if ("list" %in% is(cellTypes_test)) {
111 |             if (sum(unlist(lapply(cellTypes_test, length)) !=
112 |                     unlist(lapply(exprsMat_test, ncol))) != 0) {
113 |                 stop("Length of testing cell types does not match
114 |              with number of column of testing expression matrix")
115 |             }
116 |         }
117 | 
118 |     }
119 | 
120 |     if ("list" %in% is(exprsMat_train) ) {
121 |         if (sum(unlist(lapply(cellTypes_train, length)) !=
122 |                 unlist(lapply(exprsMat_train, ncol))) != 0) {
123 |             stop("Length of training cell types does not match with
124 |            number of column of training expression matrix")
125 |         }
126 |     }else {
127 |         if (length(cellTypes_train) != ncol(exprsMat_train)) {
128 |             stop("Length of training cell types does not match with
129 |            number of column of training expression matrix")
130 |         }
131 |     }
132 | 
133 |     if ("list" %in% is(exprsMat_train)) {
134 |         if (any(lapply(cellTypes_train, function(x) any(table(x) == 1)))) {
135 |             stop("There is cell type with only one cell,
136 |            please check cellTypes_train")
137 |         }
138 |     }else {
139 |         if (any(table(cellTypes_train) == 1)) {
140 |             stop("There is cell type with only one cell,
141 |            please check cellTypes_train")
142 |         }
143 |     }
144 | 
145 | 
146 | 
147 | 
148 |     tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
149 |     selectFeatures <- match.arg(selectFeatures,
150 |                                 c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
151 |                                 several.ok = TRUE)
152 | 
153 |     algorithm <- match.arg(algorithm,
154 |                            c("WKNN", "KNN", "DWKNN"),
155 |                            several.ok = TRUE)
156 | 
157 |     similarity <- match.arg(similarity, c("pearson",  "spearman",
158 |                                           "cosine", "jaccard", "kendall",
159 |                                           "weighted_rank","manhattan"),
160 |                             several.ok = TRUE)
161 |     cutoff_method <- match.arg(cutoff_method, c("dynamic", "static"))
162 | 
163 | 
164 | 
165 |     # To check if need to run weighted ensemble learning
166 | 
167 |     if ((length(selectFeatures) > 1 | length(algorithm) > 1 |
168 |          length(similarity) > 1) ) {
169 |         if (weighted_ensemble) {
170 |             weighted_ensemble <- TRUE
171 |             ensemble <- TRUE
172 | 
173 |             if (verbose) {
174 |                 cat("Performing weighted ensemble learning... \n")
175 |             }
176 | 
177 |         } else {
178 |             weighted_ensemble <- FALSE
179 |             ensemble <- TRUE
180 | 
181 |             if (verbose) {
182 |                 cat("Performing unweighted ensemble learning... \n")
183 |             }
184 | 
185 |         }
186 |     } else {
187 |         weighted_ensemble <- FALSE
188 |         ensemble <- FALSE
189 | 
190 |         if (verbose) {
191 |             cat("Ensemble learning is disabled... \n")
192 |         }
193 | 
194 |     }
195 | 
196 |     # To check if need to run weighted joint classification
197 | 
198 |     if ("list" %in% is(exprsMat_train) &
199 |         length(exprsMat_train) > 1 &
200 |         weighted_jointClassification) {
201 |         cat("Performing weighted joint classification \n")
202 |         weighted_jointClassification <- TRUE
203 |     } else {
204 |         weighted_jointClassification <- FALSE
205 |     }
206 | 
207 | 
208 | 
209 |     ensemble_methods <- as.matrix(expand.grid(similarity = similarity,
210 |                                               algorithm = algorithm,
211 |                                               features = selectFeatures))
212 | 
213 |     if (!is.null(weights)) {
214 |         if (length(weights) != nrow(ensemble_methods)) {
215 |             stop("The length of weights is not equal to
216 |            the number of combination of ensemble methods")
217 |         }
218 |     }
219 | 
220 | 
221 | 
222 |     # calculate the weights for train model
223 |     if (weighted_jointClassification | (weighted_ensemble & is.null(weights))) {
224 |         weightsCal = TRUE
225 |     } else {
226 |         weightsCal = FALSE
227 |     }
228 | 
229 | 
230 | 
231 |     ### train_scClassify
232 |     trainRes <- train_scClassify(exprsMat_train,
233 |                                  cellTypes_train,
234 |                                  tree = tree,
235 |                                  selectFeatures = selectFeatures,
236 |                                  topN = topN,
237 |                                  hopach_kmax = hopach_kmax,
238 |                                  pSig = pSig,
239 |                                  cellType_tree = cellType_tree,
240 |                                  weightsCal = weightsCal,
241 |                                  parallel = parallel,
242 |                                  BPPARAM = BPPARAM,
243 |                                  verbose = verbose,
244 |                                  k = k,
245 |                                  prob_threshold = prob_threshold,
246 |                                  cor_threshold_static = cor_threshold_static,
247 |                                  cor_threshold_high = cor_threshold_high,
248 |                                  algorithm = algorithm,
249 |                                  similarity = similarity,
250 |                                  cutoff_method = cutoff_method
251 |     )
252 | 
253 | 
254 | 
255 | 
256 | 
257 |     if (verbose) {
258 |         cat("Predicting using followings parameter combinations: \n")
259 |         print(ensemble_methods)
260 |     }
261 | 
262 |     ### if there are multiple testing datasets
263 |     if (verbose) {
264 |         cat("==================  Start classifying on test dataset  ==== \n")
265 |     }
266 | 
267 |     if ("list" %in% is(exprsMat_test)) {
268 |         testRes <- list()
269 | 
270 |         for (testDataset_idx in seq_len(length(exprsMat_test))) {
271 | 
272 |             if (verbose) {
273 |                 cat("Predicting: ")
274 |                 print(names(exprsMat_test)[testDataset_idx])
275 |             }
276 | 
277 |             if ("list" %in% is(exprsMat_train)) {
278 |                 # for the case there are multiple training datasets
279 |                 #
280 |                 predictRes <- list()
281 | 
282 | 
283 |                 for (train_list_idx in seq_len(length(exprsMat_train))) {
284 | 
285 |                     if (verbose) {
286 |                         cat("Training using ")
287 |                         cat(names(exprsMat_train)[train_list_idx], "\n")
288 |                     }
289 | 
290 |                     predictRes[[train_list_idx]] <- predict_scClassify(exprsMat_test = exprsMat_test[[testDataset_idx]],
291 |                                                                        trainRes  = trainRes[[train_list_idx]],
292 |                                                                        cellTypes_test = cellTypes_test[[testDataset_idx]],
293 |                                                                        k = k,
294 |                                                                        prob_threshold = prob_threshold,
295 |                                                                        cor_threshold_static = cor_threshold_static,
296 |                                                                        cor_threshold_high = cor_threshold_high,
297 |                                                                        algorithm = algorithm,
298 |                                                                        features = selectFeatures,
299 |                                                                        similarity = similarity,
300 |                                                                        cutoff_method = cutoff_method,
301 |                                                                        weighted_ensemble = weighted_ensemble,
302 |                                                                        weights = weights,
303 |                                                                        parallel = parallel,
304 |                                                                        BPPARAM = BPPARAM,
305 |                                                                        verbose = verbose)
306 | 
307 |                 }
308 |                 names(predictRes) <- paste("Trained_by",
309 |                                            names(trainRes), sep = "_")
310 | 
311 | 
312 |                 jointRes <- getJointRes(predictRes, trainRes)
313 | 
314 |                 rownames(jointRes) <- colnames(exprsMat_test)
315 | 
316 | 
317 |                 predictRes$jointRes <- jointRes
318 | 
319 | 
320 |             }else {
321 |                 predictRes <- predict_scClassify(exprsMat_test = exprsMat_test[[testDataset_idx]],
322 |                                                  trainRes  = trainRes,
323 |                                                  cellTypes_test = cellTypes_test[[testDataset_idx]],
324 |                                                  k = k,
325 |                                                  prob_threshold = prob_threshold,
326 |                                                  cor_threshold_static = cor_threshold_static,
327 |                                                  cor_threshold_high = cor_threshold_high,
328 |                                                  algorithm = algorithm,
329 |                                                  features = selectFeatures,
330 |                                                  similarity = similarity,
331 |                                                  cutoff_method = cutoff_method,
332 |                                                  weighted_ensemble = weighted_ensemble,
333 |                                                  weights = weights,
334 |                                                  parallel = parallel,
335 |                                                  BPPARAM = BPPARAM,
336 |                                                  verbose = verbose)
337 | 
338 | 
339 |             }
340 | 
341 |             testRes[[testDataset_idx]] <- predictRes
342 |         }
343 | 
344 |         names(testRes) <- names(exprsMat_test)
345 |     }else{
346 |         # else only one dataset as a matrix in the test
347 |         if ("list" %in% is(exprsMat_train)) {
348 |             # for the case there are multiple training datasets
349 |             #
350 |             testRes <- list()
351 |             for (train_list_idx in seq_len(length(exprsMat_train))) {
352 | 
353 |                 if (verbose) {
354 |                     cat("Training using ")
355 |                     cat(names(exprsMat_train)[train_list_idx], "\n")
356 |                 }
357 |                 testRes[[train_list_idx]] <- predict_scClassify(exprsMat_test = exprsMat_test,
358 |                                                                 trainRes  = trainRes[[train_list_idx]],
359 |                                                                 cellTypes_test = cellTypes_test,
360 |                                                                 k = k,
361 |                                                                 prob_threshold = prob_threshold,
362 |                                                                 cor_threshold_static = cor_threshold_static,
363 |                                                                 cor_threshold_high = cor_threshold_high,
364 |                                                                 algorithm = algorithm,
365 |                                                                 features = selectFeatures,
366 |                                                                 similarity = similarity,
367 |                                                                 cutoff_method = cutoff_method,
368 |                                                                 weighted_ensemble = weighted_ensemble,
369 |                                                                 weights = weights,
370 |                                                                 parallel = parallel,
371 |                                                                 BPPARAM = BPPARAM,
372 |                                                                 verbose = verbose)
373 | 
374 |             }
375 | 
376 | 
377 |             names(testRes) <- paste("Trained_by", names(trainRes), sep = "_")
378 | 
379 |             jointRes <- getJointRes(testRes, trainRes)
380 | 
381 |             rownames(jointRes) <- colnames(exprsMat_test)
382 | 
383 | 
384 |             testRes$jointRes <- jointRes
385 | 
386 |         }else {
387 |             predictRes <- predict_scClassify(exprsMat_test = exprsMat_test,
388 |                                              trainRes  = trainRes,
389 |                                              cellTypes_test = cellTypes_test,
390 |                                              k = k,
391 |                                              prob_threshold = prob_threshold,
392 |                                              cor_threshold_static = cor_threshold_static,
393 |                                              cor_threshold_high = cor_threshold_high,
394 |                                              algorithm = algorithm,
395 |                                              features = selectFeatures,
396 |                                              similarity = similarity,
397 |                                              cutoff_method = cutoff_method,
398 |                                              weighted_ensemble = weighted_ensemble,
399 |                                              weights = weights,
400 |                                              parallel = parallel,
401 |                                              BPPARAM = BPPARAM,
402 |                                              verbose = verbose)
403 |             testRes <- list(test = predictRes)
404 |         }
405 | 
406 | 
407 | 
408 | 
409 |     }
410 | 
411 | 
412 |     if (returnList) {
413 |         return(list(testRes = testRes, trainRes = trainRes))
414 |     } else {
415 | 
416 |         if ("list" %in% is(exprsMat_train)) {
417 |             trainClassList <- list()
418 |             for (train_list_idx in seq_len(length(trainRes))) {
419 |                 trainClassList[[train_list_idx]] <- .scClassifyTrainModel(
420 |                     name = names(trainRes)[train_list_idx],
421 |                     cellTypeTree = trainRes[[train_list_idx]]$cutree_list,
422 |                     cellTypeTrain = as.character(trainRes[[train_list_idx]]$cellTypes_train),
423 |                     features = names(trainRes[[train_list_idx]]$hierarchyKNNRes),
424 |                     model = trainRes[[train_list_idx]]$hierarchyKNNRes,
425 |                     modelweights = trainRes[[train_list_idx]]$modelweights,
426 |                     metaData = S4Vectors::DataFrame())
427 | 
428 |             }
429 |             trainClassList <- scClassifyTrainModelList(trainClassList)
430 |         } else {
431 |             trainClassList <- .scClassifyTrainModel(
432 |                 name = "training",
433 |                 cellTypeTree = trainRes$cutree_list,
434 |                 cellTypeTrain = as.character(trainRes$cellTypes_train),
435 |                 features = names(trainRes$hierarchyKNNRes),
436 |                 model = trainRes$hierarchyKNNRes,
437 |                 modelweights = trainRes$modelweights,
438 |                 metaData = S4Vectors::DataFrame())
439 |         }
440 | 
441 | 
442 |         return(list(testRes = testRes, trainRes = trainClassList))
443 | 
444 | 
445 |     }
446 | 
447 | 
448 | }
449 | 


--------------------------------------------------------------------------------
/R/scClassifyTrainClass.R:
--------------------------------------------------------------------------------
  1 | 
  2 | setClassUnion("numeric_OR_NULL", members = c("numeric","NULL"))
  3 | 
  4 | 
  5 | #' An S4 class to stored training model for scClassify
  6 | #'
  7 | #' @slot name Name of the training dataset
  8 | #' @slot cellTypeTrain A vector of cell type in training dataset
  9 | #' @slot cellTypeTree A list indicate a cell type tree
 10 | #' @slot features A vector of character indicates the
 11 | #' features that are trained for this data
 12 | #' @slot model A list stored the training model,
 13 | #' including the features that are selected
 14 | #' and the cell expression matrix that are used for training
 15 | #' @slot modelweights A vector of numeric indicates the weights of each model
 16 | #' @slot metaData A DataFrame stored meta data of training model
 17 | #'
 18 | #' @importClassesFrom S4Vectors character_OR_NULL
 19 | #' @importClassesFrom S4Vectors DataFrame
 20 | #'
 21 | #'
 22 | #' @export
 23 | setClass("scClassifyTrainModel",
 24 |          slots = c(
 25 |              name = "character_OR_NULL",
 26 |              cellTypeTrain = "character",
 27 |              cellTypeTree = "list",
 28 |              features = "character",
 29 |              model = "list",
 30 |              modelweights = "numeric_OR_NULL",
 31 |              metaData = "DataFrame"))
 32 | 
 33 | 
 34 | 
 35 | #' The scClassifyTrainModel class
 36 | #'
 37 | #' The scClassifyTrainModel class is designed to stored
 38 | #' training model for scClassify
 39 | #' @param name Name of the training dataset
 40 | #' @param cellTypeTrain A vector of cell type in training dataset
 41 | #' @param cellTypeTree A list indicate a cell type tree
 42 | #' @param features A vector of character indicates the
 43 | #' features that are trained for this data
 44 | #' @param model A list stored the training model,
 45 | #' including the features that are selected
 46 | #' and the cell expression matrix that are used for training
 47 | #' @param modelweights A vector of numeric indicates the weights of each model
 48 | #' @param metaData A DataFrame stored meta data of training model
 49 | #'
 50 | #' @return A scClassifyTrainModel object
 51 | #'
 52 | #' @author Yingxin Lin
 53 | #'
 54 | #'
 55 | #' @importFrom methods new
 56 | 
 57 | .scClassifyTrainModel <- function(name,
 58 |                                   cellTypeTree,
 59 |                                   cellTypeTrain,
 60 |                                   features,
 61 |                                   model,
 62 |                                   modelweights,
 63 |                                   metaData) {
 64 | 
 65 |     scClassify_train_obj <- new("scClassifyTrainModel",
 66 |                                 name = name,
 67 |                                 cellTypeTree = cellTypeTree,
 68 |                                 cellTypeTrain = cellTypeTrain,
 69 |                                 features = features,
 70 |                                 model = model,
 71 |                                 modelweights = modelweights,
 72 |                                 metaData = metaData)
 73 |     names(scClassify_train_obj@modelweights) <- names(modelweights)
 74 |     return(scClassify_train_obj)
 75 | 
 76 | }
 77 | 
 78 | 
 79 | 
 80 | 
 81 | setMethod("show", "scClassifyTrainModel",
 82 |           function(object) {
 83 |               cat(paste("Class:", class(object), "\n"))
 84 |               cat(paste("Model name:", object@name), "\n")
 85 |               cat("Feature selection methods: ")
 86 |               cat(object@features, "\n")
 87 |               cat("Number of cells in the training data: ")
 88 |               cat(length(object@cellTypeTrain), "\n")
 89 |               cat("Number of cell types in the training data: ")
 90 |               cat(length(object@cellTypeTree[[length(object@cellTypeTree)]]),
 91 |                   "\n")
 92 |           })
 93 | 
 94 | 
 95 | 
 96 | #' An S4 class to stored a list of training models from scClassify
 97 | #'
 98 | #'
 99 | #' @importClassesFrom S4Vectors SimpleList
100 | #' @export
101 | 
102 | setClass("scClassifyTrainModelList",
103 |          contains = "SimpleList")
104 | 
105 | 
106 | 
107 | #' The scClassifyTrainModelList class
108 | #'
109 | #' @param ... scClassifyTrainModel objects
110 | #'
111 | #' @return A scClassifyTrainModelList object
112 | #'
113 | #' @examples
114 | #'
115 | #' data("trainClassExample_xin")
116 | #' data("trainClassExample_wang")
117 | #' trainClassExampleList <- scClassifyTrainModelList(trainClassExample_xin,
118 | #' trainClassExample_wang
119 | #' )
120 | #'
121 | #' @importFrom S4Vectors SimpleList
122 | #'
123 | #' @export
124 | scClassifyTrainModelList <- function(...) {
125 |     l <- new("scClassifyTrainModelList", S4Vectors::SimpleList(...))
126 |     name_list <- lapply(l@listData, function(x) x@name)
127 |     if (any(duplicated(name_list))) {
128 |         dup_names <- unique(name_list[duplicated(name_list)])
129 |         for (i in seq_along(dup_names)) {
130 |             idx_to_change <- which(name_list %in% dup_names[i])
131 |             name_list[idx_to_change] <- paste(name_list[idx_to_change],
132 |                                               seq_along(idx_to_change),
133 |                                               sep = "_")
134 |         }
135 |     }
136 |     names(l@listData) <- name_list
137 |     l
138 | }
139 | 
140 | 
141 | 
142 | 
143 | setMethod("show", "scClassifyTrainModelList", function(object) {
144 |     cat(paste("Class:", class(object), "\n"))
145 |     cat("Number of Trainin model: ")
146 |     cat(length(object), "\n")
147 | 
148 |     cat("Training data name:",
149 |         names(object@listData), "\n")
150 | })
151 | 
152 | 
153 | #' Accessors of cellTypeTree for scClassifyTrainModel
154 | #'
155 | #' Methods to access various components of the `scClassifyTrainModel` object.
156 | #'
157 | #' @usage cellTypeTree(x)
158 | #'
159 | #' @param x A `scClassifyTrainModel` object.
160 | #'
161 | #' @examples
162 | #'
163 | #' data(trainClassExample_xin)
164 | #' cellTypeTree(trainClassExample_xin)
165 | #'
166 | #' @return cellTypeTree of the scClassifyTrainModel slot
167 | #'
168 | #' @aliases
169 | #' cellTypeTree,scClassifyTrainModel-method
170 | #' cellTypeTree
171 | #'
172 | #' @export
173 | setGeneric("cellTypeTree", function(x)
174 |     standardGeneric("cellTypeTree"))
175 | setMethod("cellTypeTree", "scClassifyTrainModel",
176 |           function(x) {
177 |               x@cellTypeTree
178 |           })
179 | 
180 | 
181 | 
182 | 
183 | setGeneric("cellTypeTree<-", function(x, value)
184 |     standardGeneric("cellTypeTree<-"))
185 | setReplaceMethod("cellTypeTree", "scClassifyTrainModel",
186 |                  function(x, value) {
187 |                      x@cellTypeTree <- value
188 |                      x
189 |                  })
190 | 
191 | 
192 | 
193 | #' Accessors of features for scClassifyTrainModel
194 | #'
195 | #' Methods to access various components of the `scClassifyTrainModel` object.
196 | #'
197 | #' @usage features(x)
198 | #'
199 | #' @param x A `scClassifyTrainModel` object.
200 | #'
201 | #' @examples
202 | #'
203 | #' data(trainClassExample_xin)
204 | #' features(trainClassExample_xin)
205 | #'
206 | #' @return features of the scClassifyTrainModel slot
207 | #'
208 | #' @aliases
209 | #' features,scClassifyTrainModel-method
210 | #' features
211 | #'
212 | #' @export
213 | setGeneric("features", function(x) standardGeneric("features"))
214 | setMethod("features", "scClassifyTrainModel", function(x) {
215 |     x@features
216 | })
217 | 
218 | 
219 | 
220 | setGeneric("features<-", function(x, value)
221 |     standardGeneric("features<-"))
222 | setReplaceMethod("features", "scClassifyTrainModel",
223 |                  function(x, value) {
224 |                      x@features <- value
225 |                      x
226 |                  })
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | #' Accessors of name for scClassifyTrainModel
236 | #'
237 | #' Methods to access various components of the `scClassifyTrainModel` object.
238 | #'
239 | #' @usage name(x)
240 | #'
241 | #' @param x A `scClassifyTrainModel` object.
242 | #'
243 | #' @examples
244 | #'
245 | #' data(trainClassExample_xin)
246 | #' name(trainClassExample_xin)
247 | #'
248 | #' @return name of the scClassifyTrainModel slot
249 | #'
250 | #' @aliases
251 | #' name,scClassifyTrainModel-method
252 | #' name
253 | #'
254 | #'
255 | #' @export
256 | setGeneric("name", function(x) standardGeneric("name"))
257 | setMethod("name", "scClassifyTrainModel", function(x) {
258 |     x@name
259 | })
260 | 
261 | 
262 | 
263 | setGeneric("name<-", function(x, value)
264 |     standardGeneric("name<-"))
265 | setReplaceMethod("name", "scClassifyTrainModel",
266 |                  function(x, value) {
267 |                      x@name <- value
268 |                      x
269 |                  })
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | #' Accessors of cellTypeTrain for scClassifyTrainModel
279 | #'
280 | #' Methods to access various components of the `scClassifyTrainModel` object.
281 | #'
282 | #' @usage cellTypeTrain(x)
283 | #'
284 | #' @param x A `scClassifyTrainModel` object.
285 | #'
286 | #' @examples
287 | #'
288 | #' data(trainClassExample_xin)
289 | #' cellTypeTrain(trainClassExample_xin)
290 | #'
291 | #' @return cellTypeTrain of the scClassifyTrainModel slot
292 | #'
293 | #' @aliases
294 | #' cellTypeTrain,scClassifyTrainModel-method
295 | #' cellTypeTrain
296 | #'
297 | #'
298 | #' @export
299 | setGeneric("cellTypeTrain", function(x) standardGeneric("cellTypeTrain"))
300 | setMethod("cellTypeTrain", "scClassifyTrainModel", function(x) {
301 |     x@cellTypeTrain
302 | })
303 | 
304 | 
305 | 
306 | setGeneric("cellTypeTrain<-", function(x, value)
307 |     standardGeneric("cellTypeTrain<-"))
308 | setReplaceMethod("cellTypeTrain", "scClassifyTrainModel",
309 |                  function(x, value) {
310 |                      x@cellTypeTrain <- value
311 |                      x
312 |                  })
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | #' Accessors of modelweights for scClassifyTrainModel
322 | #'
323 | #' Methods to access various components of the `scClassifyTrainModel` object.
324 | #'
325 | #' @usage modelweights(x)
326 | #'
327 | #' @param x A `scClassifyTrainModel` object.
328 | #'
329 | #' @examples
330 | #'
331 | #' data(trainClassExample_xin)
332 | #' modelweights(trainClassExample_xin)
333 | #'
334 | #' @return modelweights of the scClassifyTrainModel slot
335 | #'
336 | #'
337 | #' @aliases
338 | #' modelweights,scClassifyTrainModel-method
339 | #' modelweights
340 | #'
341 | #'
342 | #' @export
343 | setGeneric("modelweights", function(x) standardGeneric("modelweights"))
344 | setMethod("modelweights", "scClassifyTrainModel", function(x) {
345 |     x@modelweights
346 | })
347 | 
348 | 
349 | 
350 | setGeneric("modelweights<-", function(x, value)
351 |     standardGeneric("modelweights<-"))
352 | setReplaceMethod("modelweights", "scClassifyTrainModel",
353 |                  function(x, value) {
354 |                      x@modelweights <- value
355 |                      x
356 |                  })
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | #' Accessors of model for scClassifyTrainModel
364 | #'
365 | #' Methods to access various components of the `scClassifyTrainModel` object.
366 | #'
367 | #' @usage model(x)
368 | #'
369 | #' @param x A `scClassifyTrainModel` object.
370 | #'
371 | #' @return model of the scClassifyTrainModel slot
372 | #'
373 | #' @examples
374 | #'
375 | #' data(trainClassExample_xin)
376 | #' model(trainClassExample_xin)
377 | #'
378 | #' @aliases
379 | #' model,scClassifyTrainModel-method
380 | #' model
381 | #'
382 | #'
383 | #' @export
384 | setGeneric("model", function(x) standardGeneric("model"))
385 | setMethod("model", "scClassifyTrainModel", function(x) {
386 |     x@model
387 | })
388 | 
389 | 
390 | 
391 | setGeneric("model<-", function(x, value)
392 |     standardGeneric("model<-"))
393 | setReplaceMethod("model", "scClassifyTrainModel",
394 |                  function(x, value) {
395 |                      x@model <- value
396 |                      x
397 |                  })
398 | 
399 | 


--------------------------------------------------------------------------------
/R/train_scClassify.R:
--------------------------------------------------------------------------------
  1 | #' Training scClassify model
  2 | #'
  3 | #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset
  4 | #' @param cellTypes_train A vector of cell types of reference dataset
  5 | #' @param tree A vector indicates the method to build hierarchical tree,
  6 | #' set as "HOPACH" by default.
  7 | #' This should be one of "HOPACH" and "HC" (using stats::hclust).
  8 | #' @param selectFeatures A vector indicates the gene selection method,
  9 | #' set as "limma" by default.
 10 | #' This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo".
 11 | #' @param topN An integer indicates the top number of features that are selected
 12 | #' @param hopach_kmax An integer between 1 and 9 specifying the maximum number of
 13 | #' children at each node in the HOPACH tree.
 14 | #' @param pSig A numeric indicates the cutoff of pvalue for features
 15 | #' @param cellType_tree A list indicates the cell type tree provided by user.
 16 | #' (By default, it is NULL)
 17 | #' @param weightsCal A logical input indicates whether we need to
 18 | #' calculate the weights for the model.
 19 | #' @param parallel A logical input indicates whether the algorihms will run in parallel
 20 | #' @param BPPARAM  A \code{BiocParallelParam} class object
 21 | #' from the \code{BiocParallel} package is used. Default is SerialParam().
 22 | #' @param verbose A logical input indicates whether the intermediate steps will be printed
 23 | #' @param returnList A logical input indicates whether the output will be class of list
 24 | #' @param ... Other input for predict_scClassify for the case when weights calculation
 25 | #' of the pretrained model is performed
 26 | #' @return list of results or an object of \code{scClassifyTrainModel}
 27 | #' @author Yingxin Lin
 28 | #'
 29 | #' @examples
 30 | #' data("scClassify_example")
 31 | #' xin_cellTypes <- scClassify_example$xin_cellTypes
 32 | #' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
 33 | #' trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset,
 34 | #' cellTypes_train = xin_cellTypes,
 35 | #' selectFeatures = c("limma", "BI"),
 36 | #' returnList = FALSE
 37 | #' )
 38 | #'
 39 | #' @importFrom stats na.omit
 40 | #' @importFrom methods is
 41 | #' @importFrom BiocParallel SerialParam
 42 | #' @export
 43 | 
 44 | 
 45 | 
 46 | 
 47 | train_scClassify <- function(exprsMat_train,
 48 |                              cellTypes_train,
 49 |                              tree = "HOPACH",
 50 |                              selectFeatures = "limma",
 51 |                              topN = 50,
 52 |                              hopach_kmax = 5,
 53 |                              pSig = 0.05,
 54 |                              cellType_tree = NULL,
 55 |                              weightsCal = FALSE,
 56 |                              parallel= FALSE,
 57 |                              BPPARAM = BiocParallel::SerialParam(),
 58 |                              verbose= TRUE,
 59 |                              returnList = TRUE,
 60 |                              ...){
 61 | 
 62 | 
 63 |   if (is.null(exprsMat_train) | is.null(cellTypes_train)) {
 64 |     stop("exprsMat_train or cellTypes_train or exprsMat_test is NULL!")
 65 |   }
 66 | 
 67 |   # Matching the argument of the tree construction method
 68 |   tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
 69 | 
 70 |   # Matching the argument of feature selection method
 71 |   selectFeatures <- match.arg(selectFeatures,
 72 |                               c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
 73 |                               several.ok = TRUE)
 74 | 
 75 | 
 76 |   if ("list" %in% is(exprsMat_train)) {
 77 |     if (sum(unlist(lapply(cellTypes_train, length)) !=
 78 |             unlist(lapply(exprsMat_train, ncol))) != 0) {
 79 |       stop("Length of training cell types does not match with
 80 |            number of column of training expression matrix")
 81 |     }
 82 |   }else {
 83 |     if (length(cellTypes_train) != ncol(exprsMat_train)) {
 84 |       stop("Length of training cell types does not match with
 85 |            number of column of training expression matrix")
 86 |     }
 87 |   }
 88 | 
 89 | 
 90 | 
 91 | 
 92 |   # To rename the train list if name is null (only when there are multiple training datasets)
 93 |   if ( "list" %in% is(exprsMat_train)) {
 94 |     if (is.null(names(exprsMat_train))) {
 95 |       names(exprsMat_train) <- names(cellTypes_train) <-
 96 |         paste("TrainData",
 97 |               seq_len(length(exprsMat_train)),
 98 |               sep = "_")
 99 |     } else if (sum(names(exprsMat_train) == "") != 0) {
100 |       names(exprsMat_train)[names(exprsMat_train) == ""] <-
101 |         names(cellTypes_train)[names(cellTypes_train) == ""] <-
102 |         paste("TrainData", which(names(exprsMat_train) == ""), sep = "_")
103 |     }
104 |   }
105 | 
106 |   # QC for the training data set
107 | 
108 |   if (any(c("matrix", "dgCMatrix") %in% is(exprsMat_train))) {
109 | 
110 |     zeros <- apply(exprsMat_train, 1, function(x) sum(x == 0)/length(x))
111 |     minPctCell <- min(table(cellTypes_train)/length(cellTypes_train))
112 |     exprsMat_train <- exprsMat_train[zeros <= max(1 - minPctCell, 0.95), ]
113 |     if (verbose) {
114 |       cat("after filtering not expressed genes \n")
115 |       print(dim(exprsMat_train))
116 |     }
117 |   } else {
118 |     for (train_list_idx in seq_len(length(exprsMat_train))) {
119 |       zeros <- apply(exprsMat_train[[train_list_idx]], 1,
120 |                      function(x) sum(x == 0)/length(x))
121 |       minPctCell <- min(table(cellTypes_train[[train_list_idx]])/length(cellTypes_train[[train_list_idx]]))
122 |       exprsMat_train[[train_list_idx]] <- exprsMat_train[[train_list_idx]][zeros <= max(1 - minPctCell, 0.95), ]
123 |     }
124 |     if (verbose) {
125 |       cat("after filtering not expressed genes \n")
126 |       print(lapply(exprsMat_train, dim))
127 |     }
128 |   }
129 | 
130 |   ### train_scClassify
131 |   if ("list" %in% is(exprsMat_train)) {
132 |     trainRes <- list()
133 |     for (train_list_idx in seq_len(length(exprsMat_train))) {
134 |       trainRes[[train_list_idx]] <- train_scClassifySingle(exprsMat_train[[train_list_idx]],
135 |                                                            cellTypes_train[[train_list_idx]],
136 |                                                            tree = tree,
137 |                                                            selectFeatures = selectFeatures,
138 |                                                            topN = topN,
139 |                                                            hopach_kmax = hopach_kmax,
140 |                                                            pSig = pSig,
141 |                                                            weightsCal = weightsCal,
142 |                                                            parallel = parallel,
143 |                                                            BPPARAM = BPPARAM,
144 |                                                            verbose = verbose,
145 |                                                            ...)
146 |     }
147 |     names(trainRes) <- names(exprsMat_train)
148 |   } else{
149 | 
150 |     trainRes <- train_scClassifySingle(exprsMat_train,
151 |                                        cellTypes_train,
152 |                                        tree = tree,
153 |                                        selectFeatures = selectFeatures,
154 |                                        topN = topN,
155 |                                        hopach_kmax = hopach_kmax,
156 |                                        pSig = pSig,
157 |                                        cellType_tree = cellType_tree,
158 |                                        weightsCal = weightsCal,
159 |                                        parallel = parallel,
160 |                                        BPPARAM = BPPARAM,
161 |                                        verbose = verbose,
162 |                                        ...)
163 |   }
164 | 
165 | 
166 | 
167 | 
168 | 
169 |   # return the results
170 |   if (returnList) {
171 | 
172 |     return(trainRes)
173 | 
174 |   } else {
175 |     if ("list" %in% is(exprsMat_train)) {
176 |       trainClassList <- list()
177 |       for (train_list_idx in seq_len(length(trainRes))) {
178 |         trainClassList[[train_list_idx]] <- .scClassifyTrainModel(
179 |           name = names(trainRes)[train_list_idx],
180 |           cellTypeTree = trainRes[[train_list_idx]]$cutree_list,
181 |           cellTypeTrain = as.character(trainRes[[train_list_idx]]$cellTypes_train),
182 |           features = names(trainRes[[train_list_idx]]$hierarchyKNNRes),
183 |           model = trainRes[[train_list_idx]]$hierarchyKNNRes,
184 |           modelweights = trainRes[[train_list_idx]]$modelweights,
185 |           metaData = S4Vectors::DataFrame())
186 | 
187 |       }
188 |       trainClassList <- scClassifyTrainModelList(trainClassList)
189 |     } else {
190 |       trainClassList <- .scClassifyTrainModel(
191 |         name = "training",
192 |         cellTypeTree = trainRes$cutree_list,
193 |         cellTypeTrain = as.character(trainRes$cellTypes_train),
194 |         features = names(trainRes$hierarchyKNNRes),
195 |         model = trainRes$hierarchyKNNRes,
196 |         modelweights = trainRes$modelweights,
197 |         metaData = S4Vectors::DataFrame())
198 |     }
199 | 
200 |     return(trainClassList)
201 | 
202 |   }
203 | 
204 | 
205 | }
206 | 
207 | 
208 | 
209 | 
210 | #' @importFrom BiocParallel SerialParam bplapply
211 | 
212 | train_scClassifySingle <- function(exprsMat_train,
213 |                                    cellTypes_train,
214 |                                    tree = "HOPACH",
215 |                                    selectFeatures = "limma",
216 |                                    topN = 50,
217 |                                    hopach_kmax = 5,
218 |                                    pSig = 0.05,
219 |                                    cellType_tree = NULL,
220 |                                    weightsCal = FALSE,
221 |                                    parallel= FALSE,
222 |                                    BPPARAM = BiocParallel::SerialParam(),
223 |                                    verbose= TRUE,
224 |                                    ...){
225 | 
226 |   if (is.null(rownames(exprsMat_train))) {
227 |     stop("rownames of the exprsMat_train is NULL!")
228 |   }
229 | 
230 |   if (is.null(rownames(exprsMat_train)) |
231 |       sum(duplicated(colnames(exprsMat_train))) != 0) {
232 |     stop("colnames of exprsMat_train is NULL or not unique")
233 |   }
234 | 
235 |   if (length(cellTypes_train) != ncol(exprsMat_train)) {
236 |     stop("Length of training cell types does not match with
237 |          number of column of training expression matrix")
238 |   }
239 | 
240 |   if (all(exprsMat_train %% 1 == 0)) {
241 |     warning("exprsMat_train looks like a count matrix
242 |             (scClassify requires a log-transformed normalised input)")
243 |   }
244 | 
245 |   # Matching the argument of the tree construction method
246 |   tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
247 | 
248 |   # Matching the argument of feature selection method
249 |   selectFeatures <- match.arg(selectFeatures,
250 |                               c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
251 |                               several.ok = TRUE)
252 | 
253 | 
254 |   if (verbose) {
255 |     print("Feature Selection...")
256 |   }
257 | 
258 | 
259 |   # Select the features to construct tree
260 |   tt <- doLimma(exprsMat_train, cellTypes_train)
261 |   de <- Reduce(union, lapply(tt, function(t)
262 |     rownames(t)[seq_len(max(min(50, sum(t$adj.P.Val < 0.001)), 30))]))
263 |   de <- na.omit(de)
264 |   if (verbose) {
265 |     print(paste("Number of genes selected to construct HOPACH tree",
266 |                 length(de)))
267 |   }
268 | 
269 | 
270 |   if (is.null(cellType_tree)) {
271 |     # Calculate the centroid matrix for tree
272 |     # construction using selected features
273 |     centroidMat <- do.call(cbind, lapply(unique(cellTypes_train), function(x)
274 |       Matrix::rowMeans(as.matrix(exprsMat_train[de, cellTypes_train == x]))))
275 | 
276 |     colnames(centroidMat) <- unique(cellTypes_train)
277 | 
278 |     # Constructing the tree using selected tree method
279 |     if (verbose) {
280 |       print("Constructing tree ...")
281 |     }
282 | 
283 |     cutree_list <- constructTree(centroidMat,
284 |                                  tree = tree,
285 |                                  hopach_kmax = hopach_kmax,
286 |                                  plot = verbose)
287 | 
288 | 
289 |   } else {
290 |     cutree_list <- cellType_tree
291 |   }
292 | 
293 |   if (verbose) {
294 |     print("Training....")
295 |   }
296 | 
297 | 
298 | 
299 | 
300 |   if (parallel) {
301 |     hierarchyKNNRes <- BiocParallel::bplapply(seq_len(length(selectFeatures)),
302 |                                               function(ft)
303 |                                                 hierarchyKNNcor(exprsMat_train,
304 |                                                                 cellTypes_train,
305 |                                                                 cutree_list,
306 |                                                                 feature = selectFeatures[ft],
307 |                                                                 topN = topN,
308 |                                                                 pSig = pSig,
309 |                                                                 verbose = verbose),
310 |                                               BPPARAM = BPPARAM)
311 |     names(hierarchyKNNRes) <- selectFeatures
312 |   }else{
313 |     hierarchyKNNRes <- list()
314 |     for (ft in seq_len(length(selectFeatures))) {
315 | 
316 |       if (verbose) {
317 |         print(paste("=== selecting features by:", selectFeatures[ft], "===="))
318 |       }
319 | 
320 |       hierarchyKNNRes[[ft]] <- hierarchyKNNcor(exprsMat_train,
321 |                                                cellTypes_train,
322 |                                                cutree_list,
323 |                                                feature = selectFeatures[ft],
324 |                                                topN = topN,
325 |                                                pSig = pSig,
326 |                                                verbose = verbose)
327 |     }
328 | 
329 |     names(hierarchyKNNRes) <- selectFeatures
330 |   }
331 | 
332 | 
333 |   trainRes <- list(hierarchyKNNRes = hierarchyKNNRes,
334 |                    cutree_list = cutree_list,
335 |                    cellTypes_train = cellTypes_train)
336 | 
337 | 
338 |   if (weightsCal) {
339 |     if (verbose) {
340 |       cat("=========  SelfTraining to calculate weight  ================= \n")
341 |     }
342 | 
343 |     selfTrainRes <- predict_scClassify(exprsMat_test = exprsMat_train,
344 |                                        trainRes  = trainRes,
345 |                                        cellTypes_test = cellTypes_train,
346 |                                        parallel = parallel,
347 |                                        BPPARAM = BPPARAM,
348 |                                        verbose = verbose,
349 |                                        features = selectFeatures,
350 |                                        ...)
351 |     trainRes$selfTrainRes <- selfTrainRes
352 |     trainRes$modelweights <- getTrainWeights(selfTrainRes)
353 |   }
354 | 
355 | 
356 | 
357 |   return(trainRes)
358 | }
359 | 
360 | 
361 | 
362 | # Function to construct tree using HOPACH or HC
363 | 
364 | constructTree <- function(centroidMat,
365 |                           tree = c("HOPACH", "HC"),
366 |                           hopach_kmax = hopach_kmax,
367 |                           plot= TRUE){
368 | 
369 | 
370 |   tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
371 | 
372 | 
373 | 
374 |   if (tree == "HOPACH") {
375 | 
376 |     res <- runHOPACH(data = t(centroidMat),
377 |                      plot = plot,
378 |                      kmax = hopach_kmax)
379 |     cutree_list <- res$cutree_list
380 | 
381 |   }else{
382 |     distMat <- as.dist(1 - stats::cor(centroidMat))
383 |     hc <- stats::hclust(distMat, method = "complete")
384 |     # plot(hc)
385 |     cutree_list <- cutree_iterative(hc, depth = 1)
386 |   }
387 | 
388 |   return(cutree_list)
389 | }
390 | 
391 | 
392 | # Cut the Hierarchical tree for each level
393 | # depth here indicates the deep the tree is cut
394 | 
395 | cutree_iterative <- function(hc, depth = 1){
396 | 
397 |   height_sort <- sort(hc$height, decreasing = TRUE)
398 |   cutree_list <- list()
399 | 
400 |   for (i in seq_len(sum(height_sort >=
401 |                         height_sort[round(length(height_sort)*depth)]))) {
402 |     cutree_list[[i]] <- cutree(hc, h = height_sort[i])
403 |   }
404 | 
405 |   # Last level is distinct number
406 |   cutree_list[[length(cutree_list) + 1]] <- seq_len(length(hc$labels))
407 |   names(cutree_list[[length(cutree_list)]]) <- hc$labels
408 | 
409 |   return(cutree_list)
410 | }
411 | 
412 | currentClass <- function(cellTypes, cutree_res){
413 |   cellTypes <- as.character(cellTypes)
414 |   res <- cutree_res[cellTypes]
415 |   return(res)
416 | }
417 | 
418 | # Function to perform hierarchical feature selection
419 | 
420 | hierarchyKNNcor <- function(exprsMat,
421 |                             cellTypes,
422 |                             cutree_list,
423 |                             feature = c("limma", "DV", "DD", "chisq", "BI"),
424 |                             topN = 50,
425 |                             pSig = 0.001,
426 |                             verbose= TRUE){
427 |   feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"))
428 |   numHierchy <- length(cutree_list)
429 |   levelModel <- list()
430 |   levelHVG <- list()
431 |   for (i in seq_len(numHierchy)) {
432 |     #Make sure not all same cluster
433 |     if (length(unique(cutree_list[[i]])) != 1) {
434 | 
435 |       model <- list()
436 |       hvg <- list()
437 |       class_tmp <- currentClass(cellTypes, cutree_list[[i]])
438 |       names(class_tmp) <- colnames(exprsMat)
439 |       for (j in unique(cutree_list[[i - 1]])) {
440 |         # print(paste("Group", j))
441 |         trainIdx <- which(cellTypes %in% names(cutree_list[[i - 1]])[cutree_list[[i - 1]] == j])
442 |         trainClass <- class_tmp[trainIdx]
443 |         if (length(unique(trainClass)) != 1) {
444 | 
445 |           hvg[[j]] <- featureSelection(exprsMat[,trainIdx],
446 |                                        trainClass,
447 |                                        feature = feature,
448 |                                        topN = topN,
449 |                                        pSig = pSig
450 |           )
451 | 
452 |           model[[j]] <- list(train = Matrix::t(exprsMat[na.omit(hvg[[j]]),
453 |                                                         trainIdx,
454 |                                                         drop = FALSE]),
455 |                              y = as.factor(trainClass))
456 | 
457 |         }else{
458 |           model[[j]] <- "noModel"
459 |         }
460 |       }
461 |       levelHVG[[i]] <- hvg
462 |       levelModel[[i]] <- model
463 | 
464 | 
465 |     }
466 |   }
467 |   res <- list(model = levelModel, hvg = levelHVG)
468 |   return(res)
469 | }
470 | 


--------------------------------------------------------------------------------
/R/utils_scClassify.R:
--------------------------------------------------------------------------------
1 | 
2 | # A function coded by Hadley Wickham to suppress print/cat output
3 | quiet <- function(x) {
4 |     sink(tempfile())
5 |     on.exit(sink())
6 |     invisible(force(x))
7 | }
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scClassify: hierarchical classification of cells
 2 | 
 3 | <img src="man/figures/scClassifySticker.png" align="right" width="200"/>
 4 | 
 5 | Single cell classification via cell-type hierarchies based on ensemble learning and sample size estimation.
 6 | 
 7 | 
 8 | ## Installation
 9 | 
10 | 
11 | Install Bioconductor packages `S4Vectors`, `hopach` and `limma` packages using `BiocManager`:
12 | 
13 | ```r
14 | # install.packages("BiocManager")
15 | BiocManager::install(c("S4Vectors", "hopach", "limma"))
16 | ```
17 | 
18 | Then install the latest `scClassify` using `devtools` (For R >= 4.0):
19 | 
20 | ```r
21 | library(devtools)
22 | devtools::install_github("SydneyBioX/scClassify")
23 | ```
24 | 
25 | For R >= 3.6, install `scClassify(v0.2.3)` via 
26 | 
27 | ```
28 | devtools::install_github("SydneyBioX/scClassify@085c72f")
29 | ```
30 | 
31 | ## Vignette and Shiny app
32 | 
33 | You can find the vignette at this website (https://sydneybiox.github.io/scClassify/index.html):
34 | 
35 | + scClassify Model Building and Prediction: https://sydneybiox.github.io/scClassify/articles/scClassify.html
36 | + Sample size calculation: https://sydneybiox.github.io/scClassify/articles/webOnly/sampleSizeCal.html
37 | + Performing scClassify using pretrained models: https://sydneybiox.github.io/scClassify/articles/pretrainedModel.html
38 | 
39 | 
40 | Also, you can find our interactive shiny application (beta) at this website:
41 | http://shiny.maths.usyd.edu.au/scClassify.
42 | 
43 | 
44 | 
45 | ## Pretrained models
46 | 
47 | Currently available pre-trained scClassify models (in `scClassifyTrainModel` class)
48 | 
49 | |          Tissue        | Organism | Training Data | Accession |  Summary                                | Download `.rds`  | Gene Name Format  |
50 | | :--: | :--: | :--: | :--: | :--: | :--: | :--: |
51 | | Primary visual cortex  |   mouse  |  Tasic (2018) | [GSE115746](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115746)    | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Tasic2018_mouseNeuronal.html)|    [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainTasicV2resClass.rds)    | Mm Gene Symbol|
52 | | Primary visual cortex  |   mouse  |  Tasic (2016) | [GSE71585](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE71585)    | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Tasic2016_mouseNeuronal.html)|  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainTasicV1resClass.rds)   | Mm Gene Symbol|
53 | | Visual cortex          |   mouse  |  Hrvatin      | [GSE102827](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE102827)  |   [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Hrvatin_mouseNeuronal.html)        |  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainHrvatinresClass.rds)  | Mm Gene Symbol|
54 | |    Lung                |   mouse  |  Cohen        |   [GSE119228](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE119228)        |    [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Cohen_mouseLung.html)        |  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainCohenresClass.rds)    | Mm Gene Symbol|
55 | |    Kidney              |   mouse  |  Park         |  [GSE107585](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE107585)         | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Park_mouseKidney.html)          |   [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainParkresClass.rds)     | Mm Gene Symbol|
56 | |    Liver               |   human  |  MacParland   | [GSE115469](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115469)          | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/MacParlandres_humanLiver.html)   | [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainMacParlandresClass.rds)  | Hs Gene Symbol|
57 | |    Liver               |   human  |  Aizarani     |  [GSE124395](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE124395)         | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Aizarani_humanLiver.html)        |  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainAizaraniresClass.rds) | Hs Gene Symbol|
58 | |    Pancreas            |   human  |  Xin          |   [GSE81608](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81608)         |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Xin_humanPancreas.html)          |  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainXinClass.rds)                   | Hs Gene Symbol|
59 | |    Pancreas            |   human  |  Wang         |   [GSE83139](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE83139)        |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Wang_humanPancreas.html)          |     [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainWangClass.rds)                | Hs Gene Symbol|
60 | |    Pancreas            |   human  |  Lawlor       |   [GSE86469](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE86469)         |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Lawlor_humanPancreas.html)           |     [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainLawlorClass.rds)                | Hs Gene Symbol|
61 | |    Pancreas            |   human  |  Segerstolpe  |   [E-MTAB-5061](https://www.ebi.ac.uk/arrayexpress/E-MTAB-5061)        |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Segerstolpe_humanPancreas.html)           |  [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainSegerstolpeClass.rds)                  | Hs Gene Symbol|
62 | |    Pancreas            |   human  |  Muraro       |    [GSE85241](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85241)         | [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Muraro_humanPancreas.html)           |     [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainMuraroClass.rds)               | Hs Gene Symbol|
63 | |    Pancreas            |   human  |  Baron        |   [GSE84133](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84133)         |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Baron_humanPancreas.html)          |   [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainBaronClass.rds)               | Hs Gene Symbol|
64 | |    Pancreas            |   human  |  joint        |    -     |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Joint_humanPancreas.html)          |        [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/jointPancreasClass.rds)          | Hs Gene Symbol|
65 | |    Melanoma            |   human  |  Li           | [GSE123139](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE123139)          |   [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Li_humanMelanoma.html)         |    [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainLiresClass.rds)     | Hs Gene Symbol|
66 | |    PBMC                |   human  |  Ding (joint) |    -   |  [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/Joint_humanPBMC.html)   |           [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/jointPBMCClass.rds)        | Mm EMSEMBL ID|
67 | |    Tabula Muris        |   mouse  |  Tabula Muris | [GSE109774](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE109774)           |   [link](https://SydneyBioX.github.io/scClassify/articles/webOnly/TabulaMuris.html)        |    [link](http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scClassify/trainTMresClass.rds)              | Mm Gene Symbol|
68 | 
69 | 
70 | 
71 | ## Contact us
72 | 
73 | If you have any enquiries, especially about performing `scClassify` to classify your cells or to build your own models, please contact <yingxin.lin@sydney.edu.au> or <bioinformatics@maths.usyd.edu.au>.
74 | 
75 | 
76 | ## Reference
77 | 
78 | **scClassify: sample size estimation and multiscale classification of cells using single and multiple reference**
79 | 
80 | Yingxin Lin, Yue Cao, Hani Kim, Agus Salim, Terence Speed, David Lin, Pengyi Yang† & Jean Yang†. ***Molecular Systems Biology***, 2020, 16, e9389. [Full Text](https://doi.org/10.15252/msb.20199389); [BioC R package](http://www.bioconductor.org/packages/release/bioc/html/scClassify.html)
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | template:
 2 |   params:
 3 |     bootswatch: cosmo
 4 | navbar:
 5 |   title: ~
 6 |   left:
 7 |     - text: Classify Your Cells
 8 |       menu:
 9 |         - text: Prediction using pretrained model
10 |           href: articles/pretrainedModel.html
11 |     - text: Build Your Own Models
12 |       menu:
13 |         - text: scClassify Model Building and Prediction
14 |           href: articles/scClassify.html
15 |         - text: Sample size learning
16 |           href: articles/webOnly/sampleSizeCal.html
17 |         - text: Case study - Tabula Muris Heart
18 |           href: articles/webOnly/scClassify_TM_heart.html
19 |     - text: Functions
20 |       href: reference/index.html
21 |     - text: Pretrained Models
22 |       menu:
23 |         - text: Tasic (2016) (Mouse neuronal)
24 |           href: articles/webOnly/Tasic2016_mouseNeuronal.html
25 |         - text: Tasic (2018) (Mouse neuronal)
26 |           href: articles/webOnly/Tasic2018_mouseNeuronal.html
27 |         - text: Hrvatin (Mouse neuronal)
28 |           href: articles/webOnly/Hrvatin_mouseNeuronal.html
29 |         - text: Cohen (Mouse lung)
30 |           href: articles/webOnly/Cohen_mouseLung.html
31 |         - text: Park (Mouse kidney)
32 |           href: articles/webOnly/Park_mouseKidney.html
33 |         - text: MacParland (Human Liver)
34 |           href: articles/webOnly/MacParlandres_humanLiver.html
35 |         - text: Aizarani (Human Liver)
36 |           href: articles/webOnly/Aizarani_humanLiver.html
37 |         - text: Baron (Human Pancreas)
38 |           href: articles/webOnly/Baron_humanPancreas.html
39 |         - text: Muraro (Human Pancreas)
40 |           href: articles/webOnly/Muraro_humanPancreas.html
41 |         - text: Lawlor (Human Pancreas)
42 |           href: articles/webOnly/Lawlor_humanPancreas.html
43 |         - text: Xin (Human Pancreas)
44 |           href: articles/webOnly/Xin_humanPancreas.html
45 |         - text: Wang (Human Pancreas)
46 |           href: articles/webOnly/Wang_humanPancreas.html
47 |         - text: Segerstolpe (Human Pancreas)
48 |           href: articles/webOnly/Segerstolpe_humanPancreas.html
49 |         - text: Joint (Human Pancreas)
50 |           href: articles/webOnly/Joint_humanPancreas.html
51 |         - text: Joint (Human PBMC)
52 |           href: articles/webOnly/Joint_humanPBMC.html
53 |         - text: Li (Human Melanoma)
54 |           href: articles/webOnly/Li_humanMelanoma.html
55 |         - text: Tabula Muris (Mouse cell atlas)
56 |           href: articles/webOnly/TabulaMuris.html
57 |     - text: Interactive scClassify shiny app (beta)
58 |       icon: fa-star
59 |       href: http://shiny.maths.usyd.edu.au/scClassify/
60 |   right:
61 |     - icon: fa-github fa-lg
62 |       href: https://github.com/SydneyBioX/scClassify
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/data/scClassify_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/scClassify/74fe32784b401492da0fd203736f39bcd6f2a2c5/data/scClassify_example.rda


--------------------------------------------------------------------------------
/data/trainClassExample_wang.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/scClassify/74fe32784b401492da0fd203736f39bcd6f2a2c5/data/trainClassExample_wang.rda


--------------------------------------------------------------------------------
/data/trainClassExample_xin.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/scClassify/74fe32784b401492da0fd203736f39bcd6f2a2c5/data/trainClassExample_xin.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite scClassify in publications use:")
 2 | citEntry(entry = "Article",
 3 |   title        = "scClassify: sample size estimation and multiscale classification of cells using single and multiple reference",
 4 |   author       = personList(as.person("Yingxin Lin"),
 5 |                    as.person("Yue Cao"),
 6 |                    as.person("Hani Jieun Kim"),
 7 |                    as.person("Agus Salim"),
 8 |                    as.person("Terence P Speed"),
 9 |                    as.person("David M Lin"),
10 |                    as.person("Pengyi Yang"),
11 |                    as.person("Jean Yee Hwa Yang")),
12 |   journal      = "Molecular systems biology",
13 |   year         = "2020",
14 |   volume       = "16",
15 |   number       = "6",
16 |   pages        = "e9389",
17 |   url          = "https://doi.org/10.15252/msb.20199389",
18 |   textVersion  =
19 |   paste("Lin, Yingxin, Yue Cao, Hani Jieun Kim, Agus Salim, Terence P. Speed, David M. Lin, Pengyi Yang, and Jean Yee Hwa Yang. 2020.",
20 |         "scClassify: sample size estimation and multiscale classification of cells using single and multiple reference.",
21 |         "Molecular systems biology, 16(6), p.e9389.",
22 |         "doi: https://doi.org/10.15252/msb.20199389")
23 | )


--------------------------------------------------------------------------------
/man/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/man/cellTypeTrain.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{cellTypeTrain}
 4 | \alias{cellTypeTrain}
 5 | \alias{cellTypeTrain,scClassifyTrainModel-method}
 6 | \title{Accessors of cellTypeTrain for scClassifyTrainModel}
 7 | \usage{
 8 | cellTypeTrain(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | cellTypeTrain of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | cellTypeTrain(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/cellTypeTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{cellTypeTree}
 4 | \alias{cellTypeTree}
 5 | \alias{cellTypeTree,scClassifyTrainModel-method}
 6 | \title{Accessors of cellTypeTree for scClassifyTrainModel}
 7 | \usage{
 8 | cellTypeTree(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | cellTypeTree of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | cellTypeTree(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/dot-scClassifyTrainModel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{.scClassifyTrainModel}
 4 | \alias{.scClassifyTrainModel}
 5 | \title{The scClassifyTrainModel class}
 6 | \usage{
 7 | .scClassifyTrainModel(
 8 |   name,
 9 |   cellTypeTree,
10 |   cellTypeTrain,
11 |   features,
12 |   model,
13 |   modelweights,
14 |   metaData
15 | )
16 | }
17 | \arguments{
18 | \item{name}{Name of the training dataset}
19 | 
20 | \item{cellTypeTree}{A list indicate a cell type tree}
21 | 
22 | \item{cellTypeTrain}{A vector of cell type in training dataset}
23 | 
24 | \item{features}{A vector of character indicates the
25 | features that are trained for this data}
26 | 
27 | \item{model}{A list stored the training model,
28 | including the features that are selected
29 | and the cell expression matrix that are used for training}
30 | 
31 | \item{modelweights}{A vector of numeric indicates the weights of each model}
32 | 
33 | \item{metaData}{A DataFrame stored meta data of training model}
34 | }
35 | \value{
36 | A scClassifyTrainModel object
37 | }
38 | \description{
39 | The scClassifyTrainModel class is designed to stored
40 | training model for scClassify
41 | }
42 | \author{
43 | Yingxin Lin
44 | }
45 | 


--------------------------------------------------------------------------------
/man/features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{features}
 4 | \alias{features}
 5 | \alias{features,scClassifyTrainModel-method}
 6 | \title{Accessors of features for scClassifyTrainModel}
 7 | \usage{
 8 | features(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | features of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | features(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/figures/scClassifySticker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/scClassify/74fe32784b401492da0fd203736f39bcd6f2a2c5/man/figures/scClassifySticker.png


--------------------------------------------------------------------------------
/man/getN.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/learningCurve.R
 3 | \name{getN}
 4 | \alias{getN}
 5 | \title{Function to get the required N given by the accuracy and
 6 | the learning curve model}
 7 | \usage{
 8 | getN(res, acc = 0.9)
 9 | }
10 | \arguments{
11 | \item{res}{model results returned by \code{learning_curve} function}
12 | 
13 | \item{acc}{accuracy that are quired}
14 | }
15 | \value{
16 | sample size that are required
17 | }
18 | \description{
19 | Function to get the required N given by the accuracy and
20 | the learning curve model
21 | }
22 | \examples{
23 | set.seed(2019)
24 | n <- seq(20, 10000, 100)
25 | accMat <- do.call(cbind, lapply(1:length(n), function(i){
26 | tmp_n <- rep(n[i], 50)
27 | y <- -2/(tmp_n^0.8) + 0.95 + rnorm(length(tmp_n), 0, 0.02)
28 | }))
29 | res <- learningCurve(accMat = accMat, n)
30 | N <- getN(res, acc = 0.9)
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/learningCurve.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/learningCurve.R
 3 | \name{learningCurve}
 4 | \alias{learningCurve}
 5 | \title{Fit learning curve for accuracy matrix}
 6 | \usage{
 7 | learningCurve(
 8 |   accMat,
 9 |   n,
10 |   auto_initial = TRUE,
11 |   a = NULL,
12 |   b = NULL,
13 |   c = NULL,
14 |   d_list = NULL,
15 |   fitmodel = c("nls", "nls_mix", "gam"),
16 |   plot = TRUE,
17 |   verbose = TRUE
18 | )
19 | }
20 | \arguments{
21 | \item{accMat}{Matrix of accuracy rate where column indicate
22 | different sample size}
23 | 
24 | \item{n}{Vector indicates the sample size}
25 | 
26 | \item{auto_initial}{whether automatical intialise}
27 | 
28 | \item{a}{input the parameter a starting point}
29 | 
30 | \item{b}{input the parameter a starting point}
31 | 
32 | \item{c}{input the parameter a starting point}
33 | 
34 | \item{d_list}{range of d}
35 | 
36 | \item{fitmodel}{"nls", "nls_mix", "gam"}
37 | 
38 | \item{plot}{indicates whether plot or not}
39 | 
40 | \item{verbose}{indicates whether verbose or not}
41 | }
42 | \value{
43 | list of results
44 | }
45 | \description{
46 | Fit learning curve for accuracy matrix
47 | }
48 | \examples{
49 | set.seed(2019)
50 | n <- seq(20, 10000, 100)
51 | accMat <- do.call(cbind, lapply(1:length(n), function(i){
52 | tmp_n <- rep(n[i], 50)
53 | y <- -2/(tmp_n^0.8) + 0.95 + rnorm(length(tmp_n), 0, 0.02)
54 | }))
55 | res <- learningCurve(accMat = accMat, n)
56 | N <- getN(res, acc = 0.9)
57 | 
58 | }
59 | \author{
60 | Yingxin Lin
61 | }
62 | 


--------------------------------------------------------------------------------
/man/model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{model}
 4 | \alias{model}
 5 | \alias{model,scClassifyTrainModel-method}
 6 | \title{Accessors of model for scClassifyTrainModel}
 7 | \usage{
 8 | model(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | model of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | model(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/modelweights.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{modelweights}
 4 | \alias{modelweights}
 5 | \alias{modelweights,scClassifyTrainModel-method}
 6 | \title{Accessors of modelweights for scClassifyTrainModel}
 7 | \usage{
 8 | modelweights(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | modelweights of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | modelweights(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/name.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{name}
 4 | \alias{name}
 5 | \alias{name,scClassifyTrainModel-method}
 6 | \title{Accessors of name for scClassifyTrainModel}
 7 | \usage{
 8 | name(x)
 9 | }
10 | \arguments{
11 | \item{x}{A `scClassifyTrainModel` object.}
12 | }
13 | \value{
14 | name of the scClassifyTrainModel slot
15 | }
16 | \description{
17 | Methods to access various components of the `scClassifyTrainModel` object.
18 | }
19 | \examples{
20 | 
21 | data(trainClassExample_xin)
22 | name(trainClassExample_xin)
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/man/plotCellTypeTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/runHOPACH.R
 3 | \name{plotCellTypeTree}
 4 | \alias{plotCellTypeTree}
 5 | \title{To plot cell type tree}
 6 | \usage{
 7 | plotCellTypeTree(cutree_list, group_level = NULL)
 8 | }
 9 | \arguments{
10 | \item{cutree_list}{A list indicates the hierarchical cell type tree}
11 | 
12 | \item{group_level}{Indicate whether plot or not}
13 | }
14 | \value{
15 | A ggplot object visualising the HOPACH tree
16 | }
17 | \description{
18 | To plot cell type tree
19 | }
20 | \examples{
21 | 
22 | data("trainClassExample_xin")
23 | 
24 | plotCellTypeTree(cellTypeTree(trainClassExample_xin))
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/predict_scClassify.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/predict_scClassify.R
  3 | \name{predict_scClassify}
  4 | \alias{predict_scClassify}
  5 | \title{Testing scClassify model}
  6 | \usage{
  7 | predict_scClassify(
  8 |   exprsMat_test,
  9 |   trainRes,
 10 |   cellTypes_test = NULL,
 11 |   k = 10,
 12 |   prob_threshold = 0.7,
 13 |   cor_threshold_static = 0.5,
 14 |   cor_threshold_high = 0.7,
 15 |   features = "limma",
 16 |   algorithm = "WKNN",
 17 |   similarity = "pearson",
 18 |   cutoff_method = c("dynamic", "static"),
 19 |   weighted_ensemble = FALSE,
 20 |   weights = NULL,
 21 |   parallel = FALSE,
 22 |   BPPARAM = BiocParallel::SerialParam(),
 23 |   verbose = FALSE
 24 | )
 25 | }
 26 | \arguments{
 27 | \item{exprsMat_test}{A list or a matrix indicates the log-transformed
 28 | expression matrices of the query datasets}
 29 | 
 30 | \item{trainRes}{A `scClassifyTrainModel` or a `list` indicates
 31 | scClassify trained model}
 32 | 
 33 | \item{cellTypes_test}{A list or a vector indicates cell types
 34 | of the qurey datasets (Optional).}
 35 | 
 36 | \item{k}{An integer indicates the number of neighbour}
 37 | 
 38 | \item{prob_threshold}{A numeric indicates the probability threshold
 39 | for KNN/WKNN/DWKNN.}
 40 | 
 41 | \item{cor_threshold_static}{A numeric indicates the static
 42 | correlation threshold.}
 43 | 
 44 | \item{cor_threshold_high}{A numeric indicates the highest
 45 | correlation threshold}
 46 | 
 47 | \item{features}{A vector indicates the gene selection method,
 48 | set as "limma" by default.
 49 | This should be one or more of "limma", "DV", "DD", "chisq", "BI".}
 50 | 
 51 | \item{algorithm}{A vector indicates the KNN method that are used,
 52 | set as "WKNN" by default.
 53 | This should be one or more of "WKNN", "KNN", "DWKNN".}
 54 | 
 55 | \item{similarity}{A vector indicates the similarity measure that are used,
 56 | set as "pearson" by default.
 57 | This should be one or more of "pearson",  "spearman", "cosine",
 58 | "jaccard", "kendall", "binomial", "weighted_rank","manhattan"}
 59 | 
 60 | \item{cutoff_method}{A vector indicates the method to cutoff the
 61 | correlation distribution. Set as "dynamic" by default.}
 62 | 
 63 | \item{weighted_ensemble}{A logical input indicates in ensemble learning,
 64 | whether the results is combined by a
 65 | weighted score for each base classifier.}
 66 | 
 67 | \item{weights}{A vector indicates the weights for ensemble}
 68 | 
 69 | \item{parallel}{A logical input indicates whether running in paralllel or not}
 70 | 
 71 | \item{BPPARAM}{A \code{BiocParallelParam} class object
 72 | from the \code{BiocParallel} package is used. Default is SerialParam().}
 73 | 
 74 | \item{verbose}{A logical input indicates whether the intermediate steps
 75 | will be printed}
 76 | }
 77 | \value{
 78 | list of results
 79 | }
 80 | \description{
 81 | Testing scClassify model
 82 | }
 83 | \examples{
 84 | data("scClassify_example")
 85 | wang_cellTypes <- scClassify_example$wang_cellTypes
 86 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
 87 | data("trainClassExample_xin")
 88 | 
 89 | pred_res <- predict_scClassify(exprsMat_test = exprsMat_wang_subset,
 90 | trainRes = trainClassExample_xin,
 91 | cellTypes_test = wang_cellTypes,
 92 | algorithm = "WKNN",
 93 | features = c("limma"),
 94 | similarity = c("pearson"),
 95 | prob_threshold = 0.7,
 96 | verbose = TRUE)
 97 | 
 98 | }
 99 | \author{
100 | Yingxin Lin
101 | }
102 | 


--------------------------------------------------------------------------------
/man/predict_scClassifyJoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict_scClassify.R
 3 | \name{predict_scClassifyJoint}
 4 | \alias{predict_scClassifyJoint}
 5 | \title{Testing scClassify model (joint training)}
 6 | \usage{
 7 | predict_scClassifyJoint(
 8 |   exprsMat_test,
 9 |   trainRes,
10 |   cellTypes_test = NULL,
11 |   k = 10,
12 |   prob_threshold = 0.7,
13 |   cor_threshold_static = 0.5,
14 |   cor_threshold_high = 0.7,
15 |   features = "limma",
16 |   algorithm = "WKNN",
17 |   similarity = "pearson",
18 |   cutoff_method = c("dynamic", "static"),
19 |   parallel = FALSE,
20 |   BPPARAM = BiocParallel::SerialParam(),
21 |   verbose = FALSE
22 | )
23 | }
24 | \arguments{
25 | \item{exprsMat_test}{A list or a matrix indicates the expression matrices of the testing datasets}
26 | 
27 | \item{trainRes}{A `scClassifyTrainModel` or a `list` indicates scClassify training model}
28 | 
29 | \item{cellTypes_test}{A list or a vector indicates cell types of the testing datasets (Optional).}
30 | 
31 | \item{k}{An integer indicates the number of neighbour}
32 | 
33 | \item{prob_threshold}{A numeric indicates the probability threshold for KNN/WKNN/DWKNN.}
34 | 
35 | \item{cor_threshold_static}{A numeric indicates the static correlation threshold.}
36 | 
37 | \item{cor_threshold_high}{A numeric indicates the highest correlation threshold}
38 | 
39 | \item{features}{A vector indicates the method to select features, set as "limma" by default.
40 | This should be one or more of "limma", "DV", "DD", "chisq", "BI".}
41 | 
42 | \item{algorithm}{A vector indicates the KNN method that are used, set as "WKNN" by default.
43 | This should be one or more of "WKNN", "KNN", "DWKNN".}
44 | 
45 | \item{similarity}{A vector indicates the similarity measure that are used,
46 | set as "pearson" by default.
47 | This should be one or more of "pearson",  "spearman", "cosine", "jaccard", "kendall",
48 | "binomial", "weighted_rank","manhattan"}
49 | 
50 | \item{cutoff_method}{A vector indicates the method to cutoff the correlation distribution.
51 | Set as "dynamic" by default.}
52 | 
53 | \item{parallel}{A logical input indicates whether running in paralllel or not}
54 | 
55 | \item{BPPARAM}{A \code{BiocParallelParam} class object
56 | from the \code{BiocParallel} package is used. Default is SerialParam().}
57 | 
58 | \item{verbose}{A logical input indicates whether the intermediate steps will be printed}
59 | }
60 | \value{
61 | list of results
62 | }
63 | \description{
64 | Testing scClassify model (joint training)
65 | }
66 | \examples{
67 | data("scClassify_example")
68 | wang_cellTypes <- scClassify_example$wang_cellTypes
69 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
70 | data("trainClassExample_xin")
71 | data("trainClassExample_wang")
72 | 
73 | trainClassExampleJoint <- scClassifyTrainModelList(trainClassExample_wang,
74 | trainClassExample_xin)
75 | 
76 | pred_res_joint <- predict_scClassifyJoint(exprsMat_test = exprsMat_wang_subset,
77 | trainRes = trainClassExampleJoint,
78 | cellTypes_test = wang_cellTypes,
79 | algorithm = "WKNN",
80 | features = c("limma"),
81 | similarity = c("pearson"),
82 | prob_threshold = 0.7,
83 | verbose = FALSE)
84 | 
85 | table(pred_res_joint$jointRes$cellTypes, wang_cellTypes)
86 | 
87 | }
88 | \author{
89 | Yingxin Lin
90 | }
91 | 


--------------------------------------------------------------------------------
/man/runHOPACH.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/runHOPACH.R
 3 | \name{runHOPACH}
 4 | \alias{runHOPACH}
 5 | \title{Create HOPACH tree}
 6 | \usage{
 7 | runHOPACH(data, plot = TRUE, kmax = 5)
 8 | }
 9 | \arguments{
10 | \item{data}{A matrix of average expression matrix
11 | (each row indicates the gene, each column indicates the cell type)}
12 | 
13 | \item{plot}{Indicate whether plot or not}
14 | 
15 | \item{kmax}{Integer between 1 and 9 specifying the maximum number of children
16 | at each node in the tree.}
17 | }
18 | \value{
19 | Return a \code{list} where
20 | \itemize{
21 | \item{cutree_list}: A list indicates the hierarchical cell type tree
22 | \item{plot}: A \code{ggplot} visualise the cell type tree
23 | }
24 | }
25 | \description{
26 | A function generating HOPACH tree using the average expression matrix for
27 | each cell type.
28 | }
29 | \examples{
30 | 
31 | data("scClassify_example")
32 | wang_cellTypes <- factor(scClassify_example$wang_cellTypes)
33 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
34 | avgMat_wang <- apply(exprsMat_wang_subset, 1, function(x)
35 | aggregate(x, list(wang_cellTypes), mean)$x)
36 | rownames(avgMat_wang) <- levels(wang_cellTypes)
37 | res_hopach <- runHOPACH(avgMat_wang)
38 | res_hopach$plot
39 | 
40 | }
41 | \references{
42 | van der Laan, M. J. and Pollard, K. S. (2003)
43 | ‘A new algorithm for hybrid hierarchical clustering with
44 | visualization and the bootstrap’,
45 | Journal of Statistical Planning and Inference.
46 | doi: 10.1016/S0378-3758(02)00388-9.
47 | }
48 | \author{
49 | Yingxin Lin
50 | }
51 | 


--------------------------------------------------------------------------------
/man/runSampleCal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sampleSizeCal.R
 3 | \name{runSampleCal}
 4 | \alias{runSampleCal}
 5 | \title{Run sample size calculation for pilot data for reference dataset}
 6 | \usage{
 7 | runSampleCal(
 8 |   exprsMat,
 9 |   cellTypes,
10 |   n_list = c(20, 40, 60, 80, 100, seq(200, 500, 100)),
11 |   num_repeat = 20,
12 |   level = NULL,
13 |   cellType_tree = NULL,
14 |   BPPARAM = BiocParallel::SerialParam(),
15 |   subset_test = FALSE,
16 |   num_test = NULL,
17 |   ...
18 | )
19 | }
20 | \arguments{
21 | \item{exprsMat}{A matrix of expression matrix of pilot dataset
22 | (log-transformed, or normalised)}
23 | 
24 | \item{cellTypes}{A vector of cell types of pilot dataset}
25 | 
26 | \item{n_list}{A vector of integer indicates the sample size to run.}
27 | 
28 | \item{num_repeat}{An integer indicates the number of run for
29 | each sample size will be repeated.}
30 | 
31 | \item{level}{An integer indicates the accuracy rate is calculate
32 | based on the n-th level from top of cell type tree.
33 | If it is NULL (by default), it will be the bottom of the cell type tree.
34 | It can not be larger than the total number of levels of the tree.}
35 | 
36 | \item{cellType_tree}{A list indicates the cell type tree (optional),
37 | if it is NULL, the accuracy rate is calculate
38 | based on the provided cellTypes.}
39 | 
40 | \item{BPPARAM}{A \code{BiocParallelParam} class object
41 | from the \code{BiocParallel} package is used. Default is SerialParam().}
42 | 
43 | \item{subset_test}{A ogical input indicates whether we used a subset of data
44 | (fixed number for each sample size)
45 | to test instead of all remaining data. By default, it is FALSE.}
46 | 
47 | \item{num_test}{An integer indicates the size of the test data.}
48 | 
49 | \item{...}{other parameter from scClassify}
50 | }
51 | \value{
52 | A matrix of accuracy matrix, where columns corresponding to different
53 | sample sizes, rows corresponding to the number of repetation.
54 | }
55 | \description{
56 | Run sample size calculation for pilot data for reference dataset
57 | }
58 | \examples{
59 | data("scClassify_example")
60 | xin_cellTypes <- scClassify_example$xin_cellTypes
61 | exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
62 | 
63 | exprsMat_xin_subset <- as(exprsMat_xin_subset, "dgCMatrix")
64 | set.seed(2019)
65 | accMat <- runSampleCal(exprsMat_xin_subset,
66 | xin_cellTypes,
67 | n_list = seq(20, 100, 20),
68 | num_repeat = 5, BPPARAM = BiocParallel::SerialParam())
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/man/scClassify.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/scClassify.R
  3 | \name{scClassify}
  4 | \alias{scClassify}
  5 | \title{Train and test scClassify model}
  6 | \usage{
  7 | scClassify(
  8 |   exprsMat_train = NULL,
  9 |   cellTypes_train = NULL,
 10 |   exprsMat_test = NULL,
 11 |   cellTypes_test = NULL,
 12 |   tree = "HOPACH",
 13 |   algorithm = "WKNN",
 14 |   selectFeatures = "limma",
 15 |   similarity = "pearson",
 16 |   cutoff_method = c("dynamic", "static"),
 17 |   weighted_ensemble = FALSE,
 18 |   weights = NULL,
 19 |   weighted_jointClassification = TRUE,
 20 |   cellType_tree = NULL,
 21 |   k = 10,
 22 |   topN = 50,
 23 |   hopach_kmax = 5,
 24 |   pSig = 0.01,
 25 |   prob_threshold = 0.7,
 26 |   cor_threshold_static = 0.5,
 27 |   cor_threshold_high = 0.7,
 28 |   returnList = TRUE,
 29 |   parallel = FALSE,
 30 |   BPPARAM = BiocParallel::SerialParam(),
 31 |   verbose = FALSE
 32 | )
 33 | }
 34 | \arguments{
 35 | \item{exprsMat_train}{A matrix of log-transformed expression matrix of reference dataset}
 36 | 
 37 | \item{cellTypes_train}{A vector of cell types of reference dataset}
 38 | 
 39 | \item{exprsMat_test}{A list or a matrix indicates the expression matrices of the query datasets}
 40 | 
 41 | \item{cellTypes_test}{A list or a vector indicates cell types of the query datasets (Optional).}
 42 | 
 43 | \item{tree}{A vector indicates the method to build hierarchical tree, set as "HOPACH" by default.
 44 | This should be one of "HOPACH" and "HC" (using hclust).}
 45 | 
 46 | \item{algorithm}{A vector indicates the KNN method that are used, set as
 47 | "WKNN" by default. Thisshould be one or more of "WKNN", "KNN", "DWKNN".}
 48 | 
 49 | \item{selectFeatures}{A vector indicates the gene selection method, set as "limma" by default.
 50 | This should be one or more of "limma", "DV", "DD", "chisq", "BI" and "Cepo".}
 51 | 
 52 | \item{similarity}{A vector indicates the similarity measure that are used,
 53 | set as "pearson" by default. This should be one or more of "pearson",
 54 | "spearman", "cosine", "jaccard", kendall", "binomial", "weighted_rank","manhattan"}
 55 | 
 56 | \item{cutoff_method}{A vector indicates the method to cutoff the correlation distribution.
 57 | Set as "dynamic" by default.}
 58 | 
 59 | \item{weighted_ensemble}{A logical input indicates in ensemble learning,
 60 | whether the results is combined by a weighted score for each base classifier.}
 61 | 
 62 | \item{weights}{A vector indicates the weights for ensemble}
 63 | 
 64 | \item{weighted_jointClassification}{A logical input indicates in joint classification
 65 | using multiple training datasets,
 66 | whether the results is combined by a weighted score for each training model.}
 67 | 
 68 | \item{cellType_tree}{A list indicates the cell type tree provided by user.
 69 | (By default, it is NULL) (Only for one training data input)}
 70 | 
 71 | \item{k}{An integer indicates the number of neighbour}
 72 | 
 73 | \item{topN}{An integer indicates the top number of features that are selected}
 74 | 
 75 | \item{hopach_kmax}{An integer between 1 and 9 specifying the maximum number of
 76 | children at each node in the HOPACH tree.}
 77 | 
 78 | \item{pSig}{A numeric indicates the cutoff of pvalue for features}
 79 | 
 80 | \item{prob_threshold}{A numeric indicates the probability threshold for KNN/WKNN/DWKNN.}
 81 | 
 82 | \item{cor_threshold_static}{A numeric indicates the static correlation threshold.}
 83 | 
 84 | \item{cor_threshold_high}{A numeric indicates the highest correlation threshold}
 85 | 
 86 | \item{returnList}{A logical input indicates whether the output will be class of list}
 87 | 
 88 | \item{parallel}{A logical input indicates whether running in paralllel or not}
 89 | 
 90 | \item{BPPARAM}{A \code{BiocParallelParam} class object
 91 | from the \code{BiocParallel} package is used. Default is SerialParam().}
 92 | 
 93 | \item{verbose}{A logical input indicates whether the intermediate steps will be printed}
 94 | }
 95 | \value{
 96 | A list of the results, including testRes storing the results of the testing information,
 97 | and trainRes storing the training model inforamtion.
 98 | }
 99 | \description{
100 | Train and test scClassify model
101 | }
102 | \examples{
103 | 
104 | data("scClassify_example")
105 | xin_cellTypes <- scClassify_example$xin_cellTypes
106 | exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
107 | wang_cellTypes <- scClassify_example$wang_cellTypes
108 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
109 | 
110 | scClassify_res <- scClassify(exprsMat_train = exprsMat_xin_subset,
111 | cellTypes_train = xin_cellTypes,
112 | exprsMat_test = list(wang = exprsMat_wang_subset),
113 | cellTypes_test = list(wang = wang_cellTypes),
114 | tree = "HOPACH",
115 | algorithm = "WKNN",
116 | selectFeatures = c("limma"),
117 | similarity = c("pearson"),
118 | returnList = FALSE,
119 | verbose = FALSE)
120 | 
121 | }
122 | \author{
123 | Yingxin Lin
124 | }
125 | 


--------------------------------------------------------------------------------
/man/scClassifyTrainModel-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \docType{class}
 4 | \name{scClassifyTrainModel-class}
 5 | \alias{scClassifyTrainModel-class}
 6 | \title{An S4 class to stored training model for scClassify}
 7 | \description{
 8 | An S4 class to stored training model for scClassify
 9 | }
10 | \section{Slots}{
11 | 
12 | \describe{
13 | \item{\code{name}}{Name of the training dataset}
14 | 
15 | \item{\code{cellTypeTrain}}{A vector of cell type in training dataset}
16 | 
17 | \item{\code{cellTypeTree}}{A list indicate a cell type tree}
18 | 
19 | \item{\code{features}}{A vector of character indicates the
20 | features that are trained for this data}
21 | 
22 | \item{\code{model}}{A list stored the training model,
23 | including the features that are selected
24 | and the cell expression matrix that are used for training}
25 | 
26 | \item{\code{modelweights}}{A vector of numeric indicates the weights of each model}
27 | 
28 | \item{\code{metaData}}{A DataFrame stored meta data of training model}
29 | }}
30 | 
31 | 


--------------------------------------------------------------------------------
/man/scClassifyTrainModelList-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \docType{class}
 4 | \name{scClassifyTrainModelList-class}
 5 | \alias{scClassifyTrainModelList-class}
 6 | \title{An S4 class to stored a list of training models from scClassify}
 7 | \description{
 8 | An S4 class to stored a list of training models from scClassify
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/scClassifyTrainModelList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scClassifyTrainClass.R
 3 | \name{scClassifyTrainModelList}
 4 | \alias{scClassifyTrainModelList}
 5 | \title{The scClassifyTrainModelList class}
 6 | \usage{
 7 | scClassifyTrainModelList(...)
 8 | }
 9 | \arguments{
10 | \item{...}{scClassifyTrainModel objects}
11 | }
12 | \value{
13 | A scClassifyTrainModelList object
14 | }
15 | \description{
16 | The scClassifyTrainModelList class
17 | }
18 | \examples{
19 | 
20 | data("trainClassExample_xin")
21 | data("trainClassExample_wang")
22 | trainClassExampleList <- scClassifyTrainModelList(trainClassExample_xin,
23 | trainClassExample_wang
24 | )
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/scClassify_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{scClassify_example}
 5 | \alias{scClassify_example}
 6 | \title{Example data used in scClassify package}
 7 | \format{
 8 | An object of class \code{list} of length 4.
 9 | }
10 | \source{
11 | Wang YJ, Schug J, Won K-J, Liu C, Naji A, Avrahami D, Golson ML &
12 | Kaestner KH (2016) Single cell transcriptomics of the human endocrine
13 | pancreas. Diabetes: db160405
14 | 
15 | Xin Y, Kim J, Okamoto H, Ni M, Wei Y, Adler C, Murphy AJ, Yancopoulos GD,
16 | Lin C & Gromada J (2016) RNA sequencing of single human islet cells reveals
17 | type 2 diabetes genes. Cell Metab. 24: 608–615
18 | }
19 | \usage{
20 | data(scClassify_example, package = 'scClassify')
21 | }
22 | \description{
23 | A list includes expression matrix and cell type of subsets of
24 | wang et al., xin et al.
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/trainClassExample_wang.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{trainClassExample_wang}
 5 | \alias{trainClassExample_wang}
 6 | \title{Subset of pretrained model of Wang et al.}
 7 | \format{
 8 | An object of class \code{scClassifyTrainModel} of length 1.
 9 | }
10 | \source{
11 | Wang YJ, Schug J, Won K-J, Liu C, Naji A, Avrahami D, Golson ML &
12 | Kaestner KH (2016) Single cell transcriptomics of the human endocrine
13 | pancreas. Diabetes: db160405
14 | }
15 | \usage{
16 | data(trainClassExample_wang, package = 'scClassify')
17 | }
18 | \description{
19 | An obejct of scClassifyTrainModel for Wang et al.
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/trainClassExample_xin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{trainClassExample_xin}
 5 | \alias{trainClassExample_xin}
 6 | \title{Subset of pretrained model of Xin et al.}
 7 | \format{
 8 | An object of class \code{scClassifyTrainModel} of length 1.
 9 | }
10 | \source{
11 | Xin Y, Kim J, Okamoto H, Ni M, Wei Y, Adler C, Murphy AJ, Yancopoulos GD,
12 | Lin C & Gromada J (2016) RNA sequencing of single human islet cells reveals
13 | type 2 diabetes genes. Cell Metab. 24: 608–615
14 | }
15 | \usage{
16 | data(trainClassExample_xin, package = 'scClassify')
17 | }
18 | \description{
19 | An obejct of scClassifyTrainModel for Xin et al.
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/train_scClassify.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/train_scClassify.R
 3 | \name{train_scClassify}
 4 | \alias{train_scClassify}
 5 | \title{Training scClassify model}
 6 | \usage{
 7 | train_scClassify(
 8 |   exprsMat_train,
 9 |   cellTypes_train,
10 |   tree = "HOPACH",
11 |   selectFeatures = "limma",
12 |   topN = 50,
13 |   hopach_kmax = 5,
14 |   pSig = 0.05,
15 |   cellType_tree = NULL,
16 |   weightsCal = FALSE,
17 |   parallel = FALSE,
18 |   BPPARAM = BiocParallel::SerialParam(),
19 |   verbose = TRUE,
20 |   returnList = TRUE,
21 |   ...
22 | )
23 | }
24 | \arguments{
25 | \item{exprsMat_train}{A matrix of log-transformed expression matrix of reference dataset}
26 | 
27 | \item{cellTypes_train}{A vector of cell types of reference dataset}
28 | 
29 | \item{tree}{A vector indicates the method to build hierarchical tree,
30 | set as "HOPACH" by default.
31 | This should be one of "HOPACH" and "HC" (using stats::hclust).}
32 | 
33 | \item{selectFeatures}{A vector indicates the gene selection method,
34 | set as "limma" by default.
35 | This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo".}
36 | 
37 | \item{topN}{An integer indicates the top number of features that are selected}
38 | 
39 | \item{hopach_kmax}{An integer between 1 and 9 specifying the maximum number of
40 | children at each node in the HOPACH tree.}
41 | 
42 | \item{pSig}{A numeric indicates the cutoff of pvalue for features}
43 | 
44 | \item{cellType_tree}{A list indicates the cell type tree provided by user.
45 | (By default, it is NULL)}
46 | 
47 | \item{weightsCal}{A logical input indicates whether we need to
48 | calculate the weights for the model.}
49 | 
50 | \item{parallel}{A logical input indicates whether the algorihms will run in parallel}
51 | 
52 | \item{BPPARAM}{A \code{BiocParallelParam} class object
53 | from the \code{BiocParallel} package is used. Default is SerialParam().}
54 | 
55 | \item{verbose}{A logical input indicates whether the intermediate steps will be printed}
56 | 
57 | \item{returnList}{A logical input indicates whether the output will be class of list}
58 | 
59 | \item{...}{Other input for predict_scClassify for the case when weights calculation
60 | of the pretrained model is performed}
61 | }
62 | \value{
63 | list of results or an object of \code{scClassifyTrainModel}
64 | }
65 | \description{
66 | Training scClassify model
67 | }
68 | \examples{
69 | data("scClassify_example")
70 | xin_cellTypes <- scClassify_example$xin_cellTypes
71 | exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
72 | trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset,
73 | cellTypes_train = xin_cellTypes,
74 | selectFeatures = c("limma", "BI"),
75 | returnList = FALSE
76 | )
77 | 
78 | }
79 | \author{
80 | Yingxin Lin
81 | }
82 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | webOnly
4 | .DS_Store
5 | 


--------------------------------------------------------------------------------
/vignettes/pretrainedModel.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Performing scClassify using pretrained model"
  3 | author:
  4 | - name: Yingxin Lin
  5 |   affiliation: School of Mathematics and Statistics, The University of Sydney, Australia
  6 | date: "`r BiocStyle::doc_date()`"
  7 | output:
  8 |   BiocStyle::html_document:
  9 |     toc: true
 10 |     toc_float: true
 11 | vignette: >
 12 |   %\VignetteIndexEntry{pretrainedModel}
 13 |   %\VignetteEngine{knitr::rmarkdown}
 14 |   %\VignetteEncoding{UTF-8}
 15 | ---
 16 |   
 17 |   
 18 |   
 19 |   
 20 |   
 21 | ```{r, include = FALSE}
 22 | knitr::opts_chunk$set(
 23 |   collapse = TRUE,
 24 |   warning = FALSE,
 25 |   message = FALSE,
 26 |   comment = "#>"
 27 | )
 28 | ```
 29 | 
 30 | 
 31 | 
 32 | # Introduction
 33 | 
 34 | A common application of single-cell RNA sequencing (RNA-seq) data is 
 35 | to identify discrete cell types. To take advantage of the large collection 
 36 | of well-annotated scRNA-seq datasets, `scClassify` package implements 
 37 | a set of methods to perform accurate cell type classification based on 
 38 | *ensemble learning* and *sample size calculation*. 
 39 | 
 40 | This vignette will provide an example showing how users can use a pretrained 
 41 | model of scClassify to predict cell types. A pretrained model is a 
 42 | `scClassifyTrainModel` object  returned by `train_scClassify()`. 
 43 | A list of pretrained model can be found in
 44 | https://sydneybiox.github.io/scClassify/index.html.
 45 | 
 46 | 
 47 | First,  install `scClassify`, install `BiocManager` and use 
 48 | `BiocManager::install` to install `scClassify` package.
 49 | 
 50 | ```{r eval = FALSE}
 51 | # installation of scClassify
 52 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
 53 |   install.packages("BiocManager")
 54 | }
 55 | BiocManager::install("scClassify")
 56 | ```
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | # Setting up the data
 63 | 
 64 | We assume that you have *log-transformed* (size-factor normalized) matrices as 
 65 | query datasets, where each row refers to a gene and each column a cell. 
 66 | For demonstration purposes, we will take a subset of single-cell pancreas 
 67 | datasets from one independent study (Wang et al.).
 68 | 
 69 | 
 70 | ```{r setup}
 71 | library(scClassify)
 72 | data("scClassify_example")
 73 | wang_cellTypes <- scClassify_example$wang_cellTypes
 74 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
 75 | exprsMat_wang_subset <- as(exprsMat_wang_subset, "dgCMatrix")
 76 | ```
 77 | 
 78 | Here, we load our pretrained model using a subset of the Xin et al. 
 79 | human pancreas dataset as our reference data.
 80 | 
 81 | First, let us check basic information relating to our pretrained model. 
 82 | 
 83 | ```{r}
 84 | data("trainClassExample_xin")
 85 | trainClassExample_xin
 86 | ```
 87 | 
 88 | In this pretrained model, we have selected the genes based on Differential 
 89 | Expression using limma. To check the genes that are available 
 90 | in the pretrained model: 
 91 | 
 92 | 
 93 | ```{r}
 94 | features(trainClassExample_xin)
 95 | ```
 96 | 
 97 | 
 98 | We can also visualise the cell type tree of the reference data.
 99 | 
100 | ```{r}
101 | plotCellTypeTree(cellTypeTree(trainClassExample_xin))
102 | ```
103 | 
104 | # Running scClassify
105 | 
106 | Next, we perform `predict_scClassify` with our pretrained model 
107 | `trainRes = trainClassExample` to predict the cell types of our 
108 | query data matrix `exprsMat_wang_subset_sparse`. Here, 
109 | we used `pearson` and `spearman` as similarity metrics.
110 | 
111 | ```{r}
112 | pred_res <- predict_scClassify(exprsMat_test = exprsMat_wang_subset,
113 |                                trainRes = trainClassExample_xin,
114 |                                cellTypes_test = wang_cellTypes,
115 |                                algorithm = "WKNN",
116 |                                features = c("limma"),
117 |                                similarity = c("pearson", "spearman"),
118 |                                prob_threshold = 0.7,
119 |                                verbose = TRUE)
120 | ```
121 | 
122 | Noted that the `cellType_test` is not a required input. 
123 | For datasets with unknown labels, users can simply leave it 
124 | as `cellType_test = NULL`.
125 | 
126 | 
127 | 
128 | Prediction results for pearson as the similarity metric:
129 | 
130 | 
131 | ```{r}
132 | table(pred_res$pearson_WKNN_limma$predRes, wang_cellTypes)
133 | ```
134 | 
135 | Prediction results for spearman as the similarity metric:
136 | 
137 | ```{r}
138 | table(pred_res$spearman_WKNN_limma$predRes, wang_cellTypes)
139 | ```
140 | 
141 | 
142 | 
143 | 
144 | # Session Info
145 | 
146 | ```{r}
147 | sessionInfo()
148 | ```
149 | 


--------------------------------------------------------------------------------
/vignettes/scClassify.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "scClassify Model Building and Prediction"
  3 | author:
  4 | - name: Yingxin Lin
  5 |   affiliation: 
  6 |   - School of Mathematics and Statistics, The University of Sydney, Australia
  7 |   - Charles Perkins Centre, The University of Sydney, Australia
  8 | date: "`r BiocStyle::doc_date()`"
  9 | output:
 10 |   BiocStyle::html_document:
 11 |     toc: true
 12 |     toc_float: true
 13 | vignette: >
 14 |   %\VignetteIndexEntry{scClassify}
 15 |   %\VignetteEngine{knitr::rmarkdown}
 16 |   %\VignetteEncoding{UTF-8}
 17 | ---
 18 |   
 19 |   
 20 | ```{r, include = FALSE}
 21 | knitr::opts_chunk$set(
 22 |     collapse = TRUE,
 23 |     warning = FALSE,
 24 |     message = FALSE,
 25 |     comment = "#>"
 26 | )
 27 | ```
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # Introduction
 34 | 
 35 | A common application of single-cell RNA sequencing (RNA-seq) data is 
 36 | to identify discrete cell types. To take advantage of the large collection 
 37 | of well-annotated scRNA-seq datasets, `scClassify` package implements 
 38 | a set of methods to perform accurate cell type classification based on 
 39 | *ensemble learning* and *sample size calculation*. 
 40 | This vignette demonstrates the usage of `scClassify`, 
 41 | providing a pithy description of each method with workable examples. 
 42 | 
 43 | First, install `scClassify` via `BiocManager`.
 44 | 
 45 | ```{r eval = FALSE}
 46 | # installation of scClassify
 47 | if (!requireNamespace("BiocManager", quietly = TRUE)) {
 48 |     install.packages("BiocManager")
 49 | }
 50 | BiocManager::install("scClassify")
 51 | ```
 52 | 
 53 | # Setting up the data
 54 | 
 55 | We assume that you have *log-transformed* (size-factor normalized) matrices 
 56 | where each row is a gene and each column a cell for a reference dataset 
 57 | and a query dataset. For demonstration purposes, we will take a subset of 
 58 | single-cell pancreas datasets from two independent studies 
 59 | (Wang et al., and Xin et al.).
 60 | 
 61 | 
 62 | ```{r setup}
 63 | library("scClassify")
 64 | data("scClassify_example")
 65 | xin_cellTypes <- scClassify_example$xin_cellTypes
 66 | exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
 67 | wang_cellTypes <- scClassify_example$wang_cellTypes
 68 | exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
 69 | exprsMat_xin_subset <- as(exprsMat_xin_subset, "dgCMatrix")
 70 | exprsMat_wang_subset <- as(exprsMat_wang_subset, "dgCMatrix")
 71 | ```
 72 | 
 73 | 
 74 | The original cell type annotations and compositions of the example datasets 
 75 | can be easily accessed as shown below.
 76 | 
 77 | 
 78 | ```{r}
 79 | table(xin_cellTypes)
 80 | table(wang_cellTypes)
 81 | ```
 82 | 
 83 | We can see that Xin et al. data only have 4 cell types, 
 84 | while Wang et al. has 7 cell types.
 85 | 
 86 | 
 87 | # scClassify
 88 | 
 89 | 
 90 | 
 91 | ## Non-ensemble scClassify
 92 | 
 93 | We first perform non-ensemble `scClassify` by using Xin et al. 
 94 | as our reference dataset and Wang et al. data as ur query data. 
 95 | We use `WKNN` as the KNN algorithm, `DE` (differential expression genes) 
 96 | as the gene selection method, and lastly `pearson` as 
 97 | the similarity calculation method.
 98 | 
 99 | ```{r}
100 | scClassify_res <- scClassify(exprsMat_train = exprsMat_xin_subset,
101 |                              cellTypes_train = xin_cellTypes,
102 |                              exprsMat_test = list(wang = exprsMat_wang_subset),
103 |                              cellTypes_test = list(wang = wang_cellTypes),
104 |                              tree = "HOPACH",
105 |                              algorithm = "WKNN",
106 |                              selectFeatures = c("limma"),
107 |                              similarity = c("pearson"),
108 |                              returnList = FALSE,
109 |                              verbose = FALSE)
110 | 
111 | ```
112 | 
113 | We can check the cell type tree generated by the reference data:
114 | 
115 | ```{r warning=FALSE}
116 | scClassify_res$trainRes
117 | plotCellTypeTree(cellTypeTree(scClassify_res$trainRes))
118 | ```
119 | 
120 | Noted that `scClassify_res$trainRes` is a `scClassifyTrainModel` class.
121 | 
122 | 
123 | Check the prediction results.
124 | 
125 | ```{r}
126 | table(scClassify_res$testRes$wang$pearson_WKNN_limma$predRes, wang_cellTypes)
127 | ```
128 | 
129 | 
130 | 
131 | 
132 | ## Ensemble Classify
133 | 
134 | 
135 | We next perform ensemble `scClassify` by using Xin et al. 
136 | as our reference dataset and Wang et al. data as our query data. 
137 | We use `WKNN` as the KNN algorithm, `DE` as the gene selection method, 
138 | and `pearson` and `spearman` as the similarity calculation methods. 
139 | Thus, we will generate two combinations of gene selection models and 
140 | similarity metrics as training classifiers: 
141 | 
142 | 1. `WKNN` + `DE` + `pearson`
143 | 2. `WKNN` + `DE` + `spearman`
144 | 
145 | Here, we will weight these two classifiers equally by setting 
146 | `weighted_ensemble = FALSE`. By default this is set as `TRUE`, 
147 | so each base classifier will be weighted by the accuracy rates trained 
148 | in the reference data.
149 | 
150 | ```{r}
151 | scClassify_res_ensemble <- scClassify(exprsMat_train = exprsMat_xin_subset,
152 |                                       cellTypes_train = xin_cellTypes,
153 |                                       exprsMat_test = list(wang = exprsMat_wang_subset),
154 |                                       cellTypes_test = list(wang = wang_cellTypes),
155 |                                       tree = "HOPACH",
156 |                                       algorithm = "WKNN",
157 |                                       selectFeatures = c("limma"),
158 |                                       similarity = c("pearson", "cosine"),
159 |                                       weighted_ensemble = FALSE,
160 |                                       returnList = FALSE,
161 |                                       verbose = FALSE)
162 | 
163 | ```
164 | 
165 | We can compare the two base classifiers predictions as below.
166 | 
167 | ```{r}
168 | table(scClassify_res_ensemble$testRes$wang$pearson_WKNN_limma$predRes,
169 |       scClassify_res_ensemble$testRes$wang$cosine_WKNN_limma$predRes)
170 | ```
171 | 
172 | 
173 | Now, check the final ensemble results:
174 | 
175 | 
176 | ```{r}
177 | table(scClassify_res_ensemble$testRes$wang$ensembleRes$cellTypes, 
178 |       wang_cellTypes)
179 | ```
180 | 
181 | 
182 | 
183 | 
184 | # Train your own model
185 | 
186 | You can also train your own model `scClassifyTrainModel` using 
187 | `train_scClassify()`. Note that by setting `weightsCal = TRUE`,
188 | we will calculate the training error of the reference data as 
189 | the weights for the individual classifiers.
190 | 
191 | Here, we illustrate the training function with gene selection methods 
192 | based on differential expression ("limma") and biomodal distribution ("BI").
193 | 
194 | ```{r}
195 | trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset,
196 |                                cellTypes_train = xin_cellTypes,
197 |                                selectFeatures = c("limma", "BI"),
198 |                                returnList = FALSE
199 | )
200 | ```
201 | 
202 | 
203 | ```{r}
204 | trainClass
205 | ```
206 | 
207 | 
208 | 
209 | # Session Info
210 | 
211 | ```{r}
212 | sessionInfo()
213 | ```
214 | 


--------------------------------------------------------------------------------