├── .gitignore
├── .Rbuildignore
├── inst
    ├── docs
    │   ├── scImpute.pdf
    │   ├── scImpute-news.Rmd
    │   └── scImpute-news.html
    ├── extdata
    │   ├── labels.rds
    │   └── .Rapp.history
    └── comparison
    │   ├── .Rhistory
    │   ├── 3-run-syn.R
    │   ├── 3-run.R
    │   ├── 1-filter_data.R
    │   ├── 2-run-magic.ipynb
    │   ├── .ipynb_checkpoints
    │       └── 2-run-magic-checkpoint.ipynb
    │   ├── 4-plot-syn.R
    │   └── 4-plot.R
├── R
    ├── dmix.R
    ├── rmix.R
    ├── calculate_weight.R
    ├── write_count.R
    ├── read_count.R
    ├── get_mix_parameters.R
    ├── scimpute.R
    ├── scImpute-internal.R
    └── imputation_model.R
├── NAMESPACE
├── DESCRIPTION
├── man
    └── scimpute.Rd
├── README.md
├── README.Rmd
├── vignettes
    ├── scImpute-vignette.Rmd
    └── scImpute-vignette.html
└── .Rhistory


/.gitignore:
--------------------------------------------------------------------------------
1 | inst/doc
2 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^README\.Rmd$
2 | 


--------------------------------------------------------------------------------
/inst/docs/scImpute.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vivianstats/scImpute/HEAD/inst/docs/scImpute.pdf


--------------------------------------------------------------------------------
/inst/extdata/labels.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vivianstats/scImpute/HEAD/inst/extdata/labels.rds


--------------------------------------------------------------------------------
/inst/extdata/.Rapp.history:
--------------------------------------------------------------------------------
1 | load("/Users/wei/Dropbox/scrna_impute/codes/scimpute_dev/scImpute/inst/extdata/labels.rds")
2 | 


--------------------------------------------------------------------------------
/R/dmix.R:
--------------------------------------------------------------------------------
1 | dmix <-
2 | function (x, pars) 
3 | {
4 |     pars[1] * dgamma(x, shape = pars[2], rate = pars[3]) + (1 - 
5 |         pars[1]) * dnorm(x, mean = pars[4], sd = pars[5])
6 | }
7 | 


--------------------------------------------------------------------------------
/inst/comparison/.Rhistory:
--------------------------------------------------------------------------------
1 | 71038836/4
2 | 2882668/4
3 | 5000/9.25 +9/2.8
4 | 5000/925 +9/2.8
5 | fpkm = sapply(1:length(fpkm), function(i){
6 | if(class[i] == "transcript") return(fpkm[3])
7 | if(class[i] == "exon") return(fpkm[4])
8 | })
9 | 


--------------------------------------------------------------------------------
/R/rmix.R:
--------------------------------------------------------------------------------
 1 | rmix <-
 2 | function (pars, n) 
 3 | {
 4 |     n1 = round(n * pars[1])
 5 |     n2 = n - n1
 6 |     x1 = rgamma(n1, shape = pars[2], rate = pars[3])
 7 |     x2 = rnorm(n2, mean = pars[4], sd = pars[5])
 8 |     return(c(x1, x2))
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/calculate_weight.R:
--------------------------------------------------------------------------------
 1 | calculate_weight <-
 2 | function (x, paramt) 
 3 | {
 4 |     pz1 = paramt[1] * dgamma(x, shape = paramt[2], rate = paramt[3])
 5 |     pz2 = (1 - paramt[1]) * dnorm(x, mean = paramt[4], sd = paramt[5])
 6 |     pz = pz1/(pz1 + pz2)
 7 |     pz[pz1 == 0] = 0
 8 |     return(cbind(pz, 1 - pz))
 9 | }
10 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(scimpute)
 4 | import(doParallel)
 5 | import(foreach)
 6 | import(parallel)
 7 | import(penalized)
 8 | importFrom(kernlab,specc)
 9 | importFrom(rsvd,rpca)
10 | importFrom(stats,complete.cases)
11 | importFrom(stats,dgamma)
12 | importFrom(stats,dnorm)
13 | importFrom(stats,prcomp)
14 | importFrom(stats,quantile)
15 | importFrom(stats,rgamma)
16 | importFrom(stats,rnorm)
17 | importFrom(stats,sd)
18 | importFrom(stats,uniroot)
19 | importFrom(utils,read.csv)
20 | importFrom(utils,read.table)
21 | importFrom(utils,write.csv)
22 | importFrom(utils,write.table)
23 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scImpute
 2 | Type: Package
 3 | Title: Accurate and robust imputation of single-cell RNA sequencing data
 4 | Version: 0.0.9
 5 | Date: 2018-08-15
 6 | Author: Wei Vivian Li, Jingyi Jessica Li
 7 | Maintainer: Wei Vivian Li <liw@ucla.edu>
 8 | Description: scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. scImpute is developed to simultaneously determine which expression values are affected by dropout events in scRNA-seq data and perform imputation only on dropout entries.
 9 | Depends: R (>= 3.3.2), parallel, stats, penalized, utils, doParallel, foreach
10 | Imports: kernlab, rsvd
11 | License: GPL
12 | RoxygenNote: 6.0.1
13 | Suggests: knitr,
14 |     rmarkdown
15 | VignetteBuilder: knitr
16 | 


--------------------------------------------------------------------------------
/inst/comparison/3-run-syn.R:
--------------------------------------------------------------------------------
 1 | data_dir = "./"
 2 | count_path = paste0(data_dir, "zeisel_samp.rds")
 3 | 
 4 | count = readRDS(count_path)
 5 | write.csv(count, file = paste0(data_dir, "zeisel_samp.csv"),
 6 |           row.names = TRUE, col.names = TRUE)
 7 | ############ run scImpute
 8 | #library(devtools)
 9 | #install_github("Vivianstats/scImpute", ref = "895c262") #v0.0.3
10 | library(scImpute)
11 | 
12 | count_path = paste0(data_dir, "zeisel_samp.csv")
13 | 
14 | out_dir = paste0(data_dir, "rerun/samp/")
15 | dir.create(out_dir)
16 | scimpute(count_path = count_path, infile = "csv", outfile = "csv",
17 |          Kcluster = 9,
18 |          out_dir = out_dir, drop_thre = 0.5, ncores = 36)
19 | 
20 | count = read.csv(paste0(out_dir, "scimpute_count.csv"), row.names = 1)
21 | saveRDS(count, file = paste0(out_dir, "zeisel_scimpute_k9.rds"))
22 | 
23 | 


--------------------------------------------------------------------------------
/R/write_count.R:
--------------------------------------------------------------------------------
 1 | write_count <-
 2 | function (count_imp, filetype, out_dir, type, genelen) 
 3 | {
 4 |     totalCounts_by_cell = readRDS(paste0(out_dir, "totalCounts_by_cell.rds"))
 5 |     count_imp = sweep(count_imp, MARGIN = 2, totalCounts_by_cell/10^6, 
 6 |         FUN = "*")
 7 |     if(type == "TPM"){
 8 |       count_imp = sweep(count_imp, 1, genelen, FUN = "/")
 9 |     }
10 |     count_imp = round(count_imp, digits = 2)
11 |     if (filetype == "csv") {
12 |         write.csv(count_imp, file = paste0(out_dir, "scimpute_count.csv"))
13 |     }
14 |     else if (filetype == "txt") {
15 |         write.table(count_imp, file = paste0(out_dir, "scimpute_count.txt"), 
16 |             quote = FALSE)
17 |     }else if (filetype == "rds") {
18 |         saveRDS(count_imp, file = paste0(out_dir, "scimpute_count.rds"))
19 |     }else {
20 |         print("filetype can be 'csv', 'txt', or 'rds'!")
21 |         stop()
22 |     }
23 |     return(0)
24 | }
25 | 


--------------------------------------------------------------------------------
/inst/comparison/3-run.R:
--------------------------------------------------------------------------------
 1 | data_dir = "./"
 2 | count_path = paste0(data_dir, "rerun/zeisel_raw.csv")
 3 | 
 4 | ############ run scImpute
 5 | library(devtools)
 6 | install_github("Vivianstats/scImpute", ref = "895c262") #v0.0.3
 7 | library(scImpute)
 8 | 
 9 | out_dir = paste0(data_dir, "rerun/")
10 | dir.create(out_dir)
11 | scimpute(count_path = count_path, infile = "csv", outfile = "csv",
12 |          Kcluster = 9,
13 |          out_dir = out_dir, drop_thre = 0.5, ncores = 36)
14 | 
15 | count = read.csv(paste0(out_dir, "scimpute_count_k9.csv"), row.names = 1)
16 | saveRDS(count, file = paste0(out_dir, "zeisel_scimpute_k9.rds"))
17 | 
18 | # ############ run SAVER
19 | # devtools::install_github("mohuangx/SAVER", ref="b64a077") #v1.0.0
20 | 
21 | library(SAVER)
22 | library(doParallel)
23 | 
24 | count_path = paste0(data_dir, "rerun/zeisel_raw.rds")
25 | 
26 | cl = makeCluster(35, outfile = "")
27 | registerDoParallel(cl)
28 | 
29 | dat = readRDS(count_path)
30 | out = saver(dat)
31 | 
32 | saveRDS(out, file = paste0(data_dir, "rerun/zeisel_saver.rds"))
33 | 
34 | 
35 | # # ############ run MAGIC
36 | # dat = read.csv(paste0(data_dir, "rerun/zeisel_magic.csv"))
37 | # rownames(dat) = dat[, 1]
38 | # dat = dat[,-1]
39 | # saveRDS(dat, paste0(data_dir, "rerun/zeisel_magic.rds"))
40 | 
41 | 


--------------------------------------------------------------------------------
/R/read_count.R:
--------------------------------------------------------------------------------
 1 | read_count <-
 2 | function (filetype, path, out_dir, type, genelen) 
 3 | {
 4 |     if(filetype == "csv") {
 5 |         raw_count = read.csv(path, header = TRUE, row.names = 1)
 6 |     }else if(filetype == "txt") {
 7 |         raw_count = read.table(path, header = TRUE, row.names = 1)
 8 |     }else if(filetype == "rds") {
 9 |         raw_count = readRDS(path)
10 |     }else{
11 |         print("filetype can be 'csv', 'txt', or 'rds'!")
12 |         stop()
13 |     }
14 |     raw_count = as.matrix(raw_count)
15 |     print(paste("number of genes in raw count matrix", nrow(raw_count)))
16 |     print(paste("number of cells in raw count matrix", ncol(raw_count)))
17 |     
18 |     if(type == "TPM"){
19 |       if(length(genelen) != nrow(raw_count)) stop("number of genes in 'genelen' and count matrix do not match! ")
20 |       raw_count = sweep(raw_count, 1, genelen, FUN = "*")
21 |     }
22 |     
23 |     totalCounts_by_cell = colSums(raw_count)
24 |     saveRDS(totalCounts_by_cell, file = paste0(out_dir, "totalCounts_by_cell.rds"))
25 |     totalCounts_by_cell[totalCounts_by_cell == 0] = 1
26 |     raw_count = sweep(raw_count, MARGIN = 2, 10^6/totalCounts_by_cell, FUN = "*")
27 |     if (min(raw_count) < 0) {
28 |         stop("smallest read count cannot be negative!")
29 |     }
30 |     count_lnorm = log10(raw_count + 1.01)
31 |     return(count_lnorm)
32 | }
33 | 


--------------------------------------------------------------------------------
/inst/docs/scImpute-news.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "scImpute Updates"
 3 | author: "Wei Vivian Li"
 4 | date: "`r Sys.Date()`"
 5 | output: rmarkdown::html_vignette
 6 | #output: pdf_document
 7 | ---
 8 | 
 9 | 
10 | ## Updates
11 | 
12 | > 2018/08/15:
13 | 
14 | -   Version 0.0.9 is released!
15 | -   More robust implementation of dimension reduction.
16 | -   Faster calculation of cell similarity.
17 | 
18 | > 2018/06/27:
19 | 
20 | -   Version 0.0.8 is released!
21 | -   Faster implementation of dimension reduction.
22 | 
23 | > 2018/06/08:
24 | 
25 | -   Version 0.0.7 is released!
26 | -   New option for application on TPM values.
27 | 
28 | > 2018/03/16:
29 | 
30 |  + Version 0.0.6 is released!
31 |  + The scImpute method is published at [*Nature Communications*](https://www.nature.com/articles/s41467-018-03405-7).
32 |  + scImpute now supports input and output in the format of R objects (.rds).
33 |  
34 | > 2018/01/12:
35 | 
36 |  + Version 0.0.5 is released!
37 |  + It is now possible to apply scImpute on just one cell population by setting `Kcluster = 1`.
38 | 
39 | > 2017/10/27:
40 | 
41 |  + Version 0.0.4 is released!
42 |  + scImpute now supports multi-code parallelism.
43 |  
44 | > 2017/10/22:
45 | 
46 |  + Version 0.0.3 is released!
47 |  + Estimation of dropout probabilities is more accurate.
48 |  + Imputation step is more robust.
49 |  + `scimpute()` incorporates a new parameter `Kcluster` to specify the number of cell subpopulations.
50 |  + `scImpute` is now able to detect outlier cells.
51 |  
52 | > 2017/07/01: 
53 |   
54 |   + Version 0.0.2 is released! 
55 |   + This version speeds up the first step in `scImpute` and program now completes  in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).


--------------------------------------------------------------------------------
/inst/comparison/1-filter_data.R:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | ## Zeisel data
 3 | 
 4 | ## expression_mRNA_17-Aug-2014.txt
 5 | ## can be downloaded from
 6 | ## https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt
 7 | 
 8 | data_dir = "./"
 9 | x = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), skip = 11,
10 |                 header = FALSE, stringsAsFactors = FALSE,
11 |                 row.names = 1)
12 | x = x[, -1]
13 | cellnames = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 
14 |                         skip = 7, nrows = 1, row.names = 1, 
15 |                         stringsAsFactors = FALSE)
16 | colnames(x) = cellnames[-1]
17 | 
18 | labels = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 
19 |                         skip = 1, nrows = 1, row.names = 1, 
20 |                         stringsAsFactors = FALSE)
21 | labels = unlist(labels)
22 | table(labels)
23 | 
24 | 
25 | matching = c("1"="Interneurons", "2"="S1-Pyramidal", "3"="CA1-Pyramidal",
26 |               "4"="Oligodendrocytes", "5"="Microglia", "6"="Endothelial",
27 |               "7" = "Astrocytes", "8" = "Ependymal",
28 |               "9"="Mural")
29 | labels = matching[as.character(labels)]
30 | 
31 | saveRDS(x, paste0(data_dir, "rerun/zeisel_raw.rds"))
32 | saveRDS(labels, paste0(data_dir, "rerun/zeisel_label9.rds"))
33 | 
34 | 
35 | ### level2 classes
36 | labels = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 
37 |                     skip = 9, nrows = 1, row.names = 1, 
38 |                     stringsAsFactors = FALSE)
39 | labels = unlist(labels)
40 | table(labels)
41 | saveRDS(labels, paste0(data_dir, "rerun/zeisel_label47.rds"))
42 | 
43 | 
44 | 
45 | 
46 | write.csv(x, paste0(data_dir, "rerun/zeisel_raw.csv"), quote = FALSE)
47 | 
48 | 


--------------------------------------------------------------------------------
/inst/comparison/2-run-magic.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 15,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Calculating MAGIC...\n",
13 |       "Calculating graph and diffusion operator...\n",
14 |       "Calculating PCA...\n",
15 |       "Calculated PCA in 7.59 seconds.\n",
16 |       "Calculating KNN search...\n",
17 |       "Calculated KNN search in 1.40 seconds.\n",
18 |       "Calculating affinities...\n",
19 |       "Calculated affinities in 0.12 seconds.\n",
20 |       "Calculated graph and diffusion operator in 9.47 seconds.\n",
21 |       "Calculating imputation...\n",
22 |       "Automatically selected t = 7\n",
23 |       "Calculated imputation in 0.17 seconds.\n",
24 |       "Calculated MAGIC in 12.30 seconds.\n",
25 |       "--- 21.142620086669922 seconds ---\n"
26 |      ]
27 |     }
28 |    ],
29 |    "source": [
30 |     "import magic\n",
31 |     "import pandas as pd\n",
32 |     "import matplotlib.pyplot as plt\n",
33 |     "import time\n",
34 |     "start_time = time.time()\n",
35 |     "X = pd.read_csv(\"~/rerun/zeisel_raw.csv\",header = 0,index_col=0)\n",
36 |     "X = X.transpose()\n",
37 |     "magic_operator = magic.MAGIC()\n",
38 |     "X_magic = magic_operator.fit_transform(X)\n",
39 |     "print(\"--- %s seconds ---\" % (time.time() - start_time))"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": 20,
45 |    "metadata": {},
46 |    "outputs": [],
47 |    "source": [
48 |     "pd.DataFrame.to_csv(X_magic.transpose(), \"~/rerun/zeisel_magic.csv\")"
49 |    ]
50 |   }
51 |  ],
52 |  "metadata": {
53 |   "kernelspec": {
54 |    "display_name": "Python 3",
55 |    "language": "python",
56 |    "name": "python3"
57 |   },
58 |   "language_info": {
59 |    "codemirror_mode": {
60 |     "name": "ipython",
61 |     "version": 3
62 |    },
63 |    "file_extension": ".py",
64 |    "mimetype": "text/x-python",
65 |    "name": "python",
66 |    "nbconvert_exporter": "python",
67 |    "pygments_lexer": "ipython3",
68 |    "version": "3.6.4"
69 |   }
70 |  },
71 |  "nbformat": 4,
72 |  "nbformat_minor": 2
73 | }
74 | 


--------------------------------------------------------------------------------
/inst/comparison/.ipynb_checkpoints/2-run-magic-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 15,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Calculating MAGIC...\n",
13 |       "Calculating graph and diffusion operator...\n",
14 |       "Calculating PCA...\n",
15 |       "Calculated PCA in 7.59 seconds.\n",
16 |       "Calculating KNN search...\n",
17 |       "Calculated KNN search in 1.40 seconds.\n",
18 |       "Calculating affinities...\n",
19 |       "Calculated affinities in 0.12 seconds.\n",
20 |       "Calculated graph and diffusion operator in 9.47 seconds.\n",
21 |       "Calculating imputation...\n",
22 |       "Automatically selected t = 7\n",
23 |       "Calculated imputation in 0.17 seconds.\n",
24 |       "Calculated MAGIC in 12.30 seconds.\n",
25 |       "--- 21.142620086669922 seconds ---\n"
26 |      ]
27 |     }
28 |    ],
29 |    "source": [
30 |     "import magic\n",
31 |     "import pandas as pd\n",
32 |     "import matplotlib.pyplot as plt\n",
33 |     "import time\n",
34 |     "start_time = time.time()\n",
35 |     "X = pd.read_csv(\"~/rerun/zeisel_raw.csv\",header = 0,index_col=0)\n",
36 |     "X = X.transpose()\n",
37 |     "magic_operator = magic.MAGIC()\n",
38 |     "X_magic = magic_operator.fit_transform(X)\n",
39 |     "print(\"--- %s seconds ---\" % (time.time() - start_time))"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": 20,
45 |    "metadata": {},
46 |    "outputs": [],
47 |    "source": [
48 |     "pd.DataFrame.to_csv(X_magic.transpose(), \"~/rerun/zeisel_magic.csv\")"
49 |    ]
50 |   }
51 |  ],
52 |  "metadata": {
53 |   "kernelspec": {
54 |    "display_name": "Python 3",
55 |    "language": "python",
56 |    "name": "python3"
57 |   },
58 |   "language_info": {
59 |    "codemirror_mode": {
60 |     "name": "ipython",
61 |     "version": 3
62 |    },
63 |    "file_extension": ".py",
64 |    "mimetype": "text/x-python",
65 |    "name": "python",
66 |    "nbconvert_exporter": "python",
67 |    "pygments_lexer": "ipython3",
68 |    "version": "3.6.4"
69 |   }
70 |  },
71 |  "nbformat": 4,
72 |  "nbformat_minor": 2
73 | }
74 | 


--------------------------------------------------------------------------------
/inst/comparison/4-plot-syn.R:
--------------------------------------------------------------------------------
 1 | library(Rtsne)
 2 | library(ggplot2)
 3 | library(parallel)
 4 | library(ClusterR)
 5 | library(tidyr)
 6 | library(dplyr)
 7 | library(gridExtra)
 8 | library(grid)
 9 | #library(kernlab)
10 | 
11 | plot_dir = "./plots/"
12 | data_dir = "./rerun/"
13 | 
14 | 
15 | ### data in Huang et al.
16 | ### downloaded from https://www.dropbox.com/sh/ri6fa3mbhvgapqk/AADwOzHfiCcLSqYnX9CTyd7_a?dl=0
17 | 
18 | ### labels
19 | labels = readRDS(paste0(data_dir, "rerun/zeisel_label9.rds"))
20 | labels_samp = labels[match(colnames(count_samp), colnames(count_raw))]
21 | 
22 | info = readRDS(paste0(data_dir, "rerun/samp/fig2d_tsne.rds"))
23 | ident = info[[2]]
24 | labels_Huang = ident[[4]][[1]]
25 | write.table(table(labels_samp, labels_Huang), file = paste0(data_dir, "rerun/samp/label-matrix.txt"))
26 | 
27 | 
28 | #####################################################
29 | ### tSNE
30 | 
31 | methods = c("syn", "scImpute", "MAGIC", "SAVER")
32 | name_appends = c("samp", "samp_scimpute", "samp_magic", "samp_saver")
33 | names(name_appends) = methods
34 | 
35 | for(method in methods){
36 |   set.seed(1234)
37 |   print(method)
38 |   dim = 2
39 |   count_raw = readRDS(paste0(data_dir, "rerun/samp/zeisel_", name_appends[method], ".rds"))
40 |   if(method == "SAVER") count_raw = count_raw$estimate
41 |   count = log10(count_raw + 1)
42 |   tsne = Rtsne(t(count), dims = dim)$Y
43 |   saveRDS(tsne, file = paste0(data_dir, "rerun/samp/zeisel-", method, "-tsne", dim, ".rds"))
44 |   gc()
45 | }
46 | 
47 | 
48 | ### tSNE
49 | data = lapply(methods, function(method){
50 |   tsne = readRDS(file = paste0(data_dir, "rerun/samp/zeisel-", method,
51 |                                "-tsne2", ".rds"))
52 |   pdata = data.frame(tSNE1 = tsne[,1], tSNE2 = tsne[,2], type = labels_samp)
53 |   pdata$method = method
54 |   return(pdata)
55 | })
56 | data = Reduce(rbind, data)
57 | data$method = factor(data$method, levels = c("syn", "scImpute", "MAGIC", "SAVER"))
58 | gt = ggplot(data, aes(x = tSNE1, y = tSNE2, color = type)) +
59 |   geom_point(alpha = 0.8, cex = 0.8) + facet_wrap(~method, nrow = 1) +
60 |   theme_bw() +
61 |   theme(strip.background = element_blank(),
62 |         legend.position = "bottom",
63 |         text = element_text(size=12))
64 | ggsave(paste0(plot_dir,"Fig-samp.pdf"), gt, width = 11, height = 4)
65 | 
66 | 


--------------------------------------------------------------------------------
/R/get_mix_parameters.R:
--------------------------------------------------------------------------------
 1 | ### root-finding equation
 2 | fn = function(alpha, target){
 3 |   log(alpha) - digamma(alpha) - target
 4 | }
 5 | 
 6 | ### update parameters in gamma distribution
 7 | update_gmm_pars = function(x, wt){
 8 |   tp_s = sum(wt)
 9 |   tp_t = sum(wt * x)
10 |   tp_u = sum(wt * log(x))
11 |   tp_v = -tp_u / tp_s - log(tp_s / tp_t)
12 |   if (tp_v <= 0){
13 |     alpha = 20
14 |   }else{
15 |     alpha0 = (3 - tp_v + sqrt((tp_v - 3)^2 + 24 * tp_v)) / 12 / tp_v
16 |     if (alpha0 >= 20){alpha = 20
17 |     }else{
18 |       alpha = uniroot(fn, c(0.9, 1.1) * alpha0, target = tp_v, 
19 |                       extendInt = "yes")$root
20 |     }
21 |   }
22 |   ## need to solve log(x) - digamma(x) = tp_v
23 |   ## We use this approximation to compute the initial value
24 |   beta = tp_s / tp_t * alpha
25 |   return(c(alpha, beta))
26 | }
27 | 
28 | ### estimate parameters in the mixture distribution
29 | get_mix = function(xdata, point){
30 |   inits = rep(0, 5)
31 |   inits[1] = sum(xdata == point)/length(xdata)
32 |   if (inits[1] == 0) {inits[1] = 0.01}
33 |   inits[2:3] = c(0.5, 1)
34 |   xdata_rm = xdata[xdata > point]
35 |   inits[4:5] = c(mean(xdata_rm), sd(xdata_rm))
36 |   if (is.na(inits[5])) {inits[5] = 0}
37 |   paramt = inits
38 |   eps = 10
39 |   iter = 0
40 |   loglik_old = 0
41 |   
42 |   while(eps > 0.5) {
43 |     wt = calculate_weight(xdata, paramt)
44 |     paramt[1] = sum(wt[, 1])/nrow(wt)
45 |     paramt[4] = sum(wt[, 2] * xdata)/sum(wt[, 2])
46 |     paramt[5] = sqrt(sum(wt[, 2] * (xdata - paramt[4])^2)/sum(wt[, 2]))
47 |     paramt[2:3] = update_gmm_pars(x=xdata, wt=wt[,1])
48 |     
49 |     loglik = sum(log10(dmix(xdata, paramt)))
50 |     eps = (loglik - loglik_old)^2
51 |     loglik_old = loglik
52 |     iter = iter + 1
53 |     if (iter > 100) 
54 |       break
55 |   }
56 |   return(paramt)
57 | }
58 | 
59 | get_mix_parameters <-
60 | function (count, point = log10(1.01), path, ncores = 8) 
61 | {
62 |     count = as.matrix(count)
63 |     null_genes = which(abs(rowSums(count) - point * ncol(count)) < 1e-10)
64 |     parslist = mclapply(1:nrow(count), function(ii) {
65 |       if (ii %% 2000 == 0) {
66 |         gc()
67 |         print(ii)
68 |       }
69 |       if (ii %in% null_genes) {
70 |         return(rep(NA, 5))
71 |       }
72 |       xdata = count[ii, ]
73 |       paramt = try(get_mix(xdata, point), silent = TRUE)
74 |       if (class(paramt) == "try-error"){
75 |         paramt = rep(NA, 5)
76 |       }
77 |       return(paramt)
78 |     }, mc.cores = ncores)
79 |     save(parslist, file = path)
80 |     parslist = Reduce(rbind, parslist)
81 |     colnames(parslist) = c("rate", "alpha", "beta", "mu", "sigma")
82 |     saveRDS(parslist, file = path)
83 |     return(0)
84 | }
85 | 
86 | 


--------------------------------------------------------------------------------
/man/scimpute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scimpute.R
 3 | \name{scimpute}
 4 | \alias{scimpute}
 5 | \title{use scImpute to impute dropout values in scRNA-seq data}
 6 | \usage{
 7 | scimpute(count_path, infile = "csv", outfile = "csv", type = "count",
 8 |   out_dir, labeled = FALSE, drop_thre = 0.5, Kcluster = NULL,
 9 |   labels = NULL, genelen = NULL, ncores = 5)
10 | }
11 | \arguments{
12 | \item{count_path}{A character specifying the full path of the raw count matrix;}
13 | 
14 | \item{infile}{A character specifying the type of file storing the raw count matrix;
15 | can be "csv", "txt", or "rds". The input file shoule have rows representing genes and
16 | columns representing cells, with its first row as cell names 
17 | and first column as gene names.}
18 | 
19 | \item{outfile}{A character specifying the type of file storing the imputed count matrix;
20 | can be "csv", "txt", or "rds".}
21 | 
22 | \item{type}{A character specifying the type of values in the expression matrix. 
23 | Can be "count" (default) or "TPM".}
24 | 
25 | \item{out_dir}{A character specifying the full path of the output directory, 
26 | which is used to store all intermdediate and final outputs.}
27 | 
28 | \item{labeled}{A logical value indicating whether cell type information is available.
29 | \code{labels} must be specified if \code{labeled = TRUE}.}
30 | 
31 | \item{drop_thre}{A number between 0 and 1, 
32 | specifying the threshold to determine dropout values.}
33 | 
34 | \item{Kcluster}{An integer specifying the number of cell subpopulations. 
35 | This parameter can be determined based on prior knowledge or clustering of raw data.
36 | \code{Kcluster} is used to determine the candidate neighbors of each cell.}
37 | 
38 | \item{labels}{A character vector specifying the cell type of 
39 | each column in the raw count matrix. Only needed when \code{labeled = TRUE}.
40 | Each cell type should have at least two cells for imputation.}
41 | 
42 | \item{genelen}{An integer vector giving the length of each gene. 
43 | Order must match the gene orders in the expression matrix.
44 | \code{genelen} must be specified if \code{type = "count"}.}
45 | 
46 | \item{ncores}{A integer specifying the number of cores used for parallel computation.}
47 | }
48 | \value{
49 | scImpute returns a vector giving the column indices of outlier cells.
50 | It saves the imputed count matrix to scimpute_count.csv, scimpute_count.txt, or scimpute_count.rds 
51 | (depending on \code{outfile}) to \code{out_dir}.
52 | }
53 | \description{
54 | use scImpute to impute dropout values in scRNA-seq data
55 | }
56 | \references{
57 | Li, W. V., & Li, J. J. (2018). An accurate and robust imputation method 
58 | scImpute for single-cell RNA-seq data. \emph{Nature Communications}, 9(1), 997.
59 | }
60 | \author{
61 | Wei Vivian Li, \email{liw@ucla.edu}
62 | 
63 | Jingyi Jessica Li, \email{jli@stat.ucla.edu}
64 | }
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scImpute: accurate and robust imputation of scRNA-seq data
 2 | ================
 3 | Wei Vivian Li, Jingyi Jessica Li
 4 | 2019-08-20
 5 | 
 6 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 7 | Latest News
 8 | -----------
 9 | 
10 | > 2019/08/20:
11 | 
12 | -   Since the development of scImpute, new imputation methods have been proposed for scRNA-seq data. These methods have different model assumptions and diverse performances on different datasets. It contributes to both method development and bioinformatic applications to discuss and compare existing imputation methods. However, we realize several issues in existing evaluation and comparison of imputation methods and discuss these issue in our commentary, which is available at [arxiv](https://arxiv.org/abs/1908.07084).
13 | 
14 | > 2018/08/15:
15 | 
16 | -   Version 0.0.9 is released!
17 | -   More robust implementation of dimension reduction.
18 | -   Faster calculation of cell similarity.
19 | 
20 | Introduction
21 | ------------
22 | 
23 | `scImpute` is developed to accurately and robustly impute the dropout values in scRNA-seq data. `scImpute` can be applied to raw read count matrix before the users perform downstream analyses such as
24 | 
25 | -   dimension reduction of scRNA-seq data
26 | -   normalization of scRNA-seq data
27 | -   clustering of cell populations
28 | -   differential gene expression analysis
29 | -   time-series analysis of gene expression dynamics
30 | 
31 | The users can refer to our paper [An accurate and robust imputation method scImpute for single-cell RNA-seq data](https://www.nature.com/articles/s41467-018-03405-7) for a detailed description of the modeling and applications.
32 | 
33 | Any suggestions on the package are welcome! For technical problems, please report to [Issues](https://github.com/Vivianstats/scImpute/issues). For suggestions and comments on the method, please contact Wei (<liw@ucla.edu>) or Dr. Jessica Li (<jli@stat.ucla.edu>).
34 | 
35 | Installation
36 | ------------
37 | 
38 | The package is not on CRAN yet. For installation please use the following codes in `R`
39 | 
40 | ``` r
41 | install.packages("devtools")
42 | library(devtools)
43 | 
44 | install_github("Vivianstats/scImpute")
45 | ```
46 | 
47 | Quick start
48 | -----------
49 | 
50 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis. Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. In the simplest case, the imputation task can be done with one single function `scimpute`:
51 | 
52 | ``` r
53 | scimpute(# full path to raw count matrix
54 |          count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 
55 |          infile = "csv",           # format of input file
56 |          outfile = "csv",          # format of output file
57 |          out_dir = "./",           # full path to output directory
58 |          labeled = FALSE,          # cell type labels not available
59 |          drop_thre = 0.5,          # threshold set on dropout probability
60 |          Kcluster = 2,             # 2 cell subpopulations
61 |          ncores = 10)              # number of cores used in parallel computation
62 | ```
63 | 
64 | This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts.
65 | 
66 | For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd).
67 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "scImpute: accurate and robust  imputation of scRNA-seq data"
 3 | author: "Wei Vivian Li, Jingyi Jessica Li"
 4 | 
 5 | date: "`r Sys.Date()`"
 6 | output: github_document
 7 | ---
 8 | 
 9 | <!-- README.md is generated from README.Rmd. Please edit that file -->
10 | 
11 | ```{r, echo = FALSE}
12 | knitr::opts_chunk$set(
13 |   collapse = TRUE,
14 |   comment = "#>",
15 |   fig.path = "README-"
16 | )
17 | ```
18 | 
19 | ## Latest News
20 | 
21 | > 2019/08/20:
22 | 
23 | -  Since the development of scImpute, new imputation methods have been proposed for scRNA-seq data. These methods have different model assumptions and diverse performances on different datasets. It contributes to both method development and bioinformatic applications to discuss and compare existing imputation methods. However, we realize several issues in existing evaluation and comparison of imputation methods and discuss these issue in our commentary, which is available at [arxiv]( https://arxiv.org/abs/1908.07084).
24 | 
25 | > 2018/08/15:
26 | 
27 | -   Version 0.0.9 is released!
28 | -   More robust implementation of dimension reduction.
29 | -   Faster calculation of cell similarity.
30 | 
31 | ## Introduction
32 | `scImpute` is developed to accurately and robustly impute the dropout values in scRNA-seq data. `scImpute` can be applied to raw read count matrix before the users perform downstream analyses such as
33 | 
34 | - dimension reduction of scRNA-seq data
35 | - normalization of scRNA-seq data
36 | - clustering of cell populations
37 | - differential gene expression analysis
38 | - time-series analysis of gene expression dynamics
39 | 
40 | The users can refer to our paper [An accurate and robust imputation method scImpute for single-cell RNA-seq data](https://www.nature.com/articles/s41467-018-03405-7) for a detailed description of the modeling and applications.
41 | 
42 | Any suggestions on the package are welcome! For technical problems, please report to [Issues](https://github.com/Vivianstats/scImpute/issues). For suggestions and comments on the method, please contact Wei (<liw@ucla.edu>) or Dr. Jessica Li (<jli@stat.ucla.edu>). 
43 | 
44 | ## Installation
45 | The package is not on CRAN yet. For installation please use the following codes in `R`
46 | ```{r eval = FALSE}
47 | install.packages("devtools")
48 | library(devtools)
49 | 
50 | install_github("Vivianstats/scImpute")
51 | ```
52 | 
53 | ## Quick start
54 | 
55 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis.
56 | Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension.
57 | In the simplest case, the imputation task can be done with one single function `scimpute`:
58 | ```{r eval = FALSE}
59 | scimpute(# full path to raw count matrix
60 |          count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 
61 |          infile = "csv",           # format of input file
62 |          outfile = "csv",          # format of output file
63 |          out_dir = "./",           # full path to output directory
64 |          labeled = FALSE,          # cell type labels not available
65 |          drop_thre = 0.5,          # threshold set on dropout probability
66 |          Kcluster = 2,             # 2 cell subpopulations
67 |          ncores = 10)              # number of cores used in parallel computation
68 | ```
69 | This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts. 
70 | 
71 | For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd).
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/R/scimpute.R:
--------------------------------------------------------------------------------
 1 | #' use scImpute to impute dropout values in scRNA-seq data
 2 | #'
 3 | #' @param count_path A character specifying the full path of the raw count matrix;
 4 | #' @param infile A character specifying the type of file storing the raw count matrix;
 5 | #' can be "csv", "txt", or "rds". The input file shoule have rows representing genes and
 6 | #' columns representing cells, with its first row as cell names 
 7 | #' and first column as gene names.
 8 | #' @param outfile A character specifying the type of file storing the imputed count matrix;
 9 | #' can be "csv", "txt", or "rds".
10 | #' @param out_dir A character specifying the full path of the output directory, 
11 | #' which is used to store all intermdediate and final outputs.
12 | #' @param type A character specifying the type of values in the expression matrix. 
13 | #' Can be "count" (default) or "TPM".
14 | #' @param labeled A logical value indicating whether cell type information is available.
15 | #' \code{labels} must be specified if \code{labeled = TRUE}.
16 | #' @param genelen An integer vector giving the length of each gene. 
17 | #' Order must match the gene orders in the expression matrix.
18 | #' \code{genelen} must be specified if \code{type = "count"}.
19 | #' @param drop_thre A number between 0 and 1, 
20 | #' specifying the threshold to determine dropout values.
21 | #' @param Kcluster An integer specifying the number of cell subpopulations. 
22 | #' This parameter can be determined based on prior knowledge or clustering of raw data.
23 | #' \code{Kcluster} is used to determine the candidate neighbors of each cell.
24 | #' @param labels A character vector specifying the cell type of 
25 | #' each column in the raw count matrix. Only needed when \code{labeled = TRUE}.
26 | #' Each cell type should have at least two cells for imputation.
27 | #' @param ncores A integer specifying the number of cores used for parallel computation.
28 | #' @return scImpute returns a vector giving the column indices of outlier cells.
29 | #' It saves the imputed count matrix to scimpute_count.csv, scimpute_count.txt, or scimpute_count.rds 
30 | #' (depending on \code{outfile}) to \code{out_dir}.
31 | #' @export
32 | #' @import parallel
33 | #' @import doParallel
34 | #' @import foreach
35 | #' @importFrom stats complete.cases dgamma dnorm prcomp quantile rgamma rnorm sd uniroot
36 | #' @importFrom kernlab specc
37 | #' @import penalized 
38 | #' @importFrom utils read.csv read.table write.csv write.table
39 | #' @importFrom rsvd rpca
40 | #' @author Wei Vivian Li, \email{liw@ucla.edu}
41 | #' @author Jingyi Jessica Li, \email{jli@stat.ucla.edu}
42 | #' @references Li, W. V., & Li, J. J. (2018). An accurate and robust imputation method 
43 | #' scImpute for single-cell RNA-seq data. \emph{Nature Communications}, 9(1), 997.
44 | scimpute <-
45 | function (count_path, infile = "csv", outfile = "csv", type = "count", out_dir, labeled = FALSE, 
46 |           drop_thre = 0.5, Kcluster = NULL, labels = NULL, genelen = NULL, ncores = 5) 
47 | {   
48 |     if(labeled == TRUE & is.null(labels)){
49 |       stop("'labels' must be specified when 'labeled = TRUE'!")
50 |     }
51 |     if(labeled == FALSE & is.null(Kcluster)){
52 |       stop("'Kcluster' must be specified when 'labeled = FALSE'!")
53 |     }
54 |     if(!(type %in% c("count", "TPM"))){ stop("expression values can be either 'count' or 'TPM'!") }
55 |     if(type == "TPM" & is.null(genelen)){ stop("'genelen' must be specified when type = 'TPM'!") }
56 |   
57 |     # print(drop_thre)
58 |     print("reading in raw count matrix ...")
59 |     dir.create(out_dir, recursive = TRUE)
60 |     count_lnorm = read_count(filetype = infile, path = count_path, out_dir = out_dir, 
61 |                              type = type, genelen = genelen)
62 |     print("reading finished!")
63 |     
64 |     if(labeled == TRUE){
65 |       if(length(labels) != ncol(count_lnorm)){
66 |         stop("number of cells does not match number of labels !")
67 |       }
68 |     }
69 |     genenames = rownames(count_lnorm)
70 |     cellnames = colnames(count_lnorm)
71 |     
72 |     print("imputation starts ...")
73 |     if (labeled == FALSE){
74 |       res_imp = imputation_model8(count = count_lnorm, labeled = FALSE, 
75 |                                   point = log10(1.01), drop_thre = drop_thre, 
76 |                                   Kcluster = Kcluster, 
77 |                                   out_dir = out_dir, ncores = ncores)
78 |     }else{
79 |       res_imp = imputation_wlabel_model8(count = count_lnorm, labeled = TRUE, 
80 |                                          cell_labels = labels, point = log10(1.01), 
81 |                                          drop_thre = drop_thre, 
82 |                                          Kcluster = NULL, out_dir = out_dir, 
83 |                                          ncores = ncores)
84 |     }
85 |     count_imp = res_imp$count_imp
86 |     outliers = res_imp$outlier
87 |     count_imp = 10^count_imp - 1.01
88 |     rownames(count_imp) = genenames
89 |     colnames(count_imp) = cellnames
90 |     print("writing imputed count matrix ...")
91 |     write_count(count_imp, filetype = outfile, out_dir, type = type, genelen = genelen)
92 |     return(outliers)
93 | }
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/inst/comparison/4-plot.R:
--------------------------------------------------------------------------------
  1 | library(Rtsne)
  2 | library(ggplot2)
  3 | library(parallel)
  4 | library(ClusterR)
  5 | library(tidyr)
  6 | library(dplyr)
  7 | library(gridExtra)
  8 | library(grid)
  9 | #library(kernlab)
 10 | ###source("~/Dropbox/Rpkgs-dev/scimpute_dev/diagnosis/SAVER-paper/v2/comparison/supp.R")
 11 | 
 12 | plot_dir = "./plots/"
 13 | data_dir = "./rerun/"
 14 | 
 15 | 
 16 | #####################################################
 17 | ### data characteristics
 18 | count_raw = readRDS(paste0(data_dir, "rerun/zeisel_raw.rds"))
 19 | ### data in Huang et al.
 20 | ### downloaded from https://www.dropbox.com/sh/ri6fa3mbhvgapqk/AADwOzHfiCcLSqYnX9CTyd7_a?dl=0
 21 | count_samp = readRDS(paste0(data_dir, "zeisel_samp.rds"))
 22 | 
 23 | datas =c("raw", "SAVER-paper")
 24 | summary = lapply(1:2, function(i){
 25 |   if(i == 1){count = count_raw}
 26 |   if(i == 2){count = count_samp}
 27 |   mean = log10(rowMeans(count)+1)
 28 |   sd = log10(apply(count, 1, sd)+1)
 29 |   zero = rowSums(count == 0)/ncol(count)
 30 |   da = data.frame(mean, sd, zero)
 31 |   da$data = datas[i]
 32 |   return(da)
 33 | })
 34 | summary = Reduce(rbind, summary)
 35 | g1 = ggplot(summary, aes(x = mean, y = sd, color = data)) +
 36 |   geom_point(alpha = 0.6, cex = 0.2) +
 37 |   xlab("log10(mean + 1)") + ylab("log10(sd + 1)") +
 38 |   scale_color_manual(values = c("#999999", "#CC0C00")) +
 39 |   # scale_y_continuous(labels = scaleFUN) +
 40 |   theme_bw() +
 41 |   theme(legend.position = "none",
 42 |         text = element_text(size=14))
 43 | g2 = ggplot(summary, aes(x = mean, y = zero, color = data)) +
 44 |   geom_point(alpha = 0.6, cex = 0.2) +
 45 |   xlab("log10(mean + 1)") + ylab("zero fraction") +
 46 |   scale_color_manual(values = c("#999999", "#CC0C00")) +
 47 |   theme_bw() +
 48 |   theme(legend.position = "none",
 49 |         text = element_text(size=14))
 50 | g = arrangeGrob(g1,g2,nrow = 2)
 51 | ggsave(paste0(plot_dir, "Fig2a.pdf"), g, width = 3, height = 5)
 52 | 
 53 | g3 = ggplot(summary, aes(x = mean, fill = data)) +
 54 |   geom_density(alpha = 0.6) + xlim(c(0,1.5)) +
 55 |   xlab("log10(mean + 1)") +
 56 |   scale_fill_manual(values = c("#999999", "#CC0C00")) +
 57 |   theme_bw() +
 58 |   theme(legend.position = "none",
 59 |         text = element_text(size=14))
 60 | g4 = ggplot(summary, aes(x = sd, fill = data)) +
 61 |   geom_density(alpha = 0.6) + xlim(c(0,1.5)) +
 62 |   xlab("log10(sd + 1)") +
 63 |   scale_fill_manual(values = c("#999999", "#CC0C00")) +
 64 |   theme_bw() +
 65 |   theme(legend.position = "none",
 66 |         text = element_text(size=14))
 67 | g5 = ggplot(summary, aes(x = zero, fill = data)) +
 68 |   geom_density(alpha = 0.6) +
 69 |   xlab("zero fraction") +
 70 |   scale_fill_manual(values = c("#999999", "#CC0C00")) +
 71 |   theme_bw() +
 72 |   theme(legend.position = "none",
 73 |         text = element_text(size=14))
 74 | gg = arrangeGrob(g3,g4,g5,nrow = 3)
 75 | ggsave(paste0(plot_dir, "Fig2b.pdf"), gg, width = 3, height = 5)
 76 | 
 77 | 
 78 | 
 79 | #####################################################
 80 | ### tSNE
 81 | 
 82 | methods = c("Raw", "scImpute", "MAGIC", "SAVER")
 83 | name_appends = c("raw", "scimpute_k9", "magic", "saver")
 84 | names(name_appends) = methods
 85 | labels = readRDS(paste0(data_dir, "rerun/zeisel_label9.rds"))
 86 | 
 87 | # for(method in methods){
 88 | #   set.seed(1234)
 89 | #   print(method)
 90 | #   dim = 2
 91 | #   count_raw = readRDS(paste0(data_dir, "rerun/zeisel_", name_appends[method], ".rds"))
 92 | #   if(method == "SAVER") count_raw = count_raw$estimate
 93 | #   count = log10(count_raw + 1)
 94 | #   tsne = Rtsne(t(count), dims = dim)$Y
 95 | #   saveRDS(tsne, file = paste0(data_dir, "rerun/zeisel-", method, "-tsne", dim, ".rds"))
 96 | #   gc()
 97 | # }
 98 | 
 99 | 
100 | ### tSNE
101 | data = lapply(methods, function(method){
102 |   tsne = readRDS(file = paste0(data_dir, "rerun/zeisel-", method,
103 |                                "-tsne2", ".rds"))
104 |   pdata = data.frame(tSNE1 = tsne[,1], tSNE2 = tsne[,2], type = labels)
105 |   pdata$method = method
106 |   return(pdata)
107 | })
108 | data = Reduce(rbind, data)
109 | data$method = factor(data$method, levels = c("Raw", "scImpute", "MAGIC", "SAVER"))
110 | gt = ggplot(data, aes(x = tSNE1, y = tSNE2, color = type)) +
111 |   geom_point(alpha = 0.8, cex = 0.8) + facet_wrap(~method, nrow = 1) +
112 |   theme_bw() +
113 |   theme(strip.background = element_blank(),
114 |         legend.position = "bottom",
115 |         text = element_text(size=12))
116 | ggsave(paste0(plot_dir,"Fig2e.pdf"), gt, width = 11, height = 4)
117 | 
118 | 
119 | 
120 | ##########################################################
121 | ### clustering
122 | 
123 | B = 100
124 | J = 3005
125 | 
126 | for(kk in c(9,47)){
127 |   print(kk)
128 |   dim = 10
129 |   temp_res = mclapply(1:B, function(b){
130 |     set.seed(b)
131 |     if(b %% 20 == 0) print(b)
132 |     ind = sample(1:J, J, replace = TRUE)
133 |     val = sapply(methods, function(method){
134 |       mat = readRDS(paste0(data_dir, "rerun/zeisel-", method, "-pca.rds"))
135 |       mat = mat[ind, 1:dim]
136 |       truel = labels[ind]
137 |       clusts = hclust(dist(mat), method = "median")
138 |       Clabel = cutree(clusts, kk)
139 |       Clabel = as.numeric(Clabel)
140 |       v = sapply(c("jaccard_index", "adjusted_rand_index", "purity", "nmi"), function(x){
141 |         external_validation(as.numeric(factor(truel)), Clabel, method = x)
142 |       })
143 |       return(v)
144 |     })
145 |     gc()
146 |     mat = as.data.frame(val)
147 |     colnames(mat) = methods
148 |     mat$measure = c("Jaccard index", "adjusted Rand index", "purity", "nmi")
149 |     mat = mat %>% gather(metric, value, -measure)
150 |     return(mat)
151 |   }, mc.cores = 36)
152 |   da = Reduce(rbind, temp_res)
153 |   
154 | 
155 |   val = sapply(methods, function(method){
156 |     mat = readRDS(paste0(data_dir, "rerun/zeisel-", method, "-pca.rds"))
157 |     mat = mat[, 1:dim]
158 |     clusts = hclust(dist(mat), method = "median")
159 |     Clabel = cutree(clusts, kk)
160 |     Clabel = as.numeric(Clabel)
161 |     v = sapply(c("jaccard_index", "adjusted_rand_index", "purity", "nmi"), function(x){
162 |       external_validation(as.numeric(factor(labels)), Clabel, method = x)
163 |     })
164 |     return(v)
165 |   })
166 |   mat = as.data.frame(val)
167 |   colnames(mat) = methods
168 |   mat$measure = c("Jaccard index", "adjusted Rand index", "purity", "nmi")
169 |   mat = mat %>% gather(metric, value, -measure)
170 |   sd = sapply(1:nrow(mat), function(i){
171 |     val = filter(da, measure == mat[i,"measure"] & metric == mat[i,"metric"])
172 |     return(sd(val$value))
173 |   })
174 |   mat$sd = sd
175 | 
176 |   mat$metric = factor(mat$metric, levels = c("Raw", "scImpute", "MAGIC", "SAVER"))
177 |   gc = ggplot(mat, aes(x = metric, y = value, fill = metric)) +
178 |     geom_bar(stat = "identity", width = .7) + 
179 |     facet_wrap(~measure, nrow = 4, scales = "free") +
180 |     geom_errorbar(aes(ymin = value - sd, ymax = value+sd), width = .2) +
181 |     theme_bw() + ylab("") + ylim(0,NA) +
182 |     theme(strip.background = element_rect(colour="white", fill="white"),
183 |           text = element_text(size=12),
184 |           axis.text.x=element_text(size=8),
185 |           axis.ticks.x=element_blank(),
186 |           legend.position = "none") +
187 |     scale_fill_manual(values = c("#999999", "#56B4E9", "#CC79A7", "#E69F00"))
188 |   ggsave(paste0(plot_dir, "Fig2c-kk", kk, ".pdf"), gc, width = 2.5, height = 5)
189 | }
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/vignettes/scImpute-vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to scImpute"
  3 | author: "Wei Vivian Li, Jingyi Jessica Li"
  4 | # author:
  5 | # - name: Wei Vivian Li, Jingyi Jessica Li
  6 | #   affiliation: 
  7 | #   - Department of Statistics, University of California, Los Angeles
  8 | date: "`r Sys.Date()`"
  9 | output: rmarkdown::html_vignette
 10 | #output: pdf_document
 11 | vignette: >
 12 |   %\VignetteIndexEntry{scImpute: accurate and robust imputation for scRNA-seq data}
 13 |   %\VignetteEngine{knitr::rmarkdown}
 14 |   %\VignetteEncoding{UTF-8}
 15 | ---
 16 | 
 17 | 
 18 | The emerging single cell RNA sequencing (scRNA-seq) technologies enable the investigation of transcriptomic landscape at single-cell resolution. However, scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. Consequently, downstream analysis of scRNA-seq woule be severely biased if the dropout events are not properly corrected. `scImpute` is developed to accurately and efficiently impute the dropout values in scRNA-seq data.
 19 | 
 20 | `scImpute` can be applied to raw data count before the users perform downstream analyses such as
 21 | 
 22 | - dimension reduction of scRNA-seq data
 23 | - normalization of scRNA-seq data
 24 | - clustering of cell populations
 25 | - differential gene expression analysis
 26 | - time-series analysis of gene expression dynamics
 27 | 
 28 | 
 29 | ## Quick start
 30 | 
 31 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis.
 32 | Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension.
 33 | In the simplest case, the imputation task can be done with one single function `scimpute`:
 34 | ```{r eval = FALSE}
 35 | scimpute(# full path to raw count matrix
 36 |          count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 
 37 |          infile = "csv",           # format of input file
 38 |          outfile = "csv",          # format of output file
 39 |          out_dir = "./",           # full path to output directory
 40 |          labeled = FALSE,          # cell type labels not available
 41 |          drop_thre = 0.5,          # threshold set on dropout probability
 42 |          Kcluster = 2,             # 2 cell subpopulations
 43 |          ncores = 10)              # number of cores used in parallel computation
 44 | ```
 45 | This function returns the column indices of outlier cells, and creates a new file `scImpute_count.csv` in `out_dir` to store the imputed count matrix.
 46 | 
 47 | 
 48 | ## Step-by-step description
 49 | 
 50 | The input file can be a `.csv` file, `.txt` file, or `.rds` file. In all cases, the **first column** should give the gene names and the **first row** should give the cell names. We use the example files in the package as illustration. If the raw counts are stored in a `.csv` file, and we also hope to output the imputed matrix into a `.csv` file, then specify this information with
 51 | ```{r eval = FALSE}
 52 | # full path of the input file
 53 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute")
 54 | infile = "csv"
 55 | outfile = "csv"
 56 | ```
 57 | Similarly, If the raw counts are stored in a `.txt` file, and we also hope to output the imputed matrix into a `.txt` file, then specify this information with
 58 | ```{r eval = FALSE}
 59 | # full path of the input file
 60 | count_path = system.file("extdata", "raw_count.txt", package = "scImpute")
 61 | infile = "txt"
 62 | outfile = "txt"
 63 | ```
 64 | Next, we need to set up the directory to store all the temporary and final outputs:
 65 | ```{r eval = FALSE}
 66 | # a '/' sign is necessary at the end of the path
 67 | out_dir = "~/output/"
 68 | ```
 69 | 
 70 | 
 71 | We highly recommend using parallel computing with `scImpute`, which will significantly reduce the computation time. Suppose we would like to use 20 cores, then we can run the `scImpute` function with `ncores = 20`.
 72 | 
 73 | `scImpute` has two statistical parameters. 
 74 | The **first parameter is `Kcluster`**, which determines the **number of initial clusters** to help identify candidate neighbors of each cell. The imputation results does not heavily rely on the choice of `Kcluster`, since `scImpute` uses a model-based method to select similar cells in a later stage. `Kcluster` can be specified based on the number of known cell types and users' biological expertise, and it may also be learned by clustering the raw data and inspecting the clustering results.
 75 | The **second parameter** is `drop_thre`. Only the values that have **dropout probability** larger than `drop_thre` are imputed by `scImpute`. A default threshold `drop_thre = 0.5` is sufficient for most scRNA-seq data. 
 76 | 
 77 | Now to get the imputed matrix, all we need is the main `scimpute` function
 78 | ```{r eval = FALSE}
 79 | Kcluster = 2
 80 | drop_thre = 0.5
 81 | ncores = 10
 82 | scimpute(count_path, infile, outfile, out_dir, labeled = FALSE,  drop_thre, Kcluster, ncores)  
 83 | ```
 84 | If `outfile = "csv"`, this function will create a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix; if `outfile = "txt"`, this function will create a new file `scimpute_count.txt` in `out_dir`.
 85 | 
 86 | Note that the order of parameters matters in R functions, so we suggest using the format in **Quick start** to specify parameters and avoid mistakes. If the users would like to apply `scImpute` on data coming from homogeneous cells, this can be achieved by setting `Kcluster = 1` and `labeled = FALSE`. 
 87 | 
 88 | ## Apply `scImpute` with cell type information
 89 | 
 90 | Sometimes users may have the cell type (or subpopulation) information of the single cells and `scimpute` can take advantage of this information to impute among each cell type. To do this, we need a character vector `labels` specifying the cell type of each column in the raw count matrix. In other words, the length of `labels` equals the number of cells and the order of elements in `labels` should match the order of columns in the raw count matrix. Then we just need to specify `labeled = TRUE` in `scimpute` (default is `FALSE`) and specify the `labels` argument. `Kcluster` is not used when `labeled = TRUE`.
 91 | ```{r eval = FALSE}
 92 | labels = readRDS(system.file("extdata", "labels.rds", package = "scImpute"))
 93 | labels[1:5]
 94 | > [1] "c1" "c1" "c1" "c2" "c2"
 95 | 
 96 | scimpute(count_path, 
 97 |          infile = "csv", 
 98 |          outfile = "csv", 
 99 |          out_dir = out_dir,
100 |          labeled = TRUE, 
101 |          drop_thre = 0.5,
102 |          labels = labels, 
103 |          ncores = 10)
104 | ```
105 | 
106 | ## Apply `scImpute` to TPM values
107 | 
108 | We strongly suggest using `scImpute` on count matrices. However, if only TPM values are available, users can apply `scImpute` with gene lengths supplied. `scImpute` will use the gene lengths (sum of exon lengths) to scale the data , which ensures a good fitting of the mixture models. In this case, users need to specify `type = "TPM"` (`type = "count"` by default), and supply a vector `genelen` of gene lengths. The order of genes in `genelen` should match the order in the expression matrix. For example:
109 | ```{r eval = FALSE}
110 | > genelen[1:3]
111 | ENSMUSG00000021252 ENSMUSG00000007777 ENSMUSG00000024442
112 |               4235                998               2404
113 | 
114 | scimpute(count_path, 
115 |          infile = "csv", 
116 |          outfile = "csv", 
117 |          out_dir = out_dir,
118 |          type = "TPM"
119 |          genelen = genelen,
120 |          drop_thre = 0.5,
121 |          ncores = 10)
122 | ```
123 | 
124 | ## How to save computation time with `scImpute`
125 | 
126 | `scImpute` benefits from parallel computation, and each processor does not require heavy memory cost. `scimpute` completes computation in seconds when applied to a dataset with 10,000 genes and 100 cells, running with 10 cores. The memory requirement for this data set is around 2G. The running time mostly depends on
127 | 
128 | * number of processors (`ncores`)
129 | * number of cells in the scRNA-seq data
130 | 
131 | When the number of cells is extremely large, a filtering step on the cells can save the computation time of `scImpute`.


--------------------------------------------------------------------------------
/inst/docs/scImpute-news.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  9 | <meta name="generator" content="pandoc" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1">
 12 | 
 13 | <meta name="author" content="Wei Vivian Li" />
 14 | 
 15 | <meta name="date" content="2018-08-15" />
 16 | 
 17 | <title>scImpute Updates</title>
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | <link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
 25 | 
 26 | </head>
 27 | 
 28 | <body>
 29 | 
 30 | 
 31 | 
 32 | 
 33 | <h1 class="title toc-ignore">scImpute Updates</h1>
 34 | <h4 class="author"><em>Wei Vivian Li</em></h4>
 35 | <h4 class="date"><em>2018-08-15</em></h4>
 36 | 
 37 | 
 38 | 
 39 | <div id="updates" class="section level2">
 40 | <h2>Updates</h2>
 41 | <blockquote>
 42 | <p>2018/08/15:</p>
 43 | </blockquote>
 44 | <ul>
 45 | <li>Version 0.0.9 is released!</li>
 46 | <li>More robust implementation of dimension reduction.</li>
 47 | <li>Faster calculation of cell similarity.</li>
 48 | </ul>
 49 | <blockquote>
 50 | <p>2018/06/27:</p>
 51 | </blockquote>
 52 | <ul>
 53 | <li>Version 0.0.8 is released!</li>
 54 | <li>Faster implementation of dimension reduction.</li>
 55 | </ul>
 56 | <blockquote>
 57 | <p>2018/06/08:</p>
 58 | </blockquote>
 59 | <ul>
 60 | <li>Version 0.0.7 is released!</li>
 61 | <li>New option for application on TPM values.</li>
 62 | </ul>
 63 | <blockquote>
 64 | <p>2018/03/16:</p>
 65 | </blockquote>
 66 | <ul>
 67 | <li>Version 0.0.6 is released!</li>
 68 | <li>The scImpute method is published at <a href="https://www.nature.com/articles/s41467-018-03405-7"><em>Nature Communications</em></a>.</li>
 69 | <li>scImpute now supports input and output in the format of R objects (.rds).</li>
 70 | </ul>
 71 | <blockquote>
 72 | <p>2018/01/12:</p>
 73 | </blockquote>
 74 | <ul>
 75 | <li>Version 0.0.5 is released!</li>
 76 | <li>It is now possible to apply scImpute on just one cell population by setting <code>Kcluster = 1</code>.</li>
 77 | </ul>
 78 | <blockquote>
 79 | <p>2017/10/27:</p>
 80 | </blockquote>
 81 | <ul>
 82 | <li>Version 0.0.4 is released!</li>
 83 | <li>scImpute now supports multi-code parallelism.</li>
 84 | </ul>
 85 | <blockquote>
 86 | <p>2017/10/22:</p>
 87 | </blockquote>
 88 | <ul>
 89 | <li>Version 0.0.3 is released!</li>
 90 | <li>Estimation of dropout probabilities is more accurate.</li>
 91 | <li>Imputation step is more robust.</li>
 92 | <li><code>scimpute()</code> incorporates a new parameter <code>Kcluster</code> to specify the number of cell subpopulations.</li>
 93 | <li><code>scImpute</code> is now able to detect outlier cells.</li>
 94 | </ul>
 95 | <blockquote>
 96 | <p>2017/07/01:</p>
 97 | </blockquote>
 98 | <ul>
 99 | <li>Version 0.0.2 is released!</li>
100 | <li>This version speeds up the first step in <code>scImpute</code> and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).</li>
101 | </ul>
102 | </div>
103 | 
104 | 
105 | 
106 | <!-- dynamically load mathjax for compatibility with self-contained -->
107 | <script>
108 |   (function () {
109 |     var script = document.createElement("script");
110 |     script.type = "text/javascript";
111 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
112 |     document.getElementsByTagName("head")[0].appendChild(script);
113 |   })();
114 | </script>
115 | 
116 | </body>
117 | </html>
118 | 


--------------------------------------------------------------------------------
/R/scImpute-internal.R:
--------------------------------------------------------------------------------
  1 | .Random.seed <-
  2 | c(403L, 3L, 198228743L, -491873571L, 144704468L, -743308550L, 
  3 | 226498117L, 1699344031L, -1917922266L, -718012240L, -1564772013L, 
  4 | -880941967L, 559159712L, -369683138L, -1330072503L, -50265861L, 
  5 | 772499658L, -1932136916L, -543649233L, -1234744603L, -1582657636L, 
  6 | -1522144782L, -490912659L, 855975335L, -710914098L, 6262056L, 
  7 | 536162091L, 1867830601L, 1744083544L, 907531014L, 62832353L, 
  8 | 2022791795L, 813933282L, -2096939436L, -910145705L, 1531870029L, 
  9 | -205992220L, -1680732470L, 1285017397L, -1570298481L, 302891670L, 
 10 | 615832128L, 189786883L, -377455839L, -1591968752L, -806016018L, 
 11 | -1271432295L, 575218731L, -251051750L, 1360737244L, -1512327713L, 
 12 | -2084798027L, 1516785484L, 1963435426L, 472294653L, -1855821321L, 
 13 | -645585122L, -1336960552L, -395267269L, -1937672999L, 1999304808L, 
 14 | 167841174L, -1937631983L, 1805260483L, 244397778L, 1280782180L, 
 15 | 303747751L, 1946369021L, -1252397580L, 1786473178L, 535702565L, 
 16 | 1037893951L, 871495366L, 1826102096L, 1957774195L, -58935919L, 
 17 | -575825984L, -16899682L, 463670121L, -1820999781L, 823068330L, 
 18 | -2105741108L, 1455600015L, 2087093701L, 1080117628L, -1927267886L, 
 19 | -902322867L, 1697167367L, 1441945582L, -1094918456L, 179048907L, 
 20 | 831267881L, -509668744L, -1055170650L, 366598593L, -1991004589L, 
 21 | -886938366L, -468351948L, -1871260233L, 739981741L, 1037785796L, 
 22 | -1844363542L, -2052239019L, 498140911L, 1364915446L, 2053833760L, 
 23 | 604714467L, -1748162943L, -216461072L, 1835093902L, 461275385L, 
 24 | 788359819L, -2114131270L, 808830396L, -248221953L, 878928469L, 
 25 | -1404037780L, 1453088194L, 781819421L, 1340368535L, -68603906L, 
 26 | -1737548872L, -270936165L, 690593273L, 564731208L, 1909053558L, 
 27 | -1610586703L, 403925987L, 1346631858L, -342731516L, 1793685063L, 
 28 | -1835966179L, -1771032684L, -660630726L, 927129477L, -1819988897L, 
 29 | -1473452314L, -789279376L, -1110772077L, -1712935119L, 174765536L, 
 30 | 737038974L, -1761287927L, 2064909115L, 299629322L, -347450644L, 
 31 | 589729647L, -178892635L, 1614010972L, 1881725490L, 1001378093L, 
 32 | -1315612697L, 75841294L, -835407640L, 2121769707L, 1103653769L, 
 33 | -993470696L, 796678726L, -1713268063L, -1372676813L, -1112144734L, 
 34 | -416453740L, 754356759L, 826447373L, 575435556L, -674590966L, 
 35 | 2090140277L, -1192853041L, 1658850006L, -990237056L, 1469616323L, 
 36 | -347812255L, -557349168L, 117260334L, 905864665L, -313345557L, 
 37 | 706407514L, 1319138844L, -787859681L, 2078952309L, 2020561420L, 
 38 | 1093280738L, -329420611L, 694656823L, -1429813282L, -576137960L, 
 39 | 679082107L, -92897767L, -308357464L, -1307888554L, -1826582319L, 
 40 | -302453629L, -311881966L, 441948836L, 346875495L, 1856141245L, 
 41 | 217262388L, -317191526L, 1875790053L, 413186687L, 1097835526L, 
 42 | 2089006992L, 1556511283L, -1133792815L, -261573760L, 1395027550L, 
 43 | 1130845609L, 1527854171L, 925863018L, -1007135476L, -1552861361L, 
 44 | 5415941L, -1983685956L, -1435178094L, 1183246477L, 652766919L, 
 45 | -680335442L, 1135505928L, 1046246923L, 931118825L, 184015288L, 
 46 | -760310170L, 447902465L, 1772270995L, 553602626L, 2001154548L, 
 47 | -714585097L, -860819475L, 1354176644L, -111331528L, -133743462L, 
 48 | -868584752L, -1215635268L, -1814534508L, 271640194L, 1809292480L, 
 49 | -986959556L, 886833296L, -1716839902L, -2026211832L, -1211723508L, 
 50 | 473916060L, -284377582L, -1851822720L, -627151340L, 916497960L, 
 51 | -468282054L, 1439672272L, -1353465236L, 1679820596L, 1761440402L, 
 52 | 1428715872L, 2037342540L, -1301282848L, 1199091618L, -1774487640L, 
 53 | 310860684L, -1115662660L, 2031573874L, -1361590032L, -1270474684L, 
 54 | 999734616L, 1414562874L, 83026800L, -781040580L, -1530922220L, 
 55 | -1599157822L, -1543876384L, 1326319484L, -1923322160L, -1777867102L, 
 56 | 403637288L, -1481766644L, 966632380L, 202665266L, 1317224384L, 
 57 | 1478619700L, 1562184648L, -734657350L, -1528975472L, 1252746732L, 
 58 | -2041978796L, -1484968302L, 899472160L, -588505652L, 566348768L, 
 59 | -680488254L, -1587680728L, 1293409964L, 2092020412L, 812581362L, 
 60 | -79984336L, 1268710564L, -1693633864L, 619629402L, 943222160L, 
 61 | 1242194300L, -987407084L, 190114114L, -1086078208L, 1222671292L, 
 62 | -1824232624L, -1767349982L, 1800097096L, -1722337268L, -1326688164L, 
 63 | -579119918L, -2027313152L, -7132332L, -1390702104L, 713312250L, 
 64 | -2133309744L, -727542484L, 1851029748L, 1976825746L, -417026976L, 
 65 | -2003161844L, -1728936224L, 1786105186L, 970195624L, -1823724980L, 
 66 | -635361412L, -375395534L, 2052107888L, -1172316860L, 1731865560L, 
 67 | 1064910970L, -617507280L, -1228531012L, 231970644L, 729152066L, 
 68 | -1861067872L, 1662499004L, -2045677104L, -297302686L, 673932264L, 
 69 | 1705387980L, -1840388356L, 240373746L, 2001665152L, 1776486644L, 
 70 | 1317786760L, 1655043386L, 528368464L, 929016940L, -649236332L, 
 71 | 1518078290L, 1516379872L, -389537588L, 1504843936L, 691607938L, 
 72 | -1865484056L, 741845228L, 404301884L, 2055528562L, -129406864L, 
 73 | -1847533020L, -341278152L, -44434918L, 32694480L, -194968004L, 
 74 | -1539388268L, -1140718462L, 1678747584L, -1316520644L, 2039811728L, 
 75 | -275536862L, -885075960L, 1703697548L, 1077284892L, 551318674L, 
 76 | -308859776L, -753810412L, -1529593048L, -1227598790L, -1799115568L, 
 77 | 1219777644L, -1269717324L, -1030229998L, -32407584L, -1557861428L, 
 78 | -492642848L, 130669602L, 1084138792L, 952867084L, 380964412L, 
 79 | -1748208398L, 787106032L, -1270642876L, -346812328L, 1412350522L, 
 80 | -1216743184L, -1277570372L, -1721239020L, -365237054L, -1024375200L, 
 81 | -856435332L, -480856880L, -1188772830L, 1719642280L, 400051596L, 
 82 | -2062945860L, -675620814L, -1443560640L, 608849588L, 857039176L, 
 83 | 1418412730L, -1989886192L, 1427284204L, 1738762324L, 925001490L, 
 84 | 1879630240L, -270104628L, -1531945120L, -336593726L, 318475816L, 
 85 | -1323017812L, 1772282684L, -389893262L, 854503216L, 1329422372L, 
 86 | -1035514312L, 926667482L, -1490208880L, -1504086788L, -477782636L, 
 87 | 1081111362L, -624875008L, 866740284L, -141422000L, -1736699742L, 
 88 | 317858632L, 748292620L, -1524000036L, 1686851666L, -2020037504L, 
 89 | -82385068L, 317902824L, 1392313082L, 434718544L, -1430838228L, 
 90 | -2120853132L, 881200786L, 870720096L, 1055084684L, -1017278112L, 
 91 | 1541167330L, -469259096L, -390076724L, -1864164868L, 9132978L, 
 92 | -1136942096L, 1435881924L, 873401688L, -617848454L, -1551011408L, 
 93 | -1058422738L, 2089055963L, -1286975555L, 1408166186L, -48562744L, 
 94 | -1506917647L, 61885539L, 1836701796L, 1778914194L, 1198520423L, 
 95 | -162527183L, 371644790L, 1139435156L, 1691659429L, -634998353L, 
 96 | 1940154984L, 382390854L, 747173875L, 1162364757L, 904001842L, 
 97 | 1691929856L, -1035514391L, 1405392987L, -734229844L, -563727174L, 
 98 | -789219729L, -2046368775L, -39384434L, -376255652L, -1026349107L, 
 99 | -1105803465L, 627250624L, 1729067678L, -1287113877L, 238299885L, 
100 | 1265868634L, -1341991912L, 1548235137L, -1578711245L, -1435959980L, 
101 | -1002145950L, 339987447L, 99333185L, 1138303174L, 1788120580L, 
102 | -1233180235L, 273192607L, -349817096L, 1984264918L, 697413475L, 
103 | -79440091L, -987820862L, -529225296L, 2077041113L, 1138536267L, 
104 | 773129660L, 2131080682L, 363942431L, -899893079L, -941625282L, 
105 | 187138348L, -32363651L, -412573241L, -2110450512L, -334052658L, 
106 | -2131888133L, 2094530653L, -1474656374L, -1608267864L, -2067976943L, 
107 | -2059258877L, -708296956L, -1133505742L, 1612951943L, 1944041425L, 
108 | 985725270L, -1961134284L, 95607365L, -1323784561L, 386887688L, 
109 | -2103290842L, -1626329901L, -865702347L, 144000402L, 575854944L, 
110 | -171138871L, -783404037L, 210056140L, -1256255910L, -325282993L, 
111 | 1238706713L, 585828398L, -873953732L, 1819360173L, 1305375831L, 
112 | 377012640L, 90004222L, -657753653L, -1493647411L, 1962660602L, 
113 | 878266680L, -845753631L, -1343113837L, -405905996L, -775659838L, 
114 | 1868803799L, 321666849L, -37087514L, -1471746076L, -2076180715L, 
115 | 1433990975L, -546004264L, 1597089526L, -1489940221L, 1972154949L, 
116 | -2054618526L, -569531952L, -373768455L, 861240363L, 99220380L, 
117 | 310571978L, -1380234049L, -888672759L, -131160290L, 1157746380L, 
118 | -1810411875L, -38188249L, 512157008L, -436155410L, 554284187L, 
119 | 176037629L, 194340330L, -2046085496L, 2013397041L, 25287331L, 
120 | 609526436L, 640889682L, 1906268839L, -268892431L, -865196490L, 
121 | 579250900L, -2111996315L, -1263432593L, 1897100968L, -1353567226L, 
122 | 1781801267L, -947159659L, -460721678L, -695828928L, 1850630057L, 
123 | -396115429L, 1907276780L, -1667728390L, -348047313L, 1887156281L, 
124 | -2034234802L, 1598360220L, 1691029517L, 2121878647L, 741821568L, 
125 | -1788974370L, -66807381L, -1118956883L, 450561818L, -40626216L, 
126 | 1527157825L, -1438850317L, 392176916L, -637202270L, 633612670L
127 | )
128 | 


--------------------------------------------------------------------------------
/R/imputation_model.R:
--------------------------------------------------------------------------------
  1 | 
  2 | find_hv_genes = function(count, I, J){
  3 |   count_nzero = lapply(1:I, function(i) setdiff(count[i, ], log10(1.01)))
  4 |   mu = sapply(count_nzero, mean)
  5 |   mu[is.na(mu)] = 0
  6 |   sd = sapply(count_nzero, sd)
  7 |   sd[is.na(sd)] = 0
  8 |   cv = sd/mu
  9 |   cv[is.na(cv)] = 0
 10 |   # sum(mu >= 1 & cv >= quantile(cv, 0.25), na.rm = TRUE)
 11 |   high_var_genes = which(mu >= 1 & cv >= quantile(cv, 0.25))
 12 |   if(length(high_var_genes) < 500){ 
 13 |     high_var_genes = 1:I}
 14 |   count_hv = count[high_var_genes, ]
 15 |   return(count_hv)
 16 | }
 17 | 
 18 | find_neighbors = function(count_hv, labeled, J, Kcluster = NULL, 
 19 |                           ncores, cell_labels = NULL){
 20 |   if(labeled == TRUE){
 21 |     if(class(cell_labels) == "character"){
 22 |       labels_uniq = unique(cell_labels)
 23 |       labels_mth = 1:length(labels_uniq)
 24 |       names(labels_mth) = labels_uniq
 25 |       clust = labels_mth[cell_labels]
 26 |     }else{
 27 |       clust = cell_labels
 28 |     }
 29 |     nclust = length(unique(clust))
 30 |     print("calculating cell distances ...")
 31 |     dist_list = lapply(1:nclust, function(ll){
 32 |       cell_inds = which(clust == ll)
 33 |       count_hv_sub = count_hv[, cell_inds, drop = FALSE]
 34 |       if(length(cell_inds) < 1000){
 35 |         var_thre = 0.4
 36 |         pca = prcomp(t(count_hv_sub))
 37 |         eigs = (pca$sdev)^2
 38 |         var_cum = cumsum(eigs)/sum(eigs)
 39 |         if(max(var_cum) <= var_thre){
 40 |           npc = length(var_cum)
 41 |         }else{
 42 |           npc = which.max(var_cum > var_thre)
 43 |           if (labeled == FALSE){ npc = max(npc, Kcluster) }
 44 |         }
 45 |       }else{
 46 |         var_thre = 0.6
 47 |         pca = rpca(t(count_hv_sub), k = 1000, center = TRUE, scale = FALSE) 
 48 |         eigs = (pca$sdev)^2
 49 |         var_cum = cumsum(eigs)/sum(eigs)
 50 |         if(max(var_cum) <= var_thre){
 51 |           npc = length(var_cum)
 52 |         }else{
 53 |           npc = which.max(var_cum > var_thre)
 54 |           if (labeled == FALSE){ npc = max(npc, Kcluster) }
 55 |         }
 56 |       }
 57 |       
 58 |       if (npc < 3){ npc = 3 }
 59 |       mat_pcs = t(pca$x[, 1:npc]) 
 60 |       
 61 |       dist_cells_list = mclapply(1:length(cell_inds), function(id1){
 62 |         d = sapply(1:id1, function(id2){
 63 |           sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
 64 |           sqrt(sse)
 65 |         })
 66 |         return(c(d, rep(0, length(cell_inds)-id1)))
 67 |       }, mc.cores = ncores)
 68 |       dist_cells = matrix(0, nrow = length(cell_inds), ncol = length(cell_inds))
 69 |       for(cellid in 1:length(cell_inds)){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
 70 |       dist_cells = dist_cells + t(dist_cells)
 71 |       return(dist_cells)
 72 |     })
 73 | 
 74 |     return(list(dist_list = dist_list, clust = clust))
 75 |   }
 76 |   
 77 |   if(labeled == FALSE){
 78 |     ## dimeansion reduction
 79 |     print("dimension reduction ...")
 80 |     if(J < 5000){
 81 |       var_thre = 0.4
 82 |       pca = prcomp(t(count_hv))
 83 |       eigs = (pca$sdev)^2
 84 |       var_cum = cumsum(eigs)/sum(eigs)
 85 |       if(max(var_cum) <= var_thre){
 86 |         npc = length(var_cum)
 87 |       }else{
 88 |         npc = which.max(var_cum > var_thre)
 89 |         if (labeled == FALSE){ npc = max(npc, Kcluster) }
 90 |       }
 91 |     }else{
 92 |       var_thre = 0.6
 93 |       pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 
 94 |       eigs = (pca$sdev)^2
 95 |       var_cum = cumsum(eigs)/sum(eigs)
 96 |       if(max(var_cum) <= var_thre){
 97 |         npc = length(var_cum)
 98 |       }else{
 99 |         npc = which.max(var_cum > var_thre)
100 |         if (labeled == FALSE){ npc = max(npc, Kcluster) }
101 |       }
102 |     }
103 |     if (npc < 3){ npc = 3 }
104 |     mat_pcs = t(pca$x[, 1:npc]) # columns are cells
105 |     
106 |     ## detect outliers
107 |     print("calculating cell distances ...")
108 |     dist_cells_list = mclapply(1:J, function(id1){
109 |       d = sapply(1:id1, function(id2){
110 |         sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
111 |         sqrt(sse)
112 |       })
113 |       return(c(d, rep(0, J-id1)))
114 |     }, mc.cores = ncores)
115 |     dist_cells = matrix(0, nrow = J, ncol = J)
116 |     for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
117 |     dist_cells = dist_cells + t(dist_cells)
118 |     
119 |     min_dist = sapply(1:J, function(i){
120 |       min(dist_cells[i, -i])
121 |     })
122 |     iqr = quantile(min_dist, 0.75) - quantile(min_dist, 0.25)
123 |     outliers = which(min_dist > 1.5 * iqr + quantile(min_dist, 0.75))
124 |     
125 |     ## clustering
126 |     non_out = setdiff(1:J, outliers)
127 |     spec_res = specc(t(mat_pcs[, non_out]), centers = Kcluster, kernel = "rbfdot")
128 |     print("cluster sizes:")
129 |     print(spec_res@size)
130 |     nbs = rep(NA, J)
131 |     nbs[non_out] = spec_res
132 |     
133 |     return(list(dist_cells = dist_cells, clust = nbs))
134 |   }
135 | }
136 | 
137 | find_va_genes = function(parslist, subcount){
138 |   point = log10(1.01)
139 |   valid_genes = which( (rowSums(subcount) > point * ncol(subcount)) &
140 |                          complete.cases(parslist) )
141 |   if(length(valid_genes) == 0) return(valid_genes)
142 |   # find out genes that violate assumption
143 |   mu = parslist[, "mu"]
144 |   sgene1 = which(mu <= log10(1+1.01))
145 |   # sgene2 = which(mu <= log10(10+1.01) & mu - parslist[,5] > log10(1.01))
146 |   
147 |   dcheck1 = dgamma(mu+1, shape = parslist[, "alpha"], rate = parslist[, "beta"])
148 |   dcheck2 = dnorm(mu+1, mean = parslist[, "mu"], sd = parslist[, "sigma"])
149 |   sgene3 = which(dcheck1 >= dcheck2 & mu <= 1)
150 |   sgene = union(sgene1, sgene3)
151 |   valid_genes = setdiff(valid_genes, sgene)
152 |   return(valid_genes)
153 | }
154 | 
155 | impute_nnls = function(Ic, cellid, subcount, droprate, geneid_drop, 
156 |                   geneid_obs, nbs, distc){
157 |   yobs = subcount[ ,cellid]
158 |   if (length(geneid_drop) == 0 | length(geneid_drop) == Ic) {
159 |     return(yobs) }  
160 |   yimpute = rep(0, Ic)
161 |   
162 |   xx = subcount[geneid_obs, nbs]
163 |   yy = subcount[geneid_obs, cellid]
164 |   ximpute = subcount[geneid_drop, nbs]
165 |   num_thre = 500
166 |   if(ncol(xx) >= min(num_thre, nrow(xx))){
167 |     if (num_thre >= nrow(xx)){
168 |       new_thre = round((2*nrow(xx)/3))
169 |     }else{ new_thre = num_thre}
170 |     filterid = order(distc[cellid, -cellid])[1: new_thre]
171 |     xx = xx[, filterid, drop = FALSE]
172 |     ximpute = ximpute[, filterid, drop = FALSE]
173 |   }
174 |   set.seed(cellid)
175 |   nnls = penalized(yy, penalized = xx, unpenalized = ~0,
176 |                   positive = TRUE, lambda1 = 0, lambda2 = 0, 
177 |                   maxiter = 3000, trace = FALSE)
178 |   ynew = penalized::predict(nnls, penalized = ximpute, unpenalized = ~0)[,1]
179 |   yimpute[geneid_drop] = ynew
180 |   yimpute[geneid_obs] = yobs[geneid_obs]
181 |   maxobs = apply(subcount, 1, max)
182 |   yimpute[yimpute > maxobs] = maxobs[yimpute > maxobs]
183 |   return(yimpute)
184 | }
185 | 
186 | 
187 | imputation_model8 = function(count, labeled, point, drop_thre = 0.5, Kcluster = 10, 
188 |                              out_dir, ncores){
189 |   count = as.matrix(count)
190 |   I = nrow(count)
191 |   J = ncol(count)
192 |   count_imp = count
193 |   
194 |   # find highly variable genes
195 |   count_hv = find_hv_genes(count, I, J)
196 |   print("searching candidate neighbors ... ")
197 |   if(Kcluster == 1){
198 |     clust = rep(1, J)
199 |     if(J < 5000){
200 |       var_thre = 0.4
201 |       pca = prcomp(t(count_hv))
202 |       eigs = (pca$sdev)^2
203 |       var_cum = cumsum(eigs)/sum(eigs)
204 |       if(max(var_cum) <= var_thre){
205 |         npc = length(var_cum)
206 |       }else{
207 |         npc = which.max(var_cum > var_thre)
208 |         if (labeled == FALSE){ npc = max(npc, Kcluster) }
209 |       }
210 |     }else{
211 |       var_thre = 0.6
212 |       pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 
213 |       eigs = (pca$sdev)^2
214 |       var_cum = cumsum(eigs)/sum(eigs)
215 |       if(max(var_cum) <= var_thre){
216 |         npc = length(var_cum)
217 |       }else{
218 |         npc = which.max(var_cum > var_thre)
219 |         if (labeled == FALSE){ npc = max(npc, Kcluster) }
220 |       }
221 |     }
222 | 
223 |     if (npc < 3){ npc = 3 }
224 |     mat_pcs = t(pca$x[, 1:npc]) # columns are cells
225 |     
226 |     dist_cells_list = mclapply(1:J, function(id1){
227 |       d = sapply(1:id1, function(id2){
228 |         sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2)
229 |         sqrt(sse)
230 |       })
231 |       return(c(d, rep(0, J-id1)))
232 |     }, mc.cores = ncores)
233 |     dist_cells = matrix(0, nrow = J, ncol = J)
234 |     for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]}
235 |     dist_cells = dist_cells + t(dist_cells)
236 |   }else{
237 |     print("inferring cell similarities ...")
238 |     set.seed(Kcluster)
239 |     neighbors_res = find_neighbors(count_hv = count_hv, labeled = FALSE, J = J, 
240 |                                    Kcluster = Kcluster, ncores = ncores)
241 |     dist_cells = neighbors_res$dist_cells
242 |     clust = neighbors_res$clust
243 |   }
244 | 
245 |   saveRDS(clust, file = paste0(out_dir, "clust.rds"))
246 |   # mixture model
247 |   nclust = sum(!is.na(unique(clust)))
248 |   cl = makeCluster(ncores, outfile="")
249 |   registerDoParallel(cl)
250 |   
251 |   for(cc in 1:nclust){
252 |     print(paste("estimating dropout probability for type", cc, "..."))
253 |     paste0(out_dir, "pars", cc, ".rds")
254 |     get_mix_parameters(count = count[, which(clust == cc), drop = FALSE], 
255 |                        point = log10(1.01),
256 |                        path = paste0(out_dir, "pars", cc, ".rds"), ncores = ncores)
257 |     
258 |  
259 |     cells = which(clust == cc)
260 |     if(length(cells) <= 1) { next }
261 |     parslist = readRDS(paste0(out_dir, "pars", cc, ".rds"))
262 |     print("searching for valid genes ...")
263 |     valid_genes = find_va_genes(parslist, subcount = count[, cells])
264 |     if(length(valid_genes) <= 10){ next }
265 | 
266 |     subcount = count[valid_genes, cells, drop = FALSE]
267 |     Ic = length(valid_genes)
268 |     Jc = ncol(subcount)
269 |     parslist = parslist[valid_genes, , drop = FALSE]
270 |     
271 |     droprate = t(sapply(1:Ic, function(i) {
272 |       wt = calculate_weight(subcount[i, ], parslist[i, ])
273 |       return(wt[, 1])
274 |     }))
275 |     mucheck = sweep(subcount, MARGIN = 1, parslist[, "mu"], FUN = ">")
276 |     droprate[mucheck & droprate > drop_thre] = 0
277 |     # dropouts
278 |     setA = lapply(1:Jc, function(cellid){
279 |       which(droprate[, cellid] > drop_thre)
280 |     })
281 |     # non-dropouts
282 |     setB = lapply(1:Jc, function(cellid){
283 |       which(droprate[, cellid] <= drop_thre)
284 |     })
285 |     # imputation
286 |     gc()
287 |     print(paste("imputing dropout values for type", cc, "..."))
288 |     subres = foreach(cellid = 1:Jc, .packages = c("penalized"), 
289 |                      .combine = cbind, .export = c("impute_nnls")) %dopar% {
290 |       if (cellid %% 10 == 0) {gc()}
291 |       if (cellid %% 100 == 0) {print(cellid)}
292 |       nbs = setdiff(1:Jc, cellid)
293 |       if (length(nbs) == 0) {return(NULL)}
294 |       geneid_drop = setA[[cellid]]
295 |       geneid_obs = setB[[cellid]]
296 |       y = try(impute_nnls(Ic, cellid, subcount, droprate, geneid_drop, 
297 |                           geneid_obs, nbs, distc = dist_cells[cells, cells]), 
298 |               silent = TRUE)
299 |       if (class(y) == "try-error") {
300 |         # print(y)
301 |         y = subcount[, cellid, drop = FALSE]
302 |       }
303 |       return(y)
304 |     }
305 |     count_imp[valid_genes, cells] = subres
306 |   }
307 |   stopCluster(cl)
308 |   outlier = which(is.na(clust))
309 |   count_imp[count_imp < point] = point
310 |   return(list(count_imp = count_imp, outlier = outlier))
311 | }
312 | 
313 | imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, drop_thre, 
314 |                                     Kcluster = NULL, out_dir, ncores){
315 |   if(!(class(cell_labels) %in% c("character", "numeric", "integer"))){
316 |     stop("cell_labels should be a character or integer vector!")
317 |   }
318 |   
319 |   count = as.matrix(count)
320 |   I = nrow(count)
321 |   J = ncol(count)
322 |   count_imp = count
323 |   
324 |   count_hv = find_hv_genes(count, I, J)
325 |   print("searching candidate neighbors ... ")
326 |   neighbors_res = find_neighbors(count_hv = count_hv, labeled = TRUE, J = J,  
327 |                                  ncores = ncores, cell_labels = cell_labels)
328 |   dist_list = neighbors_res$dist_list
329 |   clust = neighbors_res$clust
330 |   
331 |   # mixture model
332 |   nclust = sum(!is.na(unique(clust)))
333 |   cl = makeCluster(ncores, outfile="")
334 |   registerDoParallel(cl)
335 |   
336 |   for(cc in 1:nclust){
337 |     print(paste("estimating dropout probability for type", cc, "..."))
338 |     paste0(out_dir, "pars", cc, ".rds")
339 |     get_mix_parameters(count = count[, which(clust == cc), drop = FALSE], 
340 |                        point = log10(1.01),
341 |                        path = paste0(out_dir, "pars", cc, ".rds"), ncores = ncores)
342 |     
343 |     cells = which(clust == cc)
344 |     if(length(cells) <= 1){ next }
345 |     parslist = readRDS(paste0(out_dir, "pars", cc, ".rds"))
346 |     print("searching for valid genes ...")
347 |     valid_genes = find_va_genes(parslist, subcount = count[, cells])
348 |     if(length(valid_genes) <= 10){ next }
349 |     
350 |     subcount = count[valid_genes, cells, drop = FALSE]
351 |     Ic = length(valid_genes)
352 |     Jc = ncol(subcount)
353 |     parslist = parslist[valid_genes, , drop = FALSE]
354 |     
355 |     droprate = t(sapply(1:Ic, function(i) {
356 |       wt = calculate_weight(subcount[i, ], parslist[i, ])
357 |       return(wt[, 1])
358 |     }))
359 |     mucheck = sweep(subcount, MARGIN = 1, parslist[, "mu"], FUN = ">")
360 |     droprate[mucheck & droprate > drop_thre] = 0
361 |     # dropouts
362 |     setA = lapply(1:Jc, function(cellid){
363 |       which(droprate[, cellid] > drop_thre)
364 |     })
365 |     # non-dropouts
366 |     setB = lapply(1:Jc, function(cellid){
367 |       which(droprate[, cellid] <= drop_thre)
368 |     })
369 |     # imputation
370 |     gc()
371 |     print(paste("imputing dropout values for type", cc, "..."))
372 | 
373 |     cellid = NULL
374 |     subres = foreach(cellid = 1:Jc, .packages = c("penalized"), 
375 |                      .combine = cbind, .export = c("impute_nnls")) %dopar% {
376 |       ##sink(paste0(out_dir, "log.txt"), append=TRUE))
377 |       ##cat(paste("imputing dropout values for type", cc, "\n")
378 |       if (cellid %% 10 == 0) {gc()}
379 |       if (cellid %% 100 == 0) {print(cellid)}
380 |       nbs = setdiff(1:Jc, cellid)
381 |       if (length(nbs) == 0) {return(NULL)}
382 |       geneid_drop = setA[[cellid]]
383 |       geneid_obs = setB[[cellid]]
384 |       y = try(impute_nnls(Ic, cellid = cellid, subcount, droprate, geneid_drop, 
385 |                           geneid_obs, nbs, distc = dist_list[[cc]]),
386 |               silent = TRUE)
387 |       if (class(y) == "try-error") {
388 |         # print(y)
389 |         y = subcount[, cellid, drop = FALSE]
390 |       }
391 |       return(y)
392 |     }
393 |     count_imp[valid_genes, cells] = subres
394 |   }
395 |   stopCluster(cl)
396 |   outlier = integer(0)
397 |   count_imp[count_imp < point] = point
398 |   return(list(count_imp = count_imp, outlier = outlier))
399 | 
400 | }
401 | 


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/"
  2 | data = lapply(1:9, function(sp){
  3 | da = lapply(1:9 , function(gtfid){
  4 | tp = lapply(1:4, function(mm){
  5 | if(mm == 1){
  6 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
  7 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
  8 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv)
  9 | }else if(mm == 2){
 10 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp",
 11 | sp, "gtf", gtfid, ".rds"))
 12 | }else if(mm == 3){
 13 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp",
 14 | sp, "gtf", gtfid, ".rds"))
 15 | }else{
 16 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp",
 17 | sp, "gtf", gtfid, ".rds"))
 18 | }
 19 | tv = unlist(tv)
 20 | return(data.frame(tv = tv, method = methods[mm]))
 21 | })
 22 | tp = Reduce(rbind, tp)
 23 | tp$annotation = gtfid
 24 | return(tp)
 25 | })
 26 | da = Reduce(rbind, da)
 27 | da$sample = sp
 28 | return(da)
 29 | })
 30 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE")
 31 | data = lapply(1:9, function(sp){
 32 | da = lapply(1:9 , function(gtfid){
 33 | tp = lapply(1:4, function(mm){
 34 | if(mm == 1){
 35 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
 36 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
 37 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv)
 38 | }else if(mm == 2){
 39 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp",
 40 | sp, "gtf", gtfid, ".rds"))
 41 | }else if(mm == 3){
 42 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp",
 43 | sp, "gtf", gtfid, ".rds"))
 44 | }else{
 45 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp",
 46 | sp, "gtf", gtfid, ".rds"))
 47 | }
 48 | tv = unlist(tv)
 49 | return(data.frame(tv = tv, method = methods[mm]))
 50 | })
 51 | tp = Reduce(rbind, tp)
 52 | tp$annotation = gtfid
 53 | return(tp)
 54 | })
 55 | da = Reduce(rbind, da)
 56 | da$sample = sp
 57 | return(da)
 58 | })
 59 | data = lapply(1:8, function(sp){
 60 | da = lapply(1:9 , function(gtfid){
 61 | tp = lapply(1:4, function(mm){
 62 | if(mm == 1){
 63 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
 64 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
 65 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv)
 66 | }else if(mm == 2){
 67 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp",
 68 | sp, "gtf", gtfid, ".rds"))
 69 | }else if(mm == 3){
 70 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp",
 71 | sp, "gtf", gtfid, ".rds"))
 72 | }else{
 73 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp",
 74 | sp, "gtf", gtfid, ".rds"))
 75 | }
 76 | tv = unlist(tv)
 77 | return(data.frame(tv = tv, method = methods[mm]))
 78 | })
 79 | tp = Reduce(rbind, tp)
 80 | tp$annotation = gtfid
 81 | return(tp)
 82 | })
 83 | da = Reduce(rbind, da)
 84 | da$sample = sp
 85 | return(da)
 86 | })
 87 | data = Reduce(rbind, data)
 88 | data$annotation = factor(data$annotation, levels = 1:9)
 89 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F",
 90 | Stringtie = "#00BA38", SLIDE = "#B79F00")
 91 | ggplot(data, aes(x = annotation, y = tv, fill = method)) +
 92 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
 93 | theme_bw() +
 94 | facet_grid(sample ~.) +
 95 | scale_fill_manual(values = cols) +
 96 | theme(strip.background = element_rect(fill = "white"))
 97 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15)
 98 | library(ggplot2)
 99 | ggplot(data, aes(x = annotation, y = tv, fill = method)) +
100 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
101 | theme_bw() +
102 | facet_grid(sample ~.) +
103 | scale_fill_manual(values = cols) +
104 | theme(strip.background = element_rect(fill = "white"))
105 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15)
106 | dir = "~/Dropbox/Iso Discovery/Codes/compare_abundance/"
107 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15)
108 | dir = "~/Dropbox/Iso Discovery/Codes/compare_transcripts/simulation/pr_by_gene/"
109 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE")
110 | data = lapply(1:1, function(sp){
111 | tp2 = lapply(1:9, function(gtfid){
112 | tp1 = lapply(1:4, function(mm){
113 | if(mm == 1){
114 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/"
115 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
116 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
117 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca)))
118 | }else if(mm == 2){
119 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds"))
120 | pr = Reduce(rbind, pr)
121 | }else if(mm == 3){
122 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds"))
123 | pr = Reduce(rbind, pr)
124 | }else{
125 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds"))
126 | pr = Reduce(rbind, pr)
127 | }
128 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm])
129 | return(res)
130 | })
131 | tp1 = Reduce(rbind, tp1)
132 | tp1$annotation = gtfid
133 | return(tp1)
134 | })
135 | tp2 = Reduce(rbind, tp2)
136 | tp2$sample = sp
137 | return(tp2)
138 | })
139 | data = Reduce(rbind, data)
140 | data$annotation = factor(data$annotation, levels = 1:9)
141 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F",
142 | Stringtie = "#00BA38", SLIDE = "#B79F00")
143 | ggplot(data, aes(x = annotation, y = precision, fill = method)) +
144 | geom_boxplot() +
145 | theme_bw() +
146 | facet_grid(sample ~.) +
147 | scale_fill_manual(values = cols) +
148 | theme(strip.background = element_rect(fill = "white"))
149 | ggplot(data, aes(x = annotation, y = precision, fill = method)) +
150 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
151 | theme_bw() +
152 | facet_grid(sample ~.) +
153 | scale_fill_manual(values = cols) +
154 | theme(strip.background = element_rect(fill = "white"))
155 | ggplot(data, aes(x = annotation, y = recall, fill = method)) +
156 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
157 | theme_bw() +
158 | facet_grid(sample ~.) +
159 | scale_fill_manual(values = cols) +
160 | theme(strip.background = element_rect(fill = "white"))
161 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE")
162 | data = lapply(1:1, function(sp){
163 | tp2 = lapply(1:9, function(gtfid){
164 | tp1 = lapply(1:4, function(mm){
165 | if(mm == 1){
166 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/"
167 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
168 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
169 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca)))
170 | }else if(mm == 2){
171 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds"))
172 | pr = Reduce(rbind, pr)
173 | }else if(mm == 3){
174 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds"))
175 | pr = Reduce(rbind, pr)
176 | }else{
177 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds"))
178 | pr = Reduce(rbind, pr)
179 | }
180 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm])
181 | return(res)
182 | })
183 | tp1 = Reduce(rbind, tp1)
184 | tp1$annotation = gtfid
185 | return(tp1)
186 | })
187 | tp2 = Reduce(rbind, tp2)
188 | tp2$sample = sp
189 | return(tp2)
190 | })
191 | data = Reduce(rbind, data)
192 | data$annotation = factor(data$annotation, levels = 1:9)
193 | dir
194 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F",
195 | Stringtie = "#00BA38", SLIDE = "#B79F00")
196 | ggplot(data, aes(x = annotation, y = precision, fill = method)) +
197 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
198 | theme_bw() +
199 | facet_grid(sample ~.) +
200 | scale_fill_manual(values = cols) +
201 | theme(strip.background = element_rect(fill = "white"))
202 | ggsave(paste0(dir, "precision_boxplot.pdf"), width = 6, height = 15)
203 | data = lapply(1:8, function(sp){
204 | print(sp)
205 | tp2 = lapply(1:9, function(gtfid){
206 | tp1 = lapply(1:4, function(mm){
207 | if(mm == 1){
208 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/"
209 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData"))
210 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)]
211 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca)))
212 | }else if(mm == 2){
213 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds"))
214 | pr = Reduce(rbind, pr)
215 | }else if(mm == 3){
216 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds"))
217 | pr = Reduce(rbind, pr)
218 | }else{
219 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds"))
220 | pr = Reduce(rbind, pr)
221 | }
222 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm])
223 | return(res)
224 | })
225 | tp1 = Reduce(rbind, tp1)
226 | tp1$annotation = gtfid
227 | return(tp1)
228 | })
229 | tp2 = Reduce(rbind, tp2)
230 | tp2$sample = sp
231 | return(tp2)
232 | })
233 | data = Reduce(rbind, data)
234 | data$annotation = factor(data$annotation, levels = 1:9)
235 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F",
236 | Stringtie = "#00BA38", SLIDE = "#B79F00")
237 | ggplot(data, aes(x = annotation, y = precision, fill = method)) +
238 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
239 | theme_bw() +
240 | facet_grid(sample ~.) +
241 | scale_fill_manual(values = cols) +
242 | theme(strip.background = element_rect(fill = "white"))
243 | ggsave(paste0(dir, "precision_boxplot.pdf"), width = 6, height = 15)
244 | ggplot(data, aes(x = annotation, y = recall, fill = method)) +
245 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) +
246 | theme_bw() +
247 | facet_grid(sample ~.) +
248 | scale_fill_manual(values = cols) +
249 | theme(strip.background = element_rect(fill = "white"))
250 | ggsave(paste0(dir, "recall_boxplot.pdf"), width = 6, height = 15)
251 | 41/56
252 | class(NULL)
253 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP2_names.txt",
254 | stringsAsFactors = FALSE)
255 | names = as.character(names[,1])
256 | names_split = strsplit(names, split = "-")
257 | pt = sapply(names_split, function(x) x[1])
258 | pt_uniq = unique(pt)
259 | ### star
260 | commands = lapply(1:length(pt_uniq), function(i){
261 | ptid = pt_uniq[i]
262 | titles = names_split[pt == ptid]
263 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3]))
264 | titles = paste0("${Dfastq}", titles)
265 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",")
266 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",")
267 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn",
268 | f1, f2,
269 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate",
270 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2"))
271 | return(cd)
272 | })
273 | commands = unlist(commands)
274 | write(commands, "~/Dropbox/Iso Discovery/Codes/melanoma/star_align_DP2.sh", sep="\n")
275 | ### samtools index
276 | commands = lapply(pt_uniq, function(pt){
277 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam")
278 | cd = paste("samtools index", bam_path)
279 | return(cd)
280 | })
281 | commands = unlist(commands)
282 | write(commands, "~/Dropbox/Iso Discovery/Codes/melanoma/samtools_index_DP2.sh", sep="\n")
283 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP2_names.txt",
284 | stringsAsFactors = FALSE)
285 | names = as.character(names[,1])
286 | names_split = strsplit(names, split = "-")
287 | pt = sapply(names_split, function(x) x[1])
288 | pt_uniq = unique(pt)
289 | ### star
290 | commands = lapply(1:length(pt_uniq), function(i){
291 | ptid = pt_uniq[i]
292 | titles = names_split[pt == ptid]
293 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3]))
294 | titles = paste0("${Dfastq}", titles)
295 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",")
296 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",")
297 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn",
298 | f1, f2,
299 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate",
300 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2"))
301 | return(cd)
302 | })
303 | commands = unlist(commands)
304 | write(commands, "~/Dropbox/Iso Discovery/melanoma/star_align_DP2.sh", sep="\n")
305 | ### samtools index
306 | commands = lapply(pt_uniq, function(pt){
307 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam")
308 | cd = paste("samtools index", bam_path)
309 | return(cd)
310 | })
311 | commands = unlist(commands)
312 | write(commands, "~/Dropbox/Iso Discovery/melanoma/samtools_index_DP2.sh", sep="\n")
313 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP2_names.txt",
314 | stringsAsFactors = FALSE)
315 | names = as.character(names[,1])
316 | names_split = strsplit(names, split = "-")
317 | pt = sapply(names_split, function(x) x[1])
318 | pt_uniq = unique(pt)
319 | ### star
320 | commands = lapply(1:length(pt_uniq), function(i){
321 | ptid = pt_uniq[i]
322 | titles = names_split[pt == ptid]
323 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3]))
324 | titles = paste0("${Dfastq}", titles)
325 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",")
326 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",")
327 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn",
328 | f1, f2,
329 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate",
330 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2"))
331 | return(cd)
332 | })
333 | commands = unlist(commands)
334 | write(commands, "~/Dropbox/Iso Discovery/melanoma/star_align_DP2.sh", sep="\n")
335 | ### samtools index
336 | commands = lapply(pt_uniq, function(pt){
337 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam")
338 | cd = paste("samtools index", bam_path)
339 | return(cd)
340 | })
341 | commands = unlist(commands)
342 | write(commands, "~/Dropbox/Iso Discovery/melanoma/samtools_index_DP2.sh", sep="\n")
343 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP2_names.txt",
344 | stringsAsFactors = FALSE)
345 | names = as.character(names[,1])
346 | names_split = strsplit(names, split = "-")
347 | pt = sapply(names_split, function(x) x[1])
348 | pt_uniq = unique(pt)
349 | pt_uniq
350 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/baseline_names.txt",
351 | stringsAsFactors = FALSE)
352 | names = as.character(names[,1])
353 | names_split = strsplit(names, split = "-")
354 | pt = sapply(names_split, function(x) x[1])
355 | pt_uniq = unique(pt)
356 | names = read.table("~/Dropbox/Iso Discovery/melanoma/baseline_names.txt",
357 | stringsAsFactors = FALSE)
358 | names = as.character(names[,1])
359 | names_split = strsplit(names, split = "-")
360 | pt = sapply(names_split, function(x) x[1])
361 | pt_uniq = unique(pt)
362 | pt_uniq
363 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP1_names.txt",
364 | stringsAsFactors = FALSE)
365 | names = as.character(names[,1])
366 | names_split = strsplit(names, split = "-")
367 | pt = sapply(names_split, function(x) x[1])
368 | pt_uniq = unique(pt)
369 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP1_names.txt",
370 | stringsAsFactors = FALSE)
371 | names = as.character(names[,1])
372 | names_split = strsplit(names, split = "-")
373 | pt = sapply(names_split, function(x) x[1])
374 | pt_uniq = unique(pt)
375 | pt_uniq
376 | library("ggplot2")
377 | 109 + 289 +199 +109
378 | 706-30
379 | 676 - 20*3
380 | 706-50
381 | 656-60
382 | c(1:6, 8:10, 15:20)
383 | c(1, 3:6, 8:10, 15:17, 20)
384 | intersect(c(1:6, 8:10, 15:20), c(1, 3:6, 8:10, 15:17, 20))
385 | log10(10)
386 | log10(0.1)
387 | log(1)
388 | log(2)
389 | install.packages("RPEnsemble")
390 | devtools::install_github("AndreaCirilloAC/updateR")
391 | library(updateR)
392 | updateR(admin_password = "l19921020")
393 | R.version
394 | 


--------------------------------------------------------------------------------
/vignettes/scImpute-vignette.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  9 | <meta name="generator" content="pandoc" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1">
 12 | 
 13 | <meta name="author" content="Wei Vivian Li, Jingyi Jessica Li" />
 14 | 
 15 | <meta name="date" content="2018-06-08" />
 16 | 
 17 | <title>Introduction to scImpute</title>
 18 | 
 19 | 
 20 | 
 21 | <style type="text/css">code{white-space: pre;}</style>
 22 | <style type="text/css">
 23 | div.sourceCode { overflow-x: auto; }
 24 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
 25 |   margin: 0; padding: 0; vertical-align: baseline; border: none; }
 26 | table.sourceCode { width: 100%; line-height: 100%; }
 27 | td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
 28 | td.sourceCode { padding-left: 5px; }
 29 | code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
 30 | code > span.dt { color: #902000; } /* DataType */
 31 | code > span.dv { color: #40a070; } /* DecVal */
 32 | code > span.bn { color: #40a070; } /* BaseN */
 33 | code > span.fl { color: #40a070; } /* Float */
 34 | code > span.ch { color: #4070a0; } /* Char */
 35 | code > span.st { color: #4070a0; } /* String */
 36 | code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
 37 | code > span.ot { color: #007020; } /* Other */
 38 | code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
 39 | code > span.fu { color: #06287e; } /* Function */
 40 | code > span.er { color: #ff0000; font-weight: bold; } /* Error */
 41 | code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
 42 | code > span.cn { color: #880000; } /* Constant */
 43 | code > span.sc { color: #4070a0; } /* SpecialChar */
 44 | code > span.vs { color: #4070a0; } /* VerbatimString */
 45 | code > span.ss { color: #bb6688; } /* SpecialString */
 46 | code > span.im { } /* Import */
 47 | code > span.va { color: #19177c; } /* Variable */
 48 | code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
 49 | code > span.op { color: #666666; } /* Operator */
 50 | code > span.bu { } /* BuiltIn */
 51 | code > span.ex { } /* Extension */
 52 | code > span.pp { color: #bc7a00; } /* Preprocessor */
 53 | code > span.at { color: #7d9029; } /* Attribute */
 54 | code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
 55 | code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
 56 | code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
 57 | code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
 58 | </style>
 59 | 
 60 | 
 61 | 
 62 | <link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
 63 | 
 64 | </head>
 65 | 
 66 | <body>
 67 | 
 68 | 
 69 | 
 70 | 
 71 | <h1 class="title toc-ignore">Introduction to scImpute</h1>
 72 | <h4 class="author"><em>Wei Vivian Li, Jingyi Jessica Li</em></h4>
 73 | <h4 class="date"><em>2018-06-08</em></h4>
 74 | 
 75 | 
 76 | 
 77 | <p>The emerging single cell RNA sequencing (scRNA-seq) technologies enable the investigation of transcriptomic landscape at single-cell resolution. However, scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. Consequently, downstream analysis of scRNA-seq woule be severely biased if the dropout events are not properly corrected. <code>scImpute</code> is developed to accurately and efficiently impute the dropout values in scRNA-seq data.</p>
 78 | <p><code>scImpute</code> can be applied to raw data count before the users perform downstream analyses such as</p>
 79 | <ul>
 80 | <li>dimension reduction of scRNA-seq data</li>
 81 | <li>normalization of scRNA-seq data</li>
 82 | <li>clustering of cell populations</li>
 83 | <li>differential gene expression analysis</li>
 84 | <li>time-series analysis of gene expression dynamics</li>
 85 | </ul>
 86 | <div id="quick-start" class="section level2">
 87 | <h2>Quick start</h2>
 88 | <p><code>scImpute</code> can be easily incorporated into existing pipeline of scRNA-seq analysis. Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. In the simplest case, the imputation task can be done with one single function <code>scimpute</code>:</p>
 89 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">scimpute</span>(<span class="co"># full path to raw count matrix</span>
 90 |          <span class="dt">count_path =</span> <span class="kw">system.file</span>(<span class="st">&quot;extdata&quot;</span>, <span class="st">&quot;raw_count.csv&quot;</span>, <span class="dt">package =</span> <span class="st">&quot;scImpute&quot;</span>), 
 91 |          <span class="dt">infile =</span> <span class="st">&quot;csv&quot;</span>,           <span class="co"># format of input file</span>
 92 |          <span class="dt">outfile =</span> <span class="st">&quot;csv&quot;</span>,          <span class="co"># format of output file</span>
 93 |          <span class="dt">out_dir =</span> <span class="st">&quot;./&quot;</span>,           <span class="co"># full path to output directory</span>
 94 |          <span class="dt">labeled =</span> <span class="ot">FALSE</span>,          <span class="co"># cell type labels not available</span>
 95 |          <span class="dt">drop_thre =</span> <span class="fl">0.5</span>,          <span class="co"># threshold set on dropout probability</span>
 96 |          <span class="dt">Kcluster =</span> <span class="dv">2</span>,             <span class="co"># 2 cell subpopulations</span>
 97 |          <span class="dt">ncores =</span> <span class="dv">10</span>)              <span class="co"># number of cores used in parallel computation</span></code></pre></div>
 98 | <p>This function returns the column indices of outlier cells, and creates a new file <code>scImpute_count.csv</code> in <code>out_dir</code> to store the imputed count matrix.</p>
 99 | </div>
100 | <div id="step-by-step-description" class="section level2">
101 | <h2>Step-by-step description</h2>
102 | <p>The input file can be a <code>.csv</code> file, <code>.txt</code> file, or <code>.rds</code> file. In all cases, the <strong>first column</strong> should give the gene names and the <strong>first row</strong> should give the cell names. We use the example files in the package as illustration. If the raw counts are stored in a <code>.csv</code> file, and we also hope to output the imputed matrix into a <code>.csv</code> file, then specify this information with</p>
103 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># full path of the input file</span>
104 | count_path =<span class="st"> </span><span class="kw">system.file</span>(<span class="st">&quot;extdata&quot;</span>, <span class="st">&quot;raw_count.csv&quot;</span>, <span class="dt">package =</span> <span class="st">&quot;scImpute&quot;</span>)
105 | infile =<span class="st"> &quot;csv&quot;</span>
106 | outfile =<span class="st"> &quot;csv&quot;</span></code></pre></div>
107 | <p>Similarly, If the raw counts are stored in a <code>.txt</code> file, and we also hope to output the imputed matrix into a <code>.txt</code> file, then specify this information with</p>
108 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># full path of the input file</span>
109 | count_path =<span class="st"> </span><span class="kw">system.file</span>(<span class="st">&quot;extdata&quot;</span>, <span class="st">&quot;raw_count.txt&quot;</span>, <span class="dt">package =</span> <span class="st">&quot;scImpute&quot;</span>)
110 | infile =<span class="st"> &quot;txt&quot;</span>
111 | outfile =<span class="st"> &quot;txt&quot;</span></code></pre></div>
112 | <p>Next, we need to set up the directory to store all the temporary and final outputs:</p>
113 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># a '/' sign is necessary at the end of the path</span>
114 | out_dir =<span class="st"> &quot;~/output/&quot;</span></code></pre></div>
115 | <p>We highly recommend using parallel computing with <code>scImpute</code>, which will significantly reduce the computation time. Suppose we would like to use 20 cores, then we can run the <code>scImpute</code> function with <code>ncores = 20</code>.</p>
116 | <p><code>scImpute</code> has two statistical parameters. The <strong>first parameter is <code>Kcluster</code></strong>, which determines the <strong>number of initial clusters</strong> to help identify candidate neighbors of each cell. The imputation results does not heavily rely on the choice of <code>Kcluster</code>, since <code>scImpute</code> uses a model-based method to select similar cells in a later stage. <code>Kcluster</code> can be specified based on the number of known cell types and users’ biological expertise, and it may also be learned by clustering the raw data and inspecting the clustering results. The <strong>second parameter</strong> is <code>drop_thre</code>. Only the values that have <strong>dropout probability</strong> larger than <code>drop_thre</code> are imputed by <code>scImpute</code>. A default threshold <code>drop_thre = 0.5</code> is sufficient for most scRNA-seq data.</p>
117 | <p>Now to get the imputed matrix, all we need is the main <code>scimpute</code> function</p>
118 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">Kcluster =<span class="st"> </span><span class="dv">2</span>
119 | drop_thre =<span class="st"> </span><span class="fl">0.5</span>
120 | ncores =<span class="st"> </span><span class="dv">10</span>
121 | <span class="kw">scimpute</span>(count_path, infile, outfile, out_dir, <span class="dt">labeled =</span> <span class="ot">FALSE</span>,  drop_thre, Kcluster, ncores)  </code></pre></div>
122 | <p>If <code>outfile = &quot;csv&quot;</code>, this function will create a new file <code>scimpute_count.csv</code> in <code>out_dir</code> to store the imputed count matrix; if <code>outfile = &quot;txt&quot;</code>, this function will create a new file <code>scimpute_count.txt</code> in <code>out_dir</code>.</p>
123 | <p>Note that the order of parameters matters in R functions, so we suggest using the format in <strong>Quick start</strong> to specify parameters and avoid mistakes. If the users would like to apply <code>scImpute</code> on data coming from homogeneous cells, this can be achieved by setting <code>Kcluster = 1</code> and <code>labeled = FALSE</code>.</p>
124 | </div>
125 | <div id="apply-scimpute-with-cell-type-information" class="section level2">
126 | <h2>Apply <code>scImpute</code> with cell type information</h2>
127 | <p>Sometimes users may have the cell type (or subpopulation) information of the single cells and <code>scimpute</code> can take advantage of this information to impute among each cell type. To do this, we need a character vector <code>labels</code> specifying the cell type of each column in the raw count matrix. In other words, the length of <code>labels</code> equals the number of cells and the order of elements in <code>labels</code> should match the order of columns in the raw count matrix. Then we just need to specify <code>labeled = TRUE</code> in <code>scimpute</code> (default is <code>FALSE</code>) and specify the <code>labels</code> argument. <code>Kcluster</code> is not used when <code>labeled = TRUE</code>.</p>
128 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">labels =<span class="st"> </span><span class="kw">readRDS</span>(<span class="kw">system.file</span>(<span class="st">&quot;extdata&quot;</span>, <span class="st">&quot;labels.rds&quot;</span>, <span class="dt">package =</span> <span class="st">&quot;scImpute&quot;</span>))
129 | labels[<span class="dv">1</span><span class="op">:</span><span class="dv">5</span>]
130 | <span class="op">&gt;</span><span class="st"> </span>[<span class="dv">1</span>] <span class="st">&quot;c1&quot;</span> <span class="st">&quot;c1&quot;</span> <span class="st">&quot;c1&quot;</span> <span class="st">&quot;c2&quot;</span> <span class="st">&quot;c2&quot;</span>
131 | 
132 | <span class="kw">scimpute</span>(count_path, 
133 |          <span class="dt">infile =</span> <span class="st">&quot;csv&quot;</span>, 
134 |          <span class="dt">outfile =</span> <span class="st">&quot;csv&quot;</span>, 
135 |          <span class="dt">out_dir =</span> out_dir,
136 |          <span class="dt">labeled =</span> <span class="ot">TRUE</span>, 
137 |          <span class="dt">drop_thre =</span> <span class="fl">0.5</span>,
138 |          <span class="dt">labels =</span> labels, 
139 |          <span class="dt">ncores =</span> <span class="dv">10</span>)</code></pre></div>
140 | </div>
141 | <div id="apply-scimpute-to-tpm-values" class="section level2">
142 | <h2>Apply <code>scImpute</code> to TPM values</h2>
143 | <p>We strongly suggest using <code>scImpute</code> on count matrices. However, if only TPM values are available, users can apply <code>scImpute</code> with gene lengths supplied. <code>scImpute</code> will use the gene lengths (sum of exon lengths) to scale the data , which ensures a good fitting of the mixture models. In this case, users need to specify <code>type = &quot;TPM&quot;</code> (<code>type = &quot;count&quot;</code> by default), and supply a vector <code>genelen</code> of gene lengths. The order of genes in <code>genelen</code> should match the order in the expression matrix. For example:</p>
144 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="op">&gt;</span><span class="st"> </span>genelen[<span class="dv">1</span><span class="op">:</span><span class="dv">3</span>]
145 | ENSMUSG00000021252 ENSMUSG00000007777 ENSMUSG00000024442
146 |               <span class="dv">4235</span>                <span class="dv">998</span>               <span class="dv">2404</span>
147 | 
148 | <span class="kw">scimpute</span>(count_path, 
149 |          <span class="dt">infile =</span> <span class="st">&quot;csv&quot;</span>, 
150 |          <span class="dt">outfile =</span> <span class="st">&quot;csv&quot;</span>, 
151 |          <span class="dt">out_dir =</span> out_dir,
152 |          <span class="dt">type =</span> <span class="st">&quot;TPM&quot;</span>
153 |          <span class="dt">genelen =</span> genelen,
154 |          <span class="dt">drop_thre =</span> <span class="fl">0.5</span>,
155 |          <span class="dt">ncores =</span> <span class="dv">10</span>)</code></pre></div>
156 | </div>
157 | <div id="how-to-save-computation-time-with-scimpute" class="section level2">
158 | <h2>How to save computation time with <code>scImpute</code></h2>
159 | <p><code>scImpute</code> benefits from parallel computation, and each processor does not require heavy memory cost. <code>scimpute</code> completes computation in seconds when applied to a dataset with 10,000 genes and 100 cells, running with 10 cores. The memory requirement for this data set is around 2G. The running time mostly depends on</p>
160 | <ul>
161 | <li>number of processors (<code>ncores</code>)</li>
162 | <li>number of cells in the scRNA-seq data</li>
163 | </ul>
164 | <p>When the number of cells is extremely large, a filtering step on the cells can save the computation time of <code>scImpute</code>.</p>
165 | </div>
166 | 
167 | 
168 | 
169 | <!-- dynamically load mathjax for compatibility with self-contained -->
170 | <script>
171 |   (function () {
172 |     var script = document.createElement("script");
173 |     script.type = "text/javascript";
174 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
175 |     document.getElementsByTagName("head")[0].appendChild(script);
176 |   })();
177 | </script>
178 | 
179 | </body>
180 | </html>
181 | 


--------------------------------------------------------------------------------