├── .gitignore ├── .Rbuildignore ├── inst ├── docs │ ├── scImpute.pdf │ ├── scImpute-news.Rmd │ └── scImpute-news.html ├── extdata │ ├── labels.rds │ └── .Rapp.history └── comparison │ ├── .Rhistory │ ├── 3-run-syn.R │ ├── 3-run.R │ ├── 1-filter_data.R │ ├── 2-run-magic.ipynb │ ├── .ipynb_checkpoints │ └── 2-run-magic-checkpoint.ipynb │ ├── 4-plot-syn.R │ └── 4-plot.R ├── R ├── dmix.R ├── rmix.R ├── calculate_weight.R ├── write_count.R ├── read_count.R ├── get_mix_parameters.R ├── scimpute.R ├── scImpute-internal.R └── imputation_model.R ├── NAMESPACE ├── DESCRIPTION ├── man └── scimpute.Rd ├── README.md ├── README.Rmd ├── vignettes ├── scImpute-vignette.Rmd └── scImpute-vignette.html └── .Rhistory /.gitignore: -------------------------------------------------------------------------------- 1 | inst/doc 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^README\.Rmd$ 2 | -------------------------------------------------------------------------------- /inst/docs/scImpute.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vivianstats/scImpute/HEAD/inst/docs/scImpute.pdf -------------------------------------------------------------------------------- /inst/extdata/labels.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vivianstats/scImpute/HEAD/inst/extdata/labels.rds -------------------------------------------------------------------------------- /inst/extdata/.Rapp.history: -------------------------------------------------------------------------------- 1 | load("/Users/wei/Dropbox/scrna_impute/codes/scimpute_dev/scImpute/inst/extdata/labels.rds") 2 | -------------------------------------------------------------------------------- /R/dmix.R: -------------------------------------------------------------------------------- 1 | dmix <- 2 | function (x, pars) 3 | { 4 | pars[1] * dgamma(x, shape = pars[2], rate = pars[3]) + (1 - 5 | pars[1]) * dnorm(x, mean = pars[4], sd = pars[5]) 6 | } 7 | -------------------------------------------------------------------------------- /inst/comparison/.Rhistory: -------------------------------------------------------------------------------- 1 | 71038836/4 2 | 2882668/4 3 | 5000/9.25 +9/2.8 4 | 5000/925 +9/2.8 5 | fpkm = sapply(1:length(fpkm), function(i){ 6 | if(class[i] == "transcript") return(fpkm[3]) 7 | if(class[i] == "exon") return(fpkm[4]) 8 | }) 9 | -------------------------------------------------------------------------------- /R/rmix.R: -------------------------------------------------------------------------------- 1 | rmix <- 2 | function (pars, n) 3 | { 4 | n1 = round(n * pars[1]) 5 | n2 = n - n1 6 | x1 = rgamma(n1, shape = pars[2], rate = pars[3]) 7 | x2 = rnorm(n2, mean = pars[4], sd = pars[5]) 8 | return(c(x1, x2)) 9 | } 10 | -------------------------------------------------------------------------------- /R/calculate_weight.R: -------------------------------------------------------------------------------- 1 | calculate_weight <- 2 | function (x, paramt) 3 | { 4 | pz1 = paramt[1] * dgamma(x, shape = paramt[2], rate = paramt[3]) 5 | pz2 = (1 - paramt[1]) * dnorm(x, mean = paramt[4], sd = paramt[5]) 6 | pz = pz1/(pz1 + pz2) 7 | pz[pz1 == 0] = 0 8 | return(cbind(pz, 1 - pz)) 9 | } 10 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(scimpute) 4 | import(doParallel) 5 | import(foreach) 6 | import(parallel) 7 | import(penalized) 8 | importFrom(kernlab,specc) 9 | importFrom(rsvd,rpca) 10 | importFrom(stats,complete.cases) 11 | importFrom(stats,dgamma) 12 | importFrom(stats,dnorm) 13 | importFrom(stats,prcomp) 14 | importFrom(stats,quantile) 15 | importFrom(stats,rgamma) 16 | importFrom(stats,rnorm) 17 | importFrom(stats,sd) 18 | importFrom(stats,uniroot) 19 | importFrom(utils,read.csv) 20 | importFrom(utils,read.table) 21 | importFrom(utils,write.csv) 22 | importFrom(utils,write.table) 23 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: scImpute 2 | Type: Package 3 | Title: Accurate and robust imputation of single-cell RNA sequencing data 4 | Version: 0.0.9 5 | Date: 2018-08-15 6 | Author: Wei Vivian Li, Jingyi Jessica Li 7 | Maintainer: Wei Vivian Li 8 | Description: scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. scImpute is developed to simultaneously determine which expression values are affected by dropout events in scRNA-seq data and perform imputation only on dropout entries. 9 | Depends: R (>= 3.3.2), parallel, stats, penalized, utils, doParallel, foreach 10 | Imports: kernlab, rsvd 11 | License: GPL 12 | RoxygenNote: 6.0.1 13 | Suggests: knitr, 14 | rmarkdown 15 | VignetteBuilder: knitr 16 | -------------------------------------------------------------------------------- /inst/comparison/3-run-syn.R: -------------------------------------------------------------------------------- 1 | data_dir = "./" 2 | count_path = paste0(data_dir, "zeisel_samp.rds") 3 | 4 | count = readRDS(count_path) 5 | write.csv(count, file = paste0(data_dir, "zeisel_samp.csv"), 6 | row.names = TRUE, col.names = TRUE) 7 | ############ run scImpute 8 | #library(devtools) 9 | #install_github("Vivianstats/scImpute", ref = "895c262") #v0.0.3 10 | library(scImpute) 11 | 12 | count_path = paste0(data_dir, "zeisel_samp.csv") 13 | 14 | out_dir = paste0(data_dir, "rerun/samp/") 15 | dir.create(out_dir) 16 | scimpute(count_path = count_path, infile = "csv", outfile = "csv", 17 | Kcluster = 9, 18 | out_dir = out_dir, drop_thre = 0.5, ncores = 36) 19 | 20 | count = read.csv(paste0(out_dir, "scimpute_count.csv"), row.names = 1) 21 | saveRDS(count, file = paste0(out_dir, "zeisel_scimpute_k9.rds")) 22 | 23 | -------------------------------------------------------------------------------- /R/write_count.R: -------------------------------------------------------------------------------- 1 | write_count <- 2 | function (count_imp, filetype, out_dir, type, genelen) 3 | { 4 | totalCounts_by_cell = readRDS(paste0(out_dir, "totalCounts_by_cell.rds")) 5 | count_imp = sweep(count_imp, MARGIN = 2, totalCounts_by_cell/10^6, 6 | FUN = "*") 7 | if(type == "TPM"){ 8 | count_imp = sweep(count_imp, 1, genelen, FUN = "/") 9 | } 10 | count_imp = round(count_imp, digits = 2) 11 | if (filetype == "csv") { 12 | write.csv(count_imp, file = paste0(out_dir, "scimpute_count.csv")) 13 | } 14 | else if (filetype == "txt") { 15 | write.table(count_imp, file = paste0(out_dir, "scimpute_count.txt"), 16 | quote = FALSE) 17 | }else if (filetype == "rds") { 18 | saveRDS(count_imp, file = paste0(out_dir, "scimpute_count.rds")) 19 | }else { 20 | print("filetype can be 'csv', 'txt', or 'rds'!") 21 | stop() 22 | } 23 | return(0) 24 | } 25 | -------------------------------------------------------------------------------- /inst/comparison/3-run.R: -------------------------------------------------------------------------------- 1 | data_dir = "./" 2 | count_path = paste0(data_dir, "rerun/zeisel_raw.csv") 3 | 4 | ############ run scImpute 5 | library(devtools) 6 | install_github("Vivianstats/scImpute", ref = "895c262") #v0.0.3 7 | library(scImpute) 8 | 9 | out_dir = paste0(data_dir, "rerun/") 10 | dir.create(out_dir) 11 | scimpute(count_path = count_path, infile = "csv", outfile = "csv", 12 | Kcluster = 9, 13 | out_dir = out_dir, drop_thre = 0.5, ncores = 36) 14 | 15 | count = read.csv(paste0(out_dir, "scimpute_count_k9.csv"), row.names = 1) 16 | saveRDS(count, file = paste0(out_dir, "zeisel_scimpute_k9.rds")) 17 | 18 | # ############ run SAVER 19 | # devtools::install_github("mohuangx/SAVER", ref="b64a077") #v1.0.0 20 | 21 | library(SAVER) 22 | library(doParallel) 23 | 24 | count_path = paste0(data_dir, "rerun/zeisel_raw.rds") 25 | 26 | cl = makeCluster(35, outfile = "") 27 | registerDoParallel(cl) 28 | 29 | dat = readRDS(count_path) 30 | out = saver(dat) 31 | 32 | saveRDS(out, file = paste0(data_dir, "rerun/zeisel_saver.rds")) 33 | 34 | 35 | # # ############ run MAGIC 36 | # dat = read.csv(paste0(data_dir, "rerun/zeisel_magic.csv")) 37 | # rownames(dat) = dat[, 1] 38 | # dat = dat[,-1] 39 | # saveRDS(dat, paste0(data_dir, "rerun/zeisel_magic.rds")) 40 | 41 | -------------------------------------------------------------------------------- /R/read_count.R: -------------------------------------------------------------------------------- 1 | read_count <- 2 | function (filetype, path, out_dir, type, genelen) 3 | { 4 | if(filetype == "csv") { 5 | raw_count = read.csv(path, header = TRUE, row.names = 1) 6 | }else if(filetype == "txt") { 7 | raw_count = read.table(path, header = TRUE, row.names = 1) 8 | }else if(filetype == "rds") { 9 | raw_count = readRDS(path) 10 | }else{ 11 | print("filetype can be 'csv', 'txt', or 'rds'!") 12 | stop() 13 | } 14 | raw_count = as.matrix(raw_count) 15 | print(paste("number of genes in raw count matrix", nrow(raw_count))) 16 | print(paste("number of cells in raw count matrix", ncol(raw_count))) 17 | 18 | if(type == "TPM"){ 19 | if(length(genelen) != nrow(raw_count)) stop("number of genes in 'genelen' and count matrix do not match! ") 20 | raw_count = sweep(raw_count, 1, genelen, FUN = "*") 21 | } 22 | 23 | totalCounts_by_cell = colSums(raw_count) 24 | saveRDS(totalCounts_by_cell, file = paste0(out_dir, "totalCounts_by_cell.rds")) 25 | totalCounts_by_cell[totalCounts_by_cell == 0] = 1 26 | raw_count = sweep(raw_count, MARGIN = 2, 10^6/totalCounts_by_cell, FUN = "*") 27 | if (min(raw_count) < 0) { 28 | stop("smallest read count cannot be negative!") 29 | } 30 | count_lnorm = log10(raw_count + 1.01) 31 | return(count_lnorm) 32 | } 33 | -------------------------------------------------------------------------------- /inst/docs/scImpute-news.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "scImpute Updates" 3 | author: "Wei Vivian Li" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | #output: pdf_document 7 | --- 8 | 9 | 10 | ## Updates 11 | 12 | > 2018/08/15: 13 | 14 | - Version 0.0.9 is released! 15 | - More robust implementation of dimension reduction. 16 | - Faster calculation of cell similarity. 17 | 18 | > 2018/06/27: 19 | 20 | - Version 0.0.8 is released! 21 | - Faster implementation of dimension reduction. 22 | 23 | > 2018/06/08: 24 | 25 | - Version 0.0.7 is released! 26 | - New option for application on TPM values. 27 | 28 | > 2018/03/16: 29 | 30 | + Version 0.0.6 is released! 31 | + The scImpute method is published at [*Nature Communications*](https://www.nature.com/articles/s41467-018-03405-7). 32 | + scImpute now supports input and output in the format of R objects (.rds). 33 | 34 | > 2018/01/12: 35 | 36 | + Version 0.0.5 is released! 37 | + It is now possible to apply scImpute on just one cell population by setting `Kcluster = 1`. 38 | 39 | > 2017/10/27: 40 | 41 | + Version 0.0.4 is released! 42 | + scImpute now supports multi-code parallelism. 43 | 44 | > 2017/10/22: 45 | 46 | + Version 0.0.3 is released! 47 | + Estimation of dropout probabilities is more accurate. 48 | + Imputation step is more robust. 49 | + `scimpute()` incorporates a new parameter `Kcluster` to specify the number of cell subpopulations. 50 | + `scImpute` is now able to detect outlier cells. 51 | 52 | > 2017/07/01: 53 | 54 | + Version 0.0.2 is released! 55 | + This version speeds up the first step in `scImpute` and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core). -------------------------------------------------------------------------------- /inst/comparison/1-filter_data.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | ## Zeisel data 3 | 4 | ## expression_mRNA_17-Aug-2014.txt 5 | ## can be downloaded from 6 | ## https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt 7 | 8 | data_dir = "./" 9 | x = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), skip = 11, 10 | header = FALSE, stringsAsFactors = FALSE, 11 | row.names = 1) 12 | x = x[, -1] 13 | cellnames = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 14 | skip = 7, nrows = 1, row.names = 1, 15 | stringsAsFactors = FALSE) 16 | colnames(x) = cellnames[-1] 17 | 18 | labels = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 19 | skip = 1, nrows = 1, row.names = 1, 20 | stringsAsFactors = FALSE) 21 | labels = unlist(labels) 22 | table(labels) 23 | 24 | 25 | matching = c("1"="Interneurons", "2"="S1-Pyramidal", "3"="CA1-Pyramidal", 26 | "4"="Oligodendrocytes", "5"="Microglia", "6"="Endothelial", 27 | "7" = "Astrocytes", "8" = "Ependymal", 28 | "9"="Mural") 29 | labels = matching[as.character(labels)] 30 | 31 | saveRDS(x, paste0(data_dir, "rerun/zeisel_raw.rds")) 32 | saveRDS(labels, paste0(data_dir, "rerun/zeisel_label9.rds")) 33 | 34 | 35 | ### level2 classes 36 | labels = read.table(paste0(data_dir, "expression_mRNA_17-Aug-2014.txt"), 37 | skip = 9, nrows = 1, row.names = 1, 38 | stringsAsFactors = FALSE) 39 | labels = unlist(labels) 40 | table(labels) 41 | saveRDS(labels, paste0(data_dir, "rerun/zeisel_label47.rds")) 42 | 43 | 44 | 45 | 46 | write.csv(x, paste0(data_dir, "rerun/zeisel_raw.csv"), quote = FALSE) 47 | 48 | -------------------------------------------------------------------------------- /inst/comparison/2-run-magic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Calculating MAGIC...\n", 13 | "Calculating graph and diffusion operator...\n", 14 | "Calculating PCA...\n", 15 | "Calculated PCA in 7.59 seconds.\n", 16 | "Calculating KNN search...\n", 17 | "Calculated KNN search in 1.40 seconds.\n", 18 | "Calculating affinities...\n", 19 | "Calculated affinities in 0.12 seconds.\n", 20 | "Calculated graph and diffusion operator in 9.47 seconds.\n", 21 | "Calculating imputation...\n", 22 | "Automatically selected t = 7\n", 23 | "Calculated imputation in 0.17 seconds.\n", 24 | "Calculated MAGIC in 12.30 seconds.\n", 25 | "--- 21.142620086669922 seconds ---\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import magic\n", 31 | "import pandas as pd\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "import time\n", 34 | "start_time = time.time()\n", 35 | "X = pd.read_csv(\"~/rerun/zeisel_raw.csv\",header = 0,index_col=0)\n", 36 | "X = X.transpose()\n", 37 | "magic_operator = magic.MAGIC()\n", 38 | "X_magic = magic_operator.fit_transform(X)\n", 39 | "print(\"--- %s seconds ---\" % (time.time() - start_time))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 20, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "pd.DataFrame.to_csv(X_magic.transpose(), \"~/rerun/zeisel_magic.csv\")" 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.6.4" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 2 73 | } 74 | -------------------------------------------------------------------------------- /inst/comparison/.ipynb_checkpoints/2-run-magic-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Calculating MAGIC...\n", 13 | "Calculating graph and diffusion operator...\n", 14 | "Calculating PCA...\n", 15 | "Calculated PCA in 7.59 seconds.\n", 16 | "Calculating KNN search...\n", 17 | "Calculated KNN search in 1.40 seconds.\n", 18 | "Calculating affinities...\n", 19 | "Calculated affinities in 0.12 seconds.\n", 20 | "Calculated graph and diffusion operator in 9.47 seconds.\n", 21 | "Calculating imputation...\n", 22 | "Automatically selected t = 7\n", 23 | "Calculated imputation in 0.17 seconds.\n", 24 | "Calculated MAGIC in 12.30 seconds.\n", 25 | "--- 21.142620086669922 seconds ---\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import magic\n", 31 | "import pandas as pd\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "import time\n", 34 | "start_time = time.time()\n", 35 | "X = pd.read_csv(\"~/rerun/zeisel_raw.csv\",header = 0,index_col=0)\n", 36 | "X = X.transpose()\n", 37 | "magic_operator = magic.MAGIC()\n", 38 | "X_magic = magic_operator.fit_transform(X)\n", 39 | "print(\"--- %s seconds ---\" % (time.time() - start_time))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 20, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "pd.DataFrame.to_csv(X_magic.transpose(), \"~/rerun/zeisel_magic.csv\")" 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.6.4" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 2 73 | } 74 | -------------------------------------------------------------------------------- /inst/comparison/4-plot-syn.R: -------------------------------------------------------------------------------- 1 | library(Rtsne) 2 | library(ggplot2) 3 | library(parallel) 4 | library(ClusterR) 5 | library(tidyr) 6 | library(dplyr) 7 | library(gridExtra) 8 | library(grid) 9 | #library(kernlab) 10 | 11 | plot_dir = "./plots/" 12 | data_dir = "./rerun/" 13 | 14 | 15 | ### data in Huang et al. 16 | ### downloaded from https://www.dropbox.com/sh/ri6fa3mbhvgapqk/AADwOzHfiCcLSqYnX9CTyd7_a?dl=0 17 | 18 | ### labels 19 | labels = readRDS(paste0(data_dir, "rerun/zeisel_label9.rds")) 20 | labels_samp = labels[match(colnames(count_samp), colnames(count_raw))] 21 | 22 | info = readRDS(paste0(data_dir, "rerun/samp/fig2d_tsne.rds")) 23 | ident = info[[2]] 24 | labels_Huang = ident[[4]][[1]] 25 | write.table(table(labels_samp, labels_Huang), file = paste0(data_dir, "rerun/samp/label-matrix.txt")) 26 | 27 | 28 | ##################################################### 29 | ### tSNE 30 | 31 | methods = c("syn", "scImpute", "MAGIC", "SAVER") 32 | name_appends = c("samp", "samp_scimpute", "samp_magic", "samp_saver") 33 | names(name_appends) = methods 34 | 35 | for(method in methods){ 36 | set.seed(1234) 37 | print(method) 38 | dim = 2 39 | count_raw = readRDS(paste0(data_dir, "rerun/samp/zeisel_", name_appends[method], ".rds")) 40 | if(method == "SAVER") count_raw = count_raw$estimate 41 | count = log10(count_raw + 1) 42 | tsne = Rtsne(t(count), dims = dim)$Y 43 | saveRDS(tsne, file = paste0(data_dir, "rerun/samp/zeisel-", method, "-tsne", dim, ".rds")) 44 | gc() 45 | } 46 | 47 | 48 | ### tSNE 49 | data = lapply(methods, function(method){ 50 | tsne = readRDS(file = paste0(data_dir, "rerun/samp/zeisel-", method, 51 | "-tsne2", ".rds")) 52 | pdata = data.frame(tSNE1 = tsne[,1], tSNE2 = tsne[,2], type = labels_samp) 53 | pdata$method = method 54 | return(pdata) 55 | }) 56 | data = Reduce(rbind, data) 57 | data$method = factor(data$method, levels = c("syn", "scImpute", "MAGIC", "SAVER")) 58 | gt = ggplot(data, aes(x = tSNE1, y = tSNE2, color = type)) + 59 | geom_point(alpha = 0.8, cex = 0.8) + facet_wrap(~method, nrow = 1) + 60 | theme_bw() + 61 | theme(strip.background = element_blank(), 62 | legend.position = "bottom", 63 | text = element_text(size=12)) 64 | ggsave(paste0(plot_dir,"Fig-samp.pdf"), gt, width = 11, height = 4) 65 | 66 | -------------------------------------------------------------------------------- /R/get_mix_parameters.R: -------------------------------------------------------------------------------- 1 | ### root-finding equation 2 | fn = function(alpha, target){ 3 | log(alpha) - digamma(alpha) - target 4 | } 5 | 6 | ### update parameters in gamma distribution 7 | update_gmm_pars = function(x, wt){ 8 | tp_s = sum(wt) 9 | tp_t = sum(wt * x) 10 | tp_u = sum(wt * log(x)) 11 | tp_v = -tp_u / tp_s - log(tp_s / tp_t) 12 | if (tp_v <= 0){ 13 | alpha = 20 14 | }else{ 15 | alpha0 = (3 - tp_v + sqrt((tp_v - 3)^2 + 24 * tp_v)) / 12 / tp_v 16 | if (alpha0 >= 20){alpha = 20 17 | }else{ 18 | alpha = uniroot(fn, c(0.9, 1.1) * alpha0, target = tp_v, 19 | extendInt = "yes")$root 20 | } 21 | } 22 | ## need to solve log(x) - digamma(x) = tp_v 23 | ## We use this approximation to compute the initial value 24 | beta = tp_s / tp_t * alpha 25 | return(c(alpha, beta)) 26 | } 27 | 28 | ### estimate parameters in the mixture distribution 29 | get_mix = function(xdata, point){ 30 | inits = rep(0, 5) 31 | inits[1] = sum(xdata == point)/length(xdata) 32 | if (inits[1] == 0) {inits[1] = 0.01} 33 | inits[2:3] = c(0.5, 1) 34 | xdata_rm = xdata[xdata > point] 35 | inits[4:5] = c(mean(xdata_rm), sd(xdata_rm)) 36 | if (is.na(inits[5])) {inits[5] = 0} 37 | paramt = inits 38 | eps = 10 39 | iter = 0 40 | loglik_old = 0 41 | 42 | while(eps > 0.5) { 43 | wt = calculate_weight(xdata, paramt) 44 | paramt[1] = sum(wt[, 1])/nrow(wt) 45 | paramt[4] = sum(wt[, 2] * xdata)/sum(wt[, 2]) 46 | paramt[5] = sqrt(sum(wt[, 2] * (xdata - paramt[4])^2)/sum(wt[, 2])) 47 | paramt[2:3] = update_gmm_pars(x=xdata, wt=wt[,1]) 48 | 49 | loglik = sum(log10(dmix(xdata, paramt))) 50 | eps = (loglik - loglik_old)^2 51 | loglik_old = loglik 52 | iter = iter + 1 53 | if (iter > 100) 54 | break 55 | } 56 | return(paramt) 57 | } 58 | 59 | get_mix_parameters <- 60 | function (count, point = log10(1.01), path, ncores = 8) 61 | { 62 | count = as.matrix(count) 63 | null_genes = which(abs(rowSums(count) - point * ncol(count)) < 1e-10) 64 | parslist = mclapply(1:nrow(count), function(ii) { 65 | if (ii %% 2000 == 0) { 66 | gc() 67 | print(ii) 68 | } 69 | if (ii %in% null_genes) { 70 | return(rep(NA, 5)) 71 | } 72 | xdata = count[ii, ] 73 | paramt = try(get_mix(xdata, point), silent = TRUE) 74 | if (class(paramt) == "try-error"){ 75 | paramt = rep(NA, 5) 76 | } 77 | return(paramt) 78 | }, mc.cores = ncores) 79 | save(parslist, file = path) 80 | parslist = Reduce(rbind, parslist) 81 | colnames(parslist) = c("rate", "alpha", "beta", "mu", "sigma") 82 | saveRDS(parslist, file = path) 83 | return(0) 84 | } 85 | 86 | -------------------------------------------------------------------------------- /man/scimpute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scimpute.R 3 | \name{scimpute} 4 | \alias{scimpute} 5 | \title{use scImpute to impute dropout values in scRNA-seq data} 6 | \usage{ 7 | scimpute(count_path, infile = "csv", outfile = "csv", type = "count", 8 | out_dir, labeled = FALSE, drop_thre = 0.5, Kcluster = NULL, 9 | labels = NULL, genelen = NULL, ncores = 5) 10 | } 11 | \arguments{ 12 | \item{count_path}{A character specifying the full path of the raw count matrix;} 13 | 14 | \item{infile}{A character specifying the type of file storing the raw count matrix; 15 | can be "csv", "txt", or "rds". The input file shoule have rows representing genes and 16 | columns representing cells, with its first row as cell names 17 | and first column as gene names.} 18 | 19 | \item{outfile}{A character specifying the type of file storing the imputed count matrix; 20 | can be "csv", "txt", or "rds".} 21 | 22 | \item{type}{A character specifying the type of values in the expression matrix. 23 | Can be "count" (default) or "TPM".} 24 | 25 | \item{out_dir}{A character specifying the full path of the output directory, 26 | which is used to store all intermdediate and final outputs.} 27 | 28 | \item{labeled}{A logical value indicating whether cell type information is available. 29 | \code{labels} must be specified if \code{labeled = TRUE}.} 30 | 31 | \item{drop_thre}{A number between 0 and 1, 32 | specifying the threshold to determine dropout values.} 33 | 34 | \item{Kcluster}{An integer specifying the number of cell subpopulations. 35 | This parameter can be determined based on prior knowledge or clustering of raw data. 36 | \code{Kcluster} is used to determine the candidate neighbors of each cell.} 37 | 38 | \item{labels}{A character vector specifying the cell type of 39 | each column in the raw count matrix. Only needed when \code{labeled = TRUE}. 40 | Each cell type should have at least two cells for imputation.} 41 | 42 | \item{genelen}{An integer vector giving the length of each gene. 43 | Order must match the gene orders in the expression matrix. 44 | \code{genelen} must be specified if \code{type = "count"}.} 45 | 46 | \item{ncores}{A integer specifying the number of cores used for parallel computation.} 47 | } 48 | \value{ 49 | scImpute returns a vector giving the column indices of outlier cells. 50 | It saves the imputed count matrix to scimpute_count.csv, scimpute_count.txt, or scimpute_count.rds 51 | (depending on \code{outfile}) to \code{out_dir}. 52 | } 53 | \description{ 54 | use scImpute to impute dropout values in scRNA-seq data 55 | } 56 | \references{ 57 | Li, W. V., & Li, J. J. (2018). An accurate and robust imputation method 58 | scImpute for single-cell RNA-seq data. \emph{Nature Communications}, 9(1), 997. 59 | } 60 | \author{ 61 | Wei Vivian Li, \email{liw@ucla.edu} 62 | 63 | Jingyi Jessica Li, \email{jli@stat.ucla.edu} 64 | } 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scImpute: accurate and robust imputation of scRNA-seq data 2 | ================ 3 | Wei Vivian Li, Jingyi Jessica Li 4 | 2019-08-20 5 | 6 | 7 | Latest News 8 | ----------- 9 | 10 | > 2019/08/20: 11 | 12 | - Since the development of scImpute, new imputation methods have been proposed for scRNA-seq data. These methods have different model assumptions and diverse performances on different datasets. It contributes to both method development and bioinformatic applications to discuss and compare existing imputation methods. However, we realize several issues in existing evaluation and comparison of imputation methods and discuss these issue in our commentary, which is available at [arxiv](https://arxiv.org/abs/1908.07084). 13 | 14 | > 2018/08/15: 15 | 16 | - Version 0.0.9 is released! 17 | - More robust implementation of dimension reduction. 18 | - Faster calculation of cell similarity. 19 | 20 | Introduction 21 | ------------ 22 | 23 | `scImpute` is developed to accurately and robustly impute the dropout values in scRNA-seq data. `scImpute` can be applied to raw read count matrix before the users perform downstream analyses such as 24 | 25 | - dimension reduction of scRNA-seq data 26 | - normalization of scRNA-seq data 27 | - clustering of cell populations 28 | - differential gene expression analysis 29 | - time-series analysis of gene expression dynamics 30 | 31 | The users can refer to our paper [An accurate and robust imputation method scImpute for single-cell RNA-seq data](https://www.nature.com/articles/s41467-018-03405-7) for a detailed description of the modeling and applications. 32 | 33 | Any suggestions on the package are welcome! For technical problems, please report to [Issues](https://github.com/Vivianstats/scImpute/issues). For suggestions and comments on the method, please contact Wei () or Dr. Jessica Li (). 34 | 35 | Installation 36 | ------------ 37 | 38 | The package is not on CRAN yet. For installation please use the following codes in `R` 39 | 40 | ``` r 41 | install.packages("devtools") 42 | library(devtools) 43 | 44 | install_github("Vivianstats/scImpute") 45 | ``` 46 | 47 | Quick start 48 | ----------- 49 | 50 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis. Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. In the simplest case, the imputation task can be done with one single function `scimpute`: 51 | 52 | ``` r 53 | scimpute(# full path to raw count matrix 54 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 55 | infile = "csv", # format of input file 56 | outfile = "csv", # format of output file 57 | out_dir = "./", # full path to output directory 58 | labeled = FALSE, # cell type labels not available 59 | drop_thre = 0.5, # threshold set on dropout probability 60 | Kcluster = 2, # 2 cell subpopulations 61 | ncores = 10) # number of cores used in parallel computation 62 | ``` 63 | 64 | This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts. 65 | 66 | For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd). 67 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "scImpute: accurate and robust imputation of scRNA-seq data" 3 | author: "Wei Vivian Li, Jingyi Jessica Li" 4 | 5 | date: "`r Sys.Date()`" 6 | output: github_document 7 | --- 8 | 9 | 10 | 11 | ```{r, echo = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>", 15 | fig.path = "README-" 16 | ) 17 | ``` 18 | 19 | ## Latest News 20 | 21 | > 2019/08/20: 22 | 23 | - Since the development of scImpute, new imputation methods have been proposed for scRNA-seq data. These methods have different model assumptions and diverse performances on different datasets. It contributes to both method development and bioinformatic applications to discuss and compare existing imputation methods. However, we realize several issues in existing evaluation and comparison of imputation methods and discuss these issue in our commentary, which is available at [arxiv]( https://arxiv.org/abs/1908.07084). 24 | 25 | > 2018/08/15: 26 | 27 | - Version 0.0.9 is released! 28 | - More robust implementation of dimension reduction. 29 | - Faster calculation of cell similarity. 30 | 31 | ## Introduction 32 | `scImpute` is developed to accurately and robustly impute the dropout values in scRNA-seq data. `scImpute` can be applied to raw read count matrix before the users perform downstream analyses such as 33 | 34 | - dimension reduction of scRNA-seq data 35 | - normalization of scRNA-seq data 36 | - clustering of cell populations 37 | - differential gene expression analysis 38 | - time-series analysis of gene expression dynamics 39 | 40 | The users can refer to our paper [An accurate and robust imputation method scImpute for single-cell RNA-seq data](https://www.nature.com/articles/s41467-018-03405-7) for a detailed description of the modeling and applications. 41 | 42 | Any suggestions on the package are welcome! For technical problems, please report to [Issues](https://github.com/Vivianstats/scImpute/issues). For suggestions and comments on the method, please contact Wei () or Dr. Jessica Li (). 43 | 44 | ## Installation 45 | The package is not on CRAN yet. For installation please use the following codes in `R` 46 | ```{r eval = FALSE} 47 | install.packages("devtools") 48 | library(devtools) 49 | 50 | install_github("Vivianstats/scImpute") 51 | ``` 52 | 53 | ## Quick start 54 | 55 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis. 56 | Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. 57 | In the simplest case, the imputation task can be done with one single function `scimpute`: 58 | ```{r eval = FALSE} 59 | scimpute(# full path to raw count matrix 60 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 61 | infile = "csv", # format of input file 62 | outfile = "csv", # format of output file 63 | out_dir = "./", # full path to output directory 64 | labeled = FALSE, # cell type labels not available 65 | drop_thre = 0.5, # threshold set on dropout probability 66 | Kcluster = 2, # 2 cell subpopulations 67 | ncores = 10) # number of cores used in parallel computation 68 | ``` 69 | This function returns the column indices of outlier cells, and creates a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix. Please note that we recommend applying scImpute on the whole-genome count matrix. A filtering step on genes is acceptable but most genes should be present to ensure robust identification of dropouts. 70 | 71 | For detailed usage, please refer to the package [manual](https://github.com/Vivianstats/scImpute/blob/master/inst/docs/) or [vignette](https://github.com/Vivianstats/scImpute/blob/master/vignettes/scImpute-vignette.Rmd). 72 | 73 | 74 | -------------------------------------------------------------------------------- /R/scimpute.R: -------------------------------------------------------------------------------- 1 | #' use scImpute to impute dropout values in scRNA-seq data 2 | #' 3 | #' @param count_path A character specifying the full path of the raw count matrix; 4 | #' @param infile A character specifying the type of file storing the raw count matrix; 5 | #' can be "csv", "txt", or "rds". The input file shoule have rows representing genes and 6 | #' columns representing cells, with its first row as cell names 7 | #' and first column as gene names. 8 | #' @param outfile A character specifying the type of file storing the imputed count matrix; 9 | #' can be "csv", "txt", or "rds". 10 | #' @param out_dir A character specifying the full path of the output directory, 11 | #' which is used to store all intermdediate and final outputs. 12 | #' @param type A character specifying the type of values in the expression matrix. 13 | #' Can be "count" (default) or "TPM". 14 | #' @param labeled A logical value indicating whether cell type information is available. 15 | #' \code{labels} must be specified if \code{labeled = TRUE}. 16 | #' @param genelen An integer vector giving the length of each gene. 17 | #' Order must match the gene orders in the expression matrix. 18 | #' \code{genelen} must be specified if \code{type = "count"}. 19 | #' @param drop_thre A number between 0 and 1, 20 | #' specifying the threshold to determine dropout values. 21 | #' @param Kcluster An integer specifying the number of cell subpopulations. 22 | #' This parameter can be determined based on prior knowledge or clustering of raw data. 23 | #' \code{Kcluster} is used to determine the candidate neighbors of each cell. 24 | #' @param labels A character vector specifying the cell type of 25 | #' each column in the raw count matrix. Only needed when \code{labeled = TRUE}. 26 | #' Each cell type should have at least two cells for imputation. 27 | #' @param ncores A integer specifying the number of cores used for parallel computation. 28 | #' @return scImpute returns a vector giving the column indices of outlier cells. 29 | #' It saves the imputed count matrix to scimpute_count.csv, scimpute_count.txt, or scimpute_count.rds 30 | #' (depending on \code{outfile}) to \code{out_dir}. 31 | #' @export 32 | #' @import parallel 33 | #' @import doParallel 34 | #' @import foreach 35 | #' @importFrom stats complete.cases dgamma dnorm prcomp quantile rgamma rnorm sd uniroot 36 | #' @importFrom kernlab specc 37 | #' @import penalized 38 | #' @importFrom utils read.csv read.table write.csv write.table 39 | #' @importFrom rsvd rpca 40 | #' @author Wei Vivian Li, \email{liw@ucla.edu} 41 | #' @author Jingyi Jessica Li, \email{jli@stat.ucla.edu} 42 | #' @references Li, W. V., & Li, J. J. (2018). An accurate and robust imputation method 43 | #' scImpute for single-cell RNA-seq data. \emph{Nature Communications}, 9(1), 997. 44 | scimpute <- 45 | function (count_path, infile = "csv", outfile = "csv", type = "count", out_dir, labeled = FALSE, 46 | drop_thre = 0.5, Kcluster = NULL, labels = NULL, genelen = NULL, ncores = 5) 47 | { 48 | if(labeled == TRUE & is.null(labels)){ 49 | stop("'labels' must be specified when 'labeled = TRUE'!") 50 | } 51 | if(labeled == FALSE & is.null(Kcluster)){ 52 | stop("'Kcluster' must be specified when 'labeled = FALSE'!") 53 | } 54 | if(!(type %in% c("count", "TPM"))){ stop("expression values can be either 'count' or 'TPM'!") } 55 | if(type == "TPM" & is.null(genelen)){ stop("'genelen' must be specified when type = 'TPM'!") } 56 | 57 | # print(drop_thre) 58 | print("reading in raw count matrix ...") 59 | dir.create(out_dir, recursive = TRUE) 60 | count_lnorm = read_count(filetype = infile, path = count_path, out_dir = out_dir, 61 | type = type, genelen = genelen) 62 | print("reading finished!") 63 | 64 | if(labeled == TRUE){ 65 | if(length(labels) != ncol(count_lnorm)){ 66 | stop("number of cells does not match number of labels !") 67 | } 68 | } 69 | genenames = rownames(count_lnorm) 70 | cellnames = colnames(count_lnorm) 71 | 72 | print("imputation starts ...") 73 | if (labeled == FALSE){ 74 | res_imp = imputation_model8(count = count_lnorm, labeled = FALSE, 75 | point = log10(1.01), drop_thre = drop_thre, 76 | Kcluster = Kcluster, 77 | out_dir = out_dir, ncores = ncores) 78 | }else{ 79 | res_imp = imputation_wlabel_model8(count = count_lnorm, labeled = TRUE, 80 | cell_labels = labels, point = log10(1.01), 81 | drop_thre = drop_thre, 82 | Kcluster = NULL, out_dir = out_dir, 83 | ncores = ncores) 84 | } 85 | count_imp = res_imp$count_imp 86 | outliers = res_imp$outlier 87 | count_imp = 10^count_imp - 1.01 88 | rownames(count_imp) = genenames 89 | colnames(count_imp) = cellnames 90 | print("writing imputed count matrix ...") 91 | write_count(count_imp, filetype = outfile, out_dir, type = type, genelen = genelen) 92 | return(outliers) 93 | } 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /inst/comparison/4-plot.R: -------------------------------------------------------------------------------- 1 | library(Rtsne) 2 | library(ggplot2) 3 | library(parallel) 4 | library(ClusterR) 5 | library(tidyr) 6 | library(dplyr) 7 | library(gridExtra) 8 | library(grid) 9 | #library(kernlab) 10 | ###source("~/Dropbox/Rpkgs-dev/scimpute_dev/diagnosis/SAVER-paper/v2/comparison/supp.R") 11 | 12 | plot_dir = "./plots/" 13 | data_dir = "./rerun/" 14 | 15 | 16 | ##################################################### 17 | ### data characteristics 18 | count_raw = readRDS(paste0(data_dir, "rerun/zeisel_raw.rds")) 19 | ### data in Huang et al. 20 | ### downloaded from https://www.dropbox.com/sh/ri6fa3mbhvgapqk/AADwOzHfiCcLSqYnX9CTyd7_a?dl=0 21 | count_samp = readRDS(paste0(data_dir, "zeisel_samp.rds")) 22 | 23 | datas =c("raw", "SAVER-paper") 24 | summary = lapply(1:2, function(i){ 25 | if(i == 1){count = count_raw} 26 | if(i == 2){count = count_samp} 27 | mean = log10(rowMeans(count)+1) 28 | sd = log10(apply(count, 1, sd)+1) 29 | zero = rowSums(count == 0)/ncol(count) 30 | da = data.frame(mean, sd, zero) 31 | da$data = datas[i] 32 | return(da) 33 | }) 34 | summary = Reduce(rbind, summary) 35 | g1 = ggplot(summary, aes(x = mean, y = sd, color = data)) + 36 | geom_point(alpha = 0.6, cex = 0.2) + 37 | xlab("log10(mean + 1)") + ylab("log10(sd + 1)") + 38 | scale_color_manual(values = c("#999999", "#CC0C00")) + 39 | # scale_y_continuous(labels = scaleFUN) + 40 | theme_bw() + 41 | theme(legend.position = "none", 42 | text = element_text(size=14)) 43 | g2 = ggplot(summary, aes(x = mean, y = zero, color = data)) + 44 | geom_point(alpha = 0.6, cex = 0.2) + 45 | xlab("log10(mean + 1)") + ylab("zero fraction") + 46 | scale_color_manual(values = c("#999999", "#CC0C00")) + 47 | theme_bw() + 48 | theme(legend.position = "none", 49 | text = element_text(size=14)) 50 | g = arrangeGrob(g1,g2,nrow = 2) 51 | ggsave(paste0(plot_dir, "Fig2a.pdf"), g, width = 3, height = 5) 52 | 53 | g3 = ggplot(summary, aes(x = mean, fill = data)) + 54 | geom_density(alpha = 0.6) + xlim(c(0,1.5)) + 55 | xlab("log10(mean + 1)") + 56 | scale_fill_manual(values = c("#999999", "#CC0C00")) + 57 | theme_bw() + 58 | theme(legend.position = "none", 59 | text = element_text(size=14)) 60 | g4 = ggplot(summary, aes(x = sd, fill = data)) + 61 | geom_density(alpha = 0.6) + xlim(c(0,1.5)) + 62 | xlab("log10(sd + 1)") + 63 | scale_fill_manual(values = c("#999999", "#CC0C00")) + 64 | theme_bw() + 65 | theme(legend.position = "none", 66 | text = element_text(size=14)) 67 | g5 = ggplot(summary, aes(x = zero, fill = data)) + 68 | geom_density(alpha = 0.6) + 69 | xlab("zero fraction") + 70 | scale_fill_manual(values = c("#999999", "#CC0C00")) + 71 | theme_bw() + 72 | theme(legend.position = "none", 73 | text = element_text(size=14)) 74 | gg = arrangeGrob(g3,g4,g5,nrow = 3) 75 | ggsave(paste0(plot_dir, "Fig2b.pdf"), gg, width = 3, height = 5) 76 | 77 | 78 | 79 | ##################################################### 80 | ### tSNE 81 | 82 | methods = c("Raw", "scImpute", "MAGIC", "SAVER") 83 | name_appends = c("raw", "scimpute_k9", "magic", "saver") 84 | names(name_appends) = methods 85 | labels = readRDS(paste0(data_dir, "rerun/zeisel_label9.rds")) 86 | 87 | # for(method in methods){ 88 | # set.seed(1234) 89 | # print(method) 90 | # dim = 2 91 | # count_raw = readRDS(paste0(data_dir, "rerun/zeisel_", name_appends[method], ".rds")) 92 | # if(method == "SAVER") count_raw = count_raw$estimate 93 | # count = log10(count_raw + 1) 94 | # tsne = Rtsne(t(count), dims = dim)$Y 95 | # saveRDS(tsne, file = paste0(data_dir, "rerun/zeisel-", method, "-tsne", dim, ".rds")) 96 | # gc() 97 | # } 98 | 99 | 100 | ### tSNE 101 | data = lapply(methods, function(method){ 102 | tsne = readRDS(file = paste0(data_dir, "rerun/zeisel-", method, 103 | "-tsne2", ".rds")) 104 | pdata = data.frame(tSNE1 = tsne[,1], tSNE2 = tsne[,2], type = labels) 105 | pdata$method = method 106 | return(pdata) 107 | }) 108 | data = Reduce(rbind, data) 109 | data$method = factor(data$method, levels = c("Raw", "scImpute", "MAGIC", "SAVER")) 110 | gt = ggplot(data, aes(x = tSNE1, y = tSNE2, color = type)) + 111 | geom_point(alpha = 0.8, cex = 0.8) + facet_wrap(~method, nrow = 1) + 112 | theme_bw() + 113 | theme(strip.background = element_blank(), 114 | legend.position = "bottom", 115 | text = element_text(size=12)) 116 | ggsave(paste0(plot_dir,"Fig2e.pdf"), gt, width = 11, height = 4) 117 | 118 | 119 | 120 | ########################################################## 121 | ### clustering 122 | 123 | B = 100 124 | J = 3005 125 | 126 | for(kk in c(9,47)){ 127 | print(kk) 128 | dim = 10 129 | temp_res = mclapply(1:B, function(b){ 130 | set.seed(b) 131 | if(b %% 20 == 0) print(b) 132 | ind = sample(1:J, J, replace = TRUE) 133 | val = sapply(methods, function(method){ 134 | mat = readRDS(paste0(data_dir, "rerun/zeisel-", method, "-pca.rds")) 135 | mat = mat[ind, 1:dim] 136 | truel = labels[ind] 137 | clusts = hclust(dist(mat), method = "median") 138 | Clabel = cutree(clusts, kk) 139 | Clabel = as.numeric(Clabel) 140 | v = sapply(c("jaccard_index", "adjusted_rand_index", "purity", "nmi"), function(x){ 141 | external_validation(as.numeric(factor(truel)), Clabel, method = x) 142 | }) 143 | return(v) 144 | }) 145 | gc() 146 | mat = as.data.frame(val) 147 | colnames(mat) = methods 148 | mat$measure = c("Jaccard index", "adjusted Rand index", "purity", "nmi") 149 | mat = mat %>% gather(metric, value, -measure) 150 | return(mat) 151 | }, mc.cores = 36) 152 | da = Reduce(rbind, temp_res) 153 | 154 | 155 | val = sapply(methods, function(method){ 156 | mat = readRDS(paste0(data_dir, "rerun/zeisel-", method, "-pca.rds")) 157 | mat = mat[, 1:dim] 158 | clusts = hclust(dist(mat), method = "median") 159 | Clabel = cutree(clusts, kk) 160 | Clabel = as.numeric(Clabel) 161 | v = sapply(c("jaccard_index", "adjusted_rand_index", "purity", "nmi"), function(x){ 162 | external_validation(as.numeric(factor(labels)), Clabel, method = x) 163 | }) 164 | return(v) 165 | }) 166 | mat = as.data.frame(val) 167 | colnames(mat) = methods 168 | mat$measure = c("Jaccard index", "adjusted Rand index", "purity", "nmi") 169 | mat = mat %>% gather(metric, value, -measure) 170 | sd = sapply(1:nrow(mat), function(i){ 171 | val = filter(da, measure == mat[i,"measure"] & metric == mat[i,"metric"]) 172 | return(sd(val$value)) 173 | }) 174 | mat$sd = sd 175 | 176 | mat$metric = factor(mat$metric, levels = c("Raw", "scImpute", "MAGIC", "SAVER")) 177 | gc = ggplot(mat, aes(x = metric, y = value, fill = metric)) + 178 | geom_bar(stat = "identity", width = .7) + 179 | facet_wrap(~measure, nrow = 4, scales = "free") + 180 | geom_errorbar(aes(ymin = value - sd, ymax = value+sd), width = .2) + 181 | theme_bw() + ylab("") + ylim(0,NA) + 182 | theme(strip.background = element_rect(colour="white", fill="white"), 183 | text = element_text(size=12), 184 | axis.text.x=element_text(size=8), 185 | axis.ticks.x=element_blank(), 186 | legend.position = "none") + 187 | scale_fill_manual(values = c("#999999", "#56B4E9", "#CC79A7", "#E69F00")) 188 | ggsave(paste0(plot_dir, "Fig2c-kk", kk, ".pdf"), gc, width = 2.5, height = 5) 189 | } 190 | 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /vignettes/scImpute-vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to scImpute" 3 | author: "Wei Vivian Li, Jingyi Jessica Li" 4 | # author: 5 | # - name: Wei Vivian Li, Jingyi Jessica Li 6 | # affiliation: 7 | # - Department of Statistics, University of California, Los Angeles 8 | date: "`r Sys.Date()`" 9 | output: rmarkdown::html_vignette 10 | #output: pdf_document 11 | vignette: > 12 | %\VignetteIndexEntry{scImpute: accurate and robust imputation for scRNA-seq data} 13 | %\VignetteEngine{knitr::rmarkdown} 14 | %\VignetteEncoding{UTF-8} 15 | --- 16 | 17 | 18 | The emerging single cell RNA sequencing (scRNA-seq) technologies enable the investigation of transcriptomic landscape at single-cell resolution. However, scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. Consequently, downstream analysis of scRNA-seq woule be severely biased if the dropout events are not properly corrected. `scImpute` is developed to accurately and efficiently impute the dropout values in scRNA-seq data. 19 | 20 | `scImpute` can be applied to raw data count before the users perform downstream analyses such as 21 | 22 | - dimension reduction of scRNA-seq data 23 | - normalization of scRNA-seq data 24 | - clustering of cell populations 25 | - differential gene expression analysis 26 | - time-series analysis of gene expression dynamics 27 | 28 | 29 | ## Quick start 30 | 31 | `scImpute` can be easily incorporated into existing pipeline of scRNA-seq analysis. 32 | Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. 33 | In the simplest case, the imputation task can be done with one single function `scimpute`: 34 | ```{r eval = FALSE} 35 | scimpute(# full path to raw count matrix 36 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 37 | infile = "csv", # format of input file 38 | outfile = "csv", # format of output file 39 | out_dir = "./", # full path to output directory 40 | labeled = FALSE, # cell type labels not available 41 | drop_thre = 0.5, # threshold set on dropout probability 42 | Kcluster = 2, # 2 cell subpopulations 43 | ncores = 10) # number of cores used in parallel computation 44 | ``` 45 | This function returns the column indices of outlier cells, and creates a new file `scImpute_count.csv` in `out_dir` to store the imputed count matrix. 46 | 47 | 48 | ## Step-by-step description 49 | 50 | The input file can be a `.csv` file, `.txt` file, or `.rds` file. In all cases, the **first column** should give the gene names and the **first row** should give the cell names. We use the example files in the package as illustration. If the raw counts are stored in a `.csv` file, and we also hope to output the imputed matrix into a `.csv` file, then specify this information with 51 | ```{r eval = FALSE} 52 | # full path of the input file 53 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute") 54 | infile = "csv" 55 | outfile = "csv" 56 | ``` 57 | Similarly, If the raw counts are stored in a `.txt` file, and we also hope to output the imputed matrix into a `.txt` file, then specify this information with 58 | ```{r eval = FALSE} 59 | # full path of the input file 60 | count_path = system.file("extdata", "raw_count.txt", package = "scImpute") 61 | infile = "txt" 62 | outfile = "txt" 63 | ``` 64 | Next, we need to set up the directory to store all the temporary and final outputs: 65 | ```{r eval = FALSE} 66 | # a '/' sign is necessary at the end of the path 67 | out_dir = "~/output/" 68 | ``` 69 | 70 | 71 | We highly recommend using parallel computing with `scImpute`, which will significantly reduce the computation time. Suppose we would like to use 20 cores, then we can run the `scImpute` function with `ncores = 20`. 72 | 73 | `scImpute` has two statistical parameters. 74 | The **first parameter is `Kcluster`**, which determines the **number of initial clusters** to help identify candidate neighbors of each cell. The imputation results does not heavily rely on the choice of `Kcluster`, since `scImpute` uses a model-based method to select similar cells in a later stage. `Kcluster` can be specified based on the number of known cell types and users' biological expertise, and it may also be learned by clustering the raw data and inspecting the clustering results. 75 | The **second parameter** is `drop_thre`. Only the values that have **dropout probability** larger than `drop_thre` are imputed by `scImpute`. A default threshold `drop_thre = 0.5` is sufficient for most scRNA-seq data. 76 | 77 | Now to get the imputed matrix, all we need is the main `scimpute` function 78 | ```{r eval = FALSE} 79 | Kcluster = 2 80 | drop_thre = 0.5 81 | ncores = 10 82 | scimpute(count_path, infile, outfile, out_dir, labeled = FALSE, drop_thre, Kcluster, ncores) 83 | ``` 84 | If `outfile = "csv"`, this function will create a new file `scimpute_count.csv` in `out_dir` to store the imputed count matrix; if `outfile = "txt"`, this function will create a new file `scimpute_count.txt` in `out_dir`. 85 | 86 | Note that the order of parameters matters in R functions, so we suggest using the format in **Quick start** to specify parameters and avoid mistakes. If the users would like to apply `scImpute` on data coming from homogeneous cells, this can be achieved by setting `Kcluster = 1` and `labeled = FALSE`. 87 | 88 | ## Apply `scImpute` with cell type information 89 | 90 | Sometimes users may have the cell type (or subpopulation) information of the single cells and `scimpute` can take advantage of this information to impute among each cell type. To do this, we need a character vector `labels` specifying the cell type of each column in the raw count matrix. In other words, the length of `labels` equals the number of cells and the order of elements in `labels` should match the order of columns in the raw count matrix. Then we just need to specify `labeled = TRUE` in `scimpute` (default is `FALSE`) and specify the `labels` argument. `Kcluster` is not used when `labeled = TRUE`. 91 | ```{r eval = FALSE} 92 | labels = readRDS(system.file("extdata", "labels.rds", package = "scImpute")) 93 | labels[1:5] 94 | > [1] "c1" "c1" "c1" "c2" "c2" 95 | 96 | scimpute(count_path, 97 | infile = "csv", 98 | outfile = "csv", 99 | out_dir = out_dir, 100 | labeled = TRUE, 101 | drop_thre = 0.5, 102 | labels = labels, 103 | ncores = 10) 104 | ``` 105 | 106 | ## Apply `scImpute` to TPM values 107 | 108 | We strongly suggest using `scImpute` on count matrices. However, if only TPM values are available, users can apply `scImpute` with gene lengths supplied. `scImpute` will use the gene lengths (sum of exon lengths) to scale the data , which ensures a good fitting of the mixture models. In this case, users need to specify `type = "TPM"` (`type = "count"` by default), and supply a vector `genelen` of gene lengths. The order of genes in `genelen` should match the order in the expression matrix. For example: 109 | ```{r eval = FALSE} 110 | > genelen[1:3] 111 | ENSMUSG00000021252 ENSMUSG00000007777 ENSMUSG00000024442 112 | 4235 998 2404 113 | 114 | scimpute(count_path, 115 | infile = "csv", 116 | outfile = "csv", 117 | out_dir = out_dir, 118 | type = "TPM" 119 | genelen = genelen, 120 | drop_thre = 0.5, 121 | ncores = 10) 122 | ``` 123 | 124 | ## How to save computation time with `scImpute` 125 | 126 | `scImpute` benefits from parallel computation, and each processor does not require heavy memory cost. `scimpute` completes computation in seconds when applied to a dataset with 10,000 genes and 100 cells, running with 10 cores. The memory requirement for this data set is around 2G. The running time mostly depends on 127 | 128 | * number of processors (`ncores`) 129 | * number of cells in the scRNA-seq data 130 | 131 | When the number of cells is extremely large, a filtering step on the cells can save the computation time of `scImpute`. -------------------------------------------------------------------------------- /inst/docs/scImpute-news.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | scImpute Updates 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |

scImpute Updates

34 |

Wei Vivian Li

35 |

2018-08-15

36 | 37 | 38 | 39 |
40 |

Updates

41 |
42 |

2018/08/15:

43 |
44 |
    45 |
  • Version 0.0.9 is released!
  • 46 |
  • More robust implementation of dimension reduction.
  • 47 |
  • Faster calculation of cell similarity.
  • 48 |
49 |
50 |

2018/06/27:

51 |
52 |
    53 |
  • Version 0.0.8 is released!
  • 54 |
  • Faster implementation of dimension reduction.
  • 55 |
56 |
57 |

2018/06/08:

58 |
59 |
    60 |
  • Version 0.0.7 is released!
  • 61 |
  • New option for application on TPM values.
  • 62 |
63 |
64 |

2018/03/16:

65 |
66 |
    67 |
  • Version 0.0.6 is released!
  • 68 |
  • The scImpute method is published at Nature Communications.
  • 69 |
  • scImpute now supports input and output in the format of R objects (.rds).
  • 70 |
71 |
72 |

2018/01/12:

73 |
74 |
    75 |
  • Version 0.0.5 is released!
  • 76 |
  • It is now possible to apply scImpute on just one cell population by setting Kcluster = 1.
  • 77 |
78 |
79 |

2017/10/27:

80 |
81 |
    82 |
  • Version 0.0.4 is released!
  • 83 |
  • scImpute now supports multi-code parallelism.
  • 84 |
85 |
86 |

2017/10/22:

87 |
88 |
    89 |
  • Version 0.0.3 is released!
  • 90 |
  • Estimation of dropout probabilities is more accurate.
  • 91 |
  • Imputation step is more robust.
  • 92 |
  • scimpute() incorporates a new parameter Kcluster to specify the number of cell subpopulations.
  • 93 |
  • scImpute is now able to detect outlier cells.
  • 94 |
95 |
96 |

2017/07/01:

97 |
98 |
    99 |
  • Version 0.0.2 is released!
  • 100 |
  • This version speeds up the first step in scImpute and program now completes in a few seconds when applied to a dataset with 10,000 genes and 100 cells (using single core).
  • 101 |
102 |
103 | 104 | 105 | 106 | 107 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /R/scImpute-internal.R: -------------------------------------------------------------------------------- 1 | .Random.seed <- 2 | c(403L, 3L, 198228743L, -491873571L, 144704468L, -743308550L, 3 | 226498117L, 1699344031L, -1917922266L, -718012240L, -1564772013L, 4 | -880941967L, 559159712L, -369683138L, -1330072503L, -50265861L, 5 | 772499658L, -1932136916L, -543649233L, -1234744603L, -1582657636L, 6 | -1522144782L, -490912659L, 855975335L, -710914098L, 6262056L, 7 | 536162091L, 1867830601L, 1744083544L, 907531014L, 62832353L, 8 | 2022791795L, 813933282L, -2096939436L, -910145705L, 1531870029L, 9 | -205992220L, -1680732470L, 1285017397L, -1570298481L, 302891670L, 10 | 615832128L, 189786883L, -377455839L, -1591968752L, -806016018L, 11 | -1271432295L, 575218731L, -251051750L, 1360737244L, -1512327713L, 12 | -2084798027L, 1516785484L, 1963435426L, 472294653L, -1855821321L, 13 | -645585122L, -1336960552L, -395267269L, -1937672999L, 1999304808L, 14 | 167841174L, -1937631983L, 1805260483L, 244397778L, 1280782180L, 15 | 303747751L, 1946369021L, -1252397580L, 1786473178L, 535702565L, 16 | 1037893951L, 871495366L, 1826102096L, 1957774195L, -58935919L, 17 | -575825984L, -16899682L, 463670121L, -1820999781L, 823068330L, 18 | -2105741108L, 1455600015L, 2087093701L, 1080117628L, -1927267886L, 19 | -902322867L, 1697167367L, 1441945582L, -1094918456L, 179048907L, 20 | 831267881L, -509668744L, -1055170650L, 366598593L, -1991004589L, 21 | -886938366L, -468351948L, -1871260233L, 739981741L, 1037785796L, 22 | -1844363542L, -2052239019L, 498140911L, 1364915446L, 2053833760L, 23 | 604714467L, -1748162943L, -216461072L, 1835093902L, 461275385L, 24 | 788359819L, -2114131270L, 808830396L, -248221953L, 878928469L, 25 | -1404037780L, 1453088194L, 781819421L, 1340368535L, -68603906L, 26 | -1737548872L, -270936165L, 690593273L, 564731208L, 1909053558L, 27 | -1610586703L, 403925987L, 1346631858L, -342731516L, 1793685063L, 28 | -1835966179L, -1771032684L, -660630726L, 927129477L, -1819988897L, 29 | -1473452314L, -789279376L, -1110772077L, -1712935119L, 174765536L, 30 | 737038974L, -1761287927L, 2064909115L, 299629322L, -347450644L, 31 | 589729647L, -178892635L, 1614010972L, 1881725490L, 1001378093L, 32 | -1315612697L, 75841294L, -835407640L, 2121769707L, 1103653769L, 33 | -993470696L, 796678726L, -1713268063L, -1372676813L, -1112144734L, 34 | -416453740L, 754356759L, 826447373L, 575435556L, -674590966L, 35 | 2090140277L, -1192853041L, 1658850006L, -990237056L, 1469616323L, 36 | -347812255L, -557349168L, 117260334L, 905864665L, -313345557L, 37 | 706407514L, 1319138844L, -787859681L, 2078952309L, 2020561420L, 38 | 1093280738L, -329420611L, 694656823L, -1429813282L, -576137960L, 39 | 679082107L, -92897767L, -308357464L, -1307888554L, -1826582319L, 40 | -302453629L, -311881966L, 441948836L, 346875495L, 1856141245L, 41 | 217262388L, -317191526L, 1875790053L, 413186687L, 1097835526L, 42 | 2089006992L, 1556511283L, -1133792815L, -261573760L, 1395027550L, 43 | 1130845609L, 1527854171L, 925863018L, -1007135476L, -1552861361L, 44 | 5415941L, -1983685956L, -1435178094L, 1183246477L, 652766919L, 45 | -680335442L, 1135505928L, 1046246923L, 931118825L, 184015288L, 46 | -760310170L, 447902465L, 1772270995L, 553602626L, 2001154548L, 47 | -714585097L, -860819475L, 1354176644L, -111331528L, -133743462L, 48 | -868584752L, -1215635268L, -1814534508L, 271640194L, 1809292480L, 49 | -986959556L, 886833296L, -1716839902L, -2026211832L, -1211723508L, 50 | 473916060L, -284377582L, -1851822720L, -627151340L, 916497960L, 51 | -468282054L, 1439672272L, -1353465236L, 1679820596L, 1761440402L, 52 | 1428715872L, 2037342540L, -1301282848L, 1199091618L, -1774487640L, 53 | 310860684L, -1115662660L, 2031573874L, -1361590032L, -1270474684L, 54 | 999734616L, 1414562874L, 83026800L, -781040580L, -1530922220L, 55 | -1599157822L, -1543876384L, 1326319484L, -1923322160L, -1777867102L, 56 | 403637288L, -1481766644L, 966632380L, 202665266L, 1317224384L, 57 | 1478619700L, 1562184648L, -734657350L, -1528975472L, 1252746732L, 58 | -2041978796L, -1484968302L, 899472160L, -588505652L, 566348768L, 59 | -680488254L, -1587680728L, 1293409964L, 2092020412L, 812581362L, 60 | -79984336L, 1268710564L, -1693633864L, 619629402L, 943222160L, 61 | 1242194300L, -987407084L, 190114114L, -1086078208L, 1222671292L, 62 | -1824232624L, -1767349982L, 1800097096L, -1722337268L, -1326688164L, 63 | -579119918L, -2027313152L, -7132332L, -1390702104L, 713312250L, 64 | -2133309744L, -727542484L, 1851029748L, 1976825746L, -417026976L, 65 | -2003161844L, -1728936224L, 1786105186L, 970195624L, -1823724980L, 66 | -635361412L, -375395534L, 2052107888L, -1172316860L, 1731865560L, 67 | 1064910970L, -617507280L, -1228531012L, 231970644L, 729152066L, 68 | -1861067872L, 1662499004L, -2045677104L, -297302686L, 673932264L, 69 | 1705387980L, -1840388356L, 240373746L, 2001665152L, 1776486644L, 70 | 1317786760L, 1655043386L, 528368464L, 929016940L, -649236332L, 71 | 1518078290L, 1516379872L, -389537588L, 1504843936L, 691607938L, 72 | -1865484056L, 741845228L, 404301884L, 2055528562L, -129406864L, 73 | -1847533020L, -341278152L, -44434918L, 32694480L, -194968004L, 74 | -1539388268L, -1140718462L, 1678747584L, -1316520644L, 2039811728L, 75 | -275536862L, -885075960L, 1703697548L, 1077284892L, 551318674L, 76 | -308859776L, -753810412L, -1529593048L, -1227598790L, -1799115568L, 77 | 1219777644L, -1269717324L, -1030229998L, -32407584L, -1557861428L, 78 | -492642848L, 130669602L, 1084138792L, 952867084L, 380964412L, 79 | -1748208398L, 787106032L, -1270642876L, -346812328L, 1412350522L, 80 | -1216743184L, -1277570372L, -1721239020L, -365237054L, -1024375200L, 81 | -856435332L, -480856880L, -1188772830L, 1719642280L, 400051596L, 82 | -2062945860L, -675620814L, -1443560640L, 608849588L, 857039176L, 83 | 1418412730L, -1989886192L, 1427284204L, 1738762324L, 925001490L, 84 | 1879630240L, -270104628L, -1531945120L, -336593726L, 318475816L, 85 | -1323017812L, 1772282684L, -389893262L, 854503216L, 1329422372L, 86 | -1035514312L, 926667482L, -1490208880L, -1504086788L, -477782636L, 87 | 1081111362L, -624875008L, 866740284L, -141422000L, -1736699742L, 88 | 317858632L, 748292620L, -1524000036L, 1686851666L, -2020037504L, 89 | -82385068L, 317902824L, 1392313082L, 434718544L, -1430838228L, 90 | -2120853132L, 881200786L, 870720096L, 1055084684L, -1017278112L, 91 | 1541167330L, -469259096L, -390076724L, -1864164868L, 9132978L, 92 | -1136942096L, 1435881924L, 873401688L, -617848454L, -1551011408L, 93 | -1058422738L, 2089055963L, -1286975555L, 1408166186L, -48562744L, 94 | -1506917647L, 61885539L, 1836701796L, 1778914194L, 1198520423L, 95 | -162527183L, 371644790L, 1139435156L, 1691659429L, -634998353L, 96 | 1940154984L, 382390854L, 747173875L, 1162364757L, 904001842L, 97 | 1691929856L, -1035514391L, 1405392987L, -734229844L, -563727174L, 98 | -789219729L, -2046368775L, -39384434L, -376255652L, -1026349107L, 99 | -1105803465L, 627250624L, 1729067678L, -1287113877L, 238299885L, 100 | 1265868634L, -1341991912L, 1548235137L, -1578711245L, -1435959980L, 101 | -1002145950L, 339987447L, 99333185L, 1138303174L, 1788120580L, 102 | -1233180235L, 273192607L, -349817096L, 1984264918L, 697413475L, 103 | -79440091L, -987820862L, -529225296L, 2077041113L, 1138536267L, 104 | 773129660L, 2131080682L, 363942431L, -899893079L, -941625282L, 105 | 187138348L, -32363651L, -412573241L, -2110450512L, -334052658L, 106 | -2131888133L, 2094530653L, -1474656374L, -1608267864L, -2067976943L, 107 | -2059258877L, -708296956L, -1133505742L, 1612951943L, 1944041425L, 108 | 985725270L, -1961134284L, 95607365L, -1323784561L, 386887688L, 109 | -2103290842L, -1626329901L, -865702347L, 144000402L, 575854944L, 110 | -171138871L, -783404037L, 210056140L, -1256255910L, -325282993L, 111 | 1238706713L, 585828398L, -873953732L, 1819360173L, 1305375831L, 112 | 377012640L, 90004222L, -657753653L, -1493647411L, 1962660602L, 113 | 878266680L, -845753631L, -1343113837L, -405905996L, -775659838L, 114 | 1868803799L, 321666849L, -37087514L, -1471746076L, -2076180715L, 115 | 1433990975L, -546004264L, 1597089526L, -1489940221L, 1972154949L, 116 | -2054618526L, -569531952L, -373768455L, 861240363L, 99220380L, 117 | 310571978L, -1380234049L, -888672759L, -131160290L, 1157746380L, 118 | -1810411875L, -38188249L, 512157008L, -436155410L, 554284187L, 119 | 176037629L, 194340330L, -2046085496L, 2013397041L, 25287331L, 120 | 609526436L, 640889682L, 1906268839L, -268892431L, -865196490L, 121 | 579250900L, -2111996315L, -1263432593L, 1897100968L, -1353567226L, 122 | 1781801267L, -947159659L, -460721678L, -695828928L, 1850630057L, 123 | -396115429L, 1907276780L, -1667728390L, -348047313L, 1887156281L, 124 | -2034234802L, 1598360220L, 1691029517L, 2121878647L, 741821568L, 125 | -1788974370L, -66807381L, -1118956883L, 450561818L, -40626216L, 126 | 1527157825L, -1438850317L, 392176916L, -637202270L, 633612670L 127 | ) 128 | -------------------------------------------------------------------------------- /R/imputation_model.R: -------------------------------------------------------------------------------- 1 | 2 | find_hv_genes = function(count, I, J){ 3 | count_nzero = lapply(1:I, function(i) setdiff(count[i, ], log10(1.01))) 4 | mu = sapply(count_nzero, mean) 5 | mu[is.na(mu)] = 0 6 | sd = sapply(count_nzero, sd) 7 | sd[is.na(sd)] = 0 8 | cv = sd/mu 9 | cv[is.na(cv)] = 0 10 | # sum(mu >= 1 & cv >= quantile(cv, 0.25), na.rm = TRUE) 11 | high_var_genes = which(mu >= 1 & cv >= quantile(cv, 0.25)) 12 | if(length(high_var_genes) < 500){ 13 | high_var_genes = 1:I} 14 | count_hv = count[high_var_genes, ] 15 | return(count_hv) 16 | } 17 | 18 | find_neighbors = function(count_hv, labeled, J, Kcluster = NULL, 19 | ncores, cell_labels = NULL){ 20 | if(labeled == TRUE){ 21 | if(class(cell_labels) == "character"){ 22 | labels_uniq = unique(cell_labels) 23 | labels_mth = 1:length(labels_uniq) 24 | names(labels_mth) = labels_uniq 25 | clust = labels_mth[cell_labels] 26 | }else{ 27 | clust = cell_labels 28 | } 29 | nclust = length(unique(clust)) 30 | print("calculating cell distances ...") 31 | dist_list = lapply(1:nclust, function(ll){ 32 | cell_inds = which(clust == ll) 33 | count_hv_sub = count_hv[, cell_inds, drop = FALSE] 34 | if(length(cell_inds) < 1000){ 35 | var_thre = 0.4 36 | pca = prcomp(t(count_hv_sub)) 37 | eigs = (pca$sdev)^2 38 | var_cum = cumsum(eigs)/sum(eigs) 39 | if(max(var_cum) <= var_thre){ 40 | npc = length(var_cum) 41 | }else{ 42 | npc = which.max(var_cum > var_thre) 43 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 44 | } 45 | }else{ 46 | var_thre = 0.6 47 | pca = rpca(t(count_hv_sub), k = 1000, center = TRUE, scale = FALSE) 48 | eigs = (pca$sdev)^2 49 | var_cum = cumsum(eigs)/sum(eigs) 50 | if(max(var_cum) <= var_thre){ 51 | npc = length(var_cum) 52 | }else{ 53 | npc = which.max(var_cum > var_thre) 54 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 55 | } 56 | } 57 | 58 | if (npc < 3){ npc = 3 } 59 | mat_pcs = t(pca$x[, 1:npc]) 60 | 61 | dist_cells_list = mclapply(1:length(cell_inds), function(id1){ 62 | d = sapply(1:id1, function(id2){ 63 | sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2) 64 | sqrt(sse) 65 | }) 66 | return(c(d, rep(0, length(cell_inds)-id1))) 67 | }, mc.cores = ncores) 68 | dist_cells = matrix(0, nrow = length(cell_inds), ncol = length(cell_inds)) 69 | for(cellid in 1:length(cell_inds)){dist_cells[cellid, ] = dist_cells_list[[cellid]]} 70 | dist_cells = dist_cells + t(dist_cells) 71 | return(dist_cells) 72 | }) 73 | 74 | return(list(dist_list = dist_list, clust = clust)) 75 | } 76 | 77 | if(labeled == FALSE){ 78 | ## dimeansion reduction 79 | print("dimension reduction ...") 80 | if(J < 5000){ 81 | var_thre = 0.4 82 | pca = prcomp(t(count_hv)) 83 | eigs = (pca$sdev)^2 84 | var_cum = cumsum(eigs)/sum(eigs) 85 | if(max(var_cum) <= var_thre){ 86 | npc = length(var_cum) 87 | }else{ 88 | npc = which.max(var_cum > var_thre) 89 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 90 | } 91 | }else{ 92 | var_thre = 0.6 93 | pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 94 | eigs = (pca$sdev)^2 95 | var_cum = cumsum(eigs)/sum(eigs) 96 | if(max(var_cum) <= var_thre){ 97 | npc = length(var_cum) 98 | }else{ 99 | npc = which.max(var_cum > var_thre) 100 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 101 | } 102 | } 103 | if (npc < 3){ npc = 3 } 104 | mat_pcs = t(pca$x[, 1:npc]) # columns are cells 105 | 106 | ## detect outliers 107 | print("calculating cell distances ...") 108 | dist_cells_list = mclapply(1:J, function(id1){ 109 | d = sapply(1:id1, function(id2){ 110 | sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2) 111 | sqrt(sse) 112 | }) 113 | return(c(d, rep(0, J-id1))) 114 | }, mc.cores = ncores) 115 | dist_cells = matrix(0, nrow = J, ncol = J) 116 | for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]} 117 | dist_cells = dist_cells + t(dist_cells) 118 | 119 | min_dist = sapply(1:J, function(i){ 120 | min(dist_cells[i, -i]) 121 | }) 122 | iqr = quantile(min_dist, 0.75) - quantile(min_dist, 0.25) 123 | outliers = which(min_dist > 1.5 * iqr + quantile(min_dist, 0.75)) 124 | 125 | ## clustering 126 | non_out = setdiff(1:J, outliers) 127 | spec_res = specc(t(mat_pcs[, non_out]), centers = Kcluster, kernel = "rbfdot") 128 | print("cluster sizes:") 129 | print(spec_res@size) 130 | nbs = rep(NA, J) 131 | nbs[non_out] = spec_res 132 | 133 | return(list(dist_cells = dist_cells, clust = nbs)) 134 | } 135 | } 136 | 137 | find_va_genes = function(parslist, subcount){ 138 | point = log10(1.01) 139 | valid_genes = which( (rowSums(subcount) > point * ncol(subcount)) & 140 | complete.cases(parslist) ) 141 | if(length(valid_genes) == 0) return(valid_genes) 142 | # find out genes that violate assumption 143 | mu = parslist[, "mu"] 144 | sgene1 = which(mu <= log10(1+1.01)) 145 | # sgene2 = which(mu <= log10(10+1.01) & mu - parslist[,5] > log10(1.01)) 146 | 147 | dcheck1 = dgamma(mu+1, shape = parslist[, "alpha"], rate = parslist[, "beta"]) 148 | dcheck2 = dnorm(mu+1, mean = parslist[, "mu"], sd = parslist[, "sigma"]) 149 | sgene3 = which(dcheck1 >= dcheck2 & mu <= 1) 150 | sgene = union(sgene1, sgene3) 151 | valid_genes = setdiff(valid_genes, sgene) 152 | return(valid_genes) 153 | } 154 | 155 | impute_nnls = function(Ic, cellid, subcount, droprate, geneid_drop, 156 | geneid_obs, nbs, distc){ 157 | yobs = subcount[ ,cellid] 158 | if (length(geneid_drop) == 0 | length(geneid_drop) == Ic) { 159 | return(yobs) } 160 | yimpute = rep(0, Ic) 161 | 162 | xx = subcount[geneid_obs, nbs] 163 | yy = subcount[geneid_obs, cellid] 164 | ximpute = subcount[geneid_drop, nbs] 165 | num_thre = 500 166 | if(ncol(xx) >= min(num_thre, nrow(xx))){ 167 | if (num_thre >= nrow(xx)){ 168 | new_thre = round((2*nrow(xx)/3)) 169 | }else{ new_thre = num_thre} 170 | filterid = order(distc[cellid, -cellid])[1: new_thre] 171 | xx = xx[, filterid, drop = FALSE] 172 | ximpute = ximpute[, filterid, drop = FALSE] 173 | } 174 | set.seed(cellid) 175 | nnls = penalized(yy, penalized = xx, unpenalized = ~0, 176 | positive = TRUE, lambda1 = 0, lambda2 = 0, 177 | maxiter = 3000, trace = FALSE) 178 | ynew = penalized::predict(nnls, penalized = ximpute, unpenalized = ~0)[,1] 179 | yimpute[geneid_drop] = ynew 180 | yimpute[geneid_obs] = yobs[geneid_obs] 181 | maxobs = apply(subcount, 1, max) 182 | yimpute[yimpute > maxobs] = maxobs[yimpute > maxobs] 183 | return(yimpute) 184 | } 185 | 186 | 187 | imputation_model8 = function(count, labeled, point, drop_thre = 0.5, Kcluster = 10, 188 | out_dir, ncores){ 189 | count = as.matrix(count) 190 | I = nrow(count) 191 | J = ncol(count) 192 | count_imp = count 193 | 194 | # find highly variable genes 195 | count_hv = find_hv_genes(count, I, J) 196 | print("searching candidate neighbors ... ") 197 | if(Kcluster == 1){ 198 | clust = rep(1, J) 199 | if(J < 5000){ 200 | var_thre = 0.4 201 | pca = prcomp(t(count_hv)) 202 | eigs = (pca$sdev)^2 203 | var_cum = cumsum(eigs)/sum(eigs) 204 | if(max(var_cum) <= var_thre){ 205 | npc = length(var_cum) 206 | }else{ 207 | npc = which.max(var_cum > var_thre) 208 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 209 | } 210 | }else{ 211 | var_thre = 0.6 212 | pca = rpca(t(count_hv), k = 1000, center = TRUE, scale = FALSE) 213 | eigs = (pca$sdev)^2 214 | var_cum = cumsum(eigs)/sum(eigs) 215 | if(max(var_cum) <= var_thre){ 216 | npc = length(var_cum) 217 | }else{ 218 | npc = which.max(var_cum > var_thre) 219 | if (labeled == FALSE){ npc = max(npc, Kcluster) } 220 | } 221 | } 222 | 223 | if (npc < 3){ npc = 3 } 224 | mat_pcs = t(pca$x[, 1:npc]) # columns are cells 225 | 226 | dist_cells_list = mclapply(1:J, function(id1){ 227 | d = sapply(1:id1, function(id2){ 228 | sse = sum((mat_pcs[, id1] - mat_pcs[, id2])^2) 229 | sqrt(sse) 230 | }) 231 | return(c(d, rep(0, J-id1))) 232 | }, mc.cores = ncores) 233 | dist_cells = matrix(0, nrow = J, ncol = J) 234 | for(cellid in 1:J){dist_cells[cellid, ] = dist_cells_list[[cellid]]} 235 | dist_cells = dist_cells + t(dist_cells) 236 | }else{ 237 | print("inferring cell similarities ...") 238 | set.seed(Kcluster) 239 | neighbors_res = find_neighbors(count_hv = count_hv, labeled = FALSE, J = J, 240 | Kcluster = Kcluster, ncores = ncores) 241 | dist_cells = neighbors_res$dist_cells 242 | clust = neighbors_res$clust 243 | } 244 | 245 | saveRDS(clust, file = paste0(out_dir, "clust.rds")) 246 | # mixture model 247 | nclust = sum(!is.na(unique(clust))) 248 | cl = makeCluster(ncores, outfile="") 249 | registerDoParallel(cl) 250 | 251 | for(cc in 1:nclust){ 252 | print(paste("estimating dropout probability for type", cc, "...")) 253 | paste0(out_dir, "pars", cc, ".rds") 254 | get_mix_parameters(count = count[, which(clust == cc), drop = FALSE], 255 | point = log10(1.01), 256 | path = paste0(out_dir, "pars", cc, ".rds"), ncores = ncores) 257 | 258 | 259 | cells = which(clust == cc) 260 | if(length(cells) <= 1) { next } 261 | parslist = readRDS(paste0(out_dir, "pars", cc, ".rds")) 262 | print("searching for valid genes ...") 263 | valid_genes = find_va_genes(parslist, subcount = count[, cells]) 264 | if(length(valid_genes) <= 10){ next } 265 | 266 | subcount = count[valid_genes, cells, drop = FALSE] 267 | Ic = length(valid_genes) 268 | Jc = ncol(subcount) 269 | parslist = parslist[valid_genes, , drop = FALSE] 270 | 271 | droprate = t(sapply(1:Ic, function(i) { 272 | wt = calculate_weight(subcount[i, ], parslist[i, ]) 273 | return(wt[, 1]) 274 | })) 275 | mucheck = sweep(subcount, MARGIN = 1, parslist[, "mu"], FUN = ">") 276 | droprate[mucheck & droprate > drop_thre] = 0 277 | # dropouts 278 | setA = lapply(1:Jc, function(cellid){ 279 | which(droprate[, cellid] > drop_thre) 280 | }) 281 | # non-dropouts 282 | setB = lapply(1:Jc, function(cellid){ 283 | which(droprate[, cellid] <= drop_thre) 284 | }) 285 | # imputation 286 | gc() 287 | print(paste("imputing dropout values for type", cc, "...")) 288 | subres = foreach(cellid = 1:Jc, .packages = c("penalized"), 289 | .combine = cbind, .export = c("impute_nnls")) %dopar% { 290 | if (cellid %% 10 == 0) {gc()} 291 | if (cellid %% 100 == 0) {print(cellid)} 292 | nbs = setdiff(1:Jc, cellid) 293 | if (length(nbs) == 0) {return(NULL)} 294 | geneid_drop = setA[[cellid]] 295 | geneid_obs = setB[[cellid]] 296 | y = try(impute_nnls(Ic, cellid, subcount, droprate, geneid_drop, 297 | geneid_obs, nbs, distc = dist_cells[cells, cells]), 298 | silent = TRUE) 299 | if (class(y) == "try-error") { 300 | # print(y) 301 | y = subcount[, cellid, drop = FALSE] 302 | } 303 | return(y) 304 | } 305 | count_imp[valid_genes, cells] = subres 306 | } 307 | stopCluster(cl) 308 | outlier = which(is.na(clust)) 309 | count_imp[count_imp < point] = point 310 | return(list(count_imp = count_imp, outlier = outlier)) 311 | } 312 | 313 | imputation_wlabel_model8 = function(count, labeled, cell_labels = NULL, point, drop_thre, 314 | Kcluster = NULL, out_dir, ncores){ 315 | if(!(class(cell_labels) %in% c("character", "numeric", "integer"))){ 316 | stop("cell_labels should be a character or integer vector!") 317 | } 318 | 319 | count = as.matrix(count) 320 | I = nrow(count) 321 | J = ncol(count) 322 | count_imp = count 323 | 324 | count_hv = find_hv_genes(count, I, J) 325 | print("searching candidate neighbors ... ") 326 | neighbors_res = find_neighbors(count_hv = count_hv, labeled = TRUE, J = J, 327 | ncores = ncores, cell_labels = cell_labels) 328 | dist_list = neighbors_res$dist_list 329 | clust = neighbors_res$clust 330 | 331 | # mixture model 332 | nclust = sum(!is.na(unique(clust))) 333 | cl = makeCluster(ncores, outfile="") 334 | registerDoParallel(cl) 335 | 336 | for(cc in 1:nclust){ 337 | print(paste("estimating dropout probability for type", cc, "...")) 338 | paste0(out_dir, "pars", cc, ".rds") 339 | get_mix_parameters(count = count[, which(clust == cc), drop = FALSE], 340 | point = log10(1.01), 341 | path = paste0(out_dir, "pars", cc, ".rds"), ncores = ncores) 342 | 343 | cells = which(clust == cc) 344 | if(length(cells) <= 1){ next } 345 | parslist = readRDS(paste0(out_dir, "pars", cc, ".rds")) 346 | print("searching for valid genes ...") 347 | valid_genes = find_va_genes(parslist, subcount = count[, cells]) 348 | if(length(valid_genes) <= 10){ next } 349 | 350 | subcount = count[valid_genes, cells, drop = FALSE] 351 | Ic = length(valid_genes) 352 | Jc = ncol(subcount) 353 | parslist = parslist[valid_genes, , drop = FALSE] 354 | 355 | droprate = t(sapply(1:Ic, function(i) { 356 | wt = calculate_weight(subcount[i, ], parslist[i, ]) 357 | return(wt[, 1]) 358 | })) 359 | mucheck = sweep(subcount, MARGIN = 1, parslist[, "mu"], FUN = ">") 360 | droprate[mucheck & droprate > drop_thre] = 0 361 | # dropouts 362 | setA = lapply(1:Jc, function(cellid){ 363 | which(droprate[, cellid] > drop_thre) 364 | }) 365 | # non-dropouts 366 | setB = lapply(1:Jc, function(cellid){ 367 | which(droprate[, cellid] <= drop_thre) 368 | }) 369 | # imputation 370 | gc() 371 | print(paste("imputing dropout values for type", cc, "...")) 372 | 373 | cellid = NULL 374 | subres = foreach(cellid = 1:Jc, .packages = c("penalized"), 375 | .combine = cbind, .export = c("impute_nnls")) %dopar% { 376 | ##sink(paste0(out_dir, "log.txt"), append=TRUE)) 377 | ##cat(paste("imputing dropout values for type", cc, "\n") 378 | if (cellid %% 10 == 0) {gc()} 379 | if (cellid %% 100 == 0) {print(cellid)} 380 | nbs = setdiff(1:Jc, cellid) 381 | if (length(nbs) == 0) {return(NULL)} 382 | geneid_drop = setA[[cellid]] 383 | geneid_obs = setB[[cellid]] 384 | y = try(impute_nnls(Ic, cellid = cellid, subcount, droprate, geneid_drop, 385 | geneid_obs, nbs, distc = dist_list[[cc]]), 386 | silent = TRUE) 387 | if (class(y) == "try-error") { 388 | # print(y) 389 | y = subcount[, cellid, drop = FALSE] 390 | } 391 | return(y) 392 | } 393 | count_imp[valid_genes, cells] = subres 394 | } 395 | stopCluster(cl) 396 | outlier = integer(0) 397 | count_imp[count_imp < point] = point 398 | return(list(count_imp = count_imp, outlier = outlier)) 399 | 400 | } 401 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/" 2 | data = lapply(1:9, function(sp){ 3 | da = lapply(1:9 , function(gtfid){ 4 | tp = lapply(1:4, function(mm){ 5 | if(mm == 1){ 6 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 7 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 8 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv) 9 | }else if(mm == 2){ 10 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp", 11 | sp, "gtf", gtfid, ".rds")) 12 | }else if(mm == 3){ 13 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp", 14 | sp, "gtf", gtfid, ".rds")) 15 | }else{ 16 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp", 17 | sp, "gtf", gtfid, ".rds")) 18 | } 19 | tv = unlist(tv) 20 | return(data.frame(tv = tv, method = methods[mm])) 21 | }) 22 | tp = Reduce(rbind, tp) 23 | tp$annotation = gtfid 24 | return(tp) 25 | }) 26 | da = Reduce(rbind, da) 27 | da$sample = sp 28 | return(da) 29 | }) 30 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE") 31 | data = lapply(1:9, function(sp){ 32 | da = lapply(1:9 , function(gtfid){ 33 | tp = lapply(1:4, function(mm){ 34 | if(mm == 1){ 35 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 36 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 37 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv) 38 | }else if(mm == 2){ 39 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp", 40 | sp, "gtf", gtfid, ".rds")) 41 | }else if(mm == 3){ 42 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp", 43 | sp, "gtf", gtfid, ".rds")) 44 | }else{ 45 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp", 46 | sp, "gtf", gtfid, ".rds")) 47 | } 48 | tv = unlist(tv) 49 | return(data.frame(tv = tv, method = methods[mm])) 50 | }) 51 | tp = Reduce(rbind, tp) 52 | tp$annotation = gtfid 53 | return(tp) 54 | }) 55 | da = Reduce(rbind, da) 56 | da$sample = sp 57 | return(da) 58 | }) 59 | data = lapply(1:8, function(sp){ 60 | da = lapply(1:9 , function(gtfid){ 61 | tp = lapply(1:4, function(mm){ 62 | if(mm == 1){ 63 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 64 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 65 | tv = sapply(LRT_simu_res, function(x) x[[1]]$tv) 66 | }else if(mm == 2){ 67 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/cufflinks/sp", 68 | sp, "gtf", gtfid, ".rds")) 69 | }else if(mm == 3){ 70 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/stringtie/sp", 71 | sp, "gtf", gtfid, ".rds")) 72 | }else{ 73 | tv = readRDS(paste0("~/Dropbox/Iso Discovery/Codes/compare_abundance/slide/sp", 74 | sp, "gtf", gtfid, ".rds")) 75 | } 76 | tv = unlist(tv) 77 | return(data.frame(tv = tv, method = methods[mm])) 78 | }) 79 | tp = Reduce(rbind, tp) 80 | tp$annotation = gtfid 81 | return(tp) 82 | }) 83 | da = Reduce(rbind, da) 84 | da$sample = sp 85 | return(da) 86 | }) 87 | data = Reduce(rbind, data) 88 | data$annotation = factor(data$annotation, levels = 1:9) 89 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F", 90 | Stringtie = "#00BA38", SLIDE = "#B79F00") 91 | ggplot(data, aes(x = annotation, y = tv, fill = method)) + 92 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 93 | theme_bw() + 94 | facet_grid(sample ~.) + 95 | scale_fill_manual(values = cols) + 96 | theme(strip.background = element_rect(fill = "white")) 97 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15) 98 | library(ggplot2) 99 | ggplot(data, aes(x = annotation, y = tv, fill = method)) + 100 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 101 | theme_bw() + 102 | facet_grid(sample ~.) + 103 | scale_fill_manual(values = cols) + 104 | theme(strip.background = element_rect(fill = "white")) 105 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15) 106 | dir = "~/Dropbox/Iso Discovery/Codes/compare_abundance/" 107 | ggsave(paste0(dir, "tv_boxplot.pdf"), width = 6, height = 15) 108 | dir = "~/Dropbox/Iso Discovery/Codes/compare_transcripts/simulation/pr_by_gene/" 109 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE") 110 | data = lapply(1:1, function(sp){ 111 | tp2 = lapply(1:9, function(gtfid){ 112 | tp1 = lapply(1:4, function(mm){ 113 | if(mm == 1){ 114 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/" 115 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 116 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 117 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca))) 118 | }else if(mm == 2){ 119 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds")) 120 | pr = Reduce(rbind, pr) 121 | }else if(mm == 3){ 122 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds")) 123 | pr = Reduce(rbind, pr) 124 | }else{ 125 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds")) 126 | pr = Reduce(rbind, pr) 127 | } 128 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm]) 129 | return(res) 130 | }) 131 | tp1 = Reduce(rbind, tp1) 132 | tp1$annotation = gtfid 133 | return(tp1) 134 | }) 135 | tp2 = Reduce(rbind, tp2) 136 | tp2$sample = sp 137 | return(tp2) 138 | }) 139 | data = Reduce(rbind, data) 140 | data$annotation = factor(data$annotation, levels = 1:9) 141 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F", 142 | Stringtie = "#00BA38", SLIDE = "#B79F00") 143 | ggplot(data, aes(x = annotation, y = precision, fill = method)) + 144 | geom_boxplot() + 145 | theme_bw() + 146 | facet_grid(sample ~.) + 147 | scale_fill_manual(values = cols) + 148 | theme(strip.background = element_rect(fill = "white")) 149 | ggplot(data, aes(x = annotation, y = precision, fill = method)) + 150 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 151 | theme_bw() + 152 | facet_grid(sample ~.) + 153 | scale_fill_manual(values = cols) + 154 | theme(strip.background = element_rect(fill = "white")) 155 | ggplot(data, aes(x = annotation, y = recall, fill = method)) + 156 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 157 | theme_bw() + 158 | facet_grid(sample ~.) + 159 | scale_fill_manual(values = cols) + 160 | theme(strip.background = element_rect(fill = "white")) 161 | methods = c("LRT", "Cufflinks", "Stringtie", "SLIDE") 162 | data = lapply(1:1, function(sp){ 163 | tp2 = lapply(1:9, function(gtfid){ 164 | tp1 = lapply(1:4, function(mm){ 165 | if(mm == 1){ 166 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/" 167 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 168 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 169 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca))) 170 | }else if(mm == 2){ 171 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds")) 172 | pr = Reduce(rbind, pr) 173 | }else if(mm == 3){ 174 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds")) 175 | pr = Reduce(rbind, pr) 176 | }else{ 177 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds")) 178 | pr = Reduce(rbind, pr) 179 | } 180 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm]) 181 | return(res) 182 | }) 183 | tp1 = Reduce(rbind, tp1) 184 | tp1$annotation = gtfid 185 | return(tp1) 186 | }) 187 | tp2 = Reduce(rbind, tp2) 188 | tp2$sample = sp 189 | return(tp2) 190 | }) 191 | data = Reduce(rbind, data) 192 | data$annotation = factor(data$annotation, levels = 1:9) 193 | dir 194 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F", 195 | Stringtie = "#00BA38", SLIDE = "#B79F00") 196 | ggplot(data, aes(x = annotation, y = precision, fill = method)) + 197 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 198 | theme_bw() + 199 | facet_grid(sample ~.) + 200 | scale_fill_manual(values = cols) + 201 | theme(strip.background = element_rect(fill = "white")) 202 | ggsave(paste0(dir, "precision_boxplot.pdf"), width = 6, height = 15) 203 | data = lapply(1:8, function(sp){ 204 | print(sp) 205 | tp2 = lapply(1:9, function(gtfid){ 206 | tp1 = lapply(1:4, function(mm){ 207 | if(mm == 1){ 208 | rdata_dir = "~/Dropbox/Iso Discovery/Codes/discovery/rdata/simu_polyester/" 209 | load(paste0(rdata_dir, "sample", sp, "gtf", gtfid, "_p-2.RData")) 210 | LRT_simu_res = LRT_simu_res[!sapply(LRT_simu_res, is.null)] 211 | pr = t(sapply(LRT_simu_res, function(x) c(x[[1]]$prec, x[[1]]$reca))) 212 | }else if(mm == 2){ 213 | pr = readRDS(paste0(dir, "cufflinks/", sp, "gtf", gtfid, ".rds")) 214 | pr = Reduce(rbind, pr) 215 | }else if(mm == 3){ 216 | pr = readRDS(paste0(dir, "stringtie/", sp, "gtf", gtfid, ".rds")) 217 | pr = Reduce(rbind, pr) 218 | }else{ 219 | pr = readRDS(paste0(dir, "slide/", sp, "gtf", gtfid, ".rds")) 220 | pr = Reduce(rbind, pr) 221 | } 222 | res = data.frame(precision = pr[,1], recall = pr[,2], method = methods[mm]) 223 | return(res) 224 | }) 225 | tp1 = Reduce(rbind, tp1) 226 | tp1$annotation = gtfid 227 | return(tp1) 228 | }) 229 | tp2 = Reduce(rbind, tp2) 230 | tp2$sample = sp 231 | return(tp2) 232 | }) 233 | data = Reduce(rbind, data) 234 | data$annotation = factor(data$annotation, levels = 1:9) 235 | cols = c(Cufflinks = "#619CFF", LRT = "#D53E4F", 236 | Stringtie = "#00BA38", SLIDE = "#B79F00") 237 | ggplot(data, aes(x = annotation, y = precision, fill = method)) + 238 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 239 | theme_bw() + 240 | facet_grid(sample ~.) + 241 | scale_fill_manual(values = cols) + 242 | theme(strip.background = element_rect(fill = "white")) 243 | ggsave(paste0(dir, "precision_boxplot.pdf"), width = 6, height = 15) 244 | ggplot(data, aes(x = annotation, y = recall, fill = method)) + 245 | geom_boxplot(aes(ymin=..lower.., ymax=..upper..), outlier.size = NA) + 246 | theme_bw() + 247 | facet_grid(sample ~.) + 248 | scale_fill_manual(values = cols) + 249 | theme(strip.background = element_rect(fill = "white")) 250 | ggsave(paste0(dir, "recall_boxplot.pdf"), width = 6, height = 15) 251 | 41/56 252 | class(NULL) 253 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP2_names.txt", 254 | stringsAsFactors = FALSE) 255 | names = as.character(names[,1]) 256 | names_split = strsplit(names, split = "-") 257 | pt = sapply(names_split, function(x) x[1]) 258 | pt_uniq = unique(pt) 259 | ### star 260 | commands = lapply(1:length(pt_uniq), function(i){ 261 | ptid = pt_uniq[i] 262 | titles = names_split[pt == ptid] 263 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3])) 264 | titles = paste0("${Dfastq}", titles) 265 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",") 266 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",") 267 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn", 268 | f1, f2, 269 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate", 270 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2")) 271 | return(cd) 272 | }) 273 | commands = unlist(commands) 274 | write(commands, "~/Dropbox/Iso Discovery/Codes/melanoma/star_align_DP2.sh", sep="\n") 275 | ### samtools index 276 | commands = lapply(pt_uniq, function(pt){ 277 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam") 278 | cd = paste("samtools index", bam_path) 279 | return(cd) 280 | }) 281 | commands = unlist(commands) 282 | write(commands, "~/Dropbox/Iso Discovery/Codes/melanoma/samtools_index_DP2.sh", sep="\n") 283 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP2_names.txt", 284 | stringsAsFactors = FALSE) 285 | names = as.character(names[,1]) 286 | names_split = strsplit(names, split = "-") 287 | pt = sapply(names_split, function(x) x[1]) 288 | pt_uniq = unique(pt) 289 | ### star 290 | commands = lapply(1:length(pt_uniq), function(i){ 291 | ptid = pt_uniq[i] 292 | titles = names_split[pt == ptid] 293 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3])) 294 | titles = paste0("${Dfastq}", titles) 295 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",") 296 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",") 297 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn", 298 | f1, f2, 299 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate", 300 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2")) 301 | return(cd) 302 | }) 303 | commands = unlist(commands) 304 | write(commands, "~/Dropbox/Iso Discovery/melanoma/star_align_DP2.sh", sep="\n") 305 | ### samtools index 306 | commands = lapply(pt_uniq, function(pt){ 307 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam") 308 | cd = paste("samtools index", bam_path) 309 | return(cd) 310 | }) 311 | commands = unlist(commands) 312 | write(commands, "~/Dropbox/Iso Discovery/melanoma/samtools_index_DP2.sh", sep="\n") 313 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP2_names.txt", 314 | stringsAsFactors = FALSE) 315 | names = as.character(names[,1]) 316 | names_split = strsplit(names, split = "-") 317 | pt = sapply(names_split, function(x) x[1]) 318 | pt_uniq = unique(pt) 319 | ### star 320 | commands = lapply(1:length(pt_uniq), function(i){ 321 | ptid = pt_uniq[i] 322 | titles = names_split[pt == ptid] 323 | titles = sapply(titles, function(x) paste0(x[1], "-", x[2], "_", x[3])) 324 | titles = paste0("${Dfastq}", titles) 325 | f1 = paste(paste0(titles, "_R1.fastq"), sep = "", collapse = ",") 326 | f2 = paste(paste0(titles, "_R2.fastq"), sep = "", collapse = ",") 327 | cd = paste("STAR --genomeDir ${GenomeDir} --readFilesIn", 328 | f1, f2, 329 | "--runThreadN 12 --outSAMstrandField intronMotif --outSAMtype BAM SortedByCoordinate", 330 | "--outFileNamePrefix", paste0("~/data/melanoma/", ptid, "-DP2")) 331 | return(cd) 332 | }) 333 | commands = unlist(commands) 334 | write(commands, "~/Dropbox/Iso Discovery/melanoma/star_align_DP2.sh", sep="\n") 335 | ### samtools index 336 | commands = lapply(pt_uniq, function(pt){ 337 | bam_path = paste0(pt, "-DP2Aligned.sortedByCoord.out.bam") 338 | cd = paste("samtools index", bam_path) 339 | return(cd) 340 | }) 341 | commands = unlist(commands) 342 | write(commands, "~/Dropbox/Iso Discovery/melanoma/samtools_index_DP2.sh", sep="\n") 343 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP2_names.txt", 344 | stringsAsFactors = FALSE) 345 | names = as.character(names[,1]) 346 | names_split = strsplit(names, split = "-") 347 | pt = sapply(names_split, function(x) x[1]) 348 | pt_uniq = unique(pt) 349 | pt_uniq 350 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/baseline_names.txt", 351 | stringsAsFactors = FALSE) 352 | names = as.character(names[,1]) 353 | names_split = strsplit(names, split = "-") 354 | pt = sapply(names_split, function(x) x[1]) 355 | pt_uniq = unique(pt) 356 | names = read.table("~/Dropbox/Iso Discovery/melanoma/baseline_names.txt", 357 | stringsAsFactors = FALSE) 358 | names = as.character(names[,1]) 359 | names_split = strsplit(names, split = "-") 360 | pt = sapply(names_split, function(x) x[1]) 361 | pt_uniq = unique(pt) 362 | pt_uniq 363 | names = read.table("~/Dropbox/Iso Discovery/Codes/melanoma/DP1_names.txt", 364 | stringsAsFactors = FALSE) 365 | names = as.character(names[,1]) 366 | names_split = strsplit(names, split = "-") 367 | pt = sapply(names_split, function(x) x[1]) 368 | pt_uniq = unique(pt) 369 | names = read.table("~/Dropbox/Iso Discovery/melanoma/DP1_names.txt", 370 | stringsAsFactors = FALSE) 371 | names = as.character(names[,1]) 372 | names_split = strsplit(names, split = "-") 373 | pt = sapply(names_split, function(x) x[1]) 374 | pt_uniq = unique(pt) 375 | pt_uniq 376 | library("ggplot2") 377 | 109 + 289 +199 +109 378 | 706-30 379 | 676 - 20*3 380 | 706-50 381 | 656-60 382 | c(1:6, 8:10, 15:20) 383 | c(1, 3:6, 8:10, 15:17, 20) 384 | intersect(c(1:6, 8:10, 15:20), c(1, 3:6, 8:10, 15:17, 20)) 385 | log10(10) 386 | log10(0.1) 387 | log(1) 388 | log(2) 389 | install.packages("RPEnsemble") 390 | devtools::install_github("AndreaCirilloAC/updateR") 391 | library(updateR) 392 | updateR(admin_password = "l19921020") 393 | R.version 394 | -------------------------------------------------------------------------------- /vignettes/scImpute-vignette.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | Introduction to scImpute 18 | 19 | 20 | 21 | 22 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |

Introduction to scImpute

72 |

Wei Vivian Li, Jingyi Jessica Li

73 |

2018-06-08

74 | 75 | 76 | 77 |

The emerging single cell RNA sequencing (scRNA-seq) technologies enable the investigation of transcriptomic landscape at single-cell resolution. However, scRNA-seq analysis is complicated by the excess of zero or near zero counts in the data, which are the so-called dropouts due to low amounts of mRNA within each individual cell. Consequently, downstream analysis of scRNA-seq woule be severely biased if the dropout events are not properly corrected. scImpute is developed to accurately and efficiently impute the dropout values in scRNA-seq data.

78 |

scImpute can be applied to raw data count before the users perform downstream analyses such as

79 |
    80 |
  • dimension reduction of scRNA-seq data
  • 81 |
  • normalization of scRNA-seq data
  • 82 |
  • clustering of cell populations
  • 83 |
  • differential gene expression analysis
  • 84 |
  • time-series analysis of gene expression dynamics
  • 85 |
86 |
87 |

Quick start

88 |

scImpute can be easily incorporated into existing pipeline of scRNA-seq analysis. Its only input is the raw count matrix with rows representing genes and columns representing cells. It will output an imputed count matrix with the same dimension. In the simplest case, the imputation task can be done with one single function scimpute:

89 |
scimpute(# full path to raw count matrix
 90 |          count_path = system.file("extdata", "raw_count.csv", package = "scImpute"), 
 91 |          infile = "csv",           # format of input file
 92 |          outfile = "csv",          # format of output file
 93 |          out_dir = "./",           # full path to output directory
 94 |          labeled = FALSE,          # cell type labels not available
 95 |          drop_thre = 0.5,          # threshold set on dropout probability
 96 |          Kcluster = 2,             # 2 cell subpopulations
 97 |          ncores = 10)              # number of cores used in parallel computation
98 |

This function returns the column indices of outlier cells, and creates a new file scImpute_count.csv in out_dir to store the imputed count matrix.

99 |
100 |
101 |

Step-by-step description

102 |

The input file can be a .csv file, .txt file, or .rds file. In all cases, the first column should give the gene names and the first row should give the cell names. We use the example files in the package as illustration. If the raw counts are stored in a .csv file, and we also hope to output the imputed matrix into a .csv file, then specify this information with

103 |
# full path of the input file
104 | count_path = system.file("extdata", "raw_count.csv", package = "scImpute")
105 | infile = "csv"
106 | outfile = "csv"
107 |

Similarly, If the raw counts are stored in a .txt file, and we also hope to output the imputed matrix into a .txt file, then specify this information with

108 |
# full path of the input file
109 | count_path = system.file("extdata", "raw_count.txt", package = "scImpute")
110 | infile = "txt"
111 | outfile = "txt"
112 |

Next, we need to set up the directory to store all the temporary and final outputs:

113 |
# a '/' sign is necessary at the end of the path
114 | out_dir = "~/output/"
115 |

We highly recommend using parallel computing with scImpute, which will significantly reduce the computation time. Suppose we would like to use 20 cores, then we can run the scImpute function with ncores = 20.

116 |

scImpute has two statistical parameters. The first parameter is Kcluster, which determines the number of initial clusters to help identify candidate neighbors of each cell. The imputation results does not heavily rely on the choice of Kcluster, since scImpute uses a model-based method to select similar cells in a later stage. Kcluster can be specified based on the number of known cell types and users’ biological expertise, and it may also be learned by clustering the raw data and inspecting the clustering results. The second parameter is drop_thre. Only the values that have dropout probability larger than drop_thre are imputed by scImpute. A default threshold drop_thre = 0.5 is sufficient for most scRNA-seq data.

117 |

Now to get the imputed matrix, all we need is the main scimpute function

118 |
Kcluster = 2
119 | drop_thre = 0.5
120 | ncores = 10
121 | scimpute(count_path, infile, outfile, out_dir, labeled = FALSE,  drop_thre, Kcluster, ncores)  
122 |

If outfile = "csv", this function will create a new file scimpute_count.csv in out_dir to store the imputed count matrix; if outfile = "txt", this function will create a new file scimpute_count.txt in out_dir.

123 |

Note that the order of parameters matters in R functions, so we suggest using the format in Quick start to specify parameters and avoid mistakes. If the users would like to apply scImpute on data coming from homogeneous cells, this can be achieved by setting Kcluster = 1 and labeled = FALSE.

124 |
125 |
126 |

Apply scImpute with cell type information

127 |

Sometimes users may have the cell type (or subpopulation) information of the single cells and scimpute can take advantage of this information to impute among each cell type. To do this, we need a character vector labels specifying the cell type of each column in the raw count matrix. In other words, the length of labels equals the number of cells and the order of elements in labels should match the order of columns in the raw count matrix. Then we just need to specify labeled = TRUE in scimpute (default is FALSE) and specify the labels argument. Kcluster is not used when labeled = TRUE.

128 |
labels = readRDS(system.file("extdata", "labels.rds", package = "scImpute"))
129 | labels[1:5]
130 | > [1] "c1" "c1" "c1" "c2" "c2"
131 | 
132 | scimpute(count_path, 
133 |          infile = "csv", 
134 |          outfile = "csv", 
135 |          out_dir = out_dir,
136 |          labeled = TRUE, 
137 |          drop_thre = 0.5,
138 |          labels = labels, 
139 |          ncores = 10)
140 |
141 |
142 |

Apply scImpute to TPM values

143 |

We strongly suggest using scImpute on count matrices. However, if only TPM values are available, users can apply scImpute with gene lengths supplied. scImpute will use the gene lengths (sum of exon lengths) to scale the data , which ensures a good fitting of the mixture models. In this case, users need to specify type = "TPM" (type = "count" by default), and supply a vector genelen of gene lengths. The order of genes in genelen should match the order in the expression matrix. For example:

144 |
> genelen[1:3]
145 | ENSMUSG00000021252 ENSMUSG00000007777 ENSMUSG00000024442
146 |               4235                998               2404
147 | 
148 | scimpute(count_path, 
149 |          infile = "csv", 
150 |          outfile = "csv", 
151 |          out_dir = out_dir,
152 |          type = "TPM"
153 |          genelen = genelen,
154 |          drop_thre = 0.5,
155 |          ncores = 10)
156 |
157 |
158 |

How to save computation time with scImpute

159 |

scImpute benefits from parallel computation, and each processor does not require heavy memory cost. scimpute completes computation in seconds when applied to a dataset with 10,000 genes and 100 cells, running with 10 cores. The memory requirement for this data set is around 2G. The running time mostly depends on

160 |
    161 |
  • number of processors (ncores)
  • 162 |
  • number of cells in the scRNA-seq data
  • 163 |
164 |

When the number of cells is extremely large, a filtering step on the cells can save the computation time of scImpute.

165 |
166 | 167 | 168 | 169 | 170 | 178 | 179 | 180 | 181 | --------------------------------------------------------------------------------