├── data ├── demo.rda ├── iCN_sim.rda ├── ref.scopeDemo.rda ├── normObj.scopeDemo.rda ├── QCmetric.scopeDemo.rda └── coverageObj.scopeDemo.rda ├── tests ├── testthat.R └── testthat │ ├── test_get_gini.R │ ├── test_initialize_ploidy.R │ ├── test_get_bam_bed.R │ ├── test_normalize_scope.R │ └── test_segment_CBScs.R ├── man ├── Y_sim.Rd ├── ref_sim.Rd ├── normObj.scopeDemo.Rd ├── ref.scopeDemo.Rd ├── QCmetric.scopeDemo.Rd ├── iCN_sim.Rd ├── coverageObj.scopeDemo.Rd ├── get_gini.Rd ├── get_samp_QC.Rd ├── get_gc.Rd ├── normalize_codex2_ns_noK.Rd ├── plot_iCN.Rd ├── initialize_ploidy.Rd ├── get_coverage_scDNA.Rd ├── get_bam_bed.Rd ├── segment_CBScs.Rd ├── get_mapp.Rd ├── initialize_ploidy_group.Rd ├── plot_EM_fit.Rd ├── perform_qc.Rd ├── normalize_scope.Rd ├── normalize_scope_foreach.Rd └── normalize_scope_group.Rd ├── R ├── get_gini.R ├── data.R ├── get_samp_QC.R ├── get_gc.R ├── get_bam_bed.R ├── get_mapp.R ├── initialize_ploidy.R ├── plot_EM_fit.R ├── normalize_codex2_ns_noK.R ├── initialize_ploidy_group.R ├── perform_qc.R ├── get_coverage_scDNA.R ├── plot_iCN.R ├── normalize_scope_foreach.R ├── segment_CBScs.R ├── normalize_scope.R └── normalize_scope_group.R ├── NEWS ├── inst └── docs │ └── split_script.py ├── NAMESPACE ├── DESCRIPTION ├── README.md └── vignettes └── SCOPE_vignette.Rmd /data/demo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/demo.rda -------------------------------------------------------------------------------- /data/iCN_sim.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/iCN_sim.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(SCOPE) 3 | 4 | test_check("SCOPE") 5 | -------------------------------------------------------------------------------- /data/ref.scopeDemo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/ref.scopeDemo.rda -------------------------------------------------------------------------------- /data/normObj.scopeDemo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/normObj.scopeDemo.rda -------------------------------------------------------------------------------- /data/QCmetric.scopeDemo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/QCmetric.scopeDemo.rda -------------------------------------------------------------------------------- /data/coverageObj.scopeDemo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/coverageObj.scopeDemo.rda -------------------------------------------------------------------------------- /tests/testthat/test_get_gini.R: -------------------------------------------------------------------------------- 1 | context("getGini") 2 | library(SCOPE) 3 | 4 | test_that("Gini Calculation works", { 5 | Gini <- get_gini(Y_sim) 6 | expect_equal(sum(Gini<=0.12), 2) 7 | }) 8 | -------------------------------------------------------------------------------- /man/Y_sim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{Y_sim} 5 | \alias{Y_sim} 6 | \title{A read count matrix in the toy dataset} 7 | \format{A read count matrix with 1544 bins and 39 cells} 8 | \usage{ 9 | Y_sim 10 | } 11 | \description{ 12 | A read count matrix in the toy dataset 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/ref_sim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ref_sim} 5 | \alias{ref_sim} 6 | \title{A reference genome in the toy dataset} 7 | \format{A GRanges object with 1544 bins and 1 metadata column of GC content} 8 | \usage{ 9 | ref_sim 10 | } 11 | \description{ 12 | A reference genome in the toy dataset 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/normObj.scopeDemo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{normObj.scopeDemo} 5 | \alias{normObj.scopeDemo} 6 | \title{Pre-stored normObj.scope data for demonstration purposes} 7 | \format{Pre-computed by SCOPE using pre-stored data \code{Y_sim}} 8 | \usage{ 9 | normObj.scopeDemo 10 | } 11 | \description{ 12 | Pre-stored normObj.scope data for demonstration purposes 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/ref.scopeDemo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ref.scopeDemo} 5 | \alias{ref.scopeDemo} 6 | \title{Pre-stored 500kb-size reference genome for demonstration purposes} 7 | \format{Pre-computed using whole genome sequencing data 8 | with GC content and mappability scores} 9 | \usage{ 10 | ref.scopeDemo 11 | } 12 | \description{ 13 | Pre-stored 500kb-size reference genome for demonstration purposes 14 | } 15 | \keyword{datasets} 16 | -------------------------------------------------------------------------------- /man/QCmetric.scopeDemo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{QCmetric.scopeDemo} 5 | \alias{QCmetric.scopeDemo} 6 | \title{Pre-stored QCmetric data for demonstration purposes} 7 | \format{Pre-computed using whole genome sequencing data of 8 | three single cells from 10X Genomics Single-Cell CNV solution} 9 | \usage{ 10 | QCmetric.scopeDemo 11 | } 12 | \description{ 13 | Pre-stored QCmetric data for demonstration purposes 14 | } 15 | \keyword{datasets} 16 | -------------------------------------------------------------------------------- /man/iCN_sim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{iCN_sim} 5 | \alias{iCN_sim} 6 | \title{A post cross-sample segmentation integer copy number matrix returned by 7 | SCOPE in the demo} 8 | \format{A post cross-sample segmentation integer copy number matrix of 9 | five toy cells returned by SCOPE} 10 | \usage{ 11 | iCN_sim 12 | } 13 | \description{ 14 | A post cross-sample segmentation integer copy number matrix returned by 15 | SCOPE in the demo 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /tests/testthat/test_initialize_ploidy.R: -------------------------------------------------------------------------------- 1 | context("PreEst_Ploidy") 2 | library(SCOPE) 3 | 4 | test_that("Ploidy Initialization works", { 5 | Gini <- get_gini(Y_sim) 6 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 7 | gc_qc = ref_sim$gc, 8 | norm_index = which(Gini<=0.12)) 9 | 10 | Yhat.noK.sim <- normObj.sim$Yhat 11 | ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = Yhat.noK.sim, ref = ref_sim) 12 | expect_equal(sum(Gini<=0.12), sum(ploidy.sim==2)) 13 | }) 14 | -------------------------------------------------------------------------------- /man/coverageObj.scopeDemo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{coverageObj.scopeDemo} 5 | \alias{coverageObj.scopeDemo} 6 | \title{Pre-stored coverageObj.scope data for demonstration purposes} 7 | \format{Pre-computed using whole genome sequencing data of 8 | three single cells from 10X Genomics Single-Cell CNV solution} 9 | \usage{ 10 | coverageObj.scopeDemo 11 | } 12 | \description{ 13 | Pre-stored coverageObj.scope data for demonstration purposes 14 | } 15 | \keyword{datasets} 16 | -------------------------------------------------------------------------------- /man/get_gini.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_gini.R 3 | \name{get_gini} 4 | \alias{get_gini} 5 | \title{Compute Gini coefficients for single cells} 6 | \usage{ 7 | get_gini(Y) 8 | } 9 | \arguments{ 10 | \item{Y}{raw read depth matrix after quality control procedure} 11 | } 12 | \value{ 13 | \item{Gini}{Vector of Gini coefficients for single cells 14 | from scDNA-seq} 15 | } 16 | \description{ 17 | Gini index is defined as two times the area 18 | between the Lorenz curve and the diagonal. 19 | } 20 | \examples{ 21 | Gini <- get_gini(Y_sim) 22 | 23 | } 24 | \author{ 25 | Rujin Wang \email{rujin@email.unc.edu} 26 | } 27 | -------------------------------------------------------------------------------- /tests/testthat/test_get_bam_bed.R: -------------------------------------------------------------------------------- 1 | context("getbambed") 2 | library(SCOPE) 3 | library(WGSmapp) 4 | 5 | test_that("File preparation works", { 6 | bamfolder <- system.file("extdata", package = "WGSmapp") 7 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 8 | bamdir <- file.path(bamfolder, bamFile) 9 | sampname_raw <- sapply(strsplit(bamFile, ".", fixed = TRUE), "[", 1) 10 | bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw) 11 | bamdir <- bambedObj$bamdir 12 | sampname_raw <- bambedObj$sampname 13 | ref_raw <- bambedObj$ref 14 | 15 | expect_equal(length(bamFile), length(sampname_raw)) 16 | expect_gte(length(as.character(unique(seqnames(ref_raw)))), 22) 17 | }) 18 | -------------------------------------------------------------------------------- /R/get_gini.R: -------------------------------------------------------------------------------- 1 | #' @title Compute Gini coefficients for single cells 2 | #' 3 | #' @description Gini index is defined as two times the area 4 | #' between the Lorenz curve and the diagonal. 5 | #' 6 | #' @param Y raw read depth matrix after quality control procedure 7 | #' 8 | #' @return 9 | #' \item{Gini}{Vector of Gini coefficients for single cells 10 | #' from scDNA-seq} 11 | #' 12 | #' @examples 13 | #' Gini <- get_gini(Y_sim) 14 | #' 15 | #' @author Rujin Wang \email{rujin@email.unc.edu} 16 | #' @importFrom DescTools AUC 17 | #' @export 18 | get_gini <- function(Y) { 19 | Gini <- rep(NA, ncol(Y)) 20 | for (i in seq_len(ncol(Y))) { 21 | y <- sort(Y[, i]) 22 | x <- c(0, seq_len(length(y))/length(y)) 23 | y <- c(0, cumsum(y)/sum(y)) 24 | Gini[i] <- 2 * round(0.5 - AUC(x, y), 4) 25 | } 26 | return(Gini) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /tests/testthat/test_normalize_scope.R: -------------------------------------------------------------------------------- 1 | context("normalizescope") 2 | library(SCOPE) 3 | 4 | test_that("basic argument errors thrown", { 5 | Gini <- get_gini(Y_sim) 6 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 7 | gc_qc = ref_sim$gc, 8 | norm_index = which(Gini<=0.12)) 9 | 10 | Yhat.noK.sim <- normObj.sim$Yhat 11 | beta.hat.noK.sim <- normObj.sim$beta.hat 12 | ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = Yhat.noK.sim, ref = ref_sim) 13 | expect_error(normalize_scope_foreach(Y_qc = Y_sim, gc_qc = ref_sim$gc, 14 | K = 1:3, ploidyInt = ploidy.sim, 15 | norm_index = which(Gini<=0.12), T = 1:7, 16 | beta0 = beta.hat.noK.sim), "exceed the number") 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test_segment_CBScs.R: -------------------------------------------------------------------------------- 1 | context("CrossSampleSegmentation") 2 | library(SCOPE) 3 | 4 | test_that("Cross-sample CBS segmentation works", { 5 | chrs <- unique(as.character(seqnames(ref_sim))) 6 | segment_cs <- vector('list',length = length(chrs)) 7 | names(segment_cs) <- chrs 8 | Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]] 9 | for (chri in chrs) { 10 | message('\n', chri, '\n') 11 | segment_cs[[chri]] <- segment_CBScs(Y = Y_sim, 12 | Yhat = Yhat.sim, 13 | sampname = colnames(Y_sim), 14 | ref = ref_sim, 15 | chr = chri, 16 | mode = "integer", max.ns = 1) 17 | } 18 | iCN <- do.call(rbind, lapply(segment_cs, function(z){z[["iCN"]]})) 19 | 20 | expect_equal(nrow(Y_sim), nrow(iCN)) 21 | expect_equal(ncol(Y_sim), ncol(iCN)) 22 | }) 23 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Changes in version 1.5.2 2 | + remove loading warnings 3 | 4 | Changes in version 1.5.1 5 | + update vignette 6 | 7 | Changes in version 0.99.13 8 | + Integrate mouse genome and update plot_iCN() 9 | 10 | Changes in version 0.99.12 11 | + add algorithms with shared clonal memberships 12 | 13 | Changes in version 0.99.11 14 | + get_gc() dontrun 15 | 16 | Changes in version 0.99.8 17 | + minor edit 18 | 19 | Changes in version 0.99.7 20 | + second-round revision 21 | 22 | Changes in version 0.99.6 23 | + first-round revision 24 | 25 | Changes in version 0.99.5 26 | + Enable user-define bin length and offer SoSplot 27 | 28 | Changes in version 0.99.4 29 | + Add .bed file for hg38 30 | 31 | Changes in version 0.99.3 32 | + Test if SCOPE is linked to WGSmapp 33 | 34 | Changes in version 0.99.2 35 | + Bump new version to trigger a build 36 | 37 | Changes in version 0.99.1 38 | + Add session_info() to the vignette 39 | 40 | Changes in version 0.99.0 41 | + SCOPE getting ready for submission to Bioc 42 | -------------------------------------------------------------------------------- /man/get_samp_QC.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_samp_QC.R 3 | \name{get_samp_QC} 4 | \alias{get_samp_QC} 5 | \title{Get QC metrics for single cells} 6 | \usage{ 7 | get_samp_QC(bambedObj) 8 | } 9 | \arguments{ 10 | \item{bambedObj}{object returned from \code{get_bam_bed}} 11 | } 12 | \value{ 13 | \item{QCmetric}{A matrix containing total number/proportion of reads, 14 | total number/proportion of mapped reads, total number/proportion 15 | of mapped non-duplicate reads, and number/proportion of reads with 16 | mapping quality greater than 20} 17 | } 18 | \description{ 19 | Perform QC step on single cells. 20 | } 21 | \examples{ 22 | library(WGSmapp) 23 | library(BSgenome.Hsapiens.UCSC.hg38) 24 | bamfolder <- system.file('extdata', package = 'WGSmapp') 25 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 26 | bamdir <- file.path(bamfolder, bamFile) 27 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 28 | bambedObj <- get_bam_bed(bamdir = bamdir, 29 | sampname = sampname_raw, 30 | hgref = "hg38") 31 | QCmetric_raw = get_samp_QC(bambedObj) 32 | 33 | } 34 | \author{ 35 | Rujin Wang \email{rujin@email.unc.edu} 36 | } 37 | -------------------------------------------------------------------------------- /man/get_gc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_gc.R 3 | \name{get_gc} 4 | \alias{get_gc} 5 | \title{Compute GC content} 6 | \usage{ 7 | get_gc(ref, hgref = "hg19") 8 | } 9 | \arguments{ 10 | \item{ref}{GRanges object returned from \code{get_bam_bed}} 11 | 12 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'. 13 | Default is human genome \code{hg19}.} 14 | } 15 | \value{ 16 | \item{gc}{Vector of GC content for each bin/target} 17 | } 18 | \description{ 19 | Compute GC content for each bin 20 | } 21 | \examples{ 22 | \dontrun{ 23 | library(WGSmapp) 24 | library(BSgenome.Hsapiens.UCSC.hg38) 25 | bamfolder <- system.file('extdata', package = 'WGSmapp') 26 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 27 | bamdir <- file.path(bamfolder, bamFile) 28 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 29 | bambedObj <- get_bam_bed(bamdir = bamdir, 30 | sampname = sampname_raw, 31 | hgref = "hg38") 32 | bamdir <- bambedObj$bamdir 33 | sampname_raw <- bambedObj$sampname 34 | ref_raw <- bambedObj$ref 35 | 36 | gc <- get_gc(ref_raw, hgref = "hg38") 37 | } 38 | 39 | } 40 | \author{ 41 | Rujin Wang \email{rujin@email.unc.edu} 42 | } 43 | -------------------------------------------------------------------------------- /inst/docs/split_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fileinput 3 | import csv 4 | with open("/pine/scr/r/u/rujin/10XGenomics/breast_tissue_A_2k/output/breast_tissue_A_2k_per_cell_summary_metrics.csv") as f: 5 | reader = csv.reader(f) 6 | next(reader) # skip header 7 | data = [] 8 | for r in reader: 9 | # get the barcode list from the 1st column 10 | bclist=r[0] 11 | data.append(bclist) 12 | # total count of reads 13 | tcount=0 14 | bcount=0 15 | for line in fileinput.input(): 16 | tcount+=1 17 | line=line.strip() 18 | tags = line.split()[11:] 19 | tags_sort = sorted(tags) 20 | for tag in tags_sort: 21 | tag_split=tag.split(':') 22 | if 'CB' in tag_split: 23 | barcode=tag_split[-1] 24 | if barcode in data: 25 | bcount+=1 26 | print(str(tcount)+" "+str(bcount)) 27 | directory = "../align/"+barcode[:-2] 28 | outfile = directory+"/"+barcode+".sam" 29 | try: 30 | f=open(outfile,"a+") 31 | print >> f,line 32 | f.close() 33 | except: 34 | os.mkdir(directory) 35 | f=open(outfile,"a+") 36 | print >> f,line 37 | f.close() 38 | break 39 | pct=float(bcount)/float(tcount) 40 | print(pct) 41 | 42 | -------------------------------------------------------------------------------- /man/normalize_codex2_ns_noK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalize_codex2_ns_noK.R 3 | \name{normalize_codex2_ns_noK} 4 | \alias{normalize_codex2_ns_noK} 5 | \title{Normalization of read depth without latent factors under 6 | the case-control setting} 7 | \usage{ 8 | normalize_codex2_ns_noK(Y_qc, gc_qc, norm_index) 9 | } 10 | \arguments{ 11 | \item{Y_qc}{read depth matrix after quality control} 12 | 13 | \item{gc_qc}{vector of GC content for each bin after quality control} 14 | 15 | \item{norm_index}{indices of normal/diploid cells} 16 | } 17 | \value{ 18 | A list with components 19 | \item{Yhat}{A list of normalized read depth matrix} 20 | \item{fGC.hat}{A list of estimated GC content bias matrix} 21 | \item{beta.hat}{A list of estimated bin-specific bias vector} 22 | \item{N}{A vector of cell-specific library size factor, 23 | which is computed from the genome-wide read depth data} 24 | } 25 | \description{ 26 | Assuming that all reads are from diploid regions, 27 | fit a Poisson generalized linear model to normalize the 28 | raw read depth data from single-cell DNA sequencing, without 29 | latent factors under the case-control setting. 30 | } 31 | \examples{ 32 | Gini <- get_gini(Y_sim) 33 | # first-pass CODEX2 run with no latent factors 34 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 35 | gc_qc = ref_sim$gc, 36 | norm_index = which(Gini<=0.12)) 37 | 38 | } 39 | \author{ 40 | Rujin Wang \email{rujin@email.unc.edu} 41 | } 42 | -------------------------------------------------------------------------------- /man/plot_iCN.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_iCN.R 3 | \name{plot_iCN} 4 | \alias{plot_iCN} 5 | \title{Plot post-segmentation copy number profiles of integer values} 6 | \usage{ 7 | plot_iCN(iCNmat, ref, Gini, annotation = NULL, 8 | plot.dendrogram = TRUE, show.names = FALSE, filename) 9 | } 10 | \arguments{ 11 | \item{iCNmat}{inferred integer copy-number matrix by SCOPE, 12 | with each column being a cell and each row being a genomic bin} 13 | 14 | \item{ref}{GRanges object after quality control procedure} 15 | 16 | \item{Gini}{vector of Gini coefficients for each cell, 17 | with the same order as that of cells in columns of \code{iCNmat}} 18 | 19 | \item{annotation}{vector of annotation for each cell, 20 | with the same order as that of cells in columns of \code{iCNmat}. 21 | Default is \code{NULL}.} 22 | 23 | \item{plot.dendrogram}{logical, whether to plot the dendrogram. 24 | Default is \code{TRUE}.} 25 | 26 | \item{show.names}{logical, whether to show cell names by y axis. 27 | Default is \code{FALSE}.} 28 | 29 | \item{filename}{name of the output png file} 30 | } 31 | \value{ 32 | png file with integer copy-number profiles across single cells 33 | with specified annotations 34 | } 35 | \description{ 36 | Show heatmap of inferred integer copy-number profiles 37 | by SCOPE with cells clustered by hierarchical clustering 38 | } 39 | \examples{ 40 | Gini <- get_gini(Y_sim) 41 | plot_iCN(iCNmat = iCN_sim, 42 | ref = ref_sim, 43 | Gini = Gini, 44 | filename = 'plot_iCN_demo') 45 | 46 | } 47 | \author{ 48 | Rujin Wang \email{rujin@email.unc.edu} 49 | } 50 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(get_bam_bed) 4 | export(get_coverage_scDNA) 5 | export(get_gc) 6 | export(get_gini) 7 | export(get_mapp) 8 | export(get_samp_QC) 9 | export(initialize_ploidy) 10 | export(initialize_ploidy_group) 11 | export(normalize_codex2_ns_noK) 12 | export(normalize_scope) 13 | export(normalize_scope_foreach) 14 | export(normalize_scope_group) 15 | export(perform_qc) 16 | export(plot_EM_fit) 17 | export(plot_iCN) 18 | export(segment_CBScs) 19 | import(BSgenome.Hsapiens.UCSC.hg19) 20 | import(Rsamtools) 21 | import(doParallel) 22 | import(foreach) 23 | import(grDevices) 24 | import(graphics) 25 | import(parallel) 26 | import(stats) 27 | import(utils) 28 | importFrom(BSgenome,start) 29 | importFrom(BiocGenerics,end) 30 | importFrom(BiocGenerics,start) 31 | importFrom(Biostrings,alphabetFrequency) 32 | importFrom(Biostrings,unmasked) 33 | importFrom(DNAcopy,CNA) 34 | importFrom(DNAcopy,segment) 35 | importFrom(DNAcopy,smooth.CNA) 36 | importFrom(DescTools,AUC) 37 | importFrom(GenomeInfoDb,mapSeqlevels) 38 | importFrom(GenomeInfoDb,seqinfo) 39 | importFrom(GenomeInfoDb,seqlevels) 40 | importFrom(GenomeInfoDb,seqlevelsStyle) 41 | importFrom(GenomeInfoDb,seqnames) 42 | importFrom(GenomicRanges,GRanges) 43 | importFrom(GenomicRanges,pintersect) 44 | importFrom(GenomicRanges,tileGenome) 45 | importFrom(IRanges,IRanges) 46 | importFrom(IRanges,Views) 47 | importFrom(IRanges,countOverlaps) 48 | importFrom(IRanges,end) 49 | importFrom(IRanges,findOverlaps) 50 | importFrom(IRanges,width) 51 | importFrom(RColorBrewer,brewer.pal) 52 | importFrom(S4Vectors,queryHits) 53 | importFrom(S4Vectors,subjectHits) 54 | importFrom(gplots,colorpanel) 55 | importFrom(grDevices,dev.off) 56 | importFrom(grDevices,png) 57 | -------------------------------------------------------------------------------- /man/initialize_ploidy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/initialize_ploidy.R 3 | \name{initialize_ploidy} 4 | \alias{initialize_ploidy} 5 | \title{Ploidy pre-initialization} 6 | \usage{ 7 | initialize_ploidy(Y, Yhat, ref, maxPloidy = 6, minPloidy = 1.5, 8 | minBinWidth = 5, SoS.plot = FALSE) 9 | } 10 | \arguments{ 11 | \item{Y}{raw read depth matrix after quality control procedure} 12 | 13 | \item{Yhat}{normalized read depth matrix} 14 | 15 | \item{ref}{GRanges object after quality control procedure} 16 | 17 | \item{maxPloidy}{maximum ploidy candidate. Defalut is \code{6}} 18 | 19 | \item{minPloidy}{minimum ploidy candidate. Defalut is \code{1.5}} 20 | 21 | \item{minBinWidth}{the minimum number of bins for a changed segment. 22 | Defalut is \code{5}} 23 | 24 | \item{SoS.plot}{logical, whether to generate ploidy pre-estimation 25 | plots. Default is \code{FALSE}.} 26 | } 27 | \value{ 28 | \item{ploidy.SoS}{Vector of pre-estimated ploidies for each cell} 29 | } 30 | \description{ 31 | Pre-estimate ploidies across all cells 32 | } 33 | \examples{ 34 | Gini <- get_gini(Y_sim) 35 | 36 | # first-pass CODEX2 run with no latent factors 37 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 38 | gc_qc = ref_sim$gc, 39 | norm_index = which(Gini<=0.12)) 40 | Yhat.noK.sim <- normObj.sim$Yhat 41 | beta.hat.noK.sim <- normObj.sim$beta.hat 42 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 43 | N.sim <- normObj.sim$N 44 | 45 | # Ploidy initialization 46 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 47 | Yhat = Yhat.noK.sim, 48 | ref = ref_sim) 49 | ploidy.sim 50 | 51 | } 52 | \author{ 53 | Rujin Wang \email{rujin@email.unc.edu} 54 | } 55 | -------------------------------------------------------------------------------- /man/get_coverage_scDNA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_coverage_scDNA.R 3 | \name{get_coverage_scDNA} 4 | \alias{get_coverage_scDNA} 5 | \title{Get read coverage from single-cell DNA sequencing} 6 | \usage{ 7 | get_coverage_scDNA(bambedObj, mapqthres, seq, hgref = "hg19") 8 | } 9 | \arguments{ 10 | \item{bambedObj}{object returned from \code{get_bam_bed}} 11 | 12 | \item{mapqthres}{mapping quality threshold of reads} 13 | 14 | \item{seq}{the sequencing method to be used. This should be either 15 | 'paired-end' or 'single-end'} 16 | 17 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'. 18 | Default is human genome \code{hg19}.} 19 | } 20 | \value{ 21 | \item{Y}{Read depth matrix} 22 | } 23 | \description{ 24 | Get read coverage for each genomic bin across all single 25 | cells from scDNA-seq. Blacklist regions, such as segmental duplication 26 | regions and gaps near telomeres/centromeres will be masked prior to 27 | getting coverage. 28 | } 29 | \examples{ 30 | library(WGSmapp) 31 | library(BSgenome.Hsapiens.UCSC.hg38) 32 | bamfolder <- system.file('extdata', package = 'WGSmapp') 33 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 34 | bamdir <- file.path(bamfolder, bamFile) 35 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 36 | bambedObj <- get_bam_bed(bamdir = bamdir, 37 | sampname = sampname_raw, 38 | hgref = "hg38") 39 | 40 | # Getting raw read depth 41 | coverageObj <- get_coverage_scDNA(bambedObj, 42 | mapqthres = 40, 43 | seq = 'paired-end', 44 | hgref = "hg38") 45 | Y_raw <- coverageObj$Y 46 | 47 | } 48 | \author{ 49 | Rujin Wang \email{rujin@email.unc.edu} 50 | } 51 | -------------------------------------------------------------------------------- /man/get_bam_bed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_bam_bed.R 3 | \name{get_bam_bed} 4 | \alias{get_bam_bed} 5 | \title{Get bam file directories, sample names, and whole genomic bins} 6 | \usage{ 7 | get_bam_bed(bamdir, sampname, hgref = "hg19", resolution = 500, 8 | sex = FALSE) 9 | } 10 | \arguments{ 11 | \item{bamdir}{vector of the directory of a bam file. Should be in the same 12 | order as sample names in \code{sampname}.} 13 | 14 | \item{sampname}{vector of sample names. Should be in the same order as bam 15 | directories in \code{bamdir}.} 16 | 17 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'. 18 | Default is human genome \code{hg19}.} 19 | 20 | \item{resolution}{numeric value of fixed bin-length. Default is \code{500}. 21 | Unit is "kb".} 22 | 23 | \item{sex}{logical, whether to include sex chromosomes. 24 | Default is \code{FALSE}.} 25 | } 26 | \value{ 27 | A list with components 28 | \item{bamdir}{A vector of bam directories} 29 | \item{sampname}{A vector of sample names} 30 | \item{ref}{A GRanges object specifying whole genomic bin positions} 31 | } 32 | \description{ 33 | Get bam file directories, sample names, and whole genomic 34 | bins from .bed file 35 | } 36 | \examples{ 37 | library(WGSmapp) 38 | library(BSgenome.Hsapiens.UCSC.hg38) 39 | bamfolder <- system.file('extdata', package = 'WGSmapp') 40 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 41 | bamdir <- file.path(bamfolder, bamFile) 42 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 43 | bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 44 | hgref = "hg38") 45 | bamdir <- bambedObj$bamdir 46 | sampname_raw <- bambedObj$sampname 47 | ref_raw <- bambedObj$ref 48 | 49 | } 50 | \author{ 51 | Rujin Wang \email{rujin@email.unc.edu} 52 | } 53 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' A read count matrix in the toy dataset 2 | #' 3 | #' @docType data 4 | #' 5 | #' @format A read count matrix with 1544 bins and 39 cells 6 | #' 7 | #' @keywords datasets 8 | "Y_sim" 9 | 10 | 11 | #' A reference genome in the toy dataset 12 | #' 13 | #' @docType data 14 | #' 15 | #' @format A GRanges object with 1544 bins and 1 metadata column of GC content 16 | #' 17 | #' @keywords datasets 18 | "ref_sim" 19 | 20 | 21 | #' A post cross-sample segmentation integer copy number matrix returned by 22 | #' SCOPE in the demo 23 | #' 24 | #' @docType data 25 | #' 26 | #' @format A post cross-sample segmentation integer copy number matrix of 27 | #' five toy cells returned by SCOPE 28 | #' 29 | #' @keywords datasets 30 | "iCN_sim" 31 | 32 | 33 | #' Pre-stored normObj.scope data for demonstration purposes 34 | #' 35 | #' @docType data 36 | #' 37 | #' @format Pre-computed by SCOPE using pre-stored data \code{Y_sim} 38 | #' 39 | #' @keywords datasets 40 | "normObj.scopeDemo" 41 | 42 | 43 | #' Pre-stored coverageObj.scope data for demonstration purposes 44 | #' 45 | #' @docType data 46 | #' 47 | #' @format Pre-computed using whole genome sequencing data of 48 | #' three single cells from 10X Genomics Single-Cell CNV solution 49 | #' 50 | #' @keywords datasets 51 | "coverageObj.scopeDemo" 52 | 53 | 54 | #' Pre-stored QCmetric data for demonstration purposes 55 | #' 56 | #' @docType data 57 | #' 58 | #' @format Pre-computed using whole genome sequencing data of 59 | #' three single cells from 10X Genomics Single-Cell CNV solution 60 | #' 61 | #' @keywords datasets 62 | "QCmetric.scopeDemo" 63 | 64 | 65 | 66 | 67 | #' Pre-stored 500kb-size reference genome for demonstration purposes 68 | #' 69 | #' @docType data 70 | #' 71 | #' @format Pre-computed using whole genome sequencing data 72 | #' with GC content and mappability scores 73 | #' 74 | #' @keywords datasets 75 | "ref.scopeDemo" 76 | -------------------------------------------------------------------------------- /man/segment_CBScs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segment_CBScs.R 3 | \name{segment_CBScs} 4 | \alias{segment_CBScs} 5 | \title{Cross-sample segmentation} 6 | \usage{ 7 | segment_CBScs(Y, Yhat, sampname, ref, chr, 8 | mode = "integer", max.ns) 9 | } 10 | \arguments{ 11 | \item{Y}{raw read depth matrix after quality control procedure} 12 | 13 | \item{Yhat}{normalized read depth matrix} 14 | 15 | \item{sampname}{vector of sample names} 16 | 17 | \item{ref}{GRanges object after quality control procedure} 18 | 19 | \item{chr}{chromosome name. Make sure it is consistent with the 20 | reference genome.} 21 | 22 | \item{mode}{format of returned copy numbers. Only integer mode is 23 | supported for scDNA-seq data.} 24 | 25 | \item{max.ns}{a number specifying how many rounds of nested structure 26 | searching would be performed. Defalut is \code{0}.} 27 | } 28 | \value{ 29 | A list with components 30 | \item{poolcall}{Cross-sample CNV callings indicating 31 | shared breakpoints} 32 | \item{finalcall}{Final cross-sample segmented callset of 33 | CNVs with genotyping results} 34 | \item{image.orig}{A matrix giving logarithm of normalized 35 | z-scores} 36 | \item{image.seg}{A matrix of logarithm of estimated 37 | copy number over 2} 38 | \item{iCN}{A matrix of inferred integer copy number profiles} 39 | } 40 | \description{ 41 | SCOPE offers a cross-sample Poisson likelihood-based 42 | recursive segmentation, enabling shared breakpoints across cells 43 | from the same genetic background. 44 | } 45 | \examples{ 46 | Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]] 47 | segment_cs_chr1 <- segment_CBScs(Y = Y_sim, Yhat = Yhat.sim, 48 | sampname = colnames(Y_sim), 49 | ref = ref_sim, chr = 'chr1', max.ns = 1) 50 | 51 | } 52 | \author{ 53 | Rujin Wang \email{rujin@email.unc.edu} 54 | } 55 | -------------------------------------------------------------------------------- /man/get_mapp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_mapp.R 3 | \name{get_mapp} 4 | \alias{get_mapp} 5 | \title{Compute mappability} 6 | \usage{ 7 | get_mapp(ref, hgref = "hg19") 8 | } 9 | \arguments{ 10 | \item{ref}{GRanges object returned from \code{get_bam_bed}} 11 | 12 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'. 13 | Default is human genome \code{hg19}.} 14 | } 15 | \value{ 16 | \item{mapp}{Vector of mappability for each bin/target} 17 | } 18 | \description{ 19 | Compute mappability for each bin. Note that scDNA 20 | sequencing is whole-genome amplification and the mappability 21 | score is essential to determine variable binning method. 22 | Mappability track for 100-mers on the GRCh37/hg19 human 23 | reference genome from ENCODE is pre-saved. Compute the mean 24 | of mappability scores that overlapped reads map to bins, 25 | weighted by the width of mappability tracks on the genome 26 | reference. Use liftOver utility to calculate mappability 27 | for hg38, which is pre-saved as well. For mm10, there are 28 | two workarounds: 1) set all mappability to 1 to avoid extensive 29 | computation; 2) adopt QC procedures based on annotation results, 30 | e.g., filter out bins within black list regions, 31 | which generally have low mappability. 32 | } 33 | \examples{ 34 | \dontrun{ 35 | library(WGSmapp) 36 | library(BSgenome.Hsapiens.UCSC.hg38) 37 | bamfolder <- system.file('extdata', package = 'WGSmapp') 38 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 39 | bamdir <- file.path(bamfolder, bamFile) 40 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 41 | bambedObj <- get_bam_bed(bamdir = bamdir, 42 | sampname = sampname_raw, 43 | hgref = "hg38") 44 | bamdir <- bambedObj$bamdir 45 | sampname_raw <- bambedObj$sampname 46 | ref_raw <- bambedObj$ref 47 | 48 | mapp <- get_mapp(ref_raw, hgref = "hg38") 49 | } 50 | 51 | } 52 | \author{ 53 | Rujin Wang \email{rujin@email.unc.edu} 54 | } 55 | -------------------------------------------------------------------------------- /man/initialize_ploidy_group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/initialize_ploidy_group.R 3 | \name{initialize_ploidy_group} 4 | \alias{initialize_ploidy_group} 5 | \title{Group-wise ploidy pre-initialization} 6 | \usage{ 7 | initialize_ploidy_group(Y, Yhat, ref, groups, 8 | maxPloidy = 6, minPloidy = 1.5, 9 | minBinWidth = 5, SoS.plot = FALSE) 10 | } 11 | \arguments{ 12 | \item{Y}{raw read depth matrix after quality control procedure} 13 | 14 | \item{Yhat}{normalized read depth matrix} 15 | 16 | \item{ref}{GRanges object after quality control procedure} 17 | 18 | \item{groups}{clonal membership labels for each cell} 19 | 20 | \item{maxPloidy}{maximum ploidy candidate. Defalut is \code{6}} 21 | 22 | \item{minPloidy}{minimum ploidy candidate. Defalut is \code{1.5}} 23 | 24 | \item{minBinWidth}{the minimum number of bins for a changed segment. 25 | Defalut is \code{5}} 26 | 27 | \item{SoS.plot}{logical, whether to generate ploidy pre-estimation 28 | plots. Default is \code{FALSE}.} 29 | } 30 | \value{ 31 | \item{ploidy.SoS}{Vector of group-wise pre-estimated ploidies 32 | for each cell} 33 | } 34 | \description{ 35 | Pre-estimate ploidies across cells with shared clonal 36 | memberships 37 | } 38 | \examples{ 39 | Gini <- get_gini(Y_sim) 40 | 41 | # first-pass CODEX2 run with no latent factors 42 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 43 | gc_qc = ref_sim$gc, 44 | norm_index = which(Gini<=0.12)) 45 | Yhat.noK.sim <- normObj.sim$Yhat 46 | beta.hat.noK.sim <- normObj.sim$beta.hat 47 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 48 | N.sim <- normObj.sim$N 49 | 50 | # Group-wise ploidy initialization 51 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1") 52 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 53 | ref = ref_sim, groups = clones) 54 | ploidy.sim.group 55 | 56 | } 57 | \author{ 58 | Rujin Wang \email{rujin@email.unc.edu} 59 | } 60 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: SCOPE 2 | Type: Package 3 | Title: A normalization and copy number estimation method for single-cell DNA sequencing 4 | Version: 1.5.2 5 | Author: Rujin Wang, Danyu Lin, Yuchao Jiang 6 | Maintainer: Rujin Wang 7 | Description: Whole genome single-cell DNA sequencing (scDNA-seq) enables characterization of copy number profiles at the cellular level. This circumvents the averaging effects associated with bulk-tissue sequencing and has increased resolution yet decreased ambiguity in deconvolving cancer subclones and elucidating cancer evolutionary history. ScDNA-seq data is, however, sparse, noisy, and highly variable even within a homogeneous cell population, due to the biases and artifacts that are introduced during the library preparation and sequencing procedure. Here, we propose SCOPE, a normalization and copy number estimation method for scDNA-seq data. The distinguishing features of SCOPE include: (i) utilization of cell-specific Gini coefficients for quality controls and for identification of normal/diploid cells, which are further used as negative control samples in a Poisson latent factor model for normalization; (ii) modeling of GC content bias using an expectation-maximization algorithm embedded in the Poisson generalized linear models, which accounts for the different copy number states along the genome; (iii) a cross-sample iterative segmentation procedure to identify breakpoints that are shared across cells from the same genetic background. 8 | Depends: R (>= 3.6.0), GenomicRanges, IRanges, Rsamtools, GenomeInfoDb, BSgenome.Hsapiens.UCSC.hg19 9 | Imports: stats, grDevices, graphics, utils, DescTools, RColorBrewer, gplots, foreach, parallel, doParallel, DNAcopy, BSgenome, Biostrings, BiocGenerics, S4Vectors 10 | Suggests: 11 | knitr, 12 | rmarkdown, 13 | WGSmapp, 14 | BSgenome.Hsapiens.UCSC.hg38, 15 | BSgenome.Mmusculus.UCSC.mm10, 16 | testthat (>= 2.1.0) 17 | VignetteBuilder: knitr 18 | biocViews: SingleCell, 19 | Normalization, 20 | CopyNumberVariation, 21 | Sequencing, WholeGenome, 22 | Coverage, 23 | Alignment, 24 | QualityControl, 25 | DataImport, 26 | DNASeq 27 | License: GPL-2 28 | LazyData: true 29 | RoxygenNote: 6.1.1 30 | Encoding: UTF-8 31 | -------------------------------------------------------------------------------- /R/get_samp_QC.R: -------------------------------------------------------------------------------- 1 | #' @title Get QC metrics for single cells 2 | #' 3 | #' @description Perform QC step on single cells. 4 | #' 5 | #' @param bambedObj object returned from \code{get_bam_bed} 6 | #' 7 | #' @return 8 | #' \item{QCmetric}{A matrix containing total number/proportion of reads, 9 | #' total number/proportion of mapped reads, total number/proportion 10 | #' of mapped non-duplicate reads, and number/proportion of reads with 11 | #' mapping quality greater than 20} 12 | #' 13 | #' @examples 14 | #' library(WGSmapp) 15 | #' library(BSgenome.Hsapiens.UCSC.hg38) 16 | #' bamfolder <- system.file('extdata', package = 'WGSmapp') 17 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 18 | #' bamdir <- file.path(bamfolder, bamFile) 19 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 20 | #' bambedObj <- get_bam_bed(bamdir = bamdir, 21 | #' sampname = sampname_raw, 22 | #' hgref = "hg38") 23 | #' QCmetric_raw = get_samp_QC(bambedObj) 24 | #' 25 | #' @author Rujin Wang \email{rujin@email.unc.edu} 26 | #' @import Rsamtools 27 | #' @export 28 | get_samp_QC <- function(bambedObj) { 29 | ref <- bambedObj$ref 30 | bamdir <- bambedObj$bamdir 31 | sampname <- bambedObj$sampname 32 | QCmetric <- matrix(ncol = 8, nrow = length(sampname)) 33 | rownames(QCmetric) <- sampname 34 | colnames(QCmetric) <- c("readlength", "total", "mapped", 35 | "mapped_prop", "non_dup", "non_dup_prop", "mapq20", 36 | "mapq20_prop") 37 | for (i in seq_len(length(sampname))) { 38 | cat("Getting sample QC metric for sample", i, "\n") 39 | what <- c("rname", "pos", "strand", "mapq", "qwidth", "flag") 40 | param <- ScanBamParam(what = what) 41 | aln <- scanBam(bamdir[i], param = param) 42 | aln <- aln[[1]] 43 | temp0 <- round(mean(aln$qwidth, na.rm = TRUE)) 44 | temp1 <- length(aln$mapq) 45 | temp2 <- sum(!is.na(aln$rname)) 46 | temp3 <- sum(!is.na(aln$rname) & aln$flag < 1024) 47 | temp4 <- sum(aln$flag < 1024 & aln$mapq >= 20, na.rm = TRUE) 48 | QCmetric[i, ] <- c(temp0, temp1, temp2, round(temp2/temp1, 3), 49 | temp3, round(temp3/temp1, 3), temp4, round(temp4/temp1, 50 | 3)) 51 | } 52 | return(QCmetric) 53 | } 54 | -------------------------------------------------------------------------------- /man/plot_EM_fit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_EM_fit.R 3 | \name{plot_EM_fit} 4 | \alias{plot_EM_fit} 5 | \title{Visualize EM fitting for each cell.} 6 | \usage{ 7 | plot_EM_fit(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0, 8 | minCountQC = 20, filename) 9 | } 10 | \arguments{ 11 | \item{Y_qc}{read depth matrix across all cells after quality control} 12 | 13 | \item{gc_qc}{vector of GC content for each bin after quality control} 14 | 15 | \item{norm_index}{indices of normal/diploid cells} 16 | 17 | \item{T}{a vector of integers indicating number of CNV groups. 18 | Use BIC to select optimal number of CNV groups. 19 | If \code{T = 1}, assume all reads are from normal regions 20 | so that EM algorithm is not implemented. Otherwise, 21 | we assume there is always a CNV group of heterozygous deletion 22 | and a group of null region. The rest groups are representative 23 | of different duplication states.} 24 | 25 | \item{ploidyInt}{a vector of initialized ploidy return from 26 | \code{initialize_ploidy}} 27 | 28 | \item{beta0}{a vector of initialized bin-specific biases returned 29 | from CODEX2 without latent factors} 30 | 31 | \item{minCountQC}{the minimum read coverage required for EM fitting. 32 | Defalut is \code{20}} 33 | 34 | \item{filename}{the name of output pdf file} 35 | } 36 | \value{ 37 | pdf file with EM fitting results and two plots: 38 | log likelihood, and BIC versus the number of CNV groups. 39 | } 40 | \description{ 41 | A pdf file containing EM fitting results and plots is generated. 42 | } 43 | \examples{ 44 | Gini <- get_gini(Y_sim) 45 | # first-pass CODEX2 run with no latent factors 46 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 47 | gc_qc = ref_sim$gc, 48 | norm_index = which(Gini<=0.12)) 49 | Yhat.noK.sim <- normObj.sim$Yhat 50 | beta.hat.noK.sim <- normObj.sim$beta.hat 51 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 52 | N.sim <- normObj.sim$N 53 | 54 | # Ploidy initialization 55 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 56 | Yhat = Yhat.noK.sim, 57 | ref = ref_sim) 58 | ploidy.sim 59 | 60 | plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc, 61 | norm_index = which(Gini<=0.12), T = 1:7, 62 | ploidyInt = ploidy.sim, 63 | beta0 = beta.hat.noK.sim, 64 | filename = 'plot_EM_fit_demo.pdf') 65 | 66 | } 67 | \author{ 68 | Rujin Wang \email{rujin@email.unc.edu} 69 | } 70 | -------------------------------------------------------------------------------- /man/perform_qc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/perform_qc.R 3 | \name{perform_qc} 4 | \alias{perform_qc} 5 | \title{Quality control for cells and bins} 6 | \usage{ 7 | perform_qc(Y_raw, sampname_raw, ref_raw, QCmetric_raw, 8 | cov_thresh = 0, minCountQC = 20, 9 | mapq20_thresh = 0.3, mapp_thresh = 0.9, 10 | gc_thresh = c(20, 80), nMAD = 3) 11 | } 12 | \arguments{ 13 | \item{Y_raw}{raw read count matrix returned 14 | from \code{\link{get_coverage_scDNA}}} 15 | 16 | \item{sampname_raw}{sample names for quality control returned 17 | from \code{\link{get_bam_bed}}} 18 | 19 | \item{ref_raw}{raw GRanges object with corresponding GC content 20 | and mappability for quality control returned from 21 | \code{\link{get_bam_bed}}} 22 | 23 | \item{QCmetric_raw}{a QC metric for single cells returned from 24 | \code{\link{get_samp_QC}}} 25 | 26 | \item{cov_thresh}{scalar variable specifying the lower bound of read count 27 | summation of each cell. Default is \code{0}} 28 | 29 | \item{minCountQC}{the minimum read coverage required for 30 | normalization and EM fitting. Defalut is \code{20}} 31 | 32 | \item{mapq20_thresh}{scalar variable specifying the lower threshold 33 | of proportion of reads with mapping quality greater than 20. 34 | Default is \code{0.3}} 35 | 36 | \item{mapp_thresh}{scalar variable specifying mappability of 37 | each genomic bin. Default is \code{0.9}} 38 | 39 | \item{gc_thresh}{vector specifying the lower and upper bound of 40 | GC content threshold for quality control. Default is \code{20-80}} 41 | 42 | \item{nMAD}{scalar variable specifying the number of MAD from the median 43 | of total read counts adjusted by library size for each cell. 44 | Default is \code{3}} 45 | } 46 | \value{ 47 | A list with components 48 | \item{Y}{read depth matrix after quality control} 49 | \item{sampname}{sample names after quality control} 50 | \item{ref}{A GRanges object specifying whole genomic 51 | bin positions after quality control} 52 | \item{QCmetric}{A data frame of QC metric for single cells 53 | after quality control} 54 | } 55 | \description{ 56 | Perform QC step on single cells and bins. 57 | } 58 | \examples{ 59 | Y_raw <- coverageObj.scopeDemo$Y 60 | sampname_raw <- rownames(QCmetric.scopeDemo) 61 | ref_raw <- ref.scopeDemo 62 | QCmetric_raw <- QCmetric.scopeDemo 63 | qcObj <- perform_qc(Y_raw = Y_raw, sampname_raw = sampname_raw, 64 | ref_raw = ref_raw, QCmetric_raw = QCmetric_raw) 65 | 66 | } 67 | \author{ 68 | Rujin Wang \email{rujin@email.unc.edu} 69 | } 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SCOPE 2 | A normalization and copy number estimation method for single-cell DNA sequencing 3 | 4 | 5 | ## Authors 6 | Rujin Wang, Danyu Lin, and Yuchao Jiang 7 | 8 | 9 | ## Maintainer 10 | Rujin Wang 11 | 12 | 13 | ## Installation 14 | From Bioconductor 15 | ```r 16 | if (!requireNamespace("BiocManager", quietly = TRUE)) 17 | install.packages("BiocManager") 18 | 19 | # The following initializes usage of Bioc devel 20 | BiocManager::install(version='devel') 21 | BiocManager::install("WGSmapp") 22 | BiocManager::install("SCOPE") 23 | ``` 24 | From GitHub 25 | ``` 26 | install.packages('devtools') 27 | devtools::install_github("rujinwang/WGSmapp") 28 | devtools::install_github("rujinwang/SCOPE") 29 | ``` 30 | 31 | ## Description 32 | Whole genome single-cell DNA sequencing (scDNA-seq) enables characterization of copy number profiles at the cellular level. This circumvents the averaging effects associated with bulk-tissue sequencing and has increased resolution yet decreased ambiguity in deconvolving cancer subclones and elucidating cancer evolutionary history. ScDNA-seq data is, however, sparse, noisy, and highly variable even within a homogeneous cell population, due to the biases and artifacts that are introduced during the library preparation and sequencing procedure. Here, we propose SCOPE, a normalization and copy number estimation method for scDNA-seq data. The distinguishing features of SCOPE include: (i) utilization of cell-specific Gini coefficients for quality controls and for identification of normal/diploid cells, which are further used as negative control samples in a Poisson latent factor model for normalization; (ii) modeling of GC content bias using an expectation-maximization algorithm embedded in the Poisson generalized linear models, which accounts for the different copy number states along the genome; (iii) a cross-sample iterative segmentation procedure to identify breakpoints that are shared across cells from the same genetic background. We evaluate performance of SCOPE on real scDNA-seq data sets from cancer genomic studies. Compared to existing methods, SCOPE more accurately estimates subclonal copy number aberrations and is shown to have higher correlation with array-based copy number profiles of purified bulk samples from the same patient. We further demonstrate SCOPE on three recently released data sets using the 10X Genomics single-cell CNV pipeline and show that it can reliably recover 1% of the cancer cells from a background of normal. 33 | 34 | 35 | ## Manuscript 36 | Rujin Wang, Danyu Lin, and Yuchao Jiang. SCOPE: A Normalization and Copy Number Estimation Method for Single-Cell DNA Sequencing. ***Cell Systems***, 2020. ([link](https://www.sciencedirect.com/science/article/abs/pii/S2405471220301113?via%3Dihub)) 37 | 38 | ## Vignettes 39 | [HTML](http://bioconductor.org/packages/devel/bioc/vignettes/SCOPE/inst/doc/SCOPE_vignette.html) 40 | -------------------------------------------------------------------------------- /R/get_gc.R: -------------------------------------------------------------------------------- 1 | if (getRversion() >= "2.15.1") { 2 | utils::globalVariables(c("BSgenome.Hsapiens.UCSC.hg19", 3 | "mapp_hg19", "mapp_hg38")) 4 | } 5 | #' @title Compute GC content 6 | #' @name get_gc 7 | #' 8 | #' @description Compute GC content for each bin 9 | #' 10 | #' @param ref GRanges object returned from \code{get_bam_bed} 11 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'. 12 | #' Default is human genome \code{hg19}. 13 | #' 14 | #' @return 15 | #' \item{gc}{Vector of GC content for each bin/target} 16 | #' 17 | #' @examples 18 | #' \dontrun{ 19 | #' library(WGSmapp) 20 | #' library(BSgenome.Hsapiens.UCSC.hg38) 21 | #' bamfolder <- system.file('extdata', package = 'WGSmapp') 22 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 23 | #' bamdir <- file.path(bamfolder, bamFile) 24 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 25 | #' bambedObj <- get_bam_bed(bamdir = bamdir, 26 | #' sampname = sampname_raw, 27 | #' hgref = "hg38") 28 | #' bamdir <- bambedObj$bamdir 29 | #' sampname_raw <- bambedObj$sampname 30 | #' ref_raw <- bambedObj$ref 31 | #' 32 | #' gc <- get_gc(ref_raw, hgref = "hg38") 33 | #' } 34 | #' 35 | #' @author Rujin Wang \email{rujin@email.unc.edu} 36 | #' @import BSgenome.Hsapiens.UCSC.hg19 37 | #' @importFrom IRanges IRanges Views 38 | #' @importFrom GenomeInfoDb mapSeqlevels seqnames 39 | #' @importFrom BiocGenerics start end 40 | #' @importFrom Biostrings unmasked alphabetFrequency 41 | #' @export 42 | get_gc <- function (ref, hgref = "hg19"){ 43 | if(!hgref %in% c("hg19", "hg38", "mm10")){ 44 | stop("Reference genome should be hg19, hg38, or mm10.") 45 | } 46 | if(hgref == "hg19") { 47 | genome <- BSgenome.Hsapiens.UCSC.hg19 48 | }else if(hgref == "hg38") { 49 | genome <- BSgenome.Hsapiens.UCSC.hg38 50 | }else if(hgref == "mm10") { 51 | genome <- BSgenome.Mmusculus.UCSC.mm10 52 | } 53 | gc <- rep(NA, length(ref)) 54 | for (chr in unique(seqnames(ref))) { 55 | message("Getting GC content for chr ", chr, sep = "") 56 | chr.index <- which(as.matrix(seqnames(ref)) == chr) 57 | ref.chr <- IRanges(start = start(ref)[chr.index], 58 | end = end(ref)[chr.index]) 59 | if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { 60 | chrtemp <- "chrX" 61 | } 62 | else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { 63 | chrtemp <- "chrY" 64 | } 65 | else { 66 | chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) 67 | } 68 | if (length(chrtemp) == 0) 69 | message("Chromosome cannot be found in NCBI database. ") 70 | chrm <- unmasked(genome[[chrtemp]]) 71 | seqs <- Views(chrm, ref.chr) 72 | af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) 73 | gc[chr.index] <- round((af[, "G"] + af[, "C"]) * 100, 2) 74 | } 75 | gc 76 | } 77 | -------------------------------------------------------------------------------- /man/normalize_scope.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalize_scope.R 3 | \name{normalize_scope} 4 | \alias{normalize_scope} 5 | \title{Normalization of read depth with latent factors using 6 | Expectation-Maximization algorithm under the case-control setting} 7 | \usage{ 8 | normalize_scope(Y_qc, gc_qc, K, norm_index, T, ploidyInt, 9 | beta0, minCountQC = 20) 10 | } 11 | \arguments{ 12 | \item{Y_qc}{read depth matrix after quality control} 13 | 14 | \item{gc_qc}{vector of GC content for each bin after quality control} 15 | 16 | \item{K}{Number of latent Poisson factors} 17 | 18 | \item{norm_index}{indices of normal/diploid cells} 19 | 20 | \item{T}{a vector of integers indicating number of CNV groups. 21 | Use BIC to select optimal number of CNV groups. If \code{T = 1}, 22 | assume all reads are from normal regions so that EM algorithm is 23 | not implemented. Otherwise, we assume there is always a CNV group 24 | of heterozygous deletion and a group of null region. The rest 25 | groups are representative of different duplication states.} 26 | 27 | \item{ploidyInt}{a vector of initialized ploidy return 28 | from \code{initialize_ploidy}. Users are also allowed to provide 29 | prior-knowledge ploidies as the input and to manually tune a few 30 | cells that have poor fitting} 31 | 32 | \item{beta0}{a vector of initialized bin-specific biases 33 | returned from CODEX2 without latent factors} 34 | 35 | \item{minCountQC}{the minimum read coverage required for 36 | normalization and EM fitting. Defalut is \code{20}} 37 | } 38 | \value{ 39 | A list with components 40 | \item{Yhat}{A list of normalized read depth matrix with EM} 41 | \item{alpha.hat}{A list of absolute copy number matrix} 42 | \item{fGC.hat}{A list of EM estimated GC content bias matrix} 43 | \item{beta.hat}{A list of EM estimated bin-specific bias vector} 44 | \item{g.hat}{A list of estimated Poisson latent factor} 45 | \item{h.hat}{A list of estimated Poisson latent factor} 46 | \item{AIC}{AIC for model selection} 47 | \item{BIC}{BIC for model selection} 48 | \item{RSS}{RSS for model selection} 49 | \item{K}{Number of latent Poisson factors} 50 | } 51 | \description{ 52 | Fit a Poisson generalized linear model to normalize 53 | the raw read depth data from single-cell DNA sequencing, with 54 | latent factors under the case-control setting. Model GC content 55 | bias using an expectation-maximization algorithm, which accounts for 56 | the different copy number states. 57 | } 58 | \examples{ 59 | Gini <- get_gini(Y_sim) 60 | 61 | # first-pass CODEX2 run with no latent factors 62 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 63 | gc_qc = ref_sim$gc, 64 | norm_index = which(Gini<=0.12)) 65 | Yhat.noK.sim <- normObj.sim$Yhat 66 | beta.hat.noK.sim <- normObj.sim$beta.hat 67 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 68 | N.sim <- normObj.sim$N 69 | 70 | # Ploidy initialization 71 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 72 | Yhat = Yhat.noK.sim, 73 | ref = ref_sim) 74 | ploidy.sim 75 | 76 | normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc, 77 | K = 1, ploidyInt = ploidy.sim, 78 | norm_index = which(Gini<=0.12), T = 1:5, 79 | beta0 = beta.hat.noK.sim) 80 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]] 81 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]] 82 | 83 | } 84 | \author{ 85 | Rujin Wang \email{rujin@email.unc.edu} 86 | } 87 | -------------------------------------------------------------------------------- /R/get_bam_bed.R: -------------------------------------------------------------------------------- 1 | if (getRversion() >= "2.15.1") { 2 | utils::globalVariables(c("BSgenome.Hsapiens.UCSC.hg19", 3 | "BSgenome.Hsapiens.UCSC.hg38", 4 | "BSgenome.Mmusculus.UCSC.mm10", 5 | "seqlevels<-", "seqlevels")) 6 | } 7 | #' @title Get bam file directories, sample names, and whole genomic bins 8 | #' 9 | #' @description Get bam file directories, sample names, and whole genomic 10 | #' bins from .bed file 11 | #' 12 | #' @usage 13 | #' get_bam_bed(bamdir, sampname, hgref = "hg19", resolution = 500, 14 | #' sex = FALSE) 15 | #' 16 | #' @param bamdir vector of the directory of a bam file. Should be in the same 17 | #' order as sample names in \code{sampname}. 18 | #' @param sampname vector of sample names. Should be in the same order as bam 19 | #' directories in \code{bamdir}. 20 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'. 21 | #' Default is human genome \code{hg19}. 22 | #' @param resolution numeric value of fixed bin-length. Default is \code{500}. 23 | #' Unit is "kb". 24 | #' @param sex logical, whether to include sex chromosomes. 25 | #' Default is \code{FALSE}. 26 | #' 27 | #' @return A list with components 28 | #' \item{bamdir}{A vector of bam directories} 29 | #' \item{sampname}{A vector of sample names} 30 | #' \item{ref}{A GRanges object specifying whole genomic bin positions} 31 | #' 32 | #' @examples 33 | #' library(WGSmapp) 34 | #' library(BSgenome.Hsapiens.UCSC.hg38) 35 | #' bamfolder <- system.file('extdata', package = 'WGSmapp') 36 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 37 | #' bamdir <- file.path(bamfolder, bamFile) 38 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 39 | #' bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 40 | #' hgref = "hg38") 41 | #' bamdir <- bambedObj$bamdir 42 | #' sampname_raw <- bambedObj$sampname 43 | #' ref_raw <- bambedObj$ref 44 | #' 45 | #' @author Rujin Wang \email{rujin@email.unc.edu} 46 | #' @import utils 47 | #' @import BSgenome.Hsapiens.UCSC.hg19 48 | #' @importFrom GenomicRanges GRanges tileGenome 49 | #' @importFrom IRanges IRanges 50 | #' @importFrom GenomeInfoDb seqnames seqinfo seqlevels 51 | #' @export 52 | get_bam_bed <- function(bamdir, sampname, hgref = "hg19", resolution = 500, 53 | sex = FALSE){ 54 | if(!hgref %in% c("hg19", "hg38", "mm10")){ 55 | stop("Reference genome should be hg19, hg38 or mm10. ") 56 | } 57 | if(hgref == "hg19") { 58 | genome <- BSgenome.Hsapiens.UCSC.hg19 59 | }else if(hgref == "hg38") { 60 | genome <- BSgenome.Hsapiens.UCSC.hg38 61 | }else if(hgref == "mm10") { 62 | genome <- BSgenome.Mmusculus.UCSC.mm10 63 | } 64 | if(resolution <= 0){ 65 | stop("Invalid fixed bin length. ") 66 | } 67 | bins <- tileGenome(seqinfo(genome), 68 | tilewidth = resolution * 1000, 69 | cut.last.tile.in.chrom = TRUE) 70 | if(hgref != "mm10"){ 71 | autochr = 22 72 | }else{ 73 | autochr = 19 74 | } 75 | if(sex) { 76 | ref <- bins[which(as.character(seqnames(bins)) %in% paste0("chr", 77 | c(seq_len(autochr), "X", "Y")))] 78 | } else { 79 | ref <- bins[which(as.character(seqnames(bins)) %in% paste0("chr", 80 | seq_len(autochr)))] 81 | } 82 | if (!any(grepl("chr", seqlevels(ref)))) { 83 | seqlevels(ref) <- paste(c(seq_len(autochr), "X", "Y"), sep = "") 84 | ref <- sort(ref) 85 | } else { 86 | seqlevels(ref) <- paste("chr", c(seq_len(autochr), "X", "Y"), sep = "") 87 | ref <- sort(ref) 88 | } 89 | list(bamdir = bamdir, sampname = sampname, ref = ref) 90 | } 91 | 92 | -------------------------------------------------------------------------------- /R/get_mapp.R: -------------------------------------------------------------------------------- 1 | if (getRversion() >= "2.15.1") { 2 | utils::globalVariables(c("mapp_hg19", 3 | "mapp_hg38", "seqlevelsStyle<-")) 4 | } 5 | #' @title Compute mappability 6 | #' @name get_mapp 7 | #' 8 | #' @description Compute mappability for each bin. Note that scDNA 9 | #' sequencing is whole-genome amplification and the mappability 10 | #' score is essential to determine variable binning method. 11 | #' Mappability track for 100-mers on the GRCh37/hg19 human 12 | #' reference genome from ENCODE is pre-saved. Compute the mean 13 | #' of mappability scores that overlapped reads map to bins, 14 | #' weighted by the width of mappability tracks on the genome 15 | #' reference. Use liftOver utility to calculate mappability 16 | #' for hg38, which is pre-saved as well. For mm10, there are 17 | #' two workarounds: 1) set all mappability to 1 to avoid extensive 18 | #' computation; 2) adopt QC procedures based on annotation results, 19 | #' e.g., filter out bins within black list regions, 20 | #' which generally have low mappability. 21 | #' 22 | #' @param ref GRanges object returned from \code{get_bam_bed} 23 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'. 24 | #' Default is human genome \code{hg19}. 25 | #' 26 | #' @return 27 | #' \item{mapp}{Vector of mappability for each bin/target} 28 | #' 29 | #' @examples 30 | #' \dontrun{ 31 | #' library(WGSmapp) 32 | #' library(BSgenome.Hsapiens.UCSC.hg38) 33 | #' bamfolder <- system.file('extdata', package = 'WGSmapp') 34 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 35 | #' bamdir <- file.path(bamfolder, bamFile) 36 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 37 | #' bambedObj <- get_bam_bed(bamdir = bamdir, 38 | #' sampname = sampname_raw, 39 | #' hgref = "hg38") 40 | #' bamdir <- bambedObj$bamdir 41 | #' sampname_raw <- bambedObj$sampname 42 | #' ref_raw <- bambedObj$ref 43 | #' 44 | #' mapp <- get_mapp(ref_raw, hgref = "hg38") 45 | #' } 46 | #' 47 | #' @author Rujin Wang \email{rujin@email.unc.edu} 48 | #' @import utils 49 | #' @importFrom GenomicRanges GRanges pintersect 50 | #' @importFrom IRanges IRanges Views countOverlaps findOverlaps width 51 | #' @importFrom GenomeInfoDb mapSeqlevels seqlevelsStyle seqnames 52 | #' @importFrom S4Vectors queryHits subjectHits 53 | #' @export 54 | get_mapp <- function(ref, hgref = "hg19") { 55 | if(!hgref %in% c("hg19", "hg38", "mm10")){ 56 | stop("Reference genome should be hg19, hg38, or mm10.") 57 | } 58 | if(hgref == "hg19") { 59 | mapp_gref <- mapp_hg19 60 | }else if(hgref == "hg38") { 61 | mapp_gref <- mapp_hg38 62 | } 63 | mapp <- rep(1, length(ref)) 64 | if(hgref != "mm10"){ 65 | seqlevelsStyle(ref) <- "UCSC" 66 | for (chr in as.character(unique(seqnames(ref)))) { 67 | message("Getting mappability for ", chr, sep = "") 68 | chr.index <- which(as.matrix(seqnames(ref)) == chr) 69 | ref.chr <- ref[which(as.character(seqnames(ref)) == chr)] 70 | mapp.chr <- rep(1, length(ref.chr)) 71 | overlap <- as.matrix(findOverlaps(ref.chr, mapp_gref)) 72 | for (i in unique(overlap[, 1])) { 73 | index.temp <- overlap[which(overlap[, 1] == i), 2] 74 | overlap.sub <- findOverlaps(ref.chr[i], mapp_gref[index.temp]) 75 | overlap.intersect <- pintersect(ref.chr[i][queryHits( 76 | overlap.sub)], mapp_gref[index.temp][ 77 | subjectHits(overlap.sub)]) 78 | mapp.chr[i] <- sum((mapp_gref$score[index.temp]) * 79 | (width(overlap.intersect)))/sum( 80 | width(overlap.intersect)) 81 | } 82 | mapp[chr.index] <- mapp.chr 83 | } 84 | } 85 | mapp 86 | } 87 | 88 | -------------------------------------------------------------------------------- /man/normalize_scope_foreach.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalize_scope_foreach.R 3 | \name{normalize_scope_foreach} 4 | \alias{normalize_scope_foreach} 5 | \title{Normalization of read depth with latent factors using 6 | Expectation-Maximization algorithm under the case-control 7 | setting in parallel} 8 | \usage{ 9 | normalize_scope_foreach(Y_qc, gc_qc, K, norm_index, T, 10 | ploidyInt, beta0, minCountQC = 20, nCores = NULL) 11 | } 12 | \arguments{ 13 | \item{Y_qc}{read depth matrix after quality control} 14 | 15 | \item{gc_qc}{vector of GC content for each bin after quality control} 16 | 17 | \item{K}{Number of latent Poisson factors} 18 | 19 | \item{norm_index}{indices of normal/diploid cells} 20 | 21 | \item{T}{a vector of integers indicating number of CNV groups. 22 | Use BIC to select optimal number of CNV groups. If \code{T = 1}, 23 | assume all reads are from normal regions so that EM algorithm is 24 | not implemented. Otherwise, we assume there is always a CNV group 25 | of heterozygous deletion and a group of null region. The rest groups 26 | are representative of different duplication states.} 27 | 28 | \item{ploidyInt}{a vector of initialized ploidy return 29 | from \code{initialize_ploidy}. Users are also allowed to provide 30 | prior-knowledge ploidies as the input and to manually tune a few 31 | cells that have poor fitting} 32 | 33 | \item{beta0}{a vector of initialized bin-specific biases returned 34 | from CODEX2 without latent factors} 35 | 36 | \item{minCountQC}{the minimum read coverage required for normalization 37 | and EM fitting. Defalut is \code{20}} 38 | 39 | \item{nCores}{number of cores to use. If \code{NULL}, number of cores 40 | is detected. Default is \code{NULL}.} 41 | } 42 | \value{ 43 | A list with components 44 | \item{Yhat}{A list of normalized read depth matrix with EM} 45 | \item{alpha.hat}{A list of absolute copy number matrix} 46 | \item{fGC.hat}{A list of EM estimated GC content bias matrix} 47 | \item{beta.hat}{A list of EM estimated bin-specific bias vector} 48 | \item{g.hat}{A list of estimated Poisson latent factor} 49 | \item{h.hat}{A list of estimated Poisson latent factor} 50 | \item{AIC}{AIC for model selection} 51 | \item{BIC}{BIC for model selection} 52 | \item{RSS}{RSS for model selection} 53 | \item{K}{Number of latent Poisson factors} 54 | } 55 | \description{ 56 | Fit a Poisson generalized linear model to normalize 57 | the raw read depth data from single-cell DNA sequencing, 58 | with latent factors under the case-control setting. Model GC 59 | content bias using an expectation-maximization algorithm, 60 | which accounts for the different copy number states. 61 | } 62 | \examples{ 63 | Gini <- get_gini(Y_sim) 64 | 65 | # first-pass CODEX2 run with no latent factors 66 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 67 | gc_qc = ref_sim$gc, 68 | norm_index = which(Gini<=0.12)) 69 | Yhat.noK.sim <- normObj.sim$Yhat 70 | beta.hat.noK.sim <- normObj.sim$beta.hat 71 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 72 | N.sim <- normObj.sim$N 73 | 74 | # Ploidy initialization 75 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 76 | Yhat = Yhat.noK.sim, 77 | ref = ref_sim) 78 | ploidy.sim 79 | 80 | # Specify nCores = 2 only for checking examples 81 | normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, 82 | gc_qc = ref_sim$gc, 83 | K = 1, ploidyInt = ploidy.sim, 84 | norm_index = which(Gini<=0.12), T = 1:5, 85 | beta0 = beta.hat.noK.sim, nCores = 2) 86 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]] 87 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]] 88 | 89 | } 90 | \author{ 91 | Rujin Wang \email{rujin@email.unc.edu} 92 | } 93 | -------------------------------------------------------------------------------- /man/normalize_scope_group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalize_scope_group.R 3 | \name{normalize_scope_group} 4 | \alias{normalize_scope_group} 5 | \title{Group-wise normalization of read depth with latent factors using 6 | Expectation-Maximization algorithm and shared clonal memberships} 7 | \usage{ 8 | normalize_scope_group(Y_qc, gc_qc, K, norm_index, groups, T, 9 | ploidyInt, beta0, minCountQC = 20) 10 | } 11 | \arguments{ 12 | \item{Y_qc}{read depth matrix after quality control} 13 | 14 | \item{gc_qc}{vector of GC content for each bin after quality control} 15 | 16 | \item{K}{Number of latent Poisson factors} 17 | 18 | \item{norm_index}{indices of normal/diploid cells using group/clone 19 | labels} 20 | 21 | \item{groups}{clonal membership labels for each cell} 22 | 23 | \item{T}{a vector of integers indicating number of CNV groups. 24 | Use BIC to select optimal number of CNV groups. If \code{T = 1}, 25 | assume all reads are from normal regions so that EM algorithm is 26 | not implemented. Otherwise, we assume there is always a CNV group 27 | of heterozygous deletion and a group of null region. The rest 28 | groups are representative of different duplication states.} 29 | 30 | \item{ploidyInt}{a vector of group-wise initialized ploidy return 31 | from \code{initialize_ploidy_group}. Users are also allowed to 32 | provide prior-knowledge ploidies as the input and to manually 33 | tune a few cells/clones that have poor fitting} 34 | 35 | \item{beta0}{a vector of initialized bin-specific biases 36 | returned from CODEX2 without latent factors} 37 | 38 | \item{minCountQC}{the minimum read coverage required for 39 | normalization and EM fitting. Defalut is \code{20}} 40 | } 41 | \value{ 42 | A list with components 43 | \item{Yhat}{A list of normalized read depth matrix with EM} 44 | \item{alpha.hat}{A list of absolute copy number matrix} 45 | \item{fGC.hat}{A list of EM estimated GC content bias matrix} 46 | \item{beta.hat}{A list of EM estimated bin-specific bias vector} 47 | \item{g.hat}{A list of estimated Poisson latent factor} 48 | \item{h.hat}{A list of estimated Poisson latent factor} 49 | \item{AIC}{AIC for model selection} 50 | \item{BIC}{BIC for model selection} 51 | \item{RSS}{RSS for model selection} 52 | \item{K}{Number of latent Poisson factors} 53 | } 54 | \description{ 55 | Fit a Poisson generalized linear model to normalize 56 | the raw read depth data from single-cell DNA sequencing, with 57 | latent factors and shared clonal memberships. Model GC content 58 | bias using an expectation-maximization algorithm, which accounts for 59 | clonal specific copy number states. 60 | } 61 | \examples{ 62 | Gini <- get_gini(Y_sim) 63 | 64 | # first-pass CODEX2 run with no latent factors 65 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 66 | gc_qc = ref_sim$gc, 67 | norm_index = which(Gini<=0.12)) 68 | Yhat.noK.sim <- normObj.sim$Yhat 69 | beta.hat.noK.sim <- normObj.sim$beta.hat 70 | fGC.hat.noK.sim <- normObj.sim$fGC.hat 71 | N.sim <- normObj.sim$N 72 | 73 | # Group-wise ploidy initialization 74 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1") 75 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 76 | ref = ref_sim, groups = clones) 77 | ploidy.sim.group 78 | 79 | normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim, 80 | gc_qc = ref_sim$gc, 81 | K = 1, ploidyInt = ploidy.sim.group, 82 | norm_index = which(clones=="normal"), 83 | groups = clones, 84 | T = 1:5, 85 | beta0 = beta.hat.noK.sim) 86 | Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max( 87 | normObj.scope.sim.group$BIC)]] 88 | fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max( 89 | normObj.scope.sim.group$BIC)]] 90 | 91 | } 92 | \author{ 93 | Rujin Wang \email{rujin@email.unc.edu} 94 | } 95 | -------------------------------------------------------------------------------- /R/initialize_ploidy.R: -------------------------------------------------------------------------------- 1 | #' @title Ploidy pre-initialization 2 | #' 3 | #' @description Pre-estimate ploidies across all cells 4 | #' 5 | #' @usage 6 | #' initialize_ploidy(Y, Yhat, ref, maxPloidy = 6, minPloidy = 1.5, 7 | #' minBinWidth = 5, SoS.plot = FALSE) 8 | #' @param Y raw read depth matrix after quality control procedure 9 | #' @param Yhat normalized read depth matrix 10 | #' @param ref GRanges object after quality control procedure 11 | #' @param maxPloidy maximum ploidy candidate. Defalut is \code{6} 12 | #' @param minPloidy minimum ploidy candidate. Defalut is \code{1.5} 13 | #' @param minBinWidth the minimum number of bins for a changed segment. 14 | #' Defalut is \code{5} 15 | #' @param SoS.plot logical, whether to generate ploidy pre-estimation 16 | #' plots. Default is \code{FALSE}. 17 | #' 18 | #' @return 19 | #' \item{ploidy.SoS}{Vector of pre-estimated ploidies for each cell} 20 | #' 21 | #' @examples 22 | #' Gini <- get_gini(Y_sim) 23 | #' 24 | #' # first-pass CODEX2 run with no latent factors 25 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 26 | #' gc_qc = ref_sim$gc, 27 | #' norm_index = which(Gini<=0.12)) 28 | #' Yhat.noK.sim <- normObj.sim$Yhat 29 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 30 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 31 | #' N.sim <- normObj.sim$N 32 | #' 33 | #' # Ploidy initialization 34 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 35 | #' Yhat = Yhat.noK.sim, 36 | #' ref = ref_sim) 37 | #' ploidy.sim 38 | #' 39 | #' @author Rujin Wang \email{rujin@email.unc.edu} 40 | #' @importFrom DNAcopy CNA smooth.CNA segment 41 | #' @importFrom GenomeInfoDb seqnames 42 | #' @importFrom BSgenome start 43 | #' @importFrom IRanges end 44 | #' @export 45 | initialize_ploidy <- function(Y, Yhat, ref, maxPloidy = 6, 46 | minPloidy = 1.5, minBinWidth = 5, SoS.plot = FALSE) { 47 | ploidy.SoS <- rep(NA, ncol(Y)) 48 | 49 | breaks <- matrix(0, nrow(Y), ncol(Y)) 50 | RCNP <- matrix(0, nrow(Y), ncol(Y)) 51 | final <- matrix(0, nrow(Y), ncol(Y)) 52 | X <- seq(minPloidy, maxPloidy, by = 0.05) 53 | n_ploidy <- length(X) 54 | SoS <- matrix(0, n_ploidy, ncol(Y)) 55 | 56 | normal <- (Y + 1)/(Yhat + 1) 57 | 58 | for (k in seq_len(ncol(Y))) { 59 | if (k%%5 == 1) { 60 | cat("Initializing ploidy for cell ", k, "\t") 61 | } 62 | 63 | lr <- log(normal[, k]) 64 | loc <- data.frame(seq = as.character(seqnames(ref)), 65 | start = start(ref), end = end(ref)) 66 | CNA.object <- CNA(genomdat = lr, chrom = loc[, 1], 67 | maploc = as.numeric(loc[, 2]), data.type = "logratio") 68 | CNA.smoothed <- smooth.CNA(CNA.object) 69 | segs <- segment(CNA.smoothed, verbose = 0, 70 | min.width = minBinWidth) 71 | frag <- segs$output[, 2:3] 72 | len <- dim(frag)[1] 73 | bps <- array(0, len) 74 | for (j in seq_len(len)) { 75 | bps[j] <- which((loc[, 1] == frag[j, 1]) & 76 | (as.numeric(loc[, 2]) == frag[j, 2])) 77 | } 78 | bps <- sort(bps) 79 | bps[(len = len + 1)] <- nrow(Y) 80 | breaks[bps, k] <- 1 81 | RCNP[, k][seq_len(bps[2])] <- median(normal[, 82 | k][seq_len(bps[2])]) 83 | for (i in 2:(len - 1)) { 84 | RCNP[, k][bps[i]:(bps[i + 1] - 1)] <- median(normal[, 85 | k][bps[i]:(bps[i + 1] - 1)]) 86 | } 87 | RCNP[, k] <- RCNP[, k]/mean(RCNP[, k]) 88 | 89 | SCNP <- RCNP[, k] %o% X 90 | FSCP <- round(SCNP) 91 | Diff2 <- (SCNP - FSCP)^2 92 | SoS[, k] <- colSums(Diff2, na.rm = FALSE, dims = 1) 93 | ploidy.SoS[k] <- X[which.min(SoS[, k])] 94 | 95 | 96 | if(SoS.plot){ 97 | par(mfrow = c(1,2)) 98 | par(mar = c(5,4,4,2)) 99 | hist(Y[,k], 100, main = 'Read depth distribution', 100 | xlab = 'Coverage per bin') 101 | plot(X, SoS[,k], xlab = "ploidy", ylab = "Sum of squared errors", 102 | main = "First-pass estimation of ploidy", pch = 16) 103 | abline(v = X[which.min(SoS[,k])], lty = 2) 104 | } 105 | } 106 | return(ploidy.SoS) 107 | } 108 | -------------------------------------------------------------------------------- /R/plot_EM_fit.R: -------------------------------------------------------------------------------- 1 | #' @title Visualize EM fitting for each cell. 2 | #' 3 | #' @description A pdf file containing EM fitting results and plots is generated. 4 | #' 5 | #' @usage plot_EM_fit(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0, 6 | #' minCountQC = 20, filename) 7 | #' @param Y_qc read depth matrix across all cells after quality control 8 | #' @param gc_qc vector of GC content for each bin after quality control 9 | #' @param norm_index indices of normal/diploid cells 10 | #' @param T a vector of integers indicating number of CNV groups. 11 | #' Use BIC to select optimal number of CNV groups. 12 | #' If \code{T = 1}, assume all reads are from normal regions 13 | #' so that EM algorithm is not implemented. Otherwise, 14 | #' we assume there is always a CNV group of heterozygous deletion 15 | #' and a group of null region. The rest groups are representative 16 | #' of different duplication states. 17 | #' @param ploidyInt a vector of initialized ploidy return from 18 | #' \code{initialize_ploidy} 19 | #' @param beta0 a vector of initialized bin-specific biases returned 20 | #' from CODEX2 without latent factors 21 | #' @param minCountQC the minimum read coverage required for EM fitting. 22 | #' Defalut is \code{20} 23 | #' @param filename the name of output pdf file 24 | #' 25 | #' @return pdf file with EM fitting results and two plots: 26 | #' log likelihood, and BIC versus the number of CNV groups. 27 | #' 28 | #' @examples 29 | #' Gini <- get_gini(Y_sim) 30 | #' # first-pass CODEX2 run with no latent factors 31 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 32 | #' gc_qc = ref_sim$gc, 33 | #' norm_index = which(Gini<=0.12)) 34 | #' Yhat.noK.sim <- normObj.sim$Yhat 35 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 36 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 37 | #' N.sim <- normObj.sim$N 38 | #' 39 | #' # Ploidy initialization 40 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 41 | #' Yhat = Yhat.noK.sim, 42 | #' ref = ref_sim) 43 | #' ploidy.sim 44 | #' 45 | #' plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc, 46 | #' norm_index = which(Gini<=0.12), T = 1:7, 47 | #' ploidyInt = ploidy.sim, 48 | #' beta0 = beta.hat.noK.sim, 49 | #' filename = 'plot_EM_fit_demo.pdf') 50 | #' 51 | #' @author Rujin Wang \email{rujin@email.unc.edu} 52 | #' @import grDevices stats 53 | #' @export 54 | plot_EM_fit <- function(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0, 55 | minCountQC = 20, filename) { 56 | Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { 57 | !any(x == 0) 58 | }), , drop = FALSE] 59 | if(dim(Y.nonzero)[1] <= 10){ 60 | message("Adopt arithmetic mean instead of geometric mean") 61 | pseudo.sample <- apply(Y_qc, 1, mean) 62 | Ntotal <- apply(apply(Y_qc, 2, function(x) { 63 | x/pseudo.sample 64 | }), 2, median, na.rm = TRUE) 65 | } else{ 66 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 67 | exp(sum(log(x))/length(x)) 68 | }) 69 | Ntotal <- apply(apply(Y.nonzero, 2, function(x) { 70 | x/pseudo.sample 71 | }), 2, median) 72 | } 73 | N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc))) 74 | Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, byrow = TRUE) 75 | 76 | # Get initialization 77 | gcfit.temp <- Y_qc/Nmat/beta0 78 | alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc)) 79 | for (j in seq_len(ncol(alpha0))) { 80 | loe.fit <- loess(gcfit.temp[, j] ~ gc_qc) 81 | gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2) 82 | alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2 83 | } 84 | 85 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 86 | data = beta0, byrow = FALSE) 87 | 88 | pdf(file = filename, width = 8, height = 10) 89 | for (j in seq_len(ncol(Y_qc))) { 90 | cat(j, "\t") 91 | if (j %in% norm_index) { 92 | fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j], gctemp = gc_qc, 93 | Yj = Y_qc[, j], offsetj = offset[, j], 94 | T = 1, draw.plot = TRUE, alphaj = alpha0[, j], minCountQC) 95 | } else { 96 | fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j], gctemp = gc_qc, 97 | Yj = Y_qc[, j], offsetj = offset[, j], 98 | T = T, draw.plot = TRUE, alphaj = alpha0[, j], minCountQC) 99 | } 100 | } 101 | dev.off() 102 | } 103 | -------------------------------------------------------------------------------- /R/normalize_codex2_ns_noK.R: -------------------------------------------------------------------------------- 1 | #' @title Normalization of read depth without latent factors under 2 | #' the case-control setting 3 | #' 4 | #' @description Assuming that all reads are from diploid regions, 5 | #' fit a Poisson generalized linear model to normalize the 6 | #' raw read depth data from single-cell DNA sequencing, without 7 | #' latent factors under the case-control setting. 8 | #' 9 | #' @param Y_qc read depth matrix after quality control 10 | #' @param gc_qc vector of GC content for each bin after quality control 11 | #' @param norm_index indices of normal/diploid cells 12 | #' 13 | #' @return A list with components 14 | #' \item{Yhat}{A list of normalized read depth matrix} 15 | #' \item{fGC.hat}{A list of estimated GC content bias matrix} 16 | #' \item{beta.hat}{A list of estimated bin-specific bias vector} 17 | #' \item{N}{A vector of cell-specific library size factor, 18 | #' which is computed from the genome-wide read depth data} 19 | #' 20 | #' @examples 21 | #' Gini <- get_gini(Y_sim) 22 | #' # first-pass CODEX2 run with no latent factors 23 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 24 | #' gc_qc = ref_sim$gc, 25 | #' norm_index = which(Gini<=0.12)) 26 | #' 27 | #' @author Rujin Wang \email{rujin@email.unc.edu} 28 | #' @import stats 29 | #' @export 30 | normalize_codex2_ns_noK <- function(Y_qc, gc_qc, norm_index) { 31 | Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { 32 | !any(x == 0) 33 | }), , drop = FALSE] 34 | if(dim(Y.nonzero)[1] <= 10){ 35 | message("Adopt arithmetic mean instead of geometric mean") 36 | pseudo.sample <- apply(Y_qc, 1, mean) 37 | Ntotal <- apply(apply(Y_qc, 2, function(x) { 38 | x/pseudo.sample 39 | }), 2, median, na.rm = TRUE) 40 | } else{ 41 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 42 | exp(sum(log(x))/length(x)) 43 | }) 44 | Ntotal <- apply(apply(Y.nonzero, 2, function(x) { 45 | x/pseudo.sample 46 | }), 2, median) 47 | } 48 | N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc))) 49 | Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, 50 | byrow = TRUE) 51 | 52 | message("Computing normalization with no latent factors") 53 | maxiter <- 10 54 | maxhiter <- 50 55 | BHTHRESH <- 1e-04 56 | HHTHRESH <- 1e-05 57 | iter <- 1 58 | fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0) 59 | fhatnew <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc)) 60 | betahat <- rep(1, nrow(Y_qc)) 61 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 62 | data = betahat, byrow = FALSE) 63 | bhdiff <- rep(Inf, maxiter) 64 | fhdiff <- rep(Inf, maxiter) 65 | betahatlist <- list(length = maxiter) 66 | fhatlist <- list(length = maxiter) 67 | while (iter <= maxiter) { 68 | gcfit <- Y_qc/Nmat/betahatmat 69 | fhatnew <- apply(gcfit, 2, function(z) { 70 | spl <- smooth.spline(gc_qc, z) 71 | temp <- predict(spl, gc_qc)$y 72 | temp[temp <= 0] <- min(temp[temp > 0]) 73 | temp 74 | }) 75 | fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile(fhatnew, 0.005) 76 | betahatnew <- apply((Y_qc/(fhatnew * Nmat))[, 77 | norm_index, drop = FALSE], 1, median) 78 | betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0]) 79 | bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat) 80 | fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat) 81 | if (fhdiff[iter] > min(fhdiff)) 82 | break 83 | message("Iteration ", iter, "\t", "beta diff =", 84 | signif(bhdiff[iter], 3), 85 | "\t", "f(GC) diff =", signif(fhdiff[iter], 3)) 86 | fhat <- fhatnew 87 | betahat <- betahatnew 88 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 89 | data = betahat, byrow = FALSE) 90 | fhatlist[[iter]] <- fhat 91 | betahatlist[[iter]] <- betahat 92 | if (bhdiff[iter] < BHTHRESH) 93 | break 94 | if (iter > 5 & bhdiff[iter] > 1) 95 | break 96 | iter <- iter + 1 97 | } 98 | optIter <- which.min(fhdiff) 99 | message(paste("Stop at Iteration ", optIter, ".", sep = "")) 100 | fhat <- fhatlist[[optIter]] 101 | betahat <- betahatlist[[optIter]] 102 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 103 | data = betahat, byrow = FALSE) 104 | Yhat <- pmax(round(fhat * Nmat * betahatmat, 0), 1) 105 | fGC.hat <- signif(fhat, 3) 106 | beta.hat <- signif(betahat, 3) 107 | list(Yhat = Yhat, fGC.hat = fGC.hat, beta.hat = beta.hat, N = N) 108 | } 109 | -------------------------------------------------------------------------------- /R/initialize_ploidy_group.R: -------------------------------------------------------------------------------- 1 | #' @title Group-wise ploidy pre-initialization 2 | #' 3 | #' @description Pre-estimate ploidies across cells with shared clonal 4 | #' memberships 5 | #' 6 | #' @usage 7 | #' initialize_ploidy_group(Y, Yhat, ref, groups, 8 | #' maxPloidy = 6, minPloidy = 1.5, 9 | #' minBinWidth = 5, SoS.plot = FALSE) 10 | #' @param Y raw read depth matrix after quality control procedure 11 | #' @param Yhat normalized read depth matrix 12 | #' @param ref GRanges object after quality control procedure 13 | #' @param groups clonal membership labels for each cell 14 | #' @param maxPloidy maximum ploidy candidate. Defalut is \code{6} 15 | #' @param minPloidy minimum ploidy candidate. Defalut is \code{1.5} 16 | #' @param minBinWidth the minimum number of bins for a changed segment. 17 | #' Defalut is \code{5} 18 | #' @param SoS.plot logical, whether to generate ploidy pre-estimation 19 | #' plots. Default is \code{FALSE}. 20 | #' 21 | #' @return 22 | #' \item{ploidy.SoS}{Vector of group-wise pre-estimated ploidies 23 | #' for each cell} 24 | #' 25 | #' @examples 26 | #' Gini <- get_gini(Y_sim) 27 | #' 28 | #' # first-pass CODEX2 run with no latent factors 29 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 30 | #' gc_qc = ref_sim$gc, 31 | #' norm_index = which(Gini<=0.12)) 32 | #' Yhat.noK.sim <- normObj.sim$Yhat 33 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 34 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 35 | #' N.sim <- normObj.sim$N 36 | #' 37 | #' # Group-wise ploidy initialization 38 | #' clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1") 39 | #' ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 40 | #' ref = ref_sim, groups = clones) 41 | #' ploidy.sim.group 42 | #' 43 | #' @author Rujin Wang \email{rujin@email.unc.edu} 44 | #' @importFrom DNAcopy CNA smooth.CNA segment 45 | #' @importFrom GenomeInfoDb seqnames 46 | #' @importFrom BSgenome start 47 | #' @importFrom IRanges end 48 | #' @export 49 | initialize_ploidy_group <- function(Y, Yhat, ref, groups, maxPloidy = 6, 50 | minPloidy = 1.5, minBinWidth = 5, 51 | SoS.plot = FALSE) { 52 | ploidy.SoS <- rep(NA, ncol(Y)) 53 | breaks <- matrix(0, nrow(Y), ncol(Y)) 54 | RCNP <- matrix(0, nrow(Y), ncol(Y)) 55 | final <- matrix(0, nrow(Y), ncol(Y)) 56 | X <- seq(minPloidy, maxPloidy, by = 0.05) 57 | n_ploidy <- length(X) 58 | SoS <- matrix(0, n_ploidy, ncol(Y)) 59 | normal <- (Y + 1)/(Yhat + 1) 60 | for (k in seq_len(ncol(Y))) { 61 | if (k%%5 == 1) { 62 | cat("Initializing ploidy for cell ", k, "\t") 63 | } 64 | 65 | lr <- log(normal[, k]) 66 | loc <- data.frame(seq = as.character(seqnames(ref)), 67 | start = start(ref), end = end(ref)) 68 | CNA.object <- CNA(genomdat = lr, chrom = loc[, 1], 69 | maploc = as.numeric(loc[, 2]), data.type = "logratio") 70 | CNA.smoothed <- smooth.CNA(CNA.object) 71 | segs <- segment(CNA.smoothed, verbose = 0, min.width = minBinWidth) 72 | frag <- segs$output[, 2:3] 73 | len <- dim(frag)[1] 74 | bps <- array(0, len) 75 | for (j in seq_len(len)) { 76 | bps[j] <- which((loc[, 1] == frag[j, 1]) & 77 | (as.numeric(loc[, 2]) == frag[j, 2])) 78 | } 79 | bps <- sort(bps) 80 | bps[(len = len + 1)] <- nrow(Y) 81 | breaks[bps, k] <- 1 82 | RCNP[, k][seq_len(bps[2])] <- median(normal[, 83 | k][seq_len(bps[2])]) 84 | for (i in 2:(len - 1)) { 85 | RCNP[, k][bps[i]:(bps[i + 1] - 1)] <- median(normal[, 86 | k][bps[i]:(bps[i + 1] - 1)]) 87 | } 88 | RCNP[, k] <- RCNP[, k]/mean(RCNP[, k]) 89 | 90 | SCNP <- RCNP[, k] %o% X 91 | FSCP <- round(SCNP) 92 | Diff2 <- (SCNP - FSCP)^2 93 | SoS[, k] <- colSums(Diff2, na.rm = FALSE, dims = 1) 94 | } 95 | 96 | cat("\n", "Initialize ploidy by group", "\n") 97 | for (G in unique(groups)) { 98 | cat(G, "\t") 99 | g <- which(G == groups) 100 | ploidy.SoS[g] <- X[which.min(apply(SoS[, g, drop = FALSE], 1, sum))] 101 | 102 | if(SoS.plot){ 103 | par(mfrow = c(1,2)) 104 | par(mar = c(5,4,4,2)) 105 | hist(apply(Y[, g, drop = FALSE], 1, median), 100, 106 | main = 'Read depth distribution', 107 | xlab = 'Group-wise Median Coverage per bin') 108 | plot(X, apply(SoS[, g, drop = FALSE], 1, sum), 109 | xlab = "ploidy", ylab = "Group-wise sum of squared errors", 110 | main = "First-pass estimation of ploidy", pch = 16) 111 | abline(v = X[which.min(apply(SoS[, g, drop = FALSE], 1, sum))], 112 | lty = 2) 113 | } 114 | } 115 | return(ploidy.SoS) 116 | } -------------------------------------------------------------------------------- /R/perform_qc.R: -------------------------------------------------------------------------------- 1 | #' @title Quality control for cells and bins 2 | #' 3 | #' @description Perform QC step on single cells and bins. 4 | #' 5 | #' @usage 6 | #' perform_qc(Y_raw, sampname_raw, ref_raw, QCmetric_raw, 7 | #' cov_thresh = 0, minCountQC = 20, 8 | #' mapq20_thresh = 0.3, mapp_thresh = 0.9, 9 | #' gc_thresh = c(20, 80), nMAD = 3) 10 | #' 11 | #' @param Y_raw raw read count matrix returned 12 | #' from \code{\link{get_coverage_scDNA}} 13 | #' @param sampname_raw sample names for quality control returned 14 | #' from \code{\link{get_bam_bed}} 15 | #' @param ref_raw raw GRanges object with corresponding GC content 16 | #' and mappability for quality control returned from 17 | #' \code{\link{get_bam_bed}} 18 | #' @param QCmetric_raw a QC metric for single cells returned from 19 | #' \code{\link{get_samp_QC}} 20 | #' @param cov_thresh scalar variable specifying the lower bound of read count 21 | #' summation of each cell. Default is \code{0} 22 | #' @param minCountQC the minimum read coverage required for 23 | #' normalization and EM fitting. Defalut is \code{20} 24 | #' @param mapq20_thresh scalar variable specifying the lower threshold 25 | #' of proportion of reads with mapping quality greater than 20. 26 | #' Default is \code{0.3} 27 | #' @param mapp_thresh scalar variable specifying mappability of 28 | #' each genomic bin. Default is \code{0.9} 29 | #' @param gc_thresh vector specifying the lower and upper bound of 30 | #' GC content threshold for quality control. Default is \code{20-80} 31 | #' @param nMAD scalar variable specifying the number of MAD from the median 32 | #' of total read counts adjusted by library size for each cell. 33 | #' Default is \code{3} 34 | #' 35 | #' @return A list with components 36 | #' \item{Y}{read depth matrix after quality control} 37 | #' \item{sampname}{sample names after quality control} 38 | #' \item{ref}{A GRanges object specifying whole genomic 39 | #' bin positions after quality control} 40 | #' \item{QCmetric}{A data frame of QC metric for single cells 41 | #' after quality control} 42 | #' 43 | #' @examples 44 | #' Y_raw <- coverageObj.scopeDemo$Y 45 | #' sampname_raw <- rownames(QCmetric.scopeDemo) 46 | #' ref_raw <- ref.scopeDemo 47 | #' QCmetric_raw <- QCmetric.scopeDemo 48 | #' qcObj <- perform_qc(Y_raw = Y_raw, sampname_raw = sampname_raw, 49 | #' ref_raw = ref_raw, QCmetric_raw = QCmetric_raw) 50 | #' 51 | #' @author Rujin Wang \email{rujin@email.unc.edu} 52 | #' @import stats 53 | #' @export 54 | perform_qc <- function(Y_raw, sampname_raw, ref_raw, QCmetric_raw, 55 | cov_thresh = 0, minCountQC = 20, 56 | mapq20_thresh = 0.3, mapp_thresh = 0.9, 57 | gc_thresh = c(20, 80), nMAD = 3) { 58 | if (length(ref_raw) != nrow(Y_raw)) { 59 | stop("Invalid inputs: length of ref and # of rows 60 | in read count matrix must be the same") 61 | } 62 | if (length(sampname_raw) != ncol(Y_raw)) { 63 | stop("Invalid inputs: length of sample names and # of cols in 64 | read count matrix must be the same") 65 | } 66 | if (nrow(QCmetric_raw) != ncol(Y_raw)) { 67 | stop("Invalid inputs: # of rows in QC metric and # of cols in 68 | read count matrix must be the same") 69 | } 70 | mapp <- ref_raw$mapp 71 | gc <- ref_raw$gc 72 | sampfilter1 <- (apply(Y_raw, 2, sum) <= cov_thresh) 73 | message("Removed ", sum(sampfilter1), 74 | " samples due to failed library preparation.") 75 | sampfilter2 <- (apply(Y_raw, 2, mean) <= minCountQC) 76 | message("Removed ", sum(sampfilter2), 77 | " samples due to failure to meet min coverage requirement.") 78 | sampfilter3 <- (QCmetric_raw[, "mapq20_prop"] < mapq20_thresh) 79 | message("Removed ", sum(sampfilter3), 80 | " samples due to low proportion of mapped reads.") 81 | if (sum(sampfilter1 | sampfilter2 | sampfilter3) != 0) { 82 | Y <- Y_raw[, !(sampfilter1 | sampfilter2 | sampfilter3)] 83 | sampname <- sampname_raw[!(sampfilter1 | sampfilter2 84 | | sampfilter3)] 85 | QCmetric <- QCmetric_raw[!(sampfilter1 | sampfilter2 86 | | sampfilter3), ] 87 | } else { 88 | Y <- Y_raw 89 | sampname <- sampname_raw 90 | QCmetric <- QCmetric_raw 91 | } 92 | binfilter1 <- (gc < gc_thresh[1] | gc > gc_thresh[2]) 93 | message("Excluded ", sum(binfilter1), 94 | " bins due to extreme GC content.") 95 | binfilter2 <- (mapp < mapp_thresh) 96 | message("Excluded ", sum(binfilter2), 97 | " bins due to low mappability.") 98 | if (sum(binfilter1 | binfilter2) != 0) { 99 | ref <- ref_raw[!(binfilter1 | binfilter2)] 100 | Y <- Y[!(binfilter1 | binfilter2), ] 101 | } else { 102 | ref <- ref_raw 103 | Y <- Y 104 | } 105 | Y.nonzero <- Y[apply(Y, 1, function(x) { 106 | !any(x == 0) 107 | }), , drop = FALSE] 108 | if(dim(Y.nonzero)[1] <= 10){ 109 | message("Adopt arithmetic mean instead of geometric mean") 110 | pseudo.sample <- apply(Y, 1, mean) 111 | N <- apply(apply(Y, 2, function(x) { 112 | x/pseudo.sample 113 | }), 2, median, na.rm = TRUE) 114 | } else{ 115 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 116 | exp(sum(log(x))/length(x)) 117 | }) 118 | N <- apply(apply(Y.nonzero, 2, function(x) { 119 | x/pseudo.sample 120 | }), 2, median) 121 | } 122 | sampfilter3 <- (N == 0) 123 | message("Removed ", sum(sampfilter3), 124 | " samples due to excessive zero read counts in 125 | library size calculation.") 126 | if (sum(sampfilter3) != 0) { 127 | Y <- Y[, !(sampfilter3)] 128 | sampname <- sampname[!(sampfilter3)] 129 | QCmetric <- QCmetric[!(sampfilter3), ] 130 | N <- N[!(sampfilter3)] 131 | } 132 | Nmat <- matrix(nrow = nrow(Y), ncol = ncol(Y), data = N, 133 | byrow = TRUE) 134 | bin.sum <- apply(Y/Nmat, 1, sum) 135 | binfilter3 <- (bin.sum >= (median(bin.sum) - 136 | nMAD * mad(bin.sum))) & (bin.sum <= (median(bin.sum) + 137 | nMAD * mad(bin.sum))) 138 | Y <- Y[binfilter3, ] 139 | ref <- ref[binfilter3] 140 | QCmetric <- as.data.frame(QCmetric) 141 | message("There are ", ncol(Y), " samples and ", 142 | nrow(Y), " bins after QC step. ") 143 | list(Y = Y, sampname = sampname, ref = ref, QCmetric = QCmetric) 144 | } 145 | -------------------------------------------------------------------------------- /R/get_coverage_scDNA.R: -------------------------------------------------------------------------------- 1 | #' @title Get read coverage from single-cell DNA sequencing 2 | #' 3 | #' @description Get read coverage for each genomic bin across all single 4 | #' cells from scDNA-seq. Blacklist regions, such as segmental duplication 5 | #' regions and gaps near telomeres/centromeres will be masked prior to 6 | #' getting coverage. 7 | #' 8 | #' @param bambedObj object returned from \code{get_bam_bed} 9 | #' @param mapqthres mapping quality threshold of reads 10 | #' @param seq the sequencing method to be used. This should be either 11 | #' 'paired-end' or 'single-end' 12 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'. 13 | #' Default is human genome \code{hg19}. 14 | #' @return 15 | #' \item{Y}{Read depth matrix} 16 | #' 17 | #' @examples 18 | #' library(WGSmapp) 19 | #' library(BSgenome.Hsapiens.UCSC.hg38) 20 | #' bamfolder <- system.file('extdata', package = 'WGSmapp') 21 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 22 | #' bamdir <- file.path(bamfolder, bamFile) 23 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1) 24 | #' bambedObj <- get_bam_bed(bamdir = bamdir, 25 | #' sampname = sampname_raw, 26 | #' hgref = "hg38") 27 | #' 28 | #' # Getting raw read depth 29 | #' coverageObj <- get_coverage_scDNA(bambedObj, 30 | #' mapqthres = 40, 31 | #' seq = 'paired-end', 32 | #' hgref = "hg38") 33 | #' Y_raw <- coverageObj$Y 34 | #' 35 | #' @author Rujin Wang \email{rujin@email.unc.edu} 36 | #' @import Rsamtools 37 | #' @importFrom GenomicRanges GRanges 38 | #' @importFrom IRanges IRanges Views countOverlaps 39 | #' @importFrom GenomeInfoDb seqnames 40 | #' @export 41 | get_coverage_scDNA <- function(bambedObj, mapqthres, seq, hgref = "hg19") { 42 | if(!hgref %in% c("hg19", "hg38", "mm10")){ 43 | stop("Reference genome should be hg19, hg38 or mm10.") 44 | } 45 | ref <- bambedObj$ref 46 | bamdir <- bambedObj$bamdir 47 | sampname <- bambedObj$sampname 48 | 49 | mask.ref <- get_masked_ref(hgref = hgref) 50 | 51 | Y <- matrix(nrow = length(ref), ncol = length(sampname)) 52 | rownames(Y) <- paste(seqnames(ref), ":", start(ref), "-", 53 | end(ref), sep = "") 54 | colnames(Y) <- sampname 55 | for (i in seq_len(length(sampname))) { 56 | bamurl <- bamdir[i] 57 | what <- c("rname", "pos", "mapq", "qwidth") 58 | if (seq == "paired-end") { 59 | flag <- scanBamFlag(isPaired = TRUE, isDuplicate = FALSE, 60 | isUnmappedQuery = FALSE, isNotPassingQualityControls = FALSE, 61 | isFirstMateRead = TRUE) 62 | param <- ScanBamParam(what = what, flag = flag) 63 | bam <- scanBam(bamurl, param = param)[[1]] 64 | } else if (seq == "single-end") { 65 | flag <- scanBamFlag(isPaired = FALSE, isDuplicate = FALSE, 66 | isUnmappedQuery = FALSE, isNotPassingQualityControls = FALSE) 67 | param <- ScanBamParam(what = what, flag = flag) 68 | bam <- scanBam(bamurl, param = param)[[1]] 69 | } 70 | message("Getting coverage for sample ", i, ": ", 71 | sampname[i], "...", sep = "") 72 | if (length(bam$rname) == 0) { 73 | Y[, i] <- 0 # Failed library preparation 74 | } else { 75 | if (any(grepl("chr", bam$rname) == TRUE)) { 76 | bam.ref <- GRanges(seqnames = bam$rname, ranges = 77 | IRanges(start = bam[["pos"]], width = bam[["qwidth"]])) 78 | } else { 79 | bam.ref <- GRanges(seqnames = paste0("chr", bam$rname), ranges = 80 | IRanges(start = bam[["pos"]], width = bam[["qwidth"]])) 81 | } 82 | bam.ref <- bam.ref[bam$mapq >= mapqthres] 83 | bam.ref <- suppressWarnings(bam.ref[countOverlaps(bam.ref, 84 | mask.ref) == 0]) 85 | Y[, i] <- countOverlaps(ref, bam.ref) 86 | } 87 | } 88 | list(Y = Y) 89 | } 90 | 91 | 92 | get_masked_ref <- function(hgref){ 93 | if(hgref == "hg19"){ 94 | # Get segmental duplication regions 95 | seg.dup <- read.table(system.file("extdata", 96 | "GRCh37GenomicSuperDup.tab", 97 | package = "WGSmapp"), header = TRUE) 98 | # Get hg19 gaps 99 | gaps <- read.table(system.file("extdata", "hg19gaps.txt", 100 | package = "WGSmapp"), 101 | header = TRUE) 102 | } else if(hgref == "hg38"){ 103 | # Get segmental duplication regions 104 | seg.dup <- read.table(system.file("extdata", 105 | "GRCh38GenomicSuperDup.tab", 106 | package = "WGSmapp")) 107 | # Get hg19 gaps 108 | gaps <- read.table(system.file("extdata", "hg38gaps.txt", 109 | package = "WGSmapp")) 110 | } else if (hgref == "mm10"){ 111 | black.list <- read.table(system.file("extdata", 112 | "mm10-blacklist.v2.bed", 113 | package = "WGSmapp"), header = FALSE, 114 | sep = '\t') 115 | } 116 | if(hgref != "mm10"){ 117 | seg.dup <- seg.dup[!is.na(match(seg.dup[,1], 118 | paste('chr', c(seq_len(22), 'X', 'Y'), 119 | sep = ''))),] 120 | seg.dup <- GRanges(seqnames = as.character(seg.dup[,1]), 121 | ranges = IRanges(start = seg.dup[,2], 122 | end = seg.dup[,3])) 123 | gaps <- gaps[!is.na(match(gaps[,2], 124 | paste('chr', c(seq_len(22), 'X', 'Y'), 125 | sep = ''))),] 126 | gaps <- GRanges(seqnames = as.character(gaps[,2]), 127 | ranges = IRanges(start = gaps[,3], 128 | end = gaps[,4])) 129 | # Generate mask region 130 | mask.ref <- sort(c(seg.dup, gaps)) 131 | } else{ 132 | black.list <- black.list[!is.na(match(black.list[,1], 133 | paste('chr', c(seq_len(19), 'X', 'Y'), sep = ''))),] 134 | black.list <- GRanges(seqnames = black.list[,1], 135 | ranges = IRanges(start = black.list[,2], 136 | end = black.list[,3])) 137 | mask.ref <- black.list 138 | } 139 | return(mask.ref) 140 | } 141 | 142 | 143 | -------------------------------------------------------------------------------- /R/plot_iCN.R: -------------------------------------------------------------------------------- 1 | #' @title Plot post-segmentation copy number profiles of integer values 2 | #' 3 | #' @description Show heatmap of inferred integer copy-number profiles 4 | #' by SCOPE with cells clustered by hierarchical clustering 5 | #' 6 | #' @usage 7 | #' plot_iCN(iCNmat, ref, Gini, annotation = NULL, 8 | #' plot.dendrogram = TRUE, show.names = FALSE, filename) 9 | #' @param iCNmat inferred integer copy-number matrix by SCOPE, 10 | #' with each column being a cell and each row being a genomic bin 11 | #' @param ref GRanges object after quality control procedure 12 | #' @param Gini vector of Gini coefficients for each cell, 13 | #' with the same order as that of cells in columns of \code{iCNmat} 14 | #' @param annotation vector of annotation for each cell, 15 | #' with the same order as that of cells in columns of \code{iCNmat}. 16 | #' Default is \code{NULL}. 17 | #' @param plot.dendrogram logical, whether to plot the dendrogram. 18 | #' Default is \code{TRUE}. 19 | #' @param show.names logical, whether to show cell names by y axis. 20 | #' Default is \code{FALSE}. 21 | #' @param filename name of the output png file 22 | #' 23 | #' @return png file with integer copy-number profiles across single cells 24 | #' with specified annotations 25 | #' 26 | #' @examples 27 | #' Gini <- get_gini(Y_sim) 28 | #' plot_iCN(iCNmat = iCN_sim, 29 | #' ref = ref_sim, 30 | #' Gini = Gini, 31 | #' filename = 'plot_iCN_demo') 32 | #' 33 | #' @author Rujin Wang \email{rujin@email.unc.edu} 34 | #' @import graphics stats 35 | #' @importFrom GenomeInfoDb seqnames 36 | #' @importFrom gplots colorpanel 37 | #' @importFrom RColorBrewer brewer.pal 38 | #' @importFrom grDevices png dev.off 39 | #' @export 40 | plot_iCN <- function(iCNmat, ref, Gini, annotation = NULL, 41 | plot.dendrogram = TRUE, show.names = FALSE, filename) { 42 | smart_image <- function(mat, ...) { 43 | image(t(mat[rev(seq(nrow(mat))), ]), ...) 44 | } 45 | hm_col <- c("#2166AC", "#92C5DE", "#FDFDFD", "#FDDBC7", 46 | "#F4A582", "#D6604D", "#B2182B", "#67001F") 47 | 48 | if (!is.matrix(iCNmat)) { 49 | stop("Invalid plot object: must be an integer matrix. \n") 50 | } 51 | if (length(ref) != nrow(iCNmat)) { 52 | stop("Invalid GRanges object: length of ref and # of 53 | rows in iCNmat must be the same") 54 | } 55 | if (!is.null(annotation)) { 56 | if (!is.null(dim(annotation))) { 57 | stop("Invalid annotation object: has to be a vector or 58 | factor with the same # of cells as that of iCNmat") 59 | } 60 | if (length(annotation) != ncol(iCNmat)) { 61 | stop("Invalid annotation object: length of annotation and 62 | # of cells in iCNmat must be the same") 63 | } 64 | } 65 | if (show.names){ 66 | if (is.null(colnames(iCNmat))){ 67 | stop("Invalid plot object: cell names cannot be NULL") 68 | } 69 | } 70 | if (length(Gini) != ncol(iCNmat)) { 71 | stop("Invalid Gini object: length of Gini coefficient and 72 | # of cells in iCNmat must be the same") 73 | } 74 | 75 | # page setup 76 | if (is.null(annotation)) { 77 | if (plot.dendrogram) { 78 | if (show.names) { 79 | mm <- matrix(c(0, 0, 4, 0, 0, 80 | 2, 3, 1, 7, 5, 81 | 2, 3, 1, 7, 0, 82 | 2, 3, 1, 7, 6), nrow = 4, byrow = TRUE) 83 | mh <- c(2, 20, 20, 20) 84 | mh <- mh/sum(mh) 85 | mw <- c(0.25, 0.1, 5, 0.2, 0.5) 86 | mw <- mw/sum(mw) 87 | } else { 88 | mm <- matrix(c(0, 0, 4, 0, 89 | 2, 3, 1, 5, 90 | 2, 3, 1, 0, 91 | 2, 3, 1, 6), nrow = 4, byrow = TRUE) 92 | mh <- c(2, 20, 20, 20) 93 | mh <- mh/sum(mh) 94 | mw <- c(0.25, 0.1, 5, 0.5) 95 | mw <- mw/sum(mw) 96 | } 97 | } else { 98 | if (show.names) { 99 | mm <- matrix(c(0, 0, 3, 0, 0, 100 | 0, 2, 1, 6, 4, 101 | 0, 2, 1, 6, 0, 102 | 0, 2, 1, 6, 5), nrow = 4, byrow = TRUE) 103 | mh <- c(2, 20, 20, 20) 104 | mh <- mh/sum(mh) 105 | mw <- c(0.25, 0.1, 5, 0.2, 0.5) 106 | mw <- mw/sum(mw) 107 | } else { 108 | mm <- matrix(c(0, 0, 3, 0, 109 | 0, 2, 1, 4, 110 | 0, 2, 1, 0, 111 | 0, 2, 1, 5), nrow = 4, byrow = TRUE) 112 | mh <- c(2, 20, 20, 20) 113 | mh <- mh/sum(mh) 114 | mw <- c(0.25, 0.1, 5, 0.5) 115 | mw <- mw/sum(mw) 116 | } 117 | } 118 | } else { 119 | if (plot.dendrogram) { 120 | if (show.names) { 121 | mm <- matrix(c(0, 0, 0, 5, 0, 0, 122 | 2, 3, 4, 1, 9, 6, 123 | 2, 3, 4, 1, 9, 7, 124 | 2, 3, 4, 1, 9, 8), nrow = 4, byrow = TRUE) 125 | mh <- c(2, 20, 20, 20) 126 | mh <- mh/sum(mh) 127 | mw <- c(0.25, 0.1, 0.1, 5, 0.2, 0.5) 128 | mw <- mw/sum(mw) 129 | } else { 130 | mm <- matrix(c(0, 0, 0, 5, 0, 131 | 2, 3, 4, 1, 6, 132 | 2, 3, 4, 1, 7, 133 | 2, 3, 4, 1, 8), nrow = 4, byrow = TRUE) 134 | mh <- c(2, 20, 20, 20) 135 | mh <- mh/sum(mh) 136 | mw <- c(0.25, 0.1, 0.1, 5, 0.5) 137 | mw <- mw/sum(mw) 138 | } 139 | } else { 140 | if (show.names) { 141 | mm <- matrix(c(0, 0, 0, 4, 0, 0, 142 | 0, 2, 3, 1, 8, 5, 143 | 0, 2, 3, 1, 8, 6, 144 | 0, 2, 3, 1, 8, 7), nrow = 4, byrow = TRUE) 145 | mh <- c(2, 20, 20, 20) 146 | mh <- mh/sum(mh) 147 | mw <- c(0.25, 0.1, 0.1, 5, 0.2, 0.5) 148 | mw <- mw/sum(mw) 149 | } else { 150 | mm <- matrix(c(0, 0, 0, 4, 0, 151 | 0, 2, 3, 1, 5, 152 | 0, 2, 3, 1, 6, 153 | 0, 2, 3, 1, 7), nrow = 4, byrow = TRUE) 154 | mh <- c(2, 20, 20, 20) 155 | mh <- mh/sum(mh) 156 | mw <- c(0.25, 0.1, 0.1, 5, 0.5) 157 | mw <- mw/sum(mw) 158 | } 159 | } 160 | } 161 | png(paste0(filename, ".png"), width = 2500, 162 | height = 1600, pointsize = 25) 163 | layout(mm, widths = mw, heights = mh) 164 | par(mar = rep(0, 4)) 165 | 166 | iCNmat <- round(iCNmat) 167 | if (!is.null(annotation)) { 168 | annotation <- as.factor(annotation) 169 | } 170 | 171 | chr.pos <- rep(NA, length(unique(seqnames(ref)))) 172 | for (chri in seq_len(length(chr.pos))) { 173 | chr.pos[chri] <- length(ref[which(as.character( 174 | seqnames(ref)) == as.character(unique( 175 | seqnames(ref)))[chri])]) 176 | 177 | } 178 | chr.pos <- cumsum(chr.pos) 179 | xpos <- round(c(0, chr.pos[seq_len(length(chr.pos)-1)]) + 180 | (chr.pos - c(0, chr.pos[seq_len(length(chr.pos)-1)]))/2) 181 | 182 | # 1) iCN heatmap 183 | dat <- t(iCNmat) 184 | dat[dat >= 7] <- 7 185 | dat[dat <= 0] <- 0 186 | iCNtab <- as.numeric(names(table(dat))) 187 | rclust <- hclust(dist(dat)) 188 | dat <- dat[rclust$order, ] 189 | 190 | smart_image(dat, col = hm_col[iCNtab + 1], xaxs = "i", 191 | yaxs = "i", axes = FALSE) 192 | abline(v = 0, lwd = 2) 193 | for (i in seq_len(length(chr.pos))) { 194 | abline(v = chr.pos[i]/length(ref), lwd = 2) 195 | } 196 | 197 | # 2) hclust 198 | if (plot.dendrogram) { 199 | plot(rev(as.dendrogram(rclust)), leaflab = "none", 200 | horiz = TRUE, axes = FALSE, yaxs = "i") 201 | } 202 | 203 | # 3) Gini annotation 204 | anno.Gini <- matrix(Gini[rclust$order], nrow = nrow(dat), 205 | ncol = 1) 206 | col.Gini <- gplots::colorpanel(50, "#F7FBFF", "#084594") 207 | smart_image(anno.Gini, col = col.Gini, xaxs = "i", 208 | yaxs = "i", axes = FALSE) 209 | 210 | if (!is.null(annotation)) { 211 | # 4) Customized annotation 212 | anno.level <- levels(annotation) 213 | anno.mat <- matrix(match(annotation[rclust$order], anno.level), 214 | nrow = nrow(dat), ncol = 1) 215 | col.anno <- brewer.pal(n = 12, name = "Set3")[ 216 | sort(unique(match(annotation[rclust$order], anno.level)))] 217 | smart_image(anno.mat, col = col.anno, xaxs = "i", 218 | yaxs = "i", axes = FALSE) 219 | } 220 | 221 | # 5) chromosome 222 | anno.chrom <- NULL 223 | for (i in seq_len(length(chr.pos))) { 224 | if (i%%2 == 1) { 225 | temp <- matrix(rep(1, length(which(as.character( 226 | seqnames(ref)) == as.character(unique( 227 | seqnames(ref)))[i]))), nrow = 1) 228 | anno.chrom <- cbind(anno.chrom, temp) 229 | } else { 230 | temp <- matrix(rep(2, length(which(as.character( 231 | seqnames(ref)) == as.character(unique( 232 | seqnames(ref)))[i]))), nrow = 1) 233 | anno.chrom <- cbind(anno.chrom, temp) 234 | } 235 | } 236 | image(t(anno.chrom), col = c("gray", "black"), 237 | xaxs = "i", yaxs = "i", axes = FALSE) 238 | pos.text <- xpos/length(ref) 239 | chr.noprint <- as.character(unique(seqnames(ref))) 240 | chr.print <- substr(chr.noprint, 4, nchar(chr.noprint)) 241 | text(pos.text, 0.2, chr.print, col = c("black", "grey"), 242 | cex = 1.5) 243 | 244 | # 6) Gini legend 245 | par(mar = c(2, 2, 2, 4)) 246 | image(1, seq_len(length(brewer.pal(n = 8, name = "Blues"))), 247 | t(as.matrix(seq_len(length(brewer.pal(n = 8, name = "Blues"))))), 248 | col = brewer.pal(n = 8, name = "Blues"), xlab = "", ylab = "", 249 | xaxt = "n", yaxt = "n", bty = "n") 250 | axis(4, at = c(1, length(brewer.pal(n = 8, name = "Blues"))), 251 | labels = round(c(min(Gini), max(Gini)), 2), col.ticks = "white", 252 | col = NA, lwd.ticks = 0, cex.axis = 1.5, las = 2, font = 2) 253 | title("Gini", cex.main = 1.5) 254 | 255 | if (!is.null(annotation)) { 256 | plot(0, 0, type = "n", axes = FALSE) 257 | legend("center", legend = sort(unique(annotation)), 258 | col = col.anno, pch = 15, bty = "n", cex = 1.5) 259 | } 260 | 261 | # 8) iCN legend 262 | par(mar = c(2, 2, 2, 4)) 263 | image(1, seq_len(length(hm_col[iCNtab + 1])), 264 | t(as.matrix(seq_len(length(hm_col[iCNtab + 1])))), 265 | col = hm_col[iCNtab + 1], xlab = "", ylab = "", 266 | xaxt = "n", yaxt = "n", bty = "n") 267 | axis(4, at = seq_len(length(hm_col[iCNtab + 1])), 268 | labels = c(0:7)[iCNtab + 1], col.ticks = "white", col = NA, 269 | lwd.ticks = 0, cex.axis = 1.5, las = 2, font = 2) 270 | title("integer CN", cex.main = 1.5) 271 | 272 | # 9) cell names 273 | if (show.names) { 274 | par(mar = rep(0, 4)) 275 | image(1, seq_len(ncol(iCNmat)), 276 | t(as.matrix(seq_len(ncol(iCNmat)))), 277 | col = "white", xlab = "", ylab = "", 278 | xaxt = "n", yaxt = "n", bty = "n") 279 | text(1, rev(seq_len(ncol(iCNmat))), 280 | labels = colnames(iCNmat)[rclust$order], las = 2, cex = 1.5, font = 2) 281 | } 282 | dev.off() 283 | } 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | -------------------------------------------------------------------------------- /vignettes/SCOPE_vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SCOPE: Single-cell Copy Number Estimation" 3 | author: "Rujin Wang, Danyu Lin, Yuchao Jiang" 4 | date: "`r format(Sys.Date())`" 5 | output: 6 | html_document: 7 | highlight: pygments 8 | toc: true 9 | vignette: > 10 | %\VignetteIndexEntry{SCOPE: Single-cell Copy Number Estimation} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | \usepackage[utf8]{inputenc} 13 | --- 14 | 15 | # 1. Overview of analysis pipeline 16 | ## 1.1 Introduction 17 | SCOPE is a statistical framework designed for calling 18 | copy number variants (CNVs) from whole-genome single-cell 19 | DNA sequencing read depths. The distinguishing features 20 | of SCOPE include: 21 | 22 | 1. Utilizes cell-specific Gini coefficients for quality 23 | controls and for identification of normal/diploid cells. 24 | In most single-cell cancer genomics studies, diploid cells 25 | are inevitably picked up from adjacent normal tissues for 26 | sequencing and can thus serve as normal controls for 27 | read depth normalization. However, not all 28 | platforms/experiments allow or adopt flow-sorting 29 | based techniques before scDNA-seq and thus cell ploidy 30 | and case-control labeling are not always readily 31 | available. Gini coefficient is able to index diploid 32 | cells out of the entire cell populations and serves 33 | as good proxies to identify cell outliers. 34 | 35 | 2. Employs an EM algorithm to model GC content bias, 36 | which accounts for the different copy number states 37 | along the genome. SCOPE is based on a Poisson latent 38 | factor model for cross-sample normalization, 39 | borrowing information both across regions and 40 | across samples to estimate the bias terms. 41 | 42 | 3. Incorporates multi-sample segmentation procedure 43 | to identify breakpoints that are shared across cells 44 | from the same genetic background 45 | 46 | 47 | ## 1.2 Bioinformatic pre-processing 48 | 49 | We demonstrate here how the pre-processing bioinformatic 50 | pipeline works. The 51 | [Picard toolkit](https://broadinstitute.github.io/picard/) 52 | is open-source and free for all uses. The `split_script.py` 53 | python script is pre-stored in the package for demultiplexing. 54 | 55 | There are two types of scDNA-seq data sources: public 56 | data from NCBI Sequence Read Archive and data from 57 | 10X Genomics. For the NCBI SRA data, start with the 58 | SRA files. Fastq-dump to obtain FASTQ files. Align 59 | FASTQ sequences to NCBI hg19 reference genome and 60 | convert to BAM files. For the 10X Genomic datasets, 61 | process from the original integrated BAM file. 62 | Error-corrected chromium cellular barcode information 63 | for each read is stored as CB tag fields. Only 64 | reads that contain CB tags and are in the list 65 | of barcode of interest are demultiplexed via 66 | a Python script. Sort, add read group, and 67 | dedup on aligned/demultiplexed BAMs. Use 68 | deduped BAM files as the input. 69 | 70 | ```{bash, eval = FALSE} 71 | # public data from NCBI Sequence Read Archive 72 | SRR=SRRXXXXXXX 73 | kim=/pine/scr/r/u/rujin/Kim_Navin_et_al_Cell_2018 74 | fastq_dir=$kim/fastq 75 | align_dir=$kim/align 76 | 77 | # Align FASTQ sequences to NCBI hg19 reference genome 78 | # (Single-end sequenced cells have only 1 FASTQ file; 79 | # paired-end sequencing would generate two FASTQ files, 80 | # with suffix "_1" and "_2") 81 | cd $fastq_dir 82 | bwa mem -M -t 16 \ 83 | ucsc.hg19.fasta `ls | grep "$SRR" | tr '\n' ' '` > $align_dir/"$SRR".sam 84 | 85 | # Convert .sam to .bam 86 | cd $align_dir 87 | samtools view -bS "$SRR".sam > "$SRR".bam 88 | 89 | # Sort 90 | java -Xmx30G -jar /proj/yuchaojlab/bin/picard.jar SortSam \ 91 | INPUT="$SRR".bam OUTPUT="$SRR".sorted.bam \ 92 | SORT_ORDER=coordinate 93 | 94 | # Add read group 95 | java -Xmx40G -jar /proj/yuchaojlab/bin/picard.jar AddOrReplaceReadGroups \ 96 | I="$SRR".sorted.bam O="$SRR".sorted.rg.bam RGID="$SRR" \ 97 | RGLB=Chung_Et_Al RGPL=ILLUMINA RGPU=machine RGSM="$SRR" 98 | samtools index "$SRR".sorted.rg.bam 99 | 100 | # Dedup 101 | java -Xmx40G -jar /proj/yuchaojlab/bin/picard.jar MarkDuplicates \ 102 | REMOVE_DUPLICATES=true \ 103 | I="$SRR".sorted.rg.bam O="$SRR".sorted.rg.dedup.bam \ 104 | METRICS_FILE="$SRR".sorted.rg.dedup.metrics.txt \ 105 | PROGRAM_RECORD_ID= MarkDuplicates PROGRAM_GROUP_VERSION=null \ 106 | PROGRAM_GROUP_NAME=MarkDuplicates 107 | java -jar /proj/yuchaojlab/bin/picard.jar BuildBamIndex \ 108 | I="$SRR".sorted.rg.dedup.bam 109 | 110 | # 10X Genomics 111 | XGenomics=/pine/scr/r/u/rujin/10XGenomics 112 | dataset=breast_tissue_A_2k 113 | output_dir=$XGenomics/$dataset/output 114 | align_dir=$XGenomics/$dataset/align 115 | 116 | # Demultiplex 117 | cd $output_dir 118 | samtools view ${dataset}_possorted_bam.bam | python $XGenomics/split_script.py 119 | 120 | # Add header to demultiplexed bam files for further processing 121 | cd $XGenomics 122 | samtools view -H $dataset/output/${dataset}_possorted_bam.bam > \ 123 | $dataset/header.txt 124 | barcode=AAAGATGGTGTAAAGT 125 | cat header.txt $align_dir/$barcode/$barcode-1.sam > \ 126 | $align_dir/$barcode/$barcode-1.header.sam 127 | 128 | # Convert .sam to .bam 129 | cd $align_dir/$barcode 130 | samtools view -bS "$barcode"-1.header.sam > "$barcode".bam 131 | ``` 132 | 133 | 134 | # 2. Pre-computation and Quality Control 135 | ## 2.1 Pre-preparation 136 | SCOPE enables reconstruction of user-defined genome-wide 137 | consecutive bins with fixed interval length prior to downstream 138 | analysis by specifying arguments `genome` and `resolution` in 139 | `get_bam_bed()` function. Make sure that all chromosomes are 140 | named consistently and be concordant with `.bam` files. SCOPE 141 | processes the entire genome altogether. Use function `get_bam_bed()` 142 | to finish the pre-preparation step. By default, SCOPE is designed 143 | for hg19 with fixed bin-length = 500kb. 144 | ```{r, eval=TRUE, message=FALSE} 145 | library(SCOPE) 146 | library(WGSmapp) 147 | library(BSgenome.Hsapiens.UCSC.hg38) 148 | bamfolder <- system.file("extdata", package = "WGSmapp") 149 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$') 150 | bamdir <- file.path(bamfolder, bamFile) 151 | sampname_raw <- sapply(strsplit(bamFile, ".", fixed = TRUE), "[", 1) 152 | bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 153 | hgref = "hg38") 154 | ref_raw <- bambedObj$ref 155 | ``` 156 | 157 | ## 2.2 Getting GC content and mappability 158 | Compute GC content and mappability for each bin. 159 | By default, SCOPE is intended for hg19 reference genome. 160 | To compute mappability for hg19, we employed the 100-mers 161 | mappability track from the ENCODE Project 162 | (`wgEncodeCrgMapabilityAlign100mer.bigwig` from [link]( 163 | http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgFileUi? 164 | db=hg19&g=wgEncodeMapability)) 165 | and computed weighted average of the mappability 166 | scores if multiple ENCODE regions overlap with 167 | the same bin. For SCOPE, the whole-genome mappability 168 | track on human hg19 assembly is stored as part of the package. 169 | 170 | 171 | The whole-genome mappability track on human 172 | hg38 assembly is also stored in SCOPE package. 173 | For more details on mappability calculation, 174 | please refer to [CODEX2 for hg38]( 175 | https://github.com/yuchaojiang/CODEX2/blob/master/README.md). 176 | Load the hg38 reference package, specify argument `hgref = "hg38"` 177 | in `get_mapp()` and `get_gc()`. By default, `hg19` are used. 178 | 179 | ```{r, eval=TRUE, message=FALSE} 180 | mapp <- get_mapp(ref_raw, hgref = "hg38") 181 | head(mapp) 182 | gc <- get_gc(ref_raw, hgref = "hg38") 183 | values(ref_raw) <- cbind(values(ref_raw), DataFrame(gc, mapp)) 184 | ref_raw 185 | ``` 186 | 187 | Note that SCOPE can also be adapted to 188 | the mouse genome (mm10) in a similar way 189 | (see [CODEX2 for mouse genome](https://github.com/yuchaojiang/CODEX2/blob/master/README.md)). 190 | Specify argument `hgref = "mm10"` in `get_bam_bed()`, `get_mapp()`, 191 | `get_gc()` and `get_coverage_scDNA()`. Calculation of GC content 192 | and mappability needs to be modified from the default (hg19). 193 | For mm10, there are two workarounds: 1) set all mappability to 1 194 | to avoid extensive computation; 2) adopt QC procedures based on 195 | annotation results, e.g., filter out bins within "blacklist" regions, 196 | which generally have low mappability. To be specific, we obtained 197 | and pre-stored mouse genome "blacklist" regions from 198 | [Amemiya et al., Scientific Reports, 2019](https://github.com/Boyle-Lab/Blacklist/tree/master/lists/mm10-blacklist.v2.bed.gz). 199 | ```{r, eval=FALSE} 200 | library(BSgenome.Mmusculus.UCSC.mm10) 201 | mapp <- get_mapp(ref_raw, hgref = "mm10") 202 | gc <- get_gc(ref_raw, hgref = "mm10") 203 | ``` 204 | 205 | 206 | For unknown reference assembly without 207 | pre-calculated mappability track, 208 | refer to [CODEX2: mappability pre-calculation]( 209 | https://github.com/yuchaojiang/CODEX2/blob/master/mouse/mapp.R). 210 | 211 | ## 2.3 Getting coverage 212 | Obtain either single-end or paired-end sequencing 213 | read depth matrix. SCOPE, by default, adopts a 214 | fixed binning method to compute the depth of 215 | coverage while removing reads that are mapped 216 | to multiple genomic locations and to "blacklist" 217 | regions. This is followed by an additional step 218 | of quality control to remove bins with extreme 219 | mappability to avoid erroneous detections. 220 | Specifically, the "blacklist" bins, including 221 | [segmental duplication regions](http://humanparalogy. 222 | gs.washington.edu/build37/data/GRCh37GenomicSuperDup.tab) 223 | and [gaps in reference assembly](https://gist.github.com/leipzig/6123703) 224 | from telomere, centromere, and/or 225 | heterochromatin regions. 226 | ```{r, eval=TRUE} 227 | # Getting raw read depth 228 | coverageObj <- get_coverage_scDNA(bambedObj, mapqthres = 40, 229 | seq = 'paired-end', hgref = "hg38") 230 | Y_raw <- coverageObj$Y 231 | ``` 232 | 233 | ## 2.4 Quality control 234 | `get_samp_QC()` is used to perform QC step on 235 | single cells, where total number/proportion 236 | of reads, total number/proportion of mapped 237 | reads, total number/proportion of mapped 238 | non-duplicate reads, and number/proportion 239 | of reads with mapping quality greater than 20 240 | will be returned. Use `perform_qc()` to further 241 | remove samples/cells with low proportion of 242 | mapped reads, bins that have extreme GC content 243 | (less than 20% and greater than 80%) and low 244 | mappability (less than 0.9) to reduce artifacts. 245 | ```{r, eval=TRUE} 246 | QCmetric_raw <- get_samp_QC(bambedObj) 247 | qcObj <- perform_qc(Y_raw = Y_raw, 248 | sampname_raw = sampname_raw, ref_raw = ref_raw, 249 | QCmetric_raw = QCmetric_raw) 250 | Y <- qcObj$Y 251 | sampname <- qcObj$sampname 252 | ref <- qcObj$ref 253 | QCmetric <- qcObj$QCmetric 254 | ``` 255 | 256 | 257 | 258 | # 3. Running SCOPE 259 | ## 3.1 Gini coefficient 260 | One feature of SCOPE is to identify normal/diploid 261 | cells using Gini index. Gini coefficient is 262 | calculated for each cell as 2 times the area 263 | between the Lorenz curve and the diagonal. 264 | The value of the Gini index varies between 265 | 0 and 1, where 0 is the most uniform and 1 266 | is the most extreme. Cells with extremely 267 | high Gini coefficients(greater than 0.5) 268 | are recommended to be excluded. Set up a 269 | Gini threshold for identification of 270 | diploid/normal cells (for example, 271 | Gini less than 0.12). We demonstrate 272 | the pre-stored toy dataset as follows. 273 | ```{r, eval=TRUE, message=FALSE} 274 | # get gini coefficient for each cell 275 | Gini <- get_gini(Y_sim) 276 | ``` 277 | 278 | ## 3.2 Running SCOPE with negative control samples 279 | 280 | Normal cell index is determined either by Gini coefficients 281 | or prior knowledge. SCOPE utilizes ploidy estimates from a 282 | first-pass normalization run to ensure fast convergence 283 | and to avoid local optima. Specify `SoS.plot = TRUE` in 284 | `initialize_ploidy()` to visualize ploidy pre-estimations. 285 | The normalization procedure includes an expectation-maximization 286 | algorithm in the Poisson generalized linear model. Note that, 287 | by setting `ploidyInt` in the normalization function, SCOPE allows 288 | users to exploit prior-knowledge ploidies as the input and to 289 | manually tune a few cells that have poor fitting. 290 | ```{r, eval=TRUE, message=TRUE} 291 | # first-pass CODEX2 run with no latent factors 292 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 293 | gc_qc = ref_sim$gc, 294 | norm_index = which(Gini<=0.12)) 295 | 296 | # Ploidy initialization 297 | ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = normObj.sim$Yhat, ref = ref_sim) 298 | 299 | # If using high performance clusters, parallel computing is 300 | # easy and improves computational efficiency. Simply use 301 | # normalize_scope_foreach() instead of normalize_scope(). 302 | # All parameters are identical. 303 | normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, gc_qc = ref_sim$gc, 304 | K = 1, ploidyInt = ploidy.sim, 305 | norm_index = which(Gini<=0.12), T = 1:5, 306 | beta0 = normObj.sim$beta.hat, nCores = 2) 307 | # normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc, 308 | # K = 1, ploidyInt = ploidy.sim, 309 | # norm_index = which(Gini<=0.12), T = 1:5, 310 | # beta0 = beta.hat.noK.sim) 311 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]] 312 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]] 313 | ``` 314 | 315 | Visualize selection results for each individual cell. 316 | By default, BIC is used to choose optimal CNV group. 317 | ```{r, eval=FALSE} 318 | plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc, norm_index = which(Gini<=0.12), 319 | T = 1:5, 320 | ploidyInt = ploidy.sim, beta0 = normObj.sim$beta.hat, 321 | filename = "plot_EM_fit_demo.pdf") 322 | ``` 323 | 324 | Upon completion of SCOPE's default normalization and segmentation, 325 | SCOPE includes the option to cluster cells based on the matrix of 326 | normalized z-scores, estimated copy numbers, or estimated changepoints. 327 | Given the inferred subclones, SCOPE can opt to perform a second round of 328 | group-wise ploidy initialization and normalization 329 | ```{r, eval=FALSE} 330 | # Group-wise ploidy initialization 331 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1") 332 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 333 | ref = ref_sim, groups = clones) 334 | ploidy.sim.group 335 | 336 | # Group-wise normalization 337 | normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim, 338 | gc_qc = ref_sim$gc, 339 | K = 1, ploidyInt = ploidy.sim.group, 340 | norm_index = which(clones=="normal"), 341 | groups = clones, 342 | T = 1:5, 343 | beta0 = beta.hat.noK.sim) 344 | Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max( 345 | normObj.scope.sim.group$BIC)]] 346 | fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max( 347 | normObj.scope.sim.group$BIC)]] 348 | ``` 349 | 350 | ## 3.3 Cross-sample segmentation by SCOPE 351 | 352 | SCOPE provides the cross-sample segmentation, 353 | which outputs shared breakpoints 354 | across cells from the same clone. This step 355 | processes the entire genome chromosome by 356 | chromosome. Shared breakpoints and integer copy-number 357 | profiles will be returned. 358 | ```{r, eval=TRUE, message=FALSE} 359 | chrs <- unique(as.character(seqnames(ref_sim))) 360 | segment_cs <- vector('list',length = length(chrs)) 361 | names(segment_cs) <- chrs 362 | for (chri in chrs) { 363 | message('\n', chri, '\n') 364 | segment_cs[[chri]] <- segment_CBScs(Y = Y_sim, 365 | Yhat = Yhat.sim, 366 | sampname = colnames(Y_sim), 367 | ref = ref_sim, 368 | chr = chri, 369 | mode = "integer", max.ns = 1) 370 | } 371 | iCN_sim <- do.call(rbind, lapply(segment_cs, function(z){z[["iCN"]]})) 372 | ``` 373 | 374 | ## 3.4 Visualization 375 | 376 | SCOPE offers heatmap of inferred integer copy-number 377 | profiles with cells clustered by hierarchical clustering. 378 | ```{r, eval=FALSE} 379 | plot_iCN(iCNmat = iCN_sim, ref = ref_sim, Gini = Gini, 380 | filename = "plot_iCN_demo") 381 | ``` 382 | 383 | 384 | ## Session information 385 | 386 | ```{r} 387 | sessionInfo() 388 | ``` -------------------------------------------------------------------------------- /R/normalize_scope_foreach.R: -------------------------------------------------------------------------------- 1 | #' @title Normalization of read depth with latent factors using 2 | #' Expectation-Maximization algorithm under the case-control 3 | #' setting in parallel 4 | #' 5 | #' @description Fit a Poisson generalized linear model to normalize 6 | #' the raw read depth data from single-cell DNA sequencing, 7 | #' with latent factors under the case-control setting. Model GC 8 | #' content bias using an expectation-maximization algorithm, 9 | #' which accounts for the different copy number states. 10 | #' 11 | #' @usage 12 | #' normalize_scope_foreach(Y_qc, gc_qc, K, norm_index, T, 13 | #' ploidyInt, beta0, minCountQC = 20, nCores = NULL) 14 | #' @param Y_qc read depth matrix after quality control 15 | #' @param gc_qc vector of GC content for each bin after quality control 16 | #' @param K Number of latent Poisson factors 17 | #' @param norm_index indices of normal/diploid cells 18 | #' @param T a vector of integers indicating number of CNV groups. 19 | #' Use BIC to select optimal number of CNV groups. If \code{T = 1}, 20 | #' assume all reads are from normal regions so that EM algorithm is 21 | #' not implemented. Otherwise, we assume there is always a CNV group 22 | #' of heterozygous deletion and a group of null region. The rest groups 23 | #' are representative of different duplication states. 24 | #' @param ploidyInt a vector of initialized ploidy return 25 | #' from \code{initialize_ploidy}. Users are also allowed to provide 26 | #' prior-knowledge ploidies as the input and to manually tune a few 27 | #' cells that have poor fitting 28 | #' @param beta0 a vector of initialized bin-specific biases returned 29 | #' from CODEX2 without latent factors 30 | #' @param minCountQC the minimum read coverage required for normalization 31 | #' and EM fitting. Defalut is \code{20} 32 | #' @param nCores number of cores to use. If \code{NULL}, number of cores 33 | #' is detected. Default is \code{NULL}. 34 | #' 35 | #' @return A list with components 36 | #' \item{Yhat}{A list of normalized read depth matrix with EM} 37 | #' \item{alpha.hat}{A list of absolute copy number matrix} 38 | #' \item{fGC.hat}{A list of EM estimated GC content bias matrix} 39 | #' \item{beta.hat}{A list of EM estimated bin-specific bias vector} 40 | #' \item{g.hat}{A list of estimated Poisson latent factor} 41 | #' \item{h.hat}{A list of estimated Poisson latent factor} 42 | #' \item{AIC}{AIC for model selection} 43 | #' \item{BIC}{BIC for model selection} 44 | #' \item{RSS}{RSS for model selection} 45 | #' \item{K}{Number of latent Poisson factors} 46 | #' 47 | #' @examples 48 | #' Gini <- get_gini(Y_sim) 49 | #' 50 | #' # first-pass CODEX2 run with no latent factors 51 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 52 | #' gc_qc = ref_sim$gc, 53 | #' norm_index = which(Gini<=0.12)) 54 | #' Yhat.noK.sim <- normObj.sim$Yhat 55 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 56 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 57 | #' N.sim <- normObj.sim$N 58 | #' 59 | #' # Ploidy initialization 60 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 61 | #' Yhat = Yhat.noK.sim, 62 | #' ref = ref_sim) 63 | #' ploidy.sim 64 | #' 65 | #' # Specify nCores = 2 only for checking examples 66 | #' normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, 67 | #' gc_qc = ref_sim$gc, 68 | #' K = 1, ploidyInt = ploidy.sim, 69 | #' norm_index = which(Gini<=0.12), T = 1:5, 70 | #' beta0 = beta.hat.noK.sim, nCores = 2) 71 | #' Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]] 72 | #' fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]] 73 | #' 74 | #' @author Rujin Wang \email{rujin@email.unc.edu} 75 | #' @import stats foreach parallel doParallel 76 | #' @export 77 | normalize_scope_foreach <- function(Y_qc, gc_qc, K, norm_index, T, 78 | ploidyInt, beta0, minCountQC = 20, nCores = NULL) { 79 | if (max(K) > length(norm_index)) 80 | stop("Number of latent Poisson factors K cannot 81 | exceed the number of normal samples. ") 82 | Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { 83 | !any(x == 0) 84 | }), , drop = FALSE] 85 | if(dim(Y.nonzero)[1] <= 10){ 86 | message("Adopt arithmetic mean instead of geometric mean") 87 | pseudo.sample <- apply(Y_qc, 1, mean) 88 | Ntotal <- apply(apply(Y_qc, 2, function(x) { 89 | x/pseudo.sample 90 | }), 2, median, na.rm = TRUE) 91 | } else{ 92 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 93 | exp(sum(log(x))/length(x)) 94 | }) 95 | Ntotal <- apply(apply(Y.nonzero, 2, function(x) { 96 | x/pseudo.sample 97 | }), 2, median) 98 | } 99 | N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc))) 100 | Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 101 | data = N, byrow = TRUE) 102 | 103 | # Get initialization 104 | gcfit.temp <- Y_qc/Nmat/beta0 105 | alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc)) 106 | for (j in seq_len(ncol(alpha0))) { 107 | loe.fit <- loess(gcfit.temp[, j] ~ gc_qc) 108 | gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2) 109 | alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2 110 | } 111 | 112 | Yhat <- vector("list", length(K)) 113 | fGC.hat <- vector("list", length(K)) 114 | alpha.hat <- vector("list", length(K)) 115 | beta.hat <- vector("list", length(K)) 116 | g.hat <- vector("list", length(K)) 117 | h.hat <- vector("list", length(K)) 118 | AIC <- rep(NA, length = length(K)) 119 | BIC <- rep(NA, length = length(K)) 120 | RSS <- rep(NA, length = length(K)) 121 | 122 | # Initialization 123 | message("Initialization ...") 124 | gcfit.temp <- Y_qc/Nmat/beta0 125 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 126 | data = beta0, byrow = FALSE) 127 | fhat.temp <- getfGC_foreach(gcfit.temp = gcfit.temp, gctemp = gc_qc, 128 | Y = Y_qc, norm_index = norm_index, offset = offset, 129 | T = T, alpha = alpha0, minCountQC = minCountQC, nCores = nCores) 130 | fhat0 <- fhat.temp$fGC.hat 131 | alpha0 <- fhat.temp$alpha 132 | 133 | for (ki in seq_len(length(K))) { 134 | k <- K[ki] 135 | message("Computing normalization with k = ", k, 136 | " latent factors ...", sep = "") 137 | message("k = ", k) 138 | maxiter <- 10 139 | maxhiter <- 50 140 | BHTHRESH <- 1e-04 141 | HHTHRESH <- 1e-05 142 | iter <- 1 143 | fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0) 144 | betahat <- beta0 145 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 146 | data = betahat, byrow = FALSE) 147 | ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k) 148 | hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k) 149 | bhdiff <- rep(Inf, maxiter) 150 | fhdiff <- rep(Inf, maxiter) 151 | 152 | betahatlist <- vector("list", maxiter) 153 | fhatlist <- vector("list", maxiter) 154 | ghatlist <- vector("list", maxiter) 155 | hhatlist <- vector("list", maxiter) 156 | alphahatlist <- vector("list", maxiter) 157 | 158 | while (iter <= maxiter) { 159 | if (iter == 1) { 160 | fhatnew <- fhat0 161 | alpha <- alpha0 162 | } 163 | if (iter > 1) { 164 | gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat)) 165 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 166 | data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat)) 167 | fhat.temp <- getfGC_foreach(gcfit.temp = gcfit.temp, 168 | gctemp = gc_qc, 169 | Y = Y_qc, norm_index = norm_index, 170 | offset = offset, T = T, alpha = alpha0, 171 | minCountQC = minCountQC, 172 | nCores = nCores) 173 | fhatnew <- fhat.temp$fGC.hat 174 | alpha <- fhat.temp$alpha 175 | } 176 | fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile( 177 | fhatnew, 0.005) 178 | betahatnew <- apply((Y_qc/(fhatnew * Nmat * 179 | exp(ghat %*% t(hhat))))[, norm_index, drop = FALSE], 180 | 1, median) 181 | betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0]) 182 | bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat) 183 | fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat) 184 | if (fhdiff[iter] > min(fhdiff)) 185 | break 186 | message("Iteration ", iter, "\t", "beta diff =", 187 | signif(bhdiff[iter], 3), "\t", "f(GC) diff =", 188 | signif(fhdiff[iter], 189 | 3)) 190 | fhat <- fhatnew 191 | betahat <- betahatnew 192 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 193 | data = betahat, byrow = FALSE) 194 | L <- log(Nmat * fhat * betahatmat * alpha/2) 195 | logmat <- log(pmax(Y_qc, 1)) - L 196 | logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 197 | data = apply(logmat, 1, mean), byrow = FALSE) 198 | hhat <- svd(logmat, nu = k, nv = k)$v 199 | hhatnew <- hhat 200 | hiter <- 1 201 | hhdiff <- rep(Inf, maxhiter) 202 | while (hiter <= maxhiter) { 203 | for (s in seq_len(nrow(Y_qc))) { 204 | temp <- try(glm(formula = Y_qc[s, norm_index] ~ 205 | hhat[norm_index, ] - 206 | 1, offset = L[s, norm_index], 207 | family = poisson)$coefficients, silent = TRUE) 208 | if (is.character(temp)) { 209 | temp <- lm(log(pmax(Y_qc[s, norm_index], 210 | 1)) ~ hhat[norm_index, ] - 1, 211 | offset = log(L[s, norm_index]))$coefficients 212 | } 213 | ghat[s, ] <- temp 214 | } 215 | # avoid overflow or underflow of the g latent factors 216 | ghat[is.na(ghat)] <- 0 217 | if (max(ghat) >= 30) { 218 | ghat <- apply(ghat, 2, function(z) { 219 | z[z > quantile(z, 0.995)] = min(quantile(z, 0.995), 30) 220 | z 221 | }) 222 | } 223 | if (min(ghat) <= -30) { 224 | ghat <- apply(ghat, 2, function(z) { 225 | z[z < quantile(z, 0.005)] = max(quantile(z, 0.005), -30) 226 | z 227 | }) 228 | } 229 | for (t in seq_len(ncol(Y_qc))) { 230 | hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1, 231 | offset = L[, t], family = poisson)$coefficients 232 | } 233 | gh <- ghat %*% t(hhatnew) 234 | gh <- scale(gh, center = TRUE, scale = FALSE) 235 | hhatnew <- svd(gh, nu = k, nv = k)$v 236 | hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat) 237 | message("\t\t\t", "hhat diff =", signif(hhdiff[hiter], 3)) 238 | hhat <- hhatnew 239 | if (hhdiff[hiter] < HHTHRESH) 240 | break 241 | if (hiter > 10 & (rank(hhdiff))[hiter] <= 3) 242 | break 243 | hiter <- hiter + 1 244 | } 245 | alphahatlist[[iter]] <- alpha 246 | fhatlist[[iter]] <- fhat 247 | betahatlist[[iter]] <- betahat 248 | ghatlist[[iter]] <- ghat 249 | hhatlist[[iter]] <- hhat 250 | if (bhdiff[iter] < BHTHRESH) 251 | break 252 | if (iter > 5 & bhdiff[iter] > 1) 253 | break 254 | iter <- iter + 1 255 | } 256 | optIter <- which.min(fhdiff) 257 | message(paste("Stop at Iteration ", optIter, ".", sep = "")) 258 | alpha <- alphahatlist[[optIter]] 259 | fhat <- fhatlist[[optIter]] 260 | betahat <- betahatlist[[optIter]] 261 | ghat <- ghatlist[[optIter]] 262 | hhat <- hhatlist[[optIter]] 263 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 264 | data = betahat, byrow = FALSE) 265 | Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat * 266 | exp(ghat %*% t(hhat)), 0), 1) 267 | alpha.hat[[ki]] <- alpha 268 | fGC.hat[[ki]] <- signif(fhat, 3) 269 | beta.hat[[ki]] <- signif(betahat, 3) 270 | h.hat[[ki]] <- signif(hhat, 3) 271 | g.hat[[ki]] <- signif(ghat, 3) 272 | Yhat.temp <- Yhat[[ki]] * alpha/2 273 | AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - Yhat.temp) - 274 | 2 * (length(ghat) + length(hhat)) 275 | BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - Yhat.temp) - 276 | (length(ghat) + length(hhat)) * log(length(Y_qc)) 277 | RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc)) 278 | message("AIC", k, " = ", round(AIC[ki], 3)) 279 | message("BIC", k, " = ", round(BIC[ki], 3)) 280 | message("RSS", k, " = ", round(RSS[ki], 3), "\n") 281 | } 282 | list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat, 283 | beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat, 284 | AIC = AIC, BIC = BIC, RSS = RSS, K = K) 285 | } 286 | 287 | 288 | 289 | getfGCj_foreach <- function(gcfit.tempj, gctemp, Yj, offsetj, T, 290 | draw.plot = NULL, alphaj, minCountQC) { 291 | alphaj <- pmax(1, round(alphaj)) 292 | if (is.null(draw.plot)) { 293 | draw.plot <- FALSE 294 | } 295 | fGCi <- fitGC(gctemp, gcfit.tempj) 296 | resid <- abs(gcfit.tempj - fGCi) 297 | 298 | bin.filter <- which(resid > (median(resid) + 299 | 5 * mad(resid)) | Yj < minCountQC) 300 | if (length(bin.filter) == 0) { 301 | bin.filter <- which.max(gcfit.tempj) 302 | } 303 | Ti <- T 304 | if (Ti == 1) { 305 | Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 1/Ti) 306 | vec_pi <- 1 307 | loe.fit.temp <- loess(gcfit.tempj[-bin.filter] ~ gctemp[-bin.filter]) 308 | fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit 309 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 310 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 311 | } 312 | if (Ti >= 2) { 313 | Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 0) 314 | mintemp <- pmin(Ti, alphaj) 315 | Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1 316 | vec_pi <- colSums(Z)/nrow(Z) 317 | 318 | loe.fit.temp <- loess((gcfit.tempj/(Z %*% 319 | as.matrix(seq_len(Ti)/2)))[-bin.filter] ~ gctemp[-bin.filter]) 320 | fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit 321 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 322 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 323 | 324 | diff.GC <- Inf 325 | diff.Z <- Inf 326 | iter <- 1 327 | while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) { 328 | Mtemp <- Mstep(Z, gcfit.tempj, gctemp) 329 | vec_pi.new <- Mtemp$vec_pi 330 | fGCi.new <- Mtemp$fGCi 331 | Z.new <- Estep(fGCi.new, vec_pi.new, Yj, offsetj) 332 | diff.GC <- sum((fGCi - fGCi.new)^2)/length(fGCi) 333 | diff.Z <- sum((Z.new - Z)^2)/length(Z) 334 | vec_pi <- vec_pi.new 335 | Z <- Z.new 336 | fGCi <- fGCi.new 337 | iter <- iter + 1 338 | if (iter >= 50) 339 | break 340 | } 341 | } 342 | 343 | if (Ti == 1) { 344 | loe.fit.plot <- loess(gcfit.tempj ~ gctemp) 345 | fGCi.plot <- loe.fit.plot$fitted 346 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 347 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 348 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 349 | 350 | loglik <- sum(dpois(Yj[-bin.filter], 351 | lambda = (offsetj * fGCi)[-bin.filter], log = TRUE)) 352 | BIC <- 2 * loglik - (length(gcfit.tempj) - df) * 353 | log(length(gcfit.tempj)) 354 | } else { 355 | loe.fit.plot <- loess((gcfit.tempj/(Z %*% 356 | as.matrix(seq_len(Ti)/2))) ~ gctemp) 357 | fGCi.plot <- loe.fit.plot$fitted 358 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 359 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 360 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 361 | 362 | loglik <- sum(dpois(Yj[-bin.filter], 363 | lambda = (offsetj * fGCi * (Z %*% 364 | as.matrix(seq_len(Ti)/2)))[-bin.filter], 365 | log = TRUE)) 366 | BIC <- 2 * loglik - (length(gcfit.tempj) - df + Ti - 1) * 367 | log(length(gcfit.tempj)) 368 | } 369 | 370 | if (draw.plot) { 371 | smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter], 372 | xlab = "GC content", ylab = "Y/beta/N/exp(gxh)", 373 | nrpoints = 0, main = paste("T =", Ti)) 374 | if (Ti == 1) { 375 | points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)], 376 | lty = 2, col = 2, type = "l", lwd = 2) 377 | points(gctemp, gcfit.tempj, cex = 0.4, col = 2, pch = 16) 378 | } else { 379 | for (k in seq_len(Ti)) { 380 | points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)] * 381 | k/2, lty = 2, col = k, type = "l", lwd = 2) 382 | points(gctemp[which((round(Z))[, k] == 1)], 383 | (gcfit.tempj)[which((round(Z))[, k] == 1)], cex = 0.4, 384 | col = k, pch = 16) 385 | } 386 | } 387 | } 388 | 389 | fGCi.obj <- fGCi 390 | Z.obj <- Z 391 | vec_pi.obj <- vec_pi 392 | return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj, 393 | vec_pi.obj = vec_pi.obj, bin.filter = bin.filter, 394 | loglik = loglik, BIC = BIC)) 395 | } 396 | 397 | 398 | 399 | if (getRversion() >= "2.15.1") { 400 | utils::globalVariables(c("Ti")) 401 | } 402 | getfGC_foreach <- function(gcfit.temp, gctemp, Y, norm_index, offset, T, 403 | alpha, minCountQC, nCores) { 404 | fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y)) 405 | for (j in seq_len(ncol(Y))) { 406 | cat(j, "\t") 407 | if (j %in% norm_index) { 408 | alpha[, j] <- 2 409 | loe.fit <- loess(gcfit.temp[, j] ~ gctemp) 410 | fGC.hat[, j] <- loe.fit$fitted 411 | } else { 412 | # begin foreach 413 | if (is.null(nCores)) { 414 | nCores <- detectCores() - 1 415 | } 416 | registerDoParallel(nCores) 417 | 418 | TList <- foreach(Ti = T, .export = c("fitGC", "Estep", "Mstep", 419 | "getfGCj_foreach")) %dopar% { 420 | getfGCj_foreach(gcfit.tempj = gcfit.temp[, j], gctemp = gctemp, 421 | Yj = Y[, j], offsetj = offset[,j], T = Ti, 422 | draw.plot = FALSE, 423 | alphaj = alpha[, j], minCountQC = minCountQC) 424 | } 425 | 426 | # When you're done, clean up the cluster 427 | stopImplicitCluster() 428 | # end foreach 429 | 430 | fGCi.obj <- lapply(TList, function(x) x[["fGCi.obj"]]) 431 | Z.obj <- lapply(TList, function(x) x[["Z.obj"]]) 432 | vec_pi.obj <- lapply(TList, function(x) x[["vec_pi.obj"]]) 433 | loglik <- vapply(TList, function(x) x[["loglik"]], numeric(1)) 434 | BIC <- vapply(TList, function(x) x[["BIC"]], numeric(1)) 435 | 436 | fGCj <- list(fGCi.obj = fGCi.obj, Z.obj = Z.obj, 437 | vec_pi.obj = vec_pi.obj, loglik = loglik, BIC = BIC) 438 | if (which.max(fGCj$BIC) == 1) { 439 | alpha[, j] <- 2 440 | } else { 441 | alpha[, j] <- apply(fGCj$Z.obj[[which.max(fGCj$BIC)]], 442 | 1, which.max) 443 | } 444 | fGC.hat[, j] <- fGCj$fGCi.obj[[which.max(fGCj$BIC)]] 445 | } 446 | } 447 | return(list(fGC.hat = fGC.hat, alpha = alpha)) 448 | } 449 | -------------------------------------------------------------------------------- /R/segment_CBScs.R: -------------------------------------------------------------------------------- 1 | #' @title Cross-sample segmentation 2 | #' 3 | #' @description SCOPE offers a cross-sample Poisson likelihood-based 4 | #' recursive segmentation, enabling shared breakpoints across cells 5 | #' from the same genetic background. 6 | #' 7 | #' @usage segment_CBScs(Y, Yhat, sampname, ref, chr, 8 | #' mode = "integer", max.ns) 9 | #' 10 | #' @param Y raw read depth matrix after quality control procedure 11 | #' @param Yhat normalized read depth matrix 12 | #' @param sampname vector of sample names 13 | #' @param ref GRanges object after quality control procedure 14 | #' @param chr chromosome name. Make sure it is consistent with the 15 | #' reference genome. 16 | #' @param mode format of returned copy numbers. Only integer mode is 17 | #' supported for scDNA-seq data. 18 | #' @param max.ns a number specifying how many rounds of nested structure 19 | #' searching would be performed. Defalut is \code{0}. 20 | #' 21 | #' @return A list with components 22 | #' \item{poolcall}{Cross-sample CNV callings indicating 23 | #' shared breakpoints} 24 | #' \item{finalcall}{Final cross-sample segmented callset of 25 | #' CNVs with genotyping results} 26 | #' \item{image.orig}{A matrix giving logarithm of normalized 27 | #' z-scores} 28 | #' \item{image.seg}{A matrix of logarithm of estimated 29 | #' copy number over 2} 30 | #' \item{iCN}{A matrix of inferred integer copy number profiles} 31 | #' 32 | #' @examples 33 | #' Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]] 34 | #' segment_cs_chr1 <- segment_CBScs(Y = Y_sim, Yhat = Yhat.sim, 35 | #' sampname = colnames(Y_sim), 36 | #' ref = ref_sim, chr = 'chr1', max.ns = 1) 37 | #' 38 | #' @author Rujin Wang \email{rujin@email.unc.edu} 39 | #' @importFrom GenomicRanges GRanges 40 | #' @importFrom IRanges IRanges Views countOverlaps 41 | #' @importFrom GenomeInfoDb seqnames 42 | #' @export 43 | segment_CBScs <- function(Y, Yhat, sampname, ref, chr, mode = "integer", 44 | max.ns = 0) { 45 | if (is.na(match(chr, unique(as.character(seqnames(ref)))))) { 46 | stop("Chromosome not found in the reference genome. Make sure that 47 | all chromosomes are named consistently. \n") 48 | } 49 | 50 | stbin.flag <- min(which(as.character(seqnames(ref)) == chr)) 51 | Y <- Y[which(as.character(seqnames(ref)) == chr), ] 52 | Yhat <- Yhat[which(as.character(seqnames(ref)) == chr), ] 53 | ref <- ref[which(as.character(seqnames(ref)) == chr)] 54 | 55 | poolcall <- NULL 56 | message("Cross-sample segmenting for ", ncol(Y), " samples.") 57 | 58 | chpts0 <- c(1, nrow(Y)) 59 | cs.scan <- compute_cs_lratio(Y, Yhat, sampname, chpts0) 60 | i <- cs.scan$i 61 | j <- cs.scan$j 62 | Z <- cs.scan$Z 63 | init.cs.finalmat <- cs.scan$finalmat 64 | 65 | if (!is.null(init.cs.finalmat) && nrow(init.cs.finalmat) > 0) { 66 | # Further cross-sample nested searching 67 | chpts <- init.cs.finalmat[, c(1, 2)] 68 | if (is.null(dim(chpts))) { 69 | chpts <- t(as.matrix(chpts)) 70 | } 71 | 72 | keep_going <- 1 73 | # number of nested searching 74 | num_ns <- 1 75 | while (!all(keep_going == 0) & num_ns <= max.ns) { 76 | nested.output <- search_cs_nested(Y, Yhat, sampname, chpts) 77 | keep_going <- vapply(nested.output, 78 | function(z) z$is.nested, numeric(1)) 79 | 80 | newchpts <- NULL 81 | for (r in seq_len(length(keep_going))) { 82 | if (keep_going[r]) { 83 | newchpts <- rbind(newchpts, 84 | nested.output[[r]]$finalmat[, c(1, 2)]) 85 | } else { 86 | newchpts <- rbind(newchpts, chpts[r, ]) 87 | } 88 | } 89 | chpts <- newchpts 90 | num_ns <- num_ns + 1 91 | } 92 | 93 | # backward compute Z after nested searching 94 | temp <- NULL 95 | for (r in seq_len(nrow(chpts))) { 96 | idx <- which(i == chpts[r, 1] & j == chpts[r, 2]) 97 | if (length(idx) != 0) { 98 | temp <- c(temp, Z[idx]) 99 | } else { 100 | temp <- c(temp, rep(NA, 4)) 101 | } 102 | } 103 | cs.finalmat <- cbind(chpts, temp) 104 | cs.finalmat <- cs.finalmat[!is.na(cs.finalmat[, 3]), , 105 | drop = FALSE] 106 | cs.finalmat <- cs.finalmat[order(-cs.finalmat[, 3]), , 107 | drop = FALSE] 108 | } else { 109 | cs.finalmat <- matrix(data = c(chpts0, 0), 110 | nrow = 1, ncol = 3, byrow = TRUE) 111 | } 112 | 113 | cs.loglikeij <- rep(NA, nrow(cs.finalmat)) 114 | cs.mBIC <- rep(NA, nrow(cs.finalmat)) 115 | 116 | kappa1 <- 3/2 117 | kappa2 <- 2.27 118 | N <- ncol(Y) 119 | T <- nrow(Y) 120 | 121 | for (s in seq_len(nrow(cs.finalmat))) { 122 | tau <- sort(unique(c(as.vector(cs.finalmat[seq_len(s), 123 | c(1, 2)]), 1, nrow(Y)))) 124 | m <- length(tau) - 2 125 | if (m > 0) { 126 | J <- matrix(data = NA, nrow = m + 2, ncol = N) 127 | 128 | Y0 <- Y 129 | Yhat0 <- Yhat 130 | Y0[Y0 <= 20] <- 20 131 | Yhat0[Yhat0 <= 20] <- 20 132 | 133 | muhat <- matrix(data = NA, nrow = m + 1, ncol = N) 134 | rhat <- matrix(data = NA, nrow = m + 1, ncol = N) 135 | muhat[1, ] <- round(apply(Y0[seq_len(tau[2]), , 136 | drop = FALSE], 2, sum)/tau[2]) 137 | rhat[1, ] <- round(2 * (apply(Y0[seq_len(tau[2]), , 138 | drop = FALSE], 2, sum)/apply(Yhat0[seq_len(tau[2]), 139 | , drop = FALSE], 2, sum))) 140 | for (r in seq_len(m)) { 141 | muhat[r + 1, ] <- round(apply( 142 | Yhat0[(tau[r + 1] + 1):tau[r + 2], , 143 | drop = FALSE], 2, sum)/(tau[r + 2] - tau[r + 1] + 1)) 144 | rhat[r + 1, ] <- round((apply(Y0[ 145 | (tau[r + 1] + 1):tau[r + 2], , 146 | drop = FALSE], 2, sum)/apply(Yhat0[ 147 | (tau[r + 1] + 1):tau[r + 2], , 148 | drop = FALSE], 2, sum))) 149 | } 150 | carriershat <- rhat[2:(m + 1), ] - rhat[seq_len(m), ] 151 | carriershat[carriershat != 0] <- 1 152 | J[2:(m + 1), ] <- carriershat 153 | deltahat <- muhat[2:(m + 1), ] - muhat[seq_len(m), ] 154 | deltahatJ <- deltahat * J[2:(m + 1), ] 155 | if (is.null(dim(deltahatJ))) { 156 | deltahatJ <- matrix(data = deltahat, nrow = 1, 157 | ncol = length(deltahat)) 158 | } 159 | deltahatJ.sq.sum <- apply(deltahatJ^2, 1, sum) 160 | deltahatJ.sq.sum <- max(deltahatJ.sq.sum, 1) 161 | 162 | # M and pi 163 | M <- sum(carriershat) 164 | pihat <- M/(N * m) 165 | 166 | cs.temp <- create_chptsmat(cs.finalmat[seq_len(s), c(1, 2), 167 | drop = FALSE], chpts0) 168 | L <- matrix(data = NA, ncol = N, nrow = nrow(cs.temp)) 169 | for (r in seq_len(nrow(L))) { 170 | yact.temp <- apply(Y[cs.temp[r, 1]:cs.temp[r, 2], , 171 | drop = FALSE], 2, sum) 172 | lambda.temp <- apply(Yhat[cs.temp[r, 1]:cs.temp[r, 2], , 173 | drop = FALSE], 2, sum) 174 | yact.temp[lambda.temp < 20] <- 20 175 | lambda.temp[lambda.temp < 20] <- 20 176 | L[r, ] <- (1 - round(2 * yact.temp/lambda.temp)/2) * 177 | lambda.temp + log((round(2 * (yact.temp/lambda.temp)) + 178 | 1e-04)/2.0001) * yact.temp 179 | } 180 | cs.loglikeij[s] <- sum(L) 181 | 182 | term1 <- cs.loglikeij[s] 183 | if (M == 0) { 184 | term2 <- 0 185 | } else { 186 | term2 <- -M/2 * log(2 * cs.loglikeij[s]/M) 187 | } 188 | term3 <- -log(choose(T, m)) 189 | # term3 <- - m * log(choose(T, m)) 190 | term4 <- -M/2 191 | term5 <- -sum(log(deltahatJ.sq.sum)) 192 | term6 <- -m * (kappa1 - kappa2) 193 | if (pihat == 0 || pihat == 1) { 194 | term7 <- 0 195 | } else { 196 | term7 <- (M * log(pihat) + (N * m - M) * log(1 - pihat)) 197 | } 198 | mbic <- term1 + term2 + term3 + term4 + term5 + term6 + term7 199 | cs.mBIC[s] <- mbic 200 | } else { 201 | cs.mBIC[s] <- 0 202 | } 203 | } 204 | cs.mBIC <- round(cs.mBIC, digits = 3) 205 | 206 | cs.finalmat <- (cbind(cs.finalmat, cs.mBIC)[seq_len(which.max(cs.mBIC)), , 207 | drop = FALSE]) 208 | poolcall <- create_chptsmat(cs.finalmat[, c(1, 2), drop = FALSE], chpts0) 209 | colnames(poolcall) <- c("st", "end") 210 | 211 | finalcall <- NULL 212 | image.orig <- log(pmax(0.001, Y)/Yhat) 213 | image.orig[image.orig <= -2] <- -2 214 | image.orig[image.orig > 2] <- 2 215 | 216 | image.seg <- matrix(data = NA, nrow = nrow(image.orig), 217 | ncol = ncol(image.orig)) 218 | iCN <- matrix(data = NA, nrow = nrow(image.orig), 219 | ncol = ncol(image.orig)) 220 | colnames(image.seg) <- colnames(Y) 221 | colnames(iCN) <- colnames(Y) 222 | 223 | for (i in seq_len(nrow(poolcall))) { 224 | st_bin <- poolcall[i, "st"] 225 | ed_bin <- poolcall[i, "end"] 226 | 227 | yact.ind <- colSums(Y[st_bin:ed_bin, , drop = FALSE]) 228 | lambda.ind <- colSums(Yhat[st_bin:ed_bin, , drop = FALSE]) 229 | if (mode == "integer") { 230 | chat.ind <- round(2 * (yact.ind/lambda.ind)) 231 | } else if (mode == "fraction") { 232 | chat.ind <- 2 * (yact.ind/lambda.ind) 233 | } 234 | chat.ind[chat.ind > 14] <- 14 235 | 236 | image.seg[poolcall[i, 1]:poolcall[i, 2], ] <- 237 | matrix(nrow = (poolcall[i, 2] - poolcall[i, 1] + 1), 238 | ncol = ncol(image.seg), 239 | data = log(chat.ind/2), byrow = TRUE) 240 | iCN[poolcall[i, 1]:poolcall[i, 2], ] <- 241 | matrix(nrow = (poolcall[i, 2] - poolcall[i, 1] + 1), 242 | ncol = ncol(iCN), 243 | data = chat.ind, byrow = TRUE) 244 | 245 | temp <- cbind(colnames(Y), rep(st_bin, ncol(Y)), 246 | rep(ed_bin, ncol(Y)), chat.ind) 247 | temp <- temp[chat.ind != 2, ] 248 | finalcall <- rbind(finalcall, temp) 249 | rownames(finalcall) <- NULL 250 | colnames(finalcall) <- c("sample_name", "st_bin", 251 | "ed_bin", "cnv_no") 252 | } 253 | 254 | finalcall <- as.data.frame(finalcall) 255 | 256 | finalcall$st_bin <- as.numeric(paste(finalcall$st_bin)) 257 | finalcall$ed_bin <- as.numeric(paste(finalcall$ed_bin)) 258 | finalcall$cnv_no <- as.numeric(paste(finalcall$cnv_no)) 259 | 260 | 261 | poolcall[, "st"] <- poolcall[, "st"] + stbin.flag - 1 262 | poolcall[, "end"] <- poolcall[, "end"] + stbin.flag - 1 263 | finalcall[, "st_bin"] <- finalcall[, "st_bin"] + stbin.flag - 1 264 | finalcall[, "ed_bin"] <- finalcall[, "ed_bin"] + stbin.flag - 1 265 | return(list(poolcall = poolcall, finalcall = finalcall, 266 | image.orig = image.orig, image.seg = image.seg, iCN = iCN)) 267 | } 268 | 269 | 270 | 271 | create_chptsmat <- function(mat1, st_end) { 272 | st <- st_end[1] 273 | end <- st_end[2] 274 | mat1 <- mat1[order(mat1[, 1]), , drop = FALSE] 275 | if (mat1[1, 1] != st) { 276 | newchptsmat <- matrix(data = c(st, mat1[1, 1] - 1, mat1[1, ]), 277 | ncol = 2, byrow = TRUE) 278 | } else { 279 | newchptsmat <- t(as.matrix(mat1[1, ])) 280 | } 281 | if (nrow(mat1) > 1) { 282 | for (r in 2:nrow(mat1)) { 283 | if (mat1[r, 1] != mat1[r - 1, 2] + 1) { 284 | newchptsmat <- rbind(newchptsmat, matrix(data = 285 | c(mat1[r - 1, 2] + 1, mat1[r, 1] - 1, mat1[r, ]), 286 | ncol = 2, byrow = TRUE)) 287 | } else { 288 | newchptsmat <- rbind(newchptsmat, matrix(data = mat1[r, ], 289 | ncol = 2, byrow = TRUE)) 290 | } 291 | } 292 | } 293 | if (mat1[nrow(mat1), 2] != end) { 294 | newchptsmat <- rbind(newchptsmat, 295 | matrix(data = c(mat1[nrow(mat1), 2] + 1, end), nrow = 1, ncol = 2)) 296 | } 297 | return(newchptsmat) 298 | } 299 | 300 | 301 | 302 | compute_cs_lratio <- function(Y, Yhat, sampname, this.chpts, msgprint = TRUE) { 303 | Z0 <- NULL 304 | if (this.chpts[2] - this.chpts[1] < 2) { 305 | return(list(i = NA, j = NA, Z = NA, 306 | finalmat = t(as.matrix(this.chpts)), is.nested = 0)) 307 | } else { 308 | lmax <- nrow(Y) - 1 309 | for (sampno in seq_len(ncol(Y))) { 310 | if (msgprint) { 311 | message("Calculating scan statistic for sample ", 312 | sampno, ": ", sampname[sampno], ".") 313 | } 314 | 315 | y <- Y[, sampno] 316 | yhat <- Yhat[, sampno] 317 | 318 | if (any(yhat < 10)) { 319 | if (any(y[yhat < 10] < 10)) { 320 | y[yhat < 10] <- 10 321 | } 322 | yhat[yhat < 10] <- 10 323 | } 324 | 325 | ysum <- sum(y) 326 | yhatsum <- sum(yhat) 327 | num <- length(y) 328 | y <- c(y, rep(0, lmax)) 329 | yhat <- c(yhat, rep(0, lmax)) 330 | i <- rep(seq_len(num), rep(lmax, num)) 331 | j <- rep(seq_len(lmax), num) + i 332 | yact <- rep(0, length(i)) 333 | lambda <- rep(0, length(i)) 334 | for (k in seq_len(num)) { 335 | yact[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum( 336 | y[k:(k + lmax)])[-1] 337 | lambda[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum( 338 | yhat[k:(k + lmax)])[-1] 339 | } 340 | i <- i[j <= num] 341 | yact <- yact[j <= num] 342 | lambda <- lambda[j <= num] 343 | j <- j[j <= num] 344 | 345 | # integer mode 346 | chat <- round(2 * (yact/lambda)) 347 | idx.noLeft <- which(i == this.chpts[1]) 348 | idx.noRight <- which(j == this.chpts[2]) 349 | chat.L <- rep(NA, length(chat)) 350 | chat.R <- rep(NA, length(chat)) 351 | yact.L <- rep(NA, length(chat)) 352 | yact.R <- rep(NA, length(chat)) 353 | lambda.L <- rep(NA, length(chat)) 354 | lambda.R <- rep(NA, length(chat)) 355 | for (x in seq_len(length(chat))) { 356 | if (x %in% idx.noLeft) { 357 | yact.L[x] <- 0 358 | lambda.L[x] <- 0 359 | } else { 360 | yact.L[x] <- sum(y[seq_len((i[x] - 1))]) 361 | lambda.L[x] <- sum(yhat[seq_len((i[x] - 1))]) 362 | } 363 | 364 | if (x %in% idx.noRight) { 365 | yact.R[x] <- 0 366 | lambda.R[x] <- 0 367 | } else { 368 | yact.R[x] <- sum(y[(j[x] + 1):length(y)]) 369 | lambda.R[x] <- sum(yhat[(j[x] + 1):length(y)]) 370 | } 371 | } 372 | chat.L <- round(2 * (yact.L/lambda.L)) 373 | chat.R <- round(2 * (yact.R/lambda.R)) 374 | chat.L[which(is.na(chat.L))] <- 2 375 | chat.R[which(is.na(chat.R))] <- 2 376 | 377 | lratio.C <- (1 - chat/2) * lambda + 378 | log((chat + 1e-04)/2.0001) * yact 379 | lratio.L <- (1 - chat.L/2) * lambda.L + 380 | log((chat.L + 1e-04)/2.0001) * yact.L 381 | lratio.R <- (1 - chat.R/2) * lambda.R + 382 | log((chat.R + 1e-04)/2.0001) * yact.R 383 | lratio <- lratio.C + lratio.L + lratio.R 384 | chat[chat > 14] <- 14 385 | 386 | Z0 <- cbind(Z0, lratio) 387 | } 388 | Z <- rowSums(Z0) 389 | 390 | winlag <- this.chpts[1] - 1 391 | i <- i + winlag 392 | j <- j + winlag 393 | 394 | if (sum(Z > 0) > 0) { 395 | if (sum(Z > 0) >= 2) { 396 | finalmat <- (cbind(i, j, Z))[Z > 0, ] 397 | finalmat <- finalmat[order(-finalmat[, 3]), ] 398 | s <- 1 399 | while (s <= (nrow(finalmat))) { 400 | rowstart <- finalmat[s, 1] 401 | rowend <- finalmat[s, 2] 402 | rowsel <- (finalmat[, 1] <= rowend & 403 | finalmat[, 2] >= rowstart) 404 | rowsel[s] <- FALSE 405 | finalmat <- finalmat[!rowsel, ] 406 | if (is.vector(finalmat)) { 407 | finalmat <- t(as.matrix(finalmat)) 408 | } 409 | s <- s + 1 410 | } 411 | } 412 | if (sum(Z > 0) == 1) { 413 | finalmat <- (cbind(i, j, Z))[Z > 0, ] 414 | finalmat <- t(as.matrix(finalmat)) 415 | } 416 | finalmat <- finalmat[finalmat[, 2] - finalmat[, 1] > 1, , 417 | drop = FALSE] 418 | finalmat <- finalmat[finalmat[, 3] > 10, , drop = FALSE] 419 | finalmat <- round(finalmat, digits = 3) 420 | 421 | if (nrow(finalmat) == 0) { 422 | is.nested <- 0 423 | } else if (nrow(finalmat) == 1 & finalmat[1, 1] == 424 | this.chpts[1] & finalmat[1, 2] == this.chpts[2]) { 425 | is.nested <- 0 426 | } else if (nrow(finalmat) > 2 & 427 | length(unique(finalmat[, 3])) == 1) { 428 | is.nested <- 0 429 | } else if (max(finalmat[, 2] - finalmat[, 1]) <= 5) { 430 | is.nested <- 0 431 | } else if (nrow(finalmat) == 1 & (this.chpts[2] - 432 | this.chpts[1]) - 433 | (finalmat[1, 2] - finalmat[1, 1]) == 1) { 434 | is.nested <- 0 435 | } else { 436 | is.nested <- 1 437 | } 438 | } else { 439 | finalmat <- NULL 440 | is.nested <- 0 441 | } 442 | 443 | # Avoid losing the boundaries 444 | if (!is.null(finalmat) && nrow(finalmat) > 0) { 445 | min.st <- min(finalmat[, c(1, 2)]) 446 | if (this.chpts[1] < min.st) { 447 | idx <- which(i == this.chpts[1] & j == (min.st - 1)) 448 | idx2 <- which(finalmat[, 1] == min.st) 449 | if (length(idx) != 0) { 450 | Z.temp <- Z[idx] 451 | if (Z.temp > 0) { 452 | finalmat[idx2, 1] <- this.chpts[1] 453 | } else { 454 | finalmat <- rbind(c(this.chpts[1], min.st - 1, 0), 455 | finalmat) 456 | } 457 | } else { 458 | finalmat[idx2, 1] <- this.chpts[1] 459 | } 460 | } 461 | max.ed <- max(finalmat[, c(1, 2)]) 462 | if (this.chpts[2] > max.ed) { 463 | idx <- which(i == (max.ed + 1) & j == this.chpts[2]) 464 | idx2 <- which(finalmat[, 2] == max.ed) 465 | if (length(idx) != 0) { 466 | Z.temp <- Z[idx] 467 | if (Z.temp > 0) { 468 | finalmat[idx2, 2] <- this.chpts[2] 469 | } else { 470 | finalmat <- rbind(finalmat, c(max.ed + 1, 471 | this.chpts[2], 0)) 472 | } 473 | } else { 474 | finalmat[idx2, 2] <- this.chpts[2] 475 | } 476 | } 477 | } 478 | 479 | if (!is.null(finalmat) && nrow(finalmat) > 1) { 480 | for (r in seq_len(nrow(finalmat))) { 481 | singlepoint <- finalmat[r, 1] - 1 482 | sg.idx <- which(finalmat[, 2] == singlepoint - 1) 483 | if (length(sg.idx) != 0 && r < sg.idx) { 484 | finalmat[sg.idx, 2] <- singlepoint 485 | } 486 | } 487 | for (r in seq_len(nrow(finalmat))) { 488 | singlepoint <- finalmat[r, 2] + 1 489 | sg.idx <- which(finalmat[, 1] == singlepoint + 1) 490 | if (length(sg.idx) != 0 && r < sg.idx) { 491 | finalmat[sg.idx, 1] <- singlepoint 492 | } 493 | } 494 | } 495 | 496 | if (!is.null(finalmat) && nrow(finalmat) > 1) { 497 | finalmat <- create_chptsmat(finalmat[, c(1, 2), 498 | drop = FALSE], this.chpts) 499 | } 500 | return(list(i = i, j = j, Z = Z, finalmat = finalmat, 501 | is.nested = is.nested)) 502 | } 503 | } 504 | 505 | 506 | 507 | search_cs_nested <- function(Y, Yhat, sampname, chptsmat) { 508 | message("Performing cross-sample nested search... \n") 509 | nest.list <- vector("list", nrow(chptsmat)) 510 | for (r in seq_len(nrow(chptsmat))) { 511 | nest.list[[r]] <- compute_cs_lratio(Y[chptsmat[r, 1]:chptsmat[r, 2], , 512 | drop = FALSE], Yhat[chptsmat[r, 1]:chptsmat[r, 513 | 2], , drop = FALSE], sampname, chptsmat[r, ], msgprint = FALSE) 514 | } 515 | return(nest.list) 516 | } 517 | 518 | 519 | 520 | 521 | -------------------------------------------------------------------------------- /R/normalize_scope.R: -------------------------------------------------------------------------------- 1 | #' @title Normalization of read depth with latent factors using 2 | #' Expectation-Maximization algorithm under the case-control setting 3 | #' 4 | #' @description Fit a Poisson generalized linear model to normalize 5 | #' the raw read depth data from single-cell DNA sequencing, with 6 | #' latent factors under the case-control setting. Model GC content 7 | #' bias using an expectation-maximization algorithm, which accounts for 8 | #' the different copy number states. 9 | #' 10 | #' @usage 11 | #' normalize_scope(Y_qc, gc_qc, K, norm_index, T, ploidyInt, 12 | #' beta0, minCountQC = 20) 13 | #' @param Y_qc read depth matrix after quality control 14 | #' @param gc_qc vector of GC content for each bin after quality control 15 | #' @param K Number of latent Poisson factors 16 | #' @param norm_index indices of normal/diploid cells 17 | #' @param T a vector of integers indicating number of CNV groups. 18 | #' Use BIC to select optimal number of CNV groups. If \code{T = 1}, 19 | #' assume all reads are from normal regions so that EM algorithm is 20 | #' not implemented. Otherwise, we assume there is always a CNV group 21 | #' of heterozygous deletion and a group of null region. The rest 22 | #' groups are representative of different duplication states. 23 | #' @param ploidyInt a vector of initialized ploidy return 24 | #' from \code{initialize_ploidy}. Users are also allowed to provide 25 | #' prior-knowledge ploidies as the input and to manually tune a few 26 | #' cells that have poor fitting 27 | #' @param beta0 a vector of initialized bin-specific biases 28 | #' returned from CODEX2 without latent factors 29 | #' @param minCountQC the minimum read coverage required for 30 | #' normalization and EM fitting. Defalut is \code{20} 31 | #' 32 | #' @return A list with components 33 | #' \item{Yhat}{A list of normalized read depth matrix with EM} 34 | #' \item{alpha.hat}{A list of absolute copy number matrix} 35 | #' \item{fGC.hat}{A list of EM estimated GC content bias matrix} 36 | #' \item{beta.hat}{A list of EM estimated bin-specific bias vector} 37 | #' \item{g.hat}{A list of estimated Poisson latent factor} 38 | #' \item{h.hat}{A list of estimated Poisson latent factor} 39 | #' \item{AIC}{AIC for model selection} 40 | #' \item{BIC}{BIC for model selection} 41 | #' \item{RSS}{RSS for model selection} 42 | #' \item{K}{Number of latent Poisson factors} 43 | #' 44 | #' @examples 45 | #' Gini <- get_gini(Y_sim) 46 | #' 47 | #' # first-pass CODEX2 run with no latent factors 48 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 49 | #' gc_qc = ref_sim$gc, 50 | #' norm_index = which(Gini<=0.12)) 51 | #' Yhat.noK.sim <- normObj.sim$Yhat 52 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 53 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 54 | #' N.sim <- normObj.sim$N 55 | #' 56 | #' # Ploidy initialization 57 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 58 | #' Yhat = Yhat.noK.sim, 59 | #' ref = ref_sim) 60 | #' ploidy.sim 61 | #' 62 | #' normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc, 63 | #' K = 1, ploidyInt = ploidy.sim, 64 | #' norm_index = which(Gini<=0.12), T = 1:5, 65 | #' beta0 = beta.hat.noK.sim) 66 | #' Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]] 67 | #' fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]] 68 | #' 69 | #' @author Rujin Wang \email{rujin@email.unc.edu} 70 | #' @import stats 71 | #' @export 72 | normalize_scope <- function(Y_qc, gc_qc, K, norm_index, T, ploidyInt, 73 | beta0, minCountQC = 20) { 74 | if (max(K) > length(norm_index)) 75 | stop("Number of latent Poisson factors K cannot exceed the number of 76 | normal samples. ") 77 | Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { 78 | !any(x == 0) 79 | }), , drop = FALSE] 80 | if(dim(Y.nonzero)[1] <= 10){ 81 | message("Adopt arithmetic mean instead of geometric mean") 82 | pseudo.sample <- apply(Y_qc, 1, mean) 83 | Ntotal <- apply(apply(Y_qc, 2, function(x) { 84 | x/pseudo.sample 85 | }), 2, median, na.rm = TRUE) 86 | } else{ 87 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 88 | exp(sum(log(x))/length(x)) 89 | }) 90 | Ntotal <- apply(apply(Y.nonzero, 2, function(x) { 91 | x/pseudo.sample 92 | }), 2, median) 93 | } 94 | N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc))) 95 | Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, 96 | byrow = TRUE) 97 | 98 | # Get initialization 99 | gcfit.temp <- Y_qc/Nmat/beta0 100 | alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc)) 101 | for (j in seq_len(ncol(alpha0))) { 102 | loe.fit <- loess(gcfit.temp[, j] ~ gc_qc) 103 | gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2) 104 | alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2 105 | } 106 | 107 | 108 | Yhat <- vector("list", length(K)) 109 | fGC.hat <- vector("list", length(K)) 110 | alpha.hat <- vector("list", length(K)) 111 | beta.hat <- vector("list", length(K)) 112 | g.hat <- vector("list", length(K)) 113 | h.hat <- vector("list", length(K)) 114 | AIC <- rep(NA, length = length(K)) 115 | BIC <- rep(NA, length = length(K)) 116 | RSS <- rep(NA, length = length(K)) 117 | 118 | # Initialization 119 | message("Initialization ...") 120 | gcfit.temp <- Y_qc/Nmat/beta0 121 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 122 | data = beta0, byrow = FALSE) 123 | fhat.temp <- getfGC(gcfit.temp = gcfit.temp, gctemp = gc_qc, 124 | Y = Y_qc, norm_index = norm_index, offset = offset, 125 | T = T, alpha = alpha0, minCountQC = minCountQC) 126 | fhat0 <- fhat.temp$fGC.hat 127 | alpha0 <- fhat.temp$alpha 128 | 129 | for (ki in seq_len(length(K))) { 130 | k <- K[ki] 131 | message("Computing normalization with k = ", k, 132 | " latent factors ...", sep = "") 133 | message("k = ", k) 134 | maxiter <- 10 135 | maxhiter <- 50 136 | BHTHRESH <- 1e-04 137 | HHTHRESH <- 1e-05 138 | iter <- 1 139 | fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0) 140 | betahat <- beta0 141 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 142 | data = betahat, byrow = FALSE) 143 | ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k) 144 | hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k) 145 | bhdiff <- rep(Inf, maxiter) 146 | fhdiff <- rep(Inf, maxiter) 147 | 148 | betahatlist <- vector("list", maxiter) 149 | fhatlist <- vector("list", maxiter) 150 | ghatlist <- vector("list", maxiter) 151 | hhatlist <- vector("list", maxiter) 152 | alphahatlist <- vector("list", maxiter) 153 | 154 | while (iter <= maxiter) { 155 | if (iter == 1) { 156 | fhatnew <- fhat0 157 | alpha <- alpha0 158 | } 159 | if (iter > 1) { 160 | gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat)) 161 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 162 | data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat)) 163 | fhat.temp <- getfGC(gcfit.temp = gcfit.temp, gctemp = gc_qc, 164 | Y = Y_qc, norm_index = norm_index, 165 | offset = offset, T = T, alpha = alpha0, 166 | minCountQC = minCountQC) 167 | fhatnew <- fhat.temp$fGC.hat 168 | alpha <- fhat.temp$alpha 169 | } 170 | fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile( 171 | fhatnew, 0.005) 172 | betahatnew <- apply((Y_qc/(fhatnew * Nmat * exp(ghat %*% 173 | t(hhat))))[, norm_index, drop = FALSE], 1, median) 174 | betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0]) 175 | bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat) 176 | fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat) 177 | if (fhdiff[iter] > min(fhdiff)) 178 | break 179 | message("Iteration ", iter, "\t", "beta diff =", 180 | signif(bhdiff[iter], 3), "\t", "f(GC) diff =", 181 | signif(fhdiff[iter], 3)) 182 | fhat <- fhatnew 183 | betahat <- betahatnew 184 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 185 | data = betahat, byrow = FALSE) 186 | L <- log(Nmat * fhat * betahatmat * alpha/2) 187 | logmat <- log(pmax(Y_qc, 1)) - L 188 | logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 189 | data = apply(logmat, 1, mean), byrow = FALSE) 190 | hhat <- svd(logmat, nu = k, nv = k)$v 191 | hhatnew <- hhat 192 | hiter <- 1 193 | hhdiff <- rep(Inf, maxhiter) 194 | while (hiter <= maxhiter) { 195 | for (s in seq_len(nrow(Y_qc))) { 196 | temp <- try(glm(formula = Y_qc[s, norm_index] ~ 197 | hhat[norm_index, ] - 198 | 1, offset = L[s, norm_index], 199 | family = poisson)$coefficients, 200 | silent = TRUE) 201 | if (is.character(temp)) { 202 | temp <- lm(log(pmax(Y_qc[s, norm_index], 1)) ~ 203 | hhat[norm_index, ] - 204 | 1, offset = log(L[s, norm_index]))$coefficients 205 | } 206 | ghat[s, ] <- temp 207 | } 208 | # avoid overflow or underflow of the g latent factors 209 | ghat[is.na(ghat)] <- 0 210 | if (max(ghat) >= 30) { 211 | ghat <- apply(ghat, 2, function(z) { 212 | z[z > quantile(z, 0.995)] = min(quantile(z, 213 | 0.995), 30) 214 | z 215 | }) 216 | } 217 | if (min(ghat) <= -30) { 218 | ghat <- apply(ghat, 2, function(z) { 219 | z[z < quantile(z, 0.005)] = max(quantile(z, 220 | 0.005), -30) 221 | z 222 | }) 223 | } 224 | for (t in seq_len(ncol(Y_qc))) { 225 | hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1, 226 | offset = L[, t], family = poisson)$coefficients 227 | } 228 | gh <- ghat %*% t(hhatnew) 229 | gh <- scale(gh, center = TRUE, scale = FALSE) 230 | hhatnew <- svd(gh, nu = k, nv = k)$v 231 | hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat) 232 | message("\t\t\t", "hhat diff =", 233 | signif(hhdiff[hiter], 3)) 234 | hhat <- hhatnew 235 | if (hhdiff[hiter] < HHTHRESH) 236 | break 237 | if (hiter > 10 & (rank(hhdiff))[hiter] <= 3) 238 | break 239 | hiter <- hiter + 1 240 | } 241 | alphahatlist[[iter]] <- alpha 242 | fhatlist[[iter]] <- fhat 243 | betahatlist[[iter]] <- betahat 244 | ghatlist[[iter]] <- ghat 245 | hhatlist[[iter]] <- hhat 246 | if (bhdiff[iter] < BHTHRESH) 247 | break 248 | if (iter > 5 & bhdiff[iter] > 1) 249 | break 250 | iter <- iter + 1 251 | } 252 | optIter <- which.min(fhdiff) 253 | message(paste("Stop at Iteration ", optIter, ".", sep = "")) 254 | alpha <- alphahatlist[[optIter]] 255 | fhat <- fhatlist[[optIter]] 256 | betahat <- betahatlist[[optIter]] 257 | ghat <- ghatlist[[optIter]] 258 | hhat <- hhatlist[[optIter]] 259 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 260 | data = betahat, byrow = FALSE) 261 | Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat * 262 | exp(ghat %*% t(hhat)), 0), 1) 263 | alpha.hat[[ki]] <- alpha 264 | fGC.hat[[ki]] <- signif(fhat, 3) 265 | beta.hat[[ki]] <- signif(betahat, 3) 266 | h.hat[[ki]] <- signif(hhat, 3) 267 | g.hat[[ki]] <- signif(ghat, 3) 268 | Yhat.temp <- Yhat[[ki]] * alpha/2 269 | AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - 270 | Yhat.temp) - 2 * (length(ghat) + length(hhat)) 271 | BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - 272 | Yhat.temp) - (length(ghat) + length(hhat)) * 273 | log(length(Y_qc)) 274 | RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc)) 275 | message("AIC", k, " = ", round(AIC[ki], 3)) 276 | message("BIC", k, " = ", round(BIC[ki], 3)) 277 | message("RSS", k, " = ", round(RSS[ki], 3), "\n") 278 | } 279 | list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat, 280 | beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat, 281 | AIC = AIC, BIC = BIC, RSS = RSS, K = K) 282 | } 283 | 284 | 285 | 286 | getfGCj <- function(gcfit.tempj, gctemp, Yj, offsetj, T, draw.plot = NULL, 287 | alphaj, minCountQC) { 288 | alphaj <- pmax(1, round(alphaj)) 289 | if (is.null(draw.plot)) { 290 | draw.plot <- FALSE 291 | } 292 | loglik = BIC = rep(NA, length(T)) 293 | 294 | fGCi.obj <- vector("list", length(T)) 295 | Z.obj <- vector("list", length(T)) 296 | vec_pi.obj <- vector("list", length(T)) 297 | fGCi <- fitGC(gctemp, gcfit.tempj) 298 | resid <- abs(gcfit.tempj - fGCi) 299 | 300 | bin.filter <- which(resid > (median(resid) + 301 | 5 * mad(resid)) | Yj < minCountQC) 302 | if (length(bin.filter) == 0) { 303 | bin.filter <- which.max(gcfit.tempj) 304 | } 305 | if (draw.plot) { 306 | par(mfrow = c(5, 2)) 307 | smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter], 308 | main = "Original", xlab = "GC content", 309 | ylab = "Y/beta/N/exp(gxh)") 310 | } 311 | for (Ti in T) { 312 | if (Ti == 1) { 313 | Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, 314 | data = 1/Ti) 315 | vec_pi <- 1 316 | loe.fit.temp <- loess(gcfit.tempj[-bin.filter] ~ 317 | gctemp[-bin.filter]) 318 | fGCi <- predict(loe.fit.temp, newdata = gctemp, 319 | se = TRUE)$fit 320 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 321 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 322 | } 323 | if (Ti >= 2) { 324 | Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 0) 325 | mintemp <- pmin(Ti, alphaj) 326 | Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1 327 | vec_pi <- colSums(Z)/nrow(Z) 328 | 329 | loe.fit.temp <- loess((gcfit.tempj/(Z %*% 330 | as.matrix(seq_len(Ti)/2)))[-bin.filter] ~ 331 | gctemp[-bin.filter]) 332 | fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit 333 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 334 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 335 | 336 | diff.GC <- Inf 337 | diff.Z <- Inf 338 | iter <- 1 339 | while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) { 340 | Mtemp <- Mstep(Z, gcfit.tempj, gctemp) 341 | vec_pi.new <- Mtemp$vec_pi 342 | fGCi.new <- Mtemp$fGCi 343 | Z.new <- Estep(fGCi.new, vec_pi.new, Yj, offsetj) 344 | diff.GC <- sum((fGCi - fGCi.new)^2)/length(fGCi) 345 | diff.Z <- sum((Z.new - Z)^2)/length(Z) 346 | vec_pi <- vec_pi.new 347 | Z <- Z.new 348 | fGCi <- fGCi.new 349 | iter <- iter + 1 350 | if (iter >= 50) 351 | break 352 | } 353 | } 354 | 355 | if (Ti == 1) { 356 | loe.fit.plot <- loess(gcfit.tempj ~ gctemp) 357 | fGCi.plot <- loe.fit.plot$fitted 358 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 359 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 360 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 361 | 362 | loglik[which(T == Ti)] <- sum(dpois(Yj[-bin.filter], 363 | lambda = (offsetj * fGCi)[-bin.filter], log = TRUE)) 364 | BIC[which(T == Ti)] <- 2 * loglik[which(T == Ti)] - 365 | (length(gcfit.tempj) - df) * log(length(gcfit.tempj)) 366 | } else { 367 | loe.fit.plot <- loess((gcfit.tempj/(Z %*% 368 | as.matrix(seq_len(Ti)/2))) ~ gctemp) 369 | fGCi.plot <- loe.fit.plot$fitted 370 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 371 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 372 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 373 | 374 | loglik[which(T == Ti)] <- sum(dpois(Yj[-bin.filter], 375 | lambda = (offsetj * fGCi * 376 | (Z %*% as.matrix(seq_len(Ti)/2)))[-bin.filter], 377 | log = TRUE)) 378 | BIC[which(T == Ti)] <- 2 * loglik[which(T == Ti)] - 379 | (length(gcfit.tempj) - df + Ti - 1) * 380 | log(length(gcfit.tempj)) 381 | } 382 | 383 | if (draw.plot) { 384 | smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter], 385 | xlab = "GC content", ylab = "Y/beta/N/exp(gxh)", 386 | nrpoints = 0, main = paste("T =", Ti)) 387 | if (Ti == 1) { 388 | points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)], 389 | lty = 2, col = 2, type = "l", lwd = 2) 390 | points(gctemp, gcfit.tempj, cex = 0.4, 391 | col = 2, pch = 16) 392 | } else { 393 | for (k in seq_len(Ti)) { 394 | points(gctemp[order(gctemp)], 395 | fGCi.plot[order(gctemp)] * k/2, 396 | lty = 2, col = k, type = "l", lwd = 2) 397 | points(gctemp[which((round(Z))[, k] == 1)], 398 | (gcfit.tempj)[which((round(Z))[, k] == 1)], 399 | cex = 0.4, col = k, pch = 16) 400 | } 401 | } 402 | } 403 | 404 | fGCi.obj[[which(T == Ti)]] <- fGCi 405 | Z.obj[[which(T == Ti)]] <- Z 406 | vec_pi.obj[[which(T == Ti)]] <- vec_pi 407 | } 408 | if (draw.plot) { 409 | plot(T, loglik, type = "b", xlab = "T", ylab = "loglik", 410 | main = "Log likelihood") 411 | abline(v = which.max(loglik), lty = 2) 412 | plot(T, BIC, type = "b", xlab = "T", ylab = "BIC", main = "BIC") 413 | abline(v = which.max(BIC), lty = 2) 414 | par(mfrow = c(1, 1)) 415 | } 416 | return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj, 417 | vec_pi.obj = vec_pi.obj, bin.filter = bin.filter, 418 | loglik = loglik, BIC = BIC)) 419 | } 420 | 421 | 422 | 423 | getfGC <- function(gcfit.temp, gctemp, Y, norm_index, offset, T, 424 | alpha, minCountQC) { 425 | fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y)) 426 | for (j in seq_len(ncol(Y))) { 427 | cat(j, "\t") 428 | if (j %in% norm_index) { 429 | alpha[, j] <- 2 430 | loe.fit <- loess(gcfit.temp[, j] ~ gctemp) 431 | fGC.hat[, j] <- loe.fit$fitted 432 | } else { 433 | fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j], 434 | gctemp = gctemp, Yj = Y[, j], offsetj = offset[, j], 435 | T = T, draw.plot = FALSE, alphaj = alpha[, j], 436 | minCountQC = minCountQC) 437 | if (which.max(fGCj$BIC) == 1) { 438 | alpha[, j] <- 2 439 | } else { 440 | alpha[, j] <- apply(fGCj$Z.obj[[which.max(fGCj$BIC)]], 441 | 1, which.max) 442 | } 443 | fGC.hat[, j] <- fGCj$fGCi.obj[[which.max(fGCj$BIC)]] 444 | } 445 | } 446 | return(list(fGC.hat = fGC.hat, alpha = alpha)) 447 | } 448 | 449 | 450 | Estep <- function(fGCi, vec_pi, Yj, offsetj) { 451 | P <- matrix(nrow = length(fGCi), ncol = length(vec_pi)) 452 | vec_pi[vec_pi == 0] <- 1e-100 453 | lambda <- matrix(nrow = nrow(P), ncol = ncol(P), 454 | data = offsetj * fGCi) * matrix(nrow = nrow(P), ncol = ncol(P), 455 | data = seq_len(length(vec_pi))/2, byrow = TRUE) 456 | P <- dpois(matrix(nrow = nrow(P), ncol = ncol(P), data = Yj), 457 | lambda = lambda, log = TRUE) + matrix(nrow = nrow(P), 458 | ncol = ncol(P), data = log(vec_pi), byrow = TRUE) 459 | Z <- matrix(nrow = length(fGCi), ncol = length(vec_pi)) 460 | for (k in seq_len(length(vec_pi))) { 461 | Z[, k] <- 1/(1 + apply(exp(apply(P, 2, function(x) { 462 | x - P[, k] 463 | })[, -k, drop = FALSE]), 1, sum)) 464 | } 465 | return(Z) 466 | } 467 | 468 | 469 | 470 | Mstep <- function(Z, gcfit.tempj, gctemp) { 471 | gcfit.temp <- gcfit.tempj/(Z %*% as.matrix(seq_len(ncol(Z))/2)) 472 | fGCi <- fitGC(gctemp, gcfit.temp) 473 | vec_pi <- colSums(Z)/nrow(Z) 474 | return(list(vec_pi = vec_pi, fGCi = fGCi)) 475 | } 476 | 477 | 478 | 479 | fitGC <- function(gctemp, gcfit.temp) { 480 | loe.fit <- loess(gcfit.temp ~ gctemp) 481 | fGCi <- loe.fit$fitted 482 | temp <- min(fGCi[fGCi > 0]) 483 | fGCi[fGCi <= 0] <- temp 484 | return(fGCi) 485 | } 486 | -------------------------------------------------------------------------------- /R/normalize_scope_group.R: -------------------------------------------------------------------------------- 1 | #' @title Group-wise normalization of read depth with latent factors using 2 | #' Expectation-Maximization algorithm and shared clonal memberships 3 | #' 4 | #' @description Fit a Poisson generalized linear model to normalize 5 | #' the raw read depth data from single-cell DNA sequencing, with 6 | #' latent factors and shared clonal memberships. Model GC content 7 | #' bias using an expectation-maximization algorithm, which accounts for 8 | #' clonal specific copy number states. 9 | #' 10 | #' @usage 11 | #' normalize_scope_group(Y_qc, gc_qc, K, norm_index, groups, T, 12 | #' ploidyInt, beta0, minCountQC = 20) 13 | #' @param Y_qc read depth matrix after quality control 14 | #' @param gc_qc vector of GC content for each bin after quality control 15 | #' @param K Number of latent Poisson factors 16 | #' @param norm_index indices of normal/diploid cells using group/clone 17 | #' labels 18 | #' @param groups clonal membership labels for each cell 19 | #' @param T a vector of integers indicating number of CNV groups. 20 | #' Use BIC to select optimal number of CNV groups. If \code{T = 1}, 21 | #' assume all reads are from normal regions so that EM algorithm is 22 | #' not implemented. Otherwise, we assume there is always a CNV group 23 | #' of heterozygous deletion and a group of null region. The rest 24 | #' groups are representative of different duplication states. 25 | #' @param ploidyInt a vector of group-wise initialized ploidy return 26 | #' from \code{initialize_ploidy_group}. Users are also allowed to 27 | #' provide prior-knowledge ploidies as the input and to manually 28 | #' tune a few cells/clones that have poor fitting 29 | #' @param beta0 a vector of initialized bin-specific biases 30 | #' returned from CODEX2 without latent factors 31 | #' @param minCountQC the minimum read coverage required for 32 | #' normalization and EM fitting. Defalut is \code{20} 33 | #' 34 | #' @return A list with components 35 | #' \item{Yhat}{A list of normalized read depth matrix with EM} 36 | #' \item{alpha.hat}{A list of absolute copy number matrix} 37 | #' \item{fGC.hat}{A list of EM estimated GC content bias matrix} 38 | #' \item{beta.hat}{A list of EM estimated bin-specific bias vector} 39 | #' \item{g.hat}{A list of estimated Poisson latent factor} 40 | #' \item{h.hat}{A list of estimated Poisson latent factor} 41 | #' \item{AIC}{AIC for model selection} 42 | #' \item{BIC}{BIC for model selection} 43 | #' \item{RSS}{RSS for model selection} 44 | #' \item{K}{Number of latent Poisson factors} 45 | #' 46 | #' @examples 47 | #' Gini <- get_gini(Y_sim) 48 | #' 49 | #' # first-pass CODEX2 run with no latent factors 50 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim, 51 | #' gc_qc = ref_sim$gc, 52 | #' norm_index = which(Gini<=0.12)) 53 | #' Yhat.noK.sim <- normObj.sim$Yhat 54 | #' beta.hat.noK.sim <- normObj.sim$beta.hat 55 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat 56 | #' N.sim <- normObj.sim$N 57 | #' 58 | #' # Group-wise ploidy initialization 59 | #' clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1") 60 | #' ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 61 | #' ref = ref_sim, groups = clones) 62 | #' ploidy.sim.group 63 | #' 64 | #' normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim, 65 | #' gc_qc = ref_sim$gc, 66 | #' K = 1, ploidyInt = ploidy.sim.group, 67 | #' norm_index = which(clones=="normal"), 68 | #' groups = clones, 69 | #' T = 1:5, 70 | #' beta0 = beta.hat.noK.sim) 71 | #' Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max( 72 | #' normObj.scope.sim.group$BIC)]] 73 | #' fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max( 74 | #' normObj.scope.sim.group$BIC)]] 75 | #' 76 | #' @author Rujin Wang \email{rujin@email.unc.edu} 77 | #' @import stats 78 | #' @export 79 | normalize_scope_group <- function(Y_qc, gc_qc, K, norm_index, groups, T, 80 | ploidyInt, beta0, minCountQC = 20) { 81 | if (max(K) > length(norm_index)) 82 | stop("Number of latent Poisson factors K cannot exceed the number of 83 | normal samples. ") 84 | Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) { 85 | !any(x == 0) 86 | }), , drop = FALSE] 87 | if(dim(Y.nonzero)[1] <= 10){ 88 | message("Adopt arithmetic mean instead of geometric mean") 89 | pseudo.sample <- apply(Y_qc, 1, mean) 90 | Ntotal <- apply(apply(Y_qc, 2, function(x) { 91 | x/pseudo.sample 92 | }), 2, median, na.rm = TRUE) 93 | } else{ 94 | pseudo.sample <- apply(Y.nonzero, 1, function(x) { 95 | exp(sum(log(x))/length(x)) 96 | }) 97 | Ntotal <- apply(apply(Y.nonzero, 2, function(x) { 98 | x/pseudo.sample 99 | }), 2, median) 100 | } 101 | N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc))) 102 | Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, 103 | byrow = TRUE) 104 | 105 | # Get initialization 106 | gcfit.temp <- Y_qc/Nmat/beta0 107 | alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc)) 108 | for (j in seq_len(ncol(alpha0))) { 109 | loe.fit <- loess(gcfit.temp[, j] ~ gc_qc) 110 | gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2) 111 | alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2 112 | } 113 | 114 | Yhat <- vector("list", length(K)) 115 | fGC.hat <- vector("list", length(K)) 116 | alpha.hat <- vector("list", length(K)) 117 | beta.hat <- vector("list", length(K)) 118 | g.hat <- vector("list", length(K)) 119 | h.hat <- vector("list", length(K)) 120 | AIC <- rep(NA, length = length(K)) 121 | BIC <- rep(NA, length = length(K)) 122 | RSS <- rep(NA, length = length(K)) 123 | 124 | # Initialization 125 | message("Initialization ...") 126 | gcfit.temp <- Y_qc/Nmat/beta0 127 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 128 | data = beta0, byrow = FALSE) 129 | 130 | fhat.temp <- getfGC2(gcfit.temp = gcfit.temp, gctemp = gc_qc, 131 | Y = Y_qc, norm_index = norm_index, groups = groups, 132 | offset = offset, 133 | T = T, alpha = alpha0, minCountQC = minCountQC) 134 | fhat0 <- fhat.temp$fGC.hat 135 | alpha0 <- fhat.temp$alpha 136 | 137 | for (ki in seq_len(length(K))) { 138 | k <- K[ki] 139 | message("Computing normalization with k = ", k, 140 | " latent factors ...", sep = "") 141 | message("k = ", k) 142 | maxiter <- 10 143 | maxhiter <- 50 144 | BHTHRESH <- 1e-04 145 | HHTHRESH <- 1e-05 146 | iter <- 1 147 | fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0) 148 | betahat <- beta0 149 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 150 | data = betahat, byrow = FALSE) 151 | ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k) 152 | hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k) 153 | bhdiff <- rep(Inf, maxiter) 154 | fhdiff <- rep(Inf, maxiter) 155 | 156 | betahatlist <- vector("list", maxiter) 157 | fhatlist <- vector("list", maxiter) 158 | ghatlist <- vector("list", maxiter) 159 | hhatlist <- vector("list", maxiter) 160 | alphahatlist <- vector("list", maxiter) 161 | 162 | while (iter <= maxiter) { 163 | if (iter == 1) { 164 | fhatnew <- fhat0 165 | alpha <- alpha0 166 | } 167 | if (iter > 1) { 168 | gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat)) 169 | offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 170 | data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat)) 171 | fhat.temp <- getfGC2(gcfit.temp = gcfit.temp, gctemp = gc_qc, 172 | Y = Y_qc, norm_index = norm_index, groups = groups, 173 | offset = offset, T = T, alpha = alpha0, 174 | minCountQC = minCountQC) 175 | fhatnew <- fhat.temp$fGC.hat 176 | alpha <- fhat.temp$alpha 177 | } 178 | fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile( 179 | fhatnew, 0.005) 180 | betahatnew <- apply((Y_qc/(fhatnew * Nmat * exp(ghat %*% 181 | t(hhat))))[, norm_index, drop = FALSE], 1, median) 182 | betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0]) 183 | bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat) 184 | fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat) 185 | if (fhdiff[iter] > min(fhdiff)) 186 | break 187 | message("Iteration ", iter, "\t", "beta diff =", 188 | signif(bhdiff[iter], 3), "\t", "f(GC) diff =", 189 | signif(fhdiff[iter], 3)) 190 | fhat <- fhatnew 191 | betahat <- betahatnew 192 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 193 | data = betahat, byrow = FALSE) 194 | L <- log(Nmat * fhat * betahatmat * alpha/2) 195 | logmat <- log(pmax(Y_qc, 1)) - L 196 | logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 197 | data = apply(logmat, 1, mean), byrow = FALSE) 198 | hhat <- svd(logmat, nu = k, nv = k)$v 199 | hhatnew <- hhat 200 | hiter <- 1 201 | hhdiff <- rep(Inf, maxhiter) 202 | while (hiter <= maxhiter) { 203 | for (s in seq_len(nrow(Y_qc))) { 204 | temp <- try(glm(formula = Y_qc[s, norm_index] ~ 205 | hhat[norm_index, ] - 206 | 1, offset = L[s, norm_index], 207 | family = poisson)$coefficients, 208 | silent = TRUE) 209 | if (is.character(temp)) { 210 | temp <- lm(log(pmax(Y_qc[s, norm_index], 1)) ~ 211 | hhat[norm_index, ] - 212 | 1, offset = log(L[s, norm_index]))$coefficients 213 | } 214 | ghat[s, ] <- temp 215 | } 216 | # avoid overflow or underflow of the g latent factors 217 | ghat[is.na(ghat)] <- 0 218 | if (max(ghat) >= 30) { 219 | ghat <- apply(ghat, 2, function(z) { 220 | z[z > quantile(z, 0.995)] = min(quantile(z, 0.995), 30) 221 | z 222 | }) 223 | } 224 | if (min(ghat) <= -30) { 225 | ghat <- apply(ghat, 2, function(z) { 226 | z[z < quantile(z, 0.005)] = max(quantile(z, 227 | 0.005), -30) 228 | z 229 | }) 230 | } 231 | for (t in seq_len(ncol(Y_qc))) { 232 | hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1, 233 | offset = L[, t], family = poisson)$coefficients 234 | } 235 | gh <- ghat %*% t(hhatnew) 236 | gh <- scale(gh, center = TRUE, scale = FALSE) 237 | hhatnew <- svd(gh, nu = k, nv = k)$v 238 | hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat) 239 | message("\t\t\t", "hhat diff =", 240 | signif(hhdiff[hiter], 3)) 241 | hhat <- hhatnew 242 | if (hhdiff[hiter] < HHTHRESH) 243 | break 244 | if (hiter > 10 & (rank(hhdiff))[hiter] <= 3) 245 | break 246 | hiter <- hiter + 1 247 | } 248 | 249 | alphahatlist[[iter]] <- alpha 250 | fhatlist[[iter]] <- fhat 251 | betahatlist[[iter]] <- betahat 252 | ghatlist[[iter]] <- ghat 253 | hhatlist[[iter]] <- hhat 254 | if (bhdiff[iter] < BHTHRESH) 255 | break 256 | if (iter > 5 & bhdiff[iter] > 1) 257 | break 258 | iter <- iter + 1 259 | } 260 | optIter <- which.min(fhdiff) 261 | message(paste("Stop at Iteration ", optIter, ".", sep = "")) 262 | alpha <- alphahatlist[[optIter]] 263 | fhat <- fhatlist[[optIter]] 264 | betahat <- betahatlist[[optIter]] 265 | ghat <- ghatlist[[optIter]] 266 | hhat <- hhatlist[[optIter]] 267 | betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), 268 | data = betahat, byrow = FALSE) 269 | Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat * 270 | exp(ghat %*% t(hhat)), 0), 1) 271 | alpha.hat[[ki]] <- alpha 272 | fGC.hat[[ki]] <- signif(fhat, 3) 273 | beta.hat[[ki]] <- signif(betahat, 3) 274 | h.hat[[ki]] <- signif(hhat, 3) 275 | g.hat[[ki]] <- signif(ghat, 3) 276 | Yhat.temp <- Yhat[[ki]] * alpha/2 277 | AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - 278 | Yhat.temp) - 2 * (length(ghat) + length(hhat)) 279 | BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - 280 | Yhat.temp) - (length(ghat) + length(hhat)) * 281 | log(length(Y_qc)) 282 | RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc)) 283 | message("AIC", k, " = ", round(AIC[ki], 3)) 284 | message("BIC", k, " = ", round(BIC[ki], 3)) 285 | message("RSS", k, " = ", round(RSS[ki], 3), "\n") 286 | } 287 | list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat, 288 | beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat, 289 | AIC = AIC, BIC = BIC, RSS = RSS, K = K) 290 | } 291 | 292 | getfGCG <- function(gcfit.tempg, gctemp, Yg, offsetg, T, draw.plot = NULL, 293 | alphag, minCountQC) { 294 | alphag <- pmax(1, round(alphag)) 295 | if (is.null(draw.plot)) { 296 | draw.plot <- FALSE 297 | } 298 | loglik <- rep(NA, length(T)) 299 | BIC <- rep(NA, length(T)) 300 | fGCi.obj <- vector("list", length(T)) 301 | fGCg.plot <- vector("list", length(T)) 302 | Z.obj <- vector("list", length(T)) 303 | vec_pi.obj <- vector("list", length(T)) 304 | 305 | bin.filter.list = vector("list", length = ncol(gcfit.tempg)) 306 | for (j in seq_len(ncol(gcfit.tempg))) { 307 | fGCi <- fitGC(gctemp, gcfit.tempg[,j]) 308 | resid <- abs(gcfit.tempg[,j] - fGCi) 309 | bin.filter.list[[j]] <- which(resid > (median(resid) + 5 * 310 | mad(resid)) | Yg[,j] < minCountQC) 311 | if (length(bin.filter.list[[j]]) == 0) { 312 | bin.filter.list[[j]] <- which.max(gcfit.tempg[,j]) 313 | } 314 | } 315 | bin.filter = sort(unique(unlist(bin.filter.list))) 316 | 317 | for (Ti in T) { 318 | cat(Ti, "\t") 319 | if (Ti == 1) { 320 | fGCg <- matrix(NA, ncol = ncol(gcfit.tempg), 321 | nrow = nrow(gcfit.tempg)) 322 | Z <- matrix(nrow = nrow(gcfit.tempg), ncol = Ti, 323 | data = 1/Ti) 324 | vec_pi <- 1 325 | for (j in seq_len(ncol(gcfit.tempg))) { 326 | loe.fit.temp <- loess(gcfit.tempg[-bin.filter,j] ~ 327 | gctemp[-bin.filter]) 328 | fGCi <- predict(loe.fit.temp, newdata = gctemp, 329 | se = TRUE)$fit 330 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 331 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 332 | fGCg[,j] = fGCi 333 | } 334 | } 335 | if (Ti >= 2) { 336 | Z <- matrix(nrow = nrow(gcfit.tempg), ncol = Ti, data = 0) 337 | mintemp <- pmin(Ti, alphag) 338 | Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1 339 | vec_pi <- colSums(Z)/nrow(Z) 340 | 341 | fGCg <- matrix(NA, ncol = ncol(gcfit.tempg), 342 | nrow = nrow(gcfit.tempg)) 343 | for (j in seq_len(ncol(gcfit.tempg))) { 344 | loe.fit.temp <- loess((gcfit.tempg[,j]/(Z %*% 345 | as.matrix(seq_len(Ti)/2)))[-bin.filter] ~ 346 | gctemp[-bin.filter]) 347 | fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit 348 | temp <- min(fGCi[!is.na(fGCi) & fGCi > 0]) 349 | fGCi[fGCi <= 0 | is.na(fGCi)] <- temp 350 | fGCg[,j] <- fGCi 351 | } 352 | 353 | diff.GC <- Inf 354 | diff.Z <- Inf 355 | iter <- 1 356 | while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) { 357 | Mtemp <- MstepG(Z, gcfit.tempg, gctemp) 358 | vec_pi.new <- Mtemp$vec_pig 359 | fGCi.new <- Mtemp$fGCg 360 | Z.new <- EstepG(fGCi.new, vec_pi.new, Yg, offsetg) 361 | diff.GC <- sum((fGCg - fGCi.new)^2)/length(fGCg) 362 | diff.Z <- sum((Z.new - Z)^2)/length(Z) 363 | vec_pi <- vec_pi.new 364 | Z <- Z.new 365 | fGCg <- fGCi.new 366 | iter <- iter + 1 367 | if (iter >= 50) 368 | break 369 | } 370 | } 371 | 372 | fGCg.plot[[which(T == Ti)]] <- matrix(NA, ncol = ncol(gcfit.tempg), 373 | nrow = nrow(gcfit.tempg)) 374 | if (Ti == 1) { 375 | loglik.temp <- rep(NA, ncol(gcfit.tempg)) 376 | BIC.temp <- rep(NA, ncol(gcfit.tempg)) 377 | for (j in seq_len(ncol(gcfit.tempg))) { 378 | loe.fit.plot <- loess(gcfit.tempg[,j] ~ gctemp) 379 | fGCi.plot <- loe.fit.plot$fitted 380 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 381 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 382 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 383 | fGCg.plot[[which(T == Ti)]][,j] = fGCi.plot 384 | 385 | loglik.temp[j] <- sum(dpois(Yg[-bin.filter,j], 386 | lambda = (offsetg[,j] * fGCg[,j])[-bin.filter], 387 | log = TRUE)) 388 | BIC.temp[j] <- 2 * loglik.temp[j] - (nrow(gcfit.tempg) - df) * 389 | log(nrow(gcfit.tempg)) 390 | } 391 | loglik[which(T == Ti)] <- sum(loglik.temp) 392 | BIC[which(T == Ti)] <- sum(BIC.temp) 393 | } else { 394 | loglik.temp <- rep(NA, ncol(gcfit.tempg)) 395 | BIC.temp <- rep(NA, ncol(gcfit.tempg)) 396 | for (j in seq_len(ncol(gcfit.tempg))) { 397 | loe.fit.plot <- loess((gcfit.tempg[,j]/(Z %*% 398 | as.matrix(seq_len(Ti)/2))) ~ gctemp) 399 | fGCi.plot <- loe.fit.plot$fitted 400 | temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0]) 401 | fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp 402 | df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df 403 | fGCg.plot[[which(T == Ti)]][,j] <- fGCi.plot 404 | 405 | loglik.temp[j] <- sum(dpois(Yg[-bin.filter,j], lambda = 406 | (offsetg[,j] * fGCg[,j] * (Z %*% 407 | as.matrix(seq_len(Ti)/2)))[-bin.filter], 408 | log = TRUE)) 409 | BIC.temp[j] <- 2 * loglik.temp[j] - (nrow(gcfit.tempg) - 410 | df + Ti - 1) * log(nrow(gcfit.tempg)) 411 | } 412 | loglik[which(T == Ti)] <- sum(loglik.temp) 413 | BIC[which(T == Ti)] <- sum(BIC.temp) 414 | } 415 | 416 | fGCi.obj[[which(T == Ti)]] <- fGCg 417 | Z.obj[[which(T == Ti)]] <- Z 418 | vec_pi.obj[[which(T == Ti)]] <- vec_pi 419 | } 420 | 421 | 422 | for (j in seq_len(ncol(gcfit.tempg))) { 423 | if (draw.plot) { 424 | par(mfrow = c(5, 2)) 425 | smoothScatter(gctemp[-bin.filter], gcfit.tempg[-bin.filter,j], 426 | main = "Original", xlab = "GC content", 427 | ylab = "Y/beta/N/exp(gxh)") 428 | } 429 | 430 | for (Ti in T) { 431 | if (draw.plot) { 432 | smoothScatter(gctemp[-bin.filter], gcfit.tempg[-bin.filter,j], 433 | xlab = "GC content", ylab = "Y/beta/N/exp(gxh)", 434 | nrpoints = 0, main = paste("T =", Ti)) 435 | if (Ti == 1) { 436 | points(gctemp[order(gctemp)], fGCg.plot[[which(T == Ti)]][ 437 | order(gctemp),j], 438 | lty = 2, col = 2, type = "l", lwd = 2) 439 | points(gctemp, gcfit.tempg[,j], cex = 0.4, 440 | col = 2, pch = 16) 441 | } else { 442 | for (k in seq_len(Ti)) { 443 | points(gctemp[order(gctemp)], 444 | fGCg.plot[[which(T == Ti)]][order(gctemp),j] * k/2, 445 | lty = 2, col = k, type = "l", lwd = 2) 446 | points(gctemp[which((round(Z.obj[[which(T == Ti) 447 | ]]))[, k] == 1)], 448 | (gcfit.tempg[,j])[which((round(Z.obj[[which(T == Ti) 449 | ]]))[, k] == 1)], 450 | cex = 0.4, col = k, pch = 16) 451 | } 452 | } 453 | } 454 | } 455 | 456 | if (draw.plot) { 457 | plot(T, loglik, type = "b", xlab = "T", ylab = "loglik", 458 | main = "Log likelihood") 459 | abline(v = which.max(loglik), lty = 2) 460 | plot(T, BIC, type = "b", xlab = "T", ylab = "BIC", main = "BIC") 461 | abline(v = which.max(BIC), lty = 2) 462 | par(mfrow = c(1, 1)) 463 | } 464 | } 465 | return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj, 466 | vec_pi.obj = vec_pi.obj, bin.filter = bin.filter, 467 | loglik = loglik, BIC = BIC)) 468 | } 469 | 470 | getfGC2 <- function(gcfit.temp, gctemp, Y, norm_index, groups, offset, T, 471 | alpha, minCountQC) { 472 | fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y)) 473 | for (G in unique(groups)) { 474 | cat(G, "\t") 475 | g <- which(groups == G) 476 | if (!any(is.na(match(norm_index, g))) & 477 | !any(is.na(match(g, norm_index)))) { 478 | alpha[, g] <- 2 479 | for (j in g) { 480 | loe.fit <- loess(gcfit.temp[, j] ~ gctemp) 481 | fGC.hat[, j] <- loe.fit$fitted 482 | } 483 | } else { 484 | fGCg <- getfGCG(gcfit.tempg = gcfit.temp[, g, drop = FALSE], 485 | gctemp = gctemp, Yg = Y[, g, drop = FALSE], 486 | offsetg = offset[, g, drop = FALSE], 487 | T = T, draw.plot = FALSE, 488 | alphag = apply(alpha[, g, drop = FALSE], 1, median), 489 | minCountQC = minCountQC) 490 | if (which.max(fGCg$BIC) == 1) { 491 | alpha[, g] <- 2 492 | } else { 493 | alpha[, g] <- apply(fGCg$Z.obj[[which.max(fGCg$BIC)]], 494 | 1, which.max) 495 | } 496 | fGC.hat[, g] <- fGCg$fGCi.obj[[which.max(fGCg$BIC)]] 497 | } 498 | } 499 | return(list(fGC.hat = fGC.hat, alpha = alpha)) 500 | } 501 | 502 | 503 | EstepG <- function(fGCg, vec_pig, Yg, offsetg) { 504 | P <- matrix(nrow = nrow(fGCg), ncol = length(vec_pig)) 505 | vec_pig[vec_pig == 0] <- 1e-100 506 | pPoisson <- vector("list", length = ncol(fGCg)) 507 | for (j in seq_len(ncol(Yg))) { 508 | lambdaj <- matrix(nrow = nrow(P), ncol = ncol(P), 509 | data = offsetg[, j, drop = FALSE] * fGCg[, j, drop = FALSE]) * 510 | matrix(nrow = nrow(P), ncol = ncol(P), 511 | data = seq_len(length(vec_pig))/2, byrow = TRUE) 512 | pPoisson[[j]] <- dpois(matrix(nrow = nrow(P), ncol = ncol(P), 513 | data = Yg[, j, drop = FALSE]), 514 | lambda = lambdaj, log = TRUE) 515 | } 516 | P <- apply(simplify2array(pPoisson), c(1,2), sum) + matrix(nrow = nrow(P), 517 | ncol = ncol(P), data = log(vec_pig), byrow = TRUE) 518 | Zg <- matrix(nrow = nrow(fGCg), ncol = length(vec_pig)) 519 | for (k in seq_len(length(vec_pig))) { 520 | Zg[, k] <- 1/(1 + apply(exp(apply(P, 2, function(x) { 521 | x - P[, k] 522 | })[, -k, drop = FALSE]), 1, sum)) 523 | } 524 | return(Zg) 525 | } 526 | 527 | 528 | 529 | MstepG <- function(Zg, gcfit.tempg, gctemp) { 530 | fGCg <- matrix(NA, nrow = nrow(gcfit.tempg), ncol = ncol(gcfit.tempg)) 531 | for (j in seq_len(ncol(gcfit.tempg))) { 532 | gcfit.temp <- gcfit.tempg[,j]/(Zg %*% as.matrix(seq_len(ncol(Zg))/2)) 533 | fGCi <- fitGC(gctemp, gcfit.temp) 534 | fGCg[,j] <- fGCi 535 | } 536 | vec_pig <- colSums(Zg)/nrow(Zg) 537 | return(list(vec_pig = vec_pig, fGCg = fGCg)) 538 | } 539 | 540 | 541 | 542 | fitGC <- function(gctemp, gcfit.temp) { 543 | loe.fit <- loess(gcfit.temp ~ gctemp) 544 | fGCi <- loe.fit$fitted 545 | temp <- min(fGCi[fGCi > 0]) 546 | fGCi[fGCi <= 0] <- temp 547 | return(fGCi) 548 | } 549 | 550 | 551 | --------------------------------------------------------------------------------