├── data
    ├── demo.rda
    ├── iCN_sim.rda
    ├── ref.scopeDemo.rda
    ├── normObj.scopeDemo.rda
    ├── QCmetric.scopeDemo.rda
    └── coverageObj.scopeDemo.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_get_gini.R
    │   ├── test_initialize_ploidy.R
    │   ├── test_get_bam_bed.R
    │   ├── test_normalize_scope.R
    │   └── test_segment_CBScs.R
├── man
    ├── Y_sim.Rd
    ├── ref_sim.Rd
    ├── normObj.scopeDemo.Rd
    ├── ref.scopeDemo.Rd
    ├── QCmetric.scopeDemo.Rd
    ├── iCN_sim.Rd
    ├── coverageObj.scopeDemo.Rd
    ├── get_gini.Rd
    ├── get_samp_QC.Rd
    ├── get_gc.Rd
    ├── normalize_codex2_ns_noK.Rd
    ├── plot_iCN.Rd
    ├── initialize_ploidy.Rd
    ├── get_coverage_scDNA.Rd
    ├── get_bam_bed.Rd
    ├── segment_CBScs.Rd
    ├── get_mapp.Rd
    ├── initialize_ploidy_group.Rd
    ├── plot_EM_fit.Rd
    ├── perform_qc.Rd
    ├── normalize_scope.Rd
    ├── normalize_scope_foreach.Rd
    └── normalize_scope_group.Rd
├── R
    ├── get_gini.R
    ├── data.R
    ├── get_samp_QC.R
    ├── get_gc.R
    ├── get_bam_bed.R
    ├── get_mapp.R
    ├── initialize_ploidy.R
    ├── plot_EM_fit.R
    ├── normalize_codex2_ns_noK.R
    ├── initialize_ploidy_group.R
    ├── perform_qc.R
    ├── get_coverage_scDNA.R
    ├── plot_iCN.R
    ├── normalize_scope_foreach.R
    ├── segment_CBScs.R
    ├── normalize_scope.R
    └── normalize_scope_group.R
├── NEWS
├── inst
    └── docs
    │   └── split_script.py
├── NAMESPACE
├── DESCRIPTION
├── README.md
└── vignettes
    └── SCOPE_vignette.Rmd


/data/demo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/demo.rda


--------------------------------------------------------------------------------
/data/iCN_sim.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/iCN_sim.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(SCOPE)
3 | 
4 | test_check("SCOPE")
5 | 


--------------------------------------------------------------------------------
/data/ref.scopeDemo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/ref.scopeDemo.rda


--------------------------------------------------------------------------------
/data/normObj.scopeDemo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/normObj.scopeDemo.rda


--------------------------------------------------------------------------------
/data/QCmetric.scopeDemo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/QCmetric.scopeDemo.rda


--------------------------------------------------------------------------------
/data/coverageObj.scopeDemo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rujinwang/SCOPE/HEAD/data/coverageObj.scopeDemo.rda


--------------------------------------------------------------------------------
/tests/testthat/test_get_gini.R:
--------------------------------------------------------------------------------
1 | context("getGini")
2 | library(SCOPE)
3 | 
4 | test_that("Gini Calculation works", {
5 |   Gini <- get_gini(Y_sim)
6 |   expect_equal(sum(Gini<=0.12), 2)
7 | })
8 | 


--------------------------------------------------------------------------------
/man/Y_sim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{Y_sim}
 5 | \alias{Y_sim}
 6 | \title{A read count matrix in the toy dataset}
 7 | \format{A read count matrix with 1544 bins and 39 cells}
 8 | \usage{
 9 | Y_sim
10 | }
11 | \description{
12 | A read count matrix in the toy dataset
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/ref_sim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ref_sim}
 5 | \alias{ref_sim}
 6 | \title{A reference genome in the toy dataset}
 7 | \format{A GRanges object with 1544 bins and 1 metadata column of GC content}
 8 | \usage{
 9 | ref_sim
10 | }
11 | \description{
12 | A reference genome in the toy dataset
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/normObj.scopeDemo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{normObj.scopeDemo}
 5 | \alias{normObj.scopeDemo}
 6 | \title{Pre-stored normObj.scope data for demonstration purposes}
 7 | \format{Pre-computed by SCOPE using pre-stored data \code{Y_sim}}
 8 | \usage{
 9 | normObj.scopeDemo
10 | }
11 | \description{
12 | Pre-stored normObj.scope data for demonstration purposes
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/ref.scopeDemo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ref.scopeDemo}
 5 | \alias{ref.scopeDemo}
 6 | \title{Pre-stored 500kb-size reference genome for demonstration purposes}
 7 | \format{Pre-computed using whole genome sequencing data
 8 | with GC content and mappability scores}
 9 | \usage{
10 | ref.scopeDemo
11 | }
12 | \description{
13 | Pre-stored 500kb-size reference genome for demonstration purposes
14 | }
15 | \keyword{datasets}
16 | 


--------------------------------------------------------------------------------
/man/QCmetric.scopeDemo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{QCmetric.scopeDemo}
 5 | \alias{QCmetric.scopeDemo}
 6 | \title{Pre-stored QCmetric data for demonstration purposes}
 7 | \format{Pre-computed using whole genome sequencing data of
 8 | three single cells from 10X Genomics Single-Cell CNV solution}
 9 | \usage{
10 | QCmetric.scopeDemo
11 | }
12 | \description{
13 | Pre-stored QCmetric data for demonstration purposes
14 | }
15 | \keyword{datasets}
16 | 


--------------------------------------------------------------------------------
/man/iCN_sim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{iCN_sim}
 5 | \alias{iCN_sim}
 6 | \title{A post cross-sample segmentation integer copy number matrix returned by
 7 | SCOPE in the demo}
 8 | \format{A post cross-sample segmentation integer copy number matrix of
 9 | five toy cells returned by SCOPE}
10 | \usage{
11 | iCN_sim
12 | }
13 | \description{
14 | A post cross-sample segmentation integer copy number matrix returned by
15 | SCOPE in the demo
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/tests/testthat/test_initialize_ploidy.R:
--------------------------------------------------------------------------------
 1 | context("PreEst_Ploidy")
 2 | library(SCOPE)
 3 | 
 4 | test_that("Ploidy Initialization works", {
 5 |   Gini <- get_gini(Y_sim)
 6 |   normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 7 |                                          gc_qc = ref_sim$gc,
 8 |                                          norm_index = which(Gini<=0.12))
 9 | 
10 |   Yhat.noK.sim <- normObj.sim$Yhat
11 |   ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = Yhat.noK.sim, ref = ref_sim)
12 |   expect_equal(sum(Gini<=0.12), sum(ploidy.sim==2))
13 | })
14 | 


--------------------------------------------------------------------------------
/man/coverageObj.scopeDemo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{coverageObj.scopeDemo}
 5 | \alias{coverageObj.scopeDemo}
 6 | \title{Pre-stored coverageObj.scope data for demonstration purposes}
 7 | \format{Pre-computed using whole genome sequencing data of
 8 | three single cells from 10X Genomics Single-Cell CNV solution}
 9 | \usage{
10 | coverageObj.scopeDemo
11 | }
12 | \description{
13 | Pre-stored coverageObj.scope data for demonstration purposes
14 | }
15 | \keyword{datasets}
16 | 


--------------------------------------------------------------------------------
/man/get_gini.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_gini.R
 3 | \name{get_gini}
 4 | \alias{get_gini}
 5 | \title{Compute Gini coefficients for single cells}
 6 | \usage{
 7 | get_gini(Y)
 8 | }
 9 | \arguments{
10 | \item{Y}{raw read depth matrix after quality control procedure}
11 | }
12 | \value{
13 | \item{Gini}{Vector of Gini coefficients for single cells
14 |     from scDNA-seq}
15 | }
16 | \description{
17 | Gini index is defined as two times the area
18 | between the Lorenz curve and the diagonal.
19 | }
20 | \examples{
21 | Gini <- get_gini(Y_sim)
22 | 
23 | }
24 | \author{
25 | Rujin Wang \email{rujin@email.unc.edu}
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get_bam_bed.R:
--------------------------------------------------------------------------------
 1 | context("getbambed")
 2 | library(SCOPE)
 3 | library(WGSmapp)
 4 | 
 5 | test_that("File preparation works", {
 6 |   bamfolder <- system.file("extdata", package = "WGSmapp")
 7 |   bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
 8 |   bamdir <- file.path(bamfolder, bamFile)
 9 |   sampname_raw <- sapply(strsplit(bamFile, ".", fixed = TRUE), "[", 1)
10 |   bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw)
11 |   bamdir <- bambedObj$bamdir
12 |   sampname_raw <- bambedObj$sampname
13 |   ref_raw <- bambedObj$ref
14 | 
15 |   expect_equal(length(bamFile), length(sampname_raw))
16 |   expect_gte(length(as.character(unique(seqnames(ref_raw)))), 22)
17 | })
18 | 


--------------------------------------------------------------------------------
/R/get_gini.R:
--------------------------------------------------------------------------------
 1 | #' @title Compute Gini coefficients for single cells
 2 | #'
 3 | #' @description Gini index is defined as two times the area
 4 | #' between the Lorenz curve and the diagonal.
 5 | #'
 6 | #' @param Y raw read depth matrix after quality control procedure
 7 | #'
 8 | #' @return
 9 | #'   \item{Gini}{Vector of Gini coefficients for single cells
10 | #'     from scDNA-seq}
11 | #'
12 | #' @examples
13 | #' Gini <- get_gini(Y_sim)
14 | #'
15 | #' @author Rujin Wang \email{rujin@email.unc.edu}
16 | #' @importFrom DescTools AUC
17 | #' @export
18 | get_gini <- function(Y) {
19 |     Gini <- rep(NA, ncol(Y))
20 |     for (i in seq_len(ncol(Y))) {
21 |         y <- sort(Y[, i])
22 |         x <- c(0, seq_len(length(y))/length(y))
23 |         y <- c(0, cumsum(y)/sum(y))
24 |         Gini[i] <- 2 * round(0.5 - AUC(x, y), 4)
25 |     }
26 |     return(Gini)
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test_normalize_scope.R:
--------------------------------------------------------------------------------
 1 | context("normalizescope")
 2 | library(SCOPE)
 3 | 
 4 | test_that("basic argument errors thrown", {
 5 |   Gini <- get_gini(Y_sim)
 6 |   normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 7 |                                          gc_qc = ref_sim$gc,
 8 |                                          norm_index = which(Gini<=0.12))
 9 | 
10 |   Yhat.noK.sim <- normObj.sim$Yhat
11 |   beta.hat.noK.sim <- normObj.sim$beta.hat
12 |   ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = Yhat.noK.sim, ref = ref_sim)
13 |   expect_error(normalize_scope_foreach(Y_qc = Y_sim, gc_qc = ref_sim$gc,
14 |                                        K = 1:3, ploidyInt = ploidy.sim,
15 |                                        norm_index = which(Gini<=0.12), T = 1:7,
16 |                                        beta0 = beta.hat.noK.sim), "exceed the number")
17 | })
18 | 


--------------------------------------------------------------------------------
/tests/testthat/test_segment_CBScs.R:
--------------------------------------------------------------------------------
 1 | context("CrossSampleSegmentation")
 2 | library(SCOPE)
 3 | 
 4 | test_that("Cross-sample CBS segmentation works", {
 5 |   chrs <- unique(as.character(seqnames(ref_sim)))
 6 |   segment_cs <- vector('list',length = length(chrs))
 7 |   names(segment_cs) <- chrs
 8 |   Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]]
 9 |   for (chri in chrs) {
10 |     message('\n', chri, '\n')
11 |     segment_cs[[chri]] <- segment_CBScs(Y = Y_sim,
12 |                                       Yhat = Yhat.sim,
13 |                                       sampname = colnames(Y_sim),
14 |                                       ref = ref_sim,
15 |                                       chr = chri,
16 |                                       mode = "integer", max.ns = 1)
17 |   }
18 |   iCN <- do.call(rbind, lapply(segment_cs, function(z){z[["iCN"]]}))
19 | 
20 |   expect_equal(nrow(Y_sim), nrow(iCN))
21 |   expect_equal(ncol(Y_sim), ncol(iCN))
22 | })
23 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Changes in version 1.5.2
 2 | + remove loading warnings
 3 | 
 4 | Changes in version 1.5.1
 5 | + update vignette
 6 | 
 7 | Changes in version 0.99.13
 8 | + Integrate mouse genome and update plot_iCN()
 9 | 
10 | Changes in version 0.99.12
11 | + add algorithms with shared clonal memberships
12 | 
13 | Changes in version 0.99.11
14 | + get_gc() dontrun
15 | 
16 | Changes in version 0.99.8
17 | + minor edit
18 | 
19 | Changes in version 0.99.7
20 | + second-round revision
21 | 
22 | Changes in version 0.99.6
23 | + first-round revision
24 | 
25 | Changes in version 0.99.5
26 | + Enable user-define bin length and offer SoSplot
27 | 
28 | Changes in version 0.99.4
29 | + Add .bed file for hg38
30 | 
31 | Changes in version 0.99.3
32 | + Test if SCOPE is linked to WGSmapp
33 | 
34 | Changes in version 0.99.2
35 | + Bump new version to trigger a build
36 | 
37 | Changes in version 0.99.1
38 | + Add session_info() to the vignette
39 | 
40 | Changes in version 0.99.0
41 | + SCOPE getting ready for submission to Bioc
42 | 


--------------------------------------------------------------------------------
/man/get_samp_QC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_samp_QC.R
 3 | \name{get_samp_QC}
 4 | \alias{get_samp_QC}
 5 | \title{Get QC metrics for single cells}
 6 | \usage{
 7 | get_samp_QC(bambedObj)
 8 | }
 9 | \arguments{
10 | \item{bambedObj}{object returned from \code{get_bam_bed}}
11 | }
12 | \value{
13 | \item{QCmetric}{A matrix containing total number/proportion of reads,
14 |     total number/proportion of mapped reads, total number/proportion
15 |     of mapped non-duplicate reads, and number/proportion of reads with
16 |     mapping quality greater than 20}
17 | }
18 | \description{
19 | Perform QC step on single cells.
20 | }
21 | \examples{
22 | library(WGSmapp)
23 | library(BSgenome.Hsapiens.UCSC.hg38)
24 | bamfolder <- system.file('extdata', package = 'WGSmapp')
25 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
26 | bamdir <- file.path(bamfolder, bamFile)
27 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
28 | bambedObj <- get_bam_bed(bamdir = bamdir,
29 |                             sampname = sampname_raw, 
30 |                             hgref = "hg38")
31 | QCmetric_raw = get_samp_QC(bambedObj)
32 | 
33 | }
34 | \author{
35 | Rujin Wang \email{rujin@email.unc.edu}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/get_gc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_gc.R
 3 | \name{get_gc}
 4 | \alias{get_gc}
 5 | \title{Compute GC content}
 6 | \usage{
 7 | get_gc(ref, hgref = "hg19")
 8 | }
 9 | \arguments{
10 | \item{ref}{GRanges object returned from \code{get_bam_bed}}
11 | 
12 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'.
13 | Default is human genome \code{hg19}.}
14 | }
15 | \value{
16 | \item{gc}{Vector of GC content for each bin/target}
17 | }
18 | \description{
19 | Compute GC content for each bin
20 | }
21 | \examples{
22 | \dontrun{
23 | library(WGSmapp)
24 | library(BSgenome.Hsapiens.UCSC.hg38)
25 | bamfolder <- system.file('extdata', package = 'WGSmapp')
26 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
27 | bamdir <- file.path(bamfolder, bamFile)
28 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
29 | bambedObj <- get_bam_bed(bamdir = bamdir,
30 |                             sampname = sampname_raw, 
31 |                             hgref = "hg38")
32 | bamdir <- bambedObj$bamdir
33 | sampname_raw <- bambedObj$sampname
34 | ref_raw <- bambedObj$ref
35 | 
36 | gc <- get_gc(ref_raw, hgref = "hg38")
37 | }
38 | 
39 | }
40 | \author{
41 | Rujin Wang \email{rujin@email.unc.edu}
42 | }
43 | 


--------------------------------------------------------------------------------
/inst/docs/split_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import fileinput
 3 | import csv
 4 | with open("/pine/scr/r/u/rujin/10XGenomics/breast_tissue_A_2k/output/breast_tissue_A_2k_per_cell_summary_metrics.csv") as f:
 5 |     reader = csv.reader(f)
 6 |     next(reader) # skip header
 7 |     data = []
 8 |     for r in reader:
 9 |         # get the barcode list from the 1st column
10 |         bclist=r[0]
11 |         data.append(bclist)
12 | # total count of reads
13 | tcount=0
14 | bcount=0
15 | for line in fileinput.input():
16 |     tcount+=1
17 |     line=line.strip()
18 |     tags = line.split()[11:]
19 |     tags_sort = sorted(tags)
20 |     for tag in tags_sort:
21 | 	tag_split=tag.split(':')
22 |         if 'CB' in tag_split:
23 | 	    barcode=tag_split[-1]
24 | 	    if barcode in data:
25 |             	bcount+=1
26 |             	print(str(tcount)+" "+str(bcount))
27 | 	    	directory = "../align/"+barcode[:-2]
28 |             	outfile = directory+"/"+barcode+".sam"
29 |             	try:
30 |                     f=open(outfile,"a+")
31 | 		    print >> f,line
32 |                     f.close()
33 |             	except:
34 |                     os.mkdir(directory)
35 |                     f=open(outfile,"a+")
36 | 		    print >> f,line
37 |                     f.close()
38 | 	    	break
39 | pct=float(bcount)/float(tcount)
40 | print(pct)
41 | 
42 | 


--------------------------------------------------------------------------------
/man/normalize_codex2_ns_noK.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/normalize_codex2_ns_noK.R
 3 | \name{normalize_codex2_ns_noK}
 4 | \alias{normalize_codex2_ns_noK}
 5 | \title{Normalization of read depth without latent factors under
 6 | the case-control setting}
 7 | \usage{
 8 | normalize_codex2_ns_noK(Y_qc, gc_qc, norm_index)
 9 | }
10 | \arguments{
11 | \item{Y_qc}{read depth matrix after quality control}
12 | 
13 | \item{gc_qc}{vector of GC content for each bin after quality control}
14 | 
15 | \item{norm_index}{indices of normal/diploid cells}
16 | }
17 | \value{
18 | A list with components
19 |     \item{Yhat}{A list of normalized read depth matrix}
20 |     \item{fGC.hat}{A list of estimated GC content bias matrix}
21 |     \item{beta.hat}{A list of estimated bin-specific bias vector}
22 |     \item{N}{A vector of cell-specific library size factor,
23 |         which is computed from the genome-wide read depth data}
24 | }
25 | \description{
26 | Assuming that all reads are from diploid regions,
27 | fit a Poisson generalized linear model to normalize the
28 | raw read depth data from single-cell DNA sequencing, without
29 | latent factors under the case-control setting.
30 | }
31 | \examples{
32 | Gini <- get_gini(Y_sim)
33 | # first-pass CODEX2 run with no latent factors
34 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
35 |                                         gc_qc = ref_sim$gc,
36 |                                         norm_index = which(Gini<=0.12))
37 | 
38 | }
39 | \author{
40 | Rujin Wang \email{rujin@email.unc.edu}
41 | }
42 | 


--------------------------------------------------------------------------------
/man/plot_iCN.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_iCN.R
 3 | \name{plot_iCN}
 4 | \alias{plot_iCN}
 5 | \title{Plot post-segmentation copy number profiles of integer values}
 6 | \usage{
 7 | plot_iCN(iCNmat, ref, Gini, annotation = NULL,
 8 |         plot.dendrogram = TRUE, show.names = FALSE, filename)
 9 | }
10 | \arguments{
11 | \item{iCNmat}{inferred integer copy-number matrix by SCOPE,
12 | with each column being a cell and each row being a genomic bin}
13 | 
14 | \item{ref}{GRanges object after quality control procedure}
15 | 
16 | \item{Gini}{vector of Gini coefficients for each cell,
17 | with the same order as that of cells in columns of \code{iCNmat}}
18 | 
19 | \item{annotation}{vector of annotation for each cell,
20 | with the same order as that of cells in columns of \code{iCNmat}.
21 | Default is \code{NULL}.}
22 | 
23 | \item{plot.dendrogram}{logical, whether to plot the dendrogram.
24 | Default is \code{TRUE}.}
25 | 
26 | \item{show.names}{logical, whether to show cell names by y axis. 
27 | Default is \code{FALSE}.}
28 | 
29 | \item{filename}{name of the output png file}
30 | }
31 | \value{
32 | png file with integer copy-number profiles across single cells
33 | with specified annotations
34 | }
35 | \description{
36 | Show heatmap of inferred integer copy-number profiles
37 | by SCOPE with cells clustered by hierarchical clustering
38 | }
39 | \examples{
40 | Gini <- get_gini(Y_sim)
41 | plot_iCN(iCNmat = iCN_sim,
42 |         ref = ref_sim,
43 |         Gini = Gini,
44 |         filename = 'plot_iCN_demo')
45 | 
46 | }
47 | \author{
48 | Rujin Wang \email{rujin@email.unc.edu}
49 | }
50 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(get_bam_bed)
 4 | export(get_coverage_scDNA)
 5 | export(get_gc)
 6 | export(get_gini)
 7 | export(get_mapp)
 8 | export(get_samp_QC)
 9 | export(initialize_ploidy)
10 | export(initialize_ploidy_group)
11 | export(normalize_codex2_ns_noK)
12 | export(normalize_scope)
13 | export(normalize_scope_foreach)
14 | export(normalize_scope_group)
15 | export(perform_qc)
16 | export(plot_EM_fit)
17 | export(plot_iCN)
18 | export(segment_CBScs)
19 | import(BSgenome.Hsapiens.UCSC.hg19)
20 | import(Rsamtools)
21 | import(doParallel)
22 | import(foreach)
23 | import(grDevices)
24 | import(graphics)
25 | import(parallel)
26 | import(stats)
27 | import(utils)
28 | importFrom(BSgenome,start)
29 | importFrom(BiocGenerics,end)
30 | importFrom(BiocGenerics,start)
31 | importFrom(Biostrings,alphabetFrequency)
32 | importFrom(Biostrings,unmasked)
33 | importFrom(DNAcopy,CNA)
34 | importFrom(DNAcopy,segment)
35 | importFrom(DNAcopy,smooth.CNA)
36 | importFrom(DescTools,AUC)
37 | importFrom(GenomeInfoDb,mapSeqlevels)
38 | importFrom(GenomeInfoDb,seqinfo)
39 | importFrom(GenomeInfoDb,seqlevels)
40 | importFrom(GenomeInfoDb,seqlevelsStyle)
41 | importFrom(GenomeInfoDb,seqnames)
42 | importFrom(GenomicRanges,GRanges)
43 | importFrom(GenomicRanges,pintersect)
44 | importFrom(GenomicRanges,tileGenome)
45 | importFrom(IRanges,IRanges)
46 | importFrom(IRanges,Views)
47 | importFrom(IRanges,countOverlaps)
48 | importFrom(IRanges,end)
49 | importFrom(IRanges,findOverlaps)
50 | importFrom(IRanges,width)
51 | importFrom(RColorBrewer,brewer.pal)
52 | importFrom(S4Vectors,queryHits)
53 | importFrom(S4Vectors,subjectHits)
54 | importFrom(gplots,colorpanel)
55 | importFrom(grDevices,dev.off)
56 | importFrom(grDevices,png)
57 | 


--------------------------------------------------------------------------------
/man/initialize_ploidy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/initialize_ploidy.R
 3 | \name{initialize_ploidy}
 4 | \alias{initialize_ploidy}
 5 | \title{Ploidy pre-initialization}
 6 | \usage{
 7 | initialize_ploidy(Y, Yhat, ref, maxPloidy = 6, minPloidy = 1.5,
 8 |                 minBinWidth = 5, SoS.plot = FALSE)
 9 | }
10 | \arguments{
11 | \item{Y}{raw read depth matrix after quality control procedure}
12 | 
13 | \item{Yhat}{normalized read depth matrix}
14 | 
15 | \item{ref}{GRanges object after quality control procedure}
16 | 
17 | \item{maxPloidy}{maximum ploidy candidate. Defalut is \code{6}}
18 | 
19 | \item{minPloidy}{minimum ploidy candidate. Defalut is \code{1.5}}
20 | 
21 | \item{minBinWidth}{the minimum number of bins for a changed segment.
22 | Defalut is \code{5}}
23 | 
24 | \item{SoS.plot}{logical, whether to generate ploidy pre-estimation 
25 | plots. Default is \code{FALSE}.}
26 | }
27 | \value{
28 | \item{ploidy.SoS}{Vector of pre-estimated ploidies for each cell}
29 | }
30 | \description{
31 | Pre-estimate ploidies across all cells
32 | }
33 | \examples{
34 | Gini <- get_gini(Y_sim)
35 | 
36 | # first-pass CODEX2 run with no latent factors
37 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
38 |                                         gc_qc = ref_sim$gc,
39 |                                         norm_index = which(Gini<=0.12))
40 | Yhat.noK.sim <- normObj.sim$Yhat
41 | beta.hat.noK.sim <- normObj.sim$beta.hat
42 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
43 | N.sim <- normObj.sim$N
44 | 
45 | # Ploidy initialization
46 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 
47 |                                 Yhat = Yhat.noK.sim, 
48 |                                 ref = ref_sim)
49 | ploidy.sim
50 | 
51 | }
52 | \author{
53 | Rujin Wang \email{rujin@email.unc.edu}
54 | }
55 | 


--------------------------------------------------------------------------------
/man/get_coverage_scDNA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_coverage_scDNA.R
 3 | \name{get_coverage_scDNA}
 4 | \alias{get_coverage_scDNA}
 5 | \title{Get read coverage from single-cell DNA sequencing}
 6 | \usage{
 7 | get_coverage_scDNA(bambedObj, mapqthres, seq, hgref = "hg19")
 8 | }
 9 | \arguments{
10 | \item{bambedObj}{object returned from \code{get_bam_bed}}
11 | 
12 | \item{mapqthres}{mapping quality threshold of reads}
13 | 
14 | \item{seq}{the sequencing method to be used. This should be either
15 | 'paired-end' or 'single-end'}
16 | 
17 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'.
18 | Default is human genome \code{hg19}.}
19 | }
20 | \value{
21 | \item{Y}{Read depth matrix}
22 | }
23 | \description{
24 | Get read coverage for each genomic bin across all single
25 |     cells from scDNA-seq. Blacklist regions, such as segmental duplication 
26 |     regions and gaps near telomeres/centromeres will be masked prior to
27 |     getting coverage.
28 | }
29 | \examples{
30 | library(WGSmapp)
31 | library(BSgenome.Hsapiens.UCSC.hg38)
32 | bamfolder <- system.file('extdata', package = 'WGSmapp')
33 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
34 | bamdir <- file.path(bamfolder, bamFile)
35 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
36 | bambedObj <- get_bam_bed(bamdir = bamdir,
37 |                             sampname = sampname_raw, 
38 |                             hgref = "hg38")
39 | 
40 | # Getting raw read depth
41 | coverageObj <- get_coverage_scDNA(bambedObj,
42 |                                 mapqthres = 40,
43 |                                 seq = 'paired-end', 
44 |                                 hgref = "hg38")
45 | Y_raw <- coverageObj$Y
46 | 
47 | }
48 | \author{
49 | Rujin Wang \email{rujin@email.unc.edu}
50 | }
51 | 


--------------------------------------------------------------------------------
/man/get_bam_bed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_bam_bed.R
 3 | \name{get_bam_bed}
 4 | \alias{get_bam_bed}
 5 | \title{Get bam file directories, sample names, and whole genomic bins}
 6 | \usage{
 7 | get_bam_bed(bamdir, sampname, hgref = "hg19", resolution = 500, 
 8 |             sex = FALSE)
 9 | }
10 | \arguments{
11 | \item{bamdir}{vector of the directory of a bam file. Should be in the same
12 | order as sample names in \code{sampname}.}
13 | 
14 | \item{sampname}{vector of sample names. Should be in the same order as bam
15 | directories in \code{bamdir}.}
16 | 
17 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'. 
18 | Default is human genome \code{hg19}.}
19 | 
20 | \item{resolution}{numeric value of fixed bin-length. Default is \code{500}.
21 | Unit is "kb".}
22 | 
23 | \item{sex}{logical, whether to include sex chromosomes. 
24 | Default is \code{FALSE}.}
25 | }
26 | \value{
27 | A list with components
28 |     \item{bamdir}{A vector of bam directories}
29 |     \item{sampname}{A vector of sample names}
30 |     \item{ref}{A GRanges object specifying whole genomic bin positions}
31 | }
32 | \description{
33 | Get bam file directories, sample names, and whole genomic
34 | bins from .bed file
35 | }
36 | \examples{
37 | library(WGSmapp)
38 | library(BSgenome.Hsapiens.UCSC.hg38)
39 | bamfolder <- system.file('extdata', package = 'WGSmapp')
40 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
41 | bamdir <- file.path(bamfolder, bamFile)
42 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
43 | bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 
44 |                         hgref = "hg38")
45 | bamdir <- bambedObj$bamdir
46 | sampname_raw <- bambedObj$sampname
47 | ref_raw <- bambedObj$ref
48 | 
49 | }
50 | \author{
51 | Rujin Wang \email{rujin@email.unc.edu}
52 | }
53 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' A read count matrix in the toy dataset
 2 | #'
 3 | #' @docType data
 4 | #'
 5 | #' @format A read count matrix with 1544 bins and 39 cells
 6 | #'
 7 | #' @keywords datasets
 8 | "Y_sim"
 9 | 
10 | 
11 | #' A reference genome in the toy dataset
12 | #'
13 | #' @docType data
14 | #'
15 | #' @format A GRanges object with 1544 bins and 1 metadata column of GC content
16 | #'
17 | #' @keywords datasets
18 | "ref_sim"
19 | 
20 | 
21 | #' A post cross-sample segmentation integer copy number matrix returned by
22 | #' SCOPE in the demo
23 | #'
24 | #' @docType data
25 | #'
26 | #' @format A post cross-sample segmentation integer copy number matrix of
27 | #' five toy cells returned by SCOPE
28 | #'
29 | #' @keywords datasets
30 | "iCN_sim"
31 | 
32 | 
33 | #' Pre-stored normObj.scope data for demonstration purposes
34 | #'
35 | #' @docType data
36 | #'
37 | #' @format Pre-computed by SCOPE using pre-stored data \code{Y_sim}
38 | #'
39 | #' @keywords datasets
40 | "normObj.scopeDemo"
41 | 
42 | 
43 | #' Pre-stored coverageObj.scope data for demonstration purposes
44 | #'
45 | #' @docType data
46 | #'
47 | #' @format Pre-computed using whole genome sequencing data of
48 | #' three single cells from 10X Genomics Single-Cell CNV solution
49 | #'
50 | #' @keywords datasets
51 | "coverageObj.scopeDemo"
52 | 
53 | 
54 | #' Pre-stored QCmetric data for demonstration purposes
55 | #'
56 | #' @docType data
57 | #'
58 | #' @format Pre-computed using whole genome sequencing data of
59 | #' three single cells from 10X Genomics Single-Cell CNV solution
60 | #'
61 | #' @keywords datasets
62 | "QCmetric.scopeDemo"
63 | 
64 | 
65 | 
66 | 
67 | #' Pre-stored 500kb-size reference genome for demonstration purposes
68 | #'
69 | #' @docType data
70 | #'
71 | #' @format Pre-computed using whole genome sequencing data
72 | #' with GC content and mappability scores
73 | #'
74 | #' @keywords datasets
75 | "ref.scopeDemo"
76 | 


--------------------------------------------------------------------------------
/man/segment_CBScs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/segment_CBScs.R
 3 | \name{segment_CBScs}
 4 | \alias{segment_CBScs}
 5 | \title{Cross-sample segmentation}
 6 | \usage{
 7 | segment_CBScs(Y, Yhat, sampname, ref, chr, 
 8 |                     mode = "integer", max.ns)
 9 | }
10 | \arguments{
11 | \item{Y}{raw read depth matrix after quality control procedure}
12 | 
13 | \item{Yhat}{normalized read depth matrix}
14 | 
15 | \item{sampname}{vector of sample names}
16 | 
17 | \item{ref}{GRanges object after quality control procedure}
18 | 
19 | \item{chr}{chromosome name. Make sure it is consistent with the
20 | reference genome.}
21 | 
22 | \item{mode}{format of returned copy numbers. Only integer mode is
23 | supported for scDNA-seq data.}
24 | 
25 | \item{max.ns}{a number specifying how many rounds of nested structure
26 | searching would be performed. Defalut is \code{0}.}
27 | }
28 | \value{
29 | A list with components
30 |     \item{poolcall}{Cross-sample CNV callings indicating
31 |         shared breakpoints}
32 |     \item{finalcall}{Final cross-sample segmented callset of
33 |         CNVs with genotyping results}
34 |     \item{image.orig}{A matrix giving logarithm of normalized
35 |         z-scores}
36 |     \item{image.seg}{A matrix of logarithm of estimated
37 |         copy number over 2}
38 |     \item{iCN}{A matrix of inferred integer copy number profiles}
39 | }
40 | \description{
41 | SCOPE offers a cross-sample Poisson likelihood-based
42 | recursive segmentation, enabling shared breakpoints across cells
43 | from the same genetic background.
44 | }
45 | \examples{
46 | Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]]
47 | segment_cs_chr1 <- segment_CBScs(Y = Y_sim, Yhat = Yhat.sim,
48 |                         sampname = colnames(Y_sim),
49 |                         ref = ref_sim, chr = 'chr1', max.ns = 1)
50 | 
51 | }
52 | \author{
53 | Rujin Wang \email{rujin@email.unc.edu}
54 | }
55 | 


--------------------------------------------------------------------------------
/man/get_mapp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_mapp.R
 3 | \name{get_mapp}
 4 | \alias{get_mapp}
 5 | \title{Compute mappability}
 6 | \usage{
 7 | get_mapp(ref, hgref = "hg19")
 8 | }
 9 | \arguments{
10 | \item{ref}{GRanges object returned from \code{get_bam_bed}}
11 | 
12 | \item{hgref}{reference genome. This should be 'hg19', 'hg38' or 'mm10'.
13 | Default is human genome \code{hg19}.}
14 | }
15 | \value{
16 | \item{mapp}{Vector of mappability for each bin/target}
17 | }
18 | \description{
19 | Compute mappability for each bin. Note that scDNA
20 | sequencing is whole-genome amplification and the mappability
21 | score is essential to determine variable binning method.
22 | Mappability track for 100-mers on the GRCh37/hg19 human
23 | reference genome from ENCODE is pre-saved. Compute the mean
24 | of mappability scores that overlapped reads map to bins,
25 | weighted by the width of mappability tracks on the genome
26 | reference. Use liftOver utility to calculate mappability
27 | for hg38, which is pre-saved as well. For mm10, there are
28 | two workarounds: 1) set all mappability to 1 to avoid extensive
29 | computation; 2) adopt QC procedures based on annotation results, 
30 | e.g., filter out bins within black list regions, 
31 | which generally have low mappability.
32 | }
33 | \examples{
34 | \dontrun{
35 | library(WGSmapp)
36 | library(BSgenome.Hsapiens.UCSC.hg38)
37 | bamfolder <- system.file('extdata', package = 'WGSmapp')
38 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
39 | bamdir <- file.path(bamfolder, bamFile)
40 | sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
41 | bambedObj <- get_bam_bed(bamdir = bamdir,
42 |                             sampname = sampname_raw, 
43 |                             hgref = "hg38")
44 | bamdir <- bambedObj$bamdir
45 | sampname_raw <- bambedObj$sampname
46 | ref_raw <- bambedObj$ref
47 | 
48 | mapp <- get_mapp(ref_raw, hgref = "hg38")
49 | }
50 | 
51 | }
52 | \author{
53 | Rujin Wang \email{rujin@email.unc.edu}
54 | }
55 | 


--------------------------------------------------------------------------------
/man/initialize_ploidy_group.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/initialize_ploidy_group.R
 3 | \name{initialize_ploidy_group}
 4 | \alias{initialize_ploidy_group}
 5 | \title{Group-wise ploidy pre-initialization}
 6 | \usage{
 7 | initialize_ploidy_group(Y, Yhat, ref, groups, 
 8 |                         maxPloidy = 6, minPloidy = 1.5,
 9 |                         minBinWidth = 5, SoS.plot = FALSE)
10 | }
11 | \arguments{
12 | \item{Y}{raw read depth matrix after quality control procedure}
13 | 
14 | \item{Yhat}{normalized read depth matrix}
15 | 
16 | \item{ref}{GRanges object after quality control procedure}
17 | 
18 | \item{groups}{clonal membership labels for each cell}
19 | 
20 | \item{maxPloidy}{maximum ploidy candidate. Defalut is \code{6}}
21 | 
22 | \item{minPloidy}{minimum ploidy candidate. Defalut is \code{1.5}}
23 | 
24 | \item{minBinWidth}{the minimum number of bins for a changed segment.
25 | Defalut is \code{5}}
26 | 
27 | \item{SoS.plot}{logical, whether to generate ploidy pre-estimation 
28 | plots. Default is \code{FALSE}.}
29 | }
30 | \value{
31 | \item{ploidy.SoS}{Vector of group-wise pre-estimated ploidies 
32 |             for each cell}
33 | }
34 | \description{
35 | Pre-estimate ploidies across cells with shared clonal 
36 | memberships
37 | }
38 | \examples{
39 | Gini <- get_gini(Y_sim)
40 | 
41 | # first-pass CODEX2 run with no latent factors
42 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
43 |                                         gc_qc = ref_sim$gc,
44 |                                         norm_index = which(Gini<=0.12))
45 | Yhat.noK.sim <- normObj.sim$Yhat
46 | beta.hat.noK.sim <- normObj.sim$beta.hat
47 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
48 | N.sim <- normObj.sim$N
49 | 
50 | # Group-wise ploidy initialization
51 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1")
52 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 
53 |                                 ref = ref_sim, groups = clones)
54 | ploidy.sim.group
55 | 
56 | }
57 | \author{
58 | Rujin Wang \email{rujin@email.unc.edu}
59 | }
60 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: SCOPE
 2 | Type: Package
 3 | Title: A normalization and copy number estimation method for single-cell DNA sequencing
 4 | Version: 1.5.2
 5 | Author: Rujin Wang, Danyu Lin, Yuchao Jiang
 6 | Maintainer: Rujin Wang <rujin@email.unc.edu>
 7 | Description: Whole genome single-cell DNA sequencing (scDNA-seq) enables characterization of copy number profiles at the cellular level. This circumvents the averaging effects associated with bulk-tissue sequencing and has increased resolution yet decreased ambiguity in deconvolving cancer subclones and elucidating cancer evolutionary history. ScDNA-seq data is, however, sparse, noisy, and highly variable even within a homogeneous cell population, due to the biases and artifacts that are introduced during the library preparation and sequencing procedure. Here, we propose SCOPE, a normalization and copy number estimation method for scDNA-seq data. The distinguishing features of SCOPE include: (i) utilization of cell-specific Gini coefficients for quality controls and for identification of normal/diploid cells, which are further used as negative control samples in a Poisson latent factor model for normalization; (ii) modeling of GC content bias using an expectation-maximization algorithm embedded in the Poisson generalized linear models, which accounts for the different copy number states along the genome; (iii) a cross-sample iterative segmentation procedure to identify breakpoints that are shared across cells from the same genetic background. 
 8 | Depends: R (>= 3.6.0), GenomicRanges, IRanges, Rsamtools, GenomeInfoDb, BSgenome.Hsapiens.UCSC.hg19
 9 | Imports: stats, grDevices, graphics, utils, DescTools, RColorBrewer, gplots, foreach, parallel, doParallel, DNAcopy, BSgenome, Biostrings, BiocGenerics, S4Vectors
10 | Suggests:
11 |     knitr,
12 |     rmarkdown,
13 |     WGSmapp,
14 |     BSgenome.Hsapiens.UCSC.hg38, 
15 |     BSgenome.Mmusculus.UCSC.mm10, 
16 |     testthat (>= 2.1.0)
17 | VignetteBuilder: knitr
18 | biocViews: SingleCell, 
19 |     Normalization, 
20 |     CopyNumberVariation, 
21 |     Sequencing, WholeGenome, 
22 |     Coverage, 
23 |     Alignment, 
24 |     QualityControl, 
25 |     DataImport,
26 |     DNASeq
27 | License: GPL-2
28 | LazyData: true
29 | RoxygenNote: 6.1.1
30 | Encoding: UTF-8
31 | 


--------------------------------------------------------------------------------
/R/get_samp_QC.R:
--------------------------------------------------------------------------------
 1 | #' @title Get QC metrics for single cells
 2 | #'
 3 | #' @description Perform QC step on single cells.
 4 | #'
 5 | #' @param bambedObj object returned from \code{get_bam_bed}
 6 | #'
 7 | #' @return
 8 | #'   \item{QCmetric}{A matrix containing total number/proportion of reads,
 9 | #'     total number/proportion of mapped reads, total number/proportion
10 | #'     of mapped non-duplicate reads, and number/proportion of reads with
11 | #'     mapping quality greater than 20}
12 | #'
13 | #' @examples
14 | #' library(WGSmapp)
15 | #' library(BSgenome.Hsapiens.UCSC.hg38)
16 | #' bamfolder <- system.file('extdata', package = 'WGSmapp')
17 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
18 | #' bamdir <- file.path(bamfolder, bamFile)
19 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
20 | #' bambedObj <- get_bam_bed(bamdir = bamdir,
21 | #'                             sampname = sampname_raw, 
22 | #'                             hgref = "hg38")
23 | #' QCmetric_raw = get_samp_QC(bambedObj)
24 | #'
25 | #' @author Rujin Wang \email{rujin@email.unc.edu}
26 | #' @import Rsamtools
27 | #' @export
28 | get_samp_QC <- function(bambedObj) {
29 |     ref <- bambedObj$ref
30 |     bamdir <- bambedObj$bamdir
31 |     sampname <- bambedObj$sampname
32 |     QCmetric <- matrix(ncol = 8, nrow = length(sampname))
33 |     rownames(QCmetric) <- sampname
34 |     colnames(QCmetric) <- c("readlength", "total", "mapped",
35 |         "mapped_prop", "non_dup", "non_dup_prop", "mapq20",
36 |         "mapq20_prop")
37 |     for (i in seq_len(length(sampname))) {
38 |         cat("Getting sample QC metric for sample", i, "\n")
39 |         what <- c("rname", "pos", "strand", "mapq", "qwidth", "flag")
40 |         param <- ScanBamParam(what = what)
41 |         aln <- scanBam(bamdir[i], param = param)
42 |         aln <- aln[[1]]
43 |         temp0 <- round(mean(aln$qwidth, na.rm = TRUE))
44 |         temp1 <- length(aln$mapq)
45 |         temp2 <- sum(!is.na(aln$rname))
46 |         temp3 <- sum(!is.na(aln$rname) & aln$flag < 1024)
47 |         temp4 <- sum(aln$flag < 1024 & aln$mapq >= 20, na.rm = TRUE)
48 |         QCmetric[i, ] <- c(temp0, temp1, temp2, round(temp2/temp1, 3),
49 |             temp3, round(temp3/temp1, 3), temp4, round(temp4/temp1,
50 |             3))
51 |     }
52 |     return(QCmetric)
53 | }
54 | 


--------------------------------------------------------------------------------
/man/plot_EM_fit.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_EM_fit.R
 3 | \name{plot_EM_fit}
 4 | \alias{plot_EM_fit}
 5 | \title{Visualize EM fitting for each cell.}
 6 | \usage{
 7 | plot_EM_fit(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0,
 8 |                 minCountQC = 20, filename)
 9 | }
10 | \arguments{
11 | \item{Y_qc}{read depth matrix across all cells after quality control}
12 | 
13 | \item{gc_qc}{vector of GC content for each bin after quality control}
14 | 
15 | \item{norm_index}{indices of normal/diploid cells}
16 | 
17 | \item{T}{a vector of integers indicating number of CNV groups.
18 | Use BIC to select optimal number of CNV groups.
19 | If \code{T = 1}, assume all reads are from normal regions
20 | so that EM algorithm is not implemented. Otherwise,
21 | we assume there is always a CNV group of heterozygous deletion
22 | and a group of null region. The rest groups are representative
23 | of different duplication states.}
24 | 
25 | \item{ploidyInt}{a vector of initialized ploidy return from
26 | \code{initialize_ploidy}}
27 | 
28 | \item{beta0}{a vector of initialized bin-specific biases returned
29 | from CODEX2 without latent factors}
30 | 
31 | \item{minCountQC}{the minimum read coverage required for EM fitting.
32 | Defalut is \code{20}}
33 | 
34 | \item{filename}{the name of output pdf file}
35 | }
36 | \value{
37 | pdf file with EM fitting results and two plots:
38 | log likelihood, and BIC versus the number of CNV groups.
39 | }
40 | \description{
41 | A pdf file containing EM fitting results and plots is generated.
42 | }
43 | \examples{
44 | Gini <- get_gini(Y_sim)
45 | # first-pass CODEX2 run with no latent factors
46 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
47 |                                         gc_qc = ref_sim$gc,
48 |                                         norm_index = which(Gini<=0.12))
49 | Yhat.noK.sim <- normObj.sim$Yhat
50 | beta.hat.noK.sim <- normObj.sim$beta.hat
51 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
52 | N.sim <- normObj.sim$N
53 | 
54 | # Ploidy initialization
55 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 
56 |                                 Yhat = Yhat.noK.sim, 
57 |                                 ref = ref_sim)
58 | ploidy.sim
59 | 
60 | plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc,
61 |         norm_index = which(Gini<=0.12), T = 1:7,
62 |         ploidyInt = ploidy.sim,
63 |         beta0 = beta.hat.noK.sim,
64 |         filename = 'plot_EM_fit_demo.pdf')
65 | 
66 | }
67 | \author{
68 | Rujin Wang \email{rujin@email.unc.edu}
69 | }
70 | 


--------------------------------------------------------------------------------
/man/perform_qc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/perform_qc.R
 3 | \name{perform_qc}
 4 | \alias{perform_qc}
 5 | \title{Quality control for cells and bins}
 6 | \usage{
 7 | perform_qc(Y_raw, sampname_raw, ref_raw, QCmetric_raw,
 8 |         cov_thresh = 0, minCountQC = 20, 
 9 |         mapq20_thresh = 0.3, mapp_thresh = 0.9,
10 |         gc_thresh = c(20, 80), nMAD = 3)
11 | }
12 | \arguments{
13 | \item{Y_raw}{raw read count matrix returned
14 | from \code{\link{get_coverage_scDNA}}}
15 | 
16 | \item{sampname_raw}{sample names for quality control returned
17 | from \code{\link{get_bam_bed}}}
18 | 
19 | \item{ref_raw}{raw GRanges object with corresponding GC content
20 | and mappability for quality control returned from
21 | \code{\link{get_bam_bed}}}
22 | 
23 | \item{QCmetric_raw}{a QC metric for single cells returned from
24 | \code{\link{get_samp_QC}}}
25 | 
26 | \item{cov_thresh}{scalar variable specifying the lower bound of read count
27 | summation of each cell. Default is \code{0}}
28 | 
29 | \item{minCountQC}{the minimum read coverage required for
30 | normalization and EM fitting. Defalut is \code{20}}
31 | 
32 | \item{mapq20_thresh}{scalar variable specifying the lower threshold
33 | of proportion of reads with mapping quality greater than 20.
34 | Default is \code{0.3}}
35 | 
36 | \item{mapp_thresh}{scalar variable specifying mappability of
37 | each genomic bin. Default is \code{0.9}}
38 | 
39 | \item{gc_thresh}{vector specifying the lower and upper bound of
40 | GC content threshold for quality control. Default is \code{20-80}}
41 | 
42 | \item{nMAD}{scalar variable specifying the number of MAD from the median
43 | of total read counts adjusted by library size for each cell.
44 | Default is \code{3}}
45 | }
46 | \value{
47 | A list with components
48 |     \item{Y}{read depth matrix after quality control}
49 |     \item{sampname}{sample names after quality control}
50 |     \item{ref}{A GRanges object specifying whole genomic
51 |         bin positions after quality control}
52 |     \item{QCmetric}{A data frame of QC metric for single cells
53 |         after quality control}
54 | }
55 | \description{
56 | Perform QC step on single cells and bins.
57 | }
58 | \examples{
59 | Y_raw <- coverageObj.scopeDemo$Y
60 | sampname_raw <- rownames(QCmetric.scopeDemo)
61 | ref_raw <- ref.scopeDemo
62 | QCmetric_raw <- QCmetric.scopeDemo
63 | qcObj <- perform_qc(Y_raw = Y_raw, sampname_raw = sampname_raw,
64 |                 ref_raw = ref_raw, QCmetric_raw = QCmetric_raw)
65 | 
66 | }
67 | \author{
68 | Rujin Wang \email{rujin@email.unc.edu}
69 | }
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SCOPE
 2 | A normalization and copy number estimation method for single-cell DNA sequencing
 3 | 
 4 | 
 5 | ## Authors
 6 | Rujin Wang, Danyu Lin, and Yuchao Jiang
 7 | 
 8 | 
 9 | ## Maintainer
10 | Rujin Wang <rujinwang1124@gmail.com>
11 | 
12 | 
13 | ## Installation
14 | From Bioconductor
15 | ```r
16 | if (!requireNamespace("BiocManager", quietly = TRUE))
17 |     install.packages("BiocManager")
18 | 
19 | # The following initializes usage of Bioc devel
20 | BiocManager::install(version='devel')
21 | BiocManager::install("WGSmapp")
22 | BiocManager::install("SCOPE")
23 | ```
24 | From GitHub
25 | ```
26 | install.packages('devtools')
27 | devtools::install_github("rujinwang/WGSmapp")
28 | devtools::install_github("rujinwang/SCOPE")
29 | ```
30 | 
31 | ## Description
32 | Whole genome single-cell DNA sequencing (scDNA-seq) enables characterization of copy number profiles at the cellular level. This circumvents the averaging effects associated with bulk-tissue sequencing and has increased resolution yet decreased ambiguity in deconvolving cancer subclones and elucidating cancer evolutionary history. ScDNA-seq data is, however, sparse, noisy, and highly variable even within a homogeneous cell population, due to the biases and artifacts that are introduced during the library preparation and sequencing procedure. Here, we propose SCOPE, a normalization and copy number estimation method for scDNA-seq data. The distinguishing features of SCOPE include: (i) utilization of cell-specific Gini coefficients for quality controls and for identification of normal/diploid cells, which are further used as negative control samples in a Poisson latent factor model for normalization; (ii) modeling of GC content bias using an expectation-maximization algorithm embedded in the Poisson generalized linear models, which accounts for the different copy number states along the genome; (iii) a cross-sample iterative segmentation procedure to identify breakpoints that are shared across cells from the same genetic background. We evaluate performance of SCOPE on real scDNA-seq data sets from cancer genomic studies. Compared to existing methods, SCOPE more accurately estimates subclonal copy number aberrations and is shown to have higher correlation with array-based copy number profiles of purified bulk samples from the same patient. We further demonstrate SCOPE on three recently released data sets using the 10X Genomics single-cell CNV pipeline and show that it can reliably recover 1% of the cancer cells from a background of normal.
33 | 
34 | 
35 | ## Manuscript
36 | Rujin Wang, Danyu Lin, and Yuchao Jiang. SCOPE: A Normalization and Copy Number Estimation Method for Single-Cell DNA Sequencing. ***Cell Systems***, 2020. ([link](https://www.sciencedirect.com/science/article/abs/pii/S2405471220301113?via%3Dihub))
37 | 
38 | ## Vignettes
39 | [HTML](http://bioconductor.org/packages/devel/bioc/vignettes/SCOPE/inst/doc/SCOPE_vignette.html)
40 | 


--------------------------------------------------------------------------------
/R/get_gc.R:
--------------------------------------------------------------------------------
 1 | if (getRversion() >= "2.15.1") {
 2 |     utils::globalVariables(c("BSgenome.Hsapiens.UCSC.hg19",
 3 |                             "mapp_hg19", "mapp_hg38"))
 4 | }
 5 | #' @title Compute GC content
 6 | #' @name get_gc
 7 | #'
 8 | #' @description Compute GC content for each bin
 9 | #'
10 | #' @param ref GRanges object returned from \code{get_bam_bed}
11 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'.
12 | #' Default is human genome \code{hg19}. 
13 | #'
14 | #' @return
15 | #'   \item{gc}{Vector of GC content for each bin/target}
16 | #'
17 | #' @examples
18 | #' \dontrun{
19 | #' library(WGSmapp)
20 | #' library(BSgenome.Hsapiens.UCSC.hg38)
21 | #' bamfolder <- system.file('extdata', package = 'WGSmapp')
22 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
23 | #' bamdir <- file.path(bamfolder, bamFile)
24 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
25 | #' bambedObj <- get_bam_bed(bamdir = bamdir,
26 | #'                             sampname = sampname_raw, 
27 | #'                             hgref = "hg38")
28 | #' bamdir <- bambedObj$bamdir
29 | #' sampname_raw <- bambedObj$sampname
30 | #' ref_raw <- bambedObj$ref
31 | #'
32 | #' gc <- get_gc(ref_raw, hgref = "hg38")
33 | #' }
34 | #'
35 | #' @author Rujin Wang \email{rujin@email.unc.edu}
36 | #' @import BSgenome.Hsapiens.UCSC.hg19
37 | #' @importFrom IRanges IRanges Views
38 | #' @importFrom GenomeInfoDb mapSeqlevels seqnames
39 | #' @importFrom BiocGenerics start end
40 | #' @importFrom Biostrings unmasked alphabetFrequency
41 | #' @export
42 | get_gc <- function (ref, hgref = "hg19"){
43 |     if(!hgref %in% c("hg19", "hg38", "mm10")){
44 |         stop("Reference genome should be hg19, hg38, or mm10.")
45 |     }
46 |     if(hgref == "hg19") {
47 |         genome <- BSgenome.Hsapiens.UCSC.hg19
48 |     }else if(hgref == "hg38") {
49 |         genome <- BSgenome.Hsapiens.UCSC.hg38
50 |     }else if(hgref == "mm10") {
51 |         genome <- BSgenome.Mmusculus.UCSC.mm10
52 |     }
53 |     gc <- rep(NA, length(ref))
54 |     for (chr in unique(seqnames(ref))) {
55 |         message("Getting GC content for chr ", chr, sep = "")
56 |         chr.index <- which(as.matrix(seqnames(ref)) == chr)
57 |         ref.chr <- IRanges(start = start(ref)[chr.index],
58 |             end = end(ref)[chr.index])
59 |         if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
60 |             chrtemp <- "chrX"
61 |         }
62 |         else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
63 |             chrtemp <- "chrY"
64 |         }
65 |         else {
66 |             chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1])
67 |         }
68 |         if (length(chrtemp) == 0)
69 |             message("Chromosome cannot be found in NCBI database. ")
70 |             chrm <- unmasked(genome[[chrtemp]])
71 |             seqs <- Views(chrm, ref.chr)
72 |             af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE)
73 |             gc[chr.index] <- round((af[, "G"] + af[, "C"]) * 100, 2)
74 |     }
75 |     gc
76 | }
77 | 


--------------------------------------------------------------------------------
/man/normalize_scope.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/normalize_scope.R
 3 | \name{normalize_scope}
 4 | \alias{normalize_scope}
 5 | \title{Normalization of read depth with latent factors using
 6 | Expectation-Maximization algorithm under the case-control setting}
 7 | \usage{
 8 | normalize_scope(Y_qc, gc_qc, K, norm_index, T, ploidyInt,
 9 |                 beta0, minCountQC = 20)
10 | }
11 | \arguments{
12 | \item{Y_qc}{read depth matrix after quality control}
13 | 
14 | \item{gc_qc}{vector of GC content for each bin after quality control}
15 | 
16 | \item{K}{Number of latent Poisson factors}
17 | 
18 | \item{norm_index}{indices of normal/diploid cells}
19 | 
20 | \item{T}{a vector of integers indicating number of CNV groups.
21 | Use BIC to select optimal number of CNV groups. If \code{T = 1},
22 | assume all reads are from normal regions so that EM algorithm is
23 | not implemented. Otherwise, we assume there is always a CNV group
24 | of heterozygous deletion and a group of null region. The rest
25 | groups are representative of different duplication states.}
26 | 
27 | \item{ploidyInt}{a vector of initialized ploidy return
28 | from \code{initialize_ploidy}. Users are also allowed to provide
29 | prior-knowledge ploidies as the input and to manually tune a few 
30 | cells that have poor fitting}
31 | 
32 | \item{beta0}{a vector of initialized bin-specific biases
33 | returned from CODEX2 without latent factors}
34 | 
35 | \item{minCountQC}{the minimum read coverage required for
36 | normalization and EM fitting. Defalut is \code{20}}
37 | }
38 | \value{
39 | A list with components
40 |     \item{Yhat}{A list of normalized read depth matrix with EM}
41 |     \item{alpha.hat}{A list of absolute copy number matrix}
42 |     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
43 |     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
44 |     \item{g.hat}{A list of estimated Poisson latent factor}
45 |     \item{h.hat}{A list of estimated Poisson latent factor}
46 |     \item{AIC}{AIC for model selection}
47 |     \item{BIC}{BIC for model selection}
48 |     \item{RSS}{RSS for model selection}
49 |     \item{K}{Number of latent Poisson factors}
50 | }
51 | \description{
52 | Fit a Poisson generalized linear model to normalize
53 | the raw read depth data from single-cell DNA sequencing, with
54 | latent factors under the case-control setting. Model GC content
55 | bias using an expectation-maximization algorithm, which accounts for
56 | the different copy number states.
57 | }
58 | \examples{
59 | Gini <- get_gini(Y_sim)
60 | 
61 | # first-pass CODEX2 run with no latent factors
62 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
63 |                                         gc_qc = ref_sim$gc,
64 |                                         norm_index = which(Gini<=0.12))
65 | Yhat.noK.sim <- normObj.sim$Yhat
66 | beta.hat.noK.sim <- normObj.sim$beta.hat
67 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
68 | N.sim <- normObj.sim$N
69 | 
70 | # Ploidy initialization
71 | ploidy.sim <- initialize_ploidy(Y = Y_sim, 
72 |                                 Yhat = Yhat.noK.sim, 
73 |                                 ref = ref_sim)
74 | ploidy.sim
75 | 
76 | normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc,
77 |                                     K = 1, ploidyInt = ploidy.sim,
78 |                                     norm_index = which(Gini<=0.12), T = 1:5,
79 |                                     beta0 = beta.hat.noK.sim)
80 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]]
81 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]]
82 | 
83 | }
84 | \author{
85 | Rujin Wang \email{rujin@email.unc.edu}
86 | }
87 | 


--------------------------------------------------------------------------------
/R/get_bam_bed.R:
--------------------------------------------------------------------------------
 1 | if (getRversion() >= "2.15.1") {
 2 |     utils::globalVariables(c("BSgenome.Hsapiens.UCSC.hg19",
 3 |                             "BSgenome.Hsapiens.UCSC.hg38", 
 4 |                             "BSgenome.Mmusculus.UCSC.mm10",
 5 |                             "seqlevels<-", "seqlevels"))
 6 | }
 7 | #' @title Get bam file directories, sample names, and whole genomic bins
 8 | #'
 9 | #' @description Get bam file directories, sample names, and whole genomic
10 | #' bins from .bed file
11 | #'
12 | #' @usage
13 | #' get_bam_bed(bamdir, sampname, hgref = "hg19", resolution = 500, 
14 | #'             sex = FALSE)
15 | #'                 
16 | #' @param bamdir vector of the directory of a bam file. Should be in the same
17 | #'  order as sample names in \code{sampname}.
18 | #' @param sampname vector of sample names. Should be in the same order as bam
19 | #'  directories in \code{bamdir}.
20 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'. 
21 | #'  Default is human genome \code{hg19}. 
22 | #' @param resolution numeric value of fixed bin-length. Default is \code{500}.
23 | #'  Unit is "kb". 
24 | #' @param sex logical, whether to include sex chromosomes. 
25 | #'  Default is \code{FALSE}.
26 | #'
27 | #' @return A list with components
28 | #'     \item{bamdir}{A vector of bam directories}
29 | #'     \item{sampname}{A vector of sample names}
30 | #'     \item{ref}{A GRanges object specifying whole genomic bin positions}
31 | #'
32 | #' @examples
33 | #' library(WGSmapp)
34 | #' library(BSgenome.Hsapiens.UCSC.hg38)
35 | #' bamfolder <- system.file('extdata', package = 'WGSmapp')
36 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
37 | #' bamdir <- file.path(bamfolder, bamFile)
38 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
39 | #' bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 
40 | #'                         hgref = "hg38")
41 | #' bamdir <- bambedObj$bamdir
42 | #' sampname_raw <- bambedObj$sampname
43 | #' ref_raw <- bambedObj$ref
44 | #'
45 | #' @author Rujin Wang \email{rujin@email.unc.edu}
46 | #' @import utils
47 | #' @import BSgenome.Hsapiens.UCSC.hg19
48 | #' @importFrom GenomicRanges GRanges tileGenome
49 | #' @importFrom IRanges IRanges
50 | #' @importFrom GenomeInfoDb seqnames seqinfo seqlevels
51 | #' @export
52 | get_bam_bed <- function(bamdir, sampname, hgref = "hg19", resolution = 500, 
53 |                         sex = FALSE){
54 |     if(!hgref %in% c("hg19", "hg38", "mm10")){
55 |         stop("Reference genome should be hg19, hg38 or mm10. ")
56 |     }
57 |     if(hgref == "hg19") {
58 |         genome <- BSgenome.Hsapiens.UCSC.hg19
59 |     }else if(hgref == "hg38") {
60 |         genome <- BSgenome.Hsapiens.UCSC.hg38
61 |     }else if(hgref == "mm10") {
62 |         genome <- BSgenome.Mmusculus.UCSC.mm10
63 |     }
64 |     if(resolution <= 0){
65 |         stop("Invalid fixed bin length. ")
66 |     }
67 |     bins <- tileGenome(seqinfo(genome), 
68 |                     tilewidth = resolution * 1000, 
69 |                     cut.last.tile.in.chrom = TRUE)
70 |     if(hgref != "mm10"){
71 |         autochr = 22
72 |     }else{
73 |         autochr = 19
74 |     }
75 |     if(sex) {
76 |         ref <- bins[which(as.character(seqnames(bins)) %in% paste0("chr", 
77 |                     c(seq_len(autochr), "X", "Y")))]
78 |     } else {
79 |         ref <- bins[which(as.character(seqnames(bins)) %in% paste0("chr", 
80 |                     seq_len(autochr)))]
81 |     }
82 |     if (!any(grepl("chr", seqlevels(ref)))) {
83 |         seqlevels(ref) <- paste(c(seq_len(autochr), "X", "Y"), sep = "")
84 |         ref <- sort(ref)
85 |     } else {
86 |         seqlevels(ref) <- paste("chr", c(seq_len(autochr), "X", "Y"), sep = "")
87 |         ref <- sort(ref)
88 |     }
89 |     list(bamdir = bamdir, sampname = sampname, ref = ref)
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/R/get_mapp.R:
--------------------------------------------------------------------------------
 1 | if (getRversion() >= "2.15.1") {
 2 |     utils::globalVariables(c("mapp_hg19", 
 3 |                             "mapp_hg38", "seqlevelsStyle<-"))
 4 | }
 5 | #' @title Compute mappability
 6 | #' @name get_mapp
 7 | #'
 8 | #' @description Compute mappability for each bin. Note that scDNA
 9 | #' sequencing is whole-genome amplification and the mappability
10 | #' score is essential to determine variable binning method.
11 | #' Mappability track for 100-mers on the GRCh37/hg19 human
12 | #' reference genome from ENCODE is pre-saved. Compute the mean
13 | #' of mappability scores that overlapped reads map to bins,
14 | #' weighted by the width of mappability tracks on the genome
15 | #' reference. Use liftOver utility to calculate mappability
16 | #' for hg38, which is pre-saved as well. For mm10, there are
17 | #' two workarounds: 1) set all mappability to 1 to avoid extensive
18 | #' computation; 2) adopt QC procedures based on annotation results, 
19 | #' e.g., filter out bins within black list regions, 
20 | #' which generally have low mappability.
21 | #'
22 | #' @param ref GRanges object returned from \code{get_bam_bed}
23 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'.
24 | #' Default is human genome \code{hg19}. 
25 | #'
26 | #' @return
27 | #'   \item{mapp}{Vector of mappability for each bin/target}
28 | #'
29 | #' @examples
30 | #' \dontrun{
31 | #' library(WGSmapp)
32 | #' library(BSgenome.Hsapiens.UCSC.hg38)
33 | #' bamfolder <- system.file('extdata', package = 'WGSmapp')
34 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
35 | #' bamdir <- file.path(bamfolder, bamFile)
36 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
37 | #' bambedObj <- get_bam_bed(bamdir = bamdir,
38 | #'                             sampname = sampname_raw, 
39 | #'                             hgref = "hg38")
40 | #' bamdir <- bambedObj$bamdir
41 | #' sampname_raw <- bambedObj$sampname
42 | #' ref_raw <- bambedObj$ref
43 | #' 
44 | #' mapp <- get_mapp(ref_raw, hgref = "hg38")
45 | #' }
46 | #'
47 | #' @author Rujin Wang \email{rujin@email.unc.edu}
48 | #' @import utils
49 | #' @importFrom GenomicRanges GRanges pintersect
50 | #' @importFrom IRanges IRanges Views countOverlaps findOverlaps width
51 | #' @importFrom GenomeInfoDb mapSeqlevels seqlevelsStyle seqnames
52 | #' @importFrom S4Vectors queryHits subjectHits
53 | #' @export
54 | get_mapp <- function(ref, hgref = "hg19") {
55 |     if(!hgref %in% c("hg19", "hg38", "mm10")){
56 |         stop("Reference genome should be hg19, hg38, or mm10.")
57 |     }
58 |     if(hgref == "hg19") {
59 |         mapp_gref <- mapp_hg19
60 |     }else if(hgref == "hg38") {
61 |         mapp_gref <- mapp_hg38
62 |     }
63 |     mapp <- rep(1, length(ref))
64 |     if(hgref != "mm10"){
65 |         seqlevelsStyle(ref) <- "UCSC"
66 |         for (chr in as.character(unique(seqnames(ref)))) {
67 |             message("Getting mappability for ", chr, sep = "")
68 |             chr.index <- which(as.matrix(seqnames(ref)) == chr)
69 |             ref.chr <- ref[which(as.character(seqnames(ref)) == chr)]
70 |             mapp.chr <- rep(1, length(ref.chr))
71 |             overlap <- as.matrix(findOverlaps(ref.chr, mapp_gref))
72 |             for (i in unique(overlap[, 1])) {
73 |                 index.temp <- overlap[which(overlap[, 1] == i), 2]
74 |                 overlap.sub <- findOverlaps(ref.chr[i], mapp_gref[index.temp])
75 |                 overlap.intersect <- pintersect(ref.chr[i][queryHits(
76 |                     overlap.sub)], mapp_gref[index.temp][
77 |                     subjectHits(overlap.sub)])
78 |                 mapp.chr[i] <- sum((mapp_gref$score[index.temp]) * 
79 |                                 (width(overlap.intersect)))/sum(
80 |                                 width(overlap.intersect))
81 |             }
82 |             mapp[chr.index] <- mapp.chr
83 |         }
84 |     }
85 |     mapp
86 | }
87 | 
88 | 


--------------------------------------------------------------------------------
/man/normalize_scope_foreach.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/normalize_scope_foreach.R
 3 | \name{normalize_scope_foreach}
 4 | \alias{normalize_scope_foreach}
 5 | \title{Normalization of read depth with latent factors using
 6 | Expectation-Maximization algorithm under the case-control
 7 | setting in parallel}
 8 | \usage{
 9 | normalize_scope_foreach(Y_qc, gc_qc, K, norm_index, T,
10 |     ploidyInt, beta0, minCountQC = 20, nCores = NULL)
11 | }
12 | \arguments{
13 | \item{Y_qc}{read depth matrix after quality control}
14 | 
15 | \item{gc_qc}{vector of GC content for each bin after quality control}
16 | 
17 | \item{K}{Number of latent Poisson factors}
18 | 
19 | \item{norm_index}{indices of normal/diploid cells}
20 | 
21 | \item{T}{a vector of integers indicating number of CNV groups.
22 | Use BIC to select optimal number of CNV groups. If \code{T = 1},
23 | assume all reads are from normal regions so that EM algorithm is
24 | not implemented. Otherwise, we assume there is always a CNV group
25 | of heterozygous deletion and a group of null region. The rest groups
26 | are representative of different duplication states.}
27 | 
28 | \item{ploidyInt}{a vector of initialized ploidy return
29 | from \code{initialize_ploidy}. Users are also allowed to provide
30 | prior-knowledge ploidies as the input and to manually tune a few 
31 | cells that have poor fitting}
32 | 
33 | \item{beta0}{a vector of initialized bin-specific biases returned
34 | from CODEX2 without latent factors}
35 | 
36 | \item{minCountQC}{the minimum read coverage required for normalization
37 | and EM fitting. Defalut is \code{20}}
38 | 
39 | \item{nCores}{number of cores to use. If \code{NULL}, number of cores
40 | is detected. Default is \code{NULL}.}
41 | }
42 | \value{
43 | A list with components
44 |     \item{Yhat}{A list of normalized read depth matrix with EM}
45 |     \item{alpha.hat}{A list of absolute copy number matrix}
46 |     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
47 |     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
48 |     \item{g.hat}{A list of estimated Poisson latent factor}
49 |     \item{h.hat}{A list of estimated Poisson latent factor}
50 |     \item{AIC}{AIC for model selection}
51 |     \item{BIC}{BIC for model selection}
52 |     \item{RSS}{RSS for model selection}
53 |     \item{K}{Number of latent Poisson factors}
54 | }
55 | \description{
56 | Fit a Poisson generalized linear model to normalize
57 | the raw read depth data from single-cell DNA sequencing,
58 | with latent factors under the case-control setting. Model GC
59 | content bias using an expectation-maximization algorithm,
60 | which accounts for the different copy number states.
61 | }
62 | \examples{
63 | Gini <- get_gini(Y_sim)
64 | 
65 | # first-pass CODEX2 run with no latent factors
66 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
67 |                                         gc_qc = ref_sim$gc,
68 |                                         norm_index = which(Gini<=0.12))
69 | Yhat.noK.sim <- normObj.sim$Yhat
70 | beta.hat.noK.sim <- normObj.sim$beta.hat
71 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
72 | N.sim <- normObj.sim$N
73 | 
74 | # Ploidy initialization
75 | ploidy.sim <- initialize_ploidy(Y = Y_sim,
76 |                             Yhat = Yhat.noK.sim,
77 |                             ref = ref_sim)
78 | ploidy.sim
79 | 
80 | # Specify nCores = 2 only for checking examples
81 | normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, 
82 |                         gc_qc = ref_sim$gc,
83 |                         K = 1, ploidyInt = ploidy.sim,
84 |                         norm_index = which(Gini<=0.12), T = 1:5,
85 |                         beta0 = beta.hat.noK.sim, nCores = 2)
86 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]]
87 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]]
88 | 
89 | }
90 | \author{
91 | Rujin Wang \email{rujin@email.unc.edu}
92 | }
93 | 


--------------------------------------------------------------------------------
/man/normalize_scope_group.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/normalize_scope_group.R
 3 | \name{normalize_scope_group}
 4 | \alias{normalize_scope_group}
 5 | \title{Group-wise normalization of read depth with latent factors using
 6 | Expectation-Maximization algorithm and shared clonal memberships}
 7 | \usage{
 8 | normalize_scope_group(Y_qc, gc_qc, K, norm_index, groups, T, 
 9 |                         ploidyInt, beta0, minCountQC = 20)
10 | }
11 | \arguments{
12 | \item{Y_qc}{read depth matrix after quality control}
13 | 
14 | \item{gc_qc}{vector of GC content for each bin after quality control}
15 | 
16 | \item{K}{Number of latent Poisson factors}
17 | 
18 | \item{norm_index}{indices of normal/diploid cells using group/clone
19 | labels}
20 | 
21 | \item{groups}{clonal membership labels for each cell}
22 | 
23 | \item{T}{a vector of integers indicating number of CNV groups.
24 | Use BIC to select optimal number of CNV groups. If \code{T = 1},
25 | assume all reads are from normal regions so that EM algorithm is
26 | not implemented. Otherwise, we assume there is always a CNV group
27 | of heterozygous deletion and a group of null region. The rest
28 | groups are representative of different duplication states.}
29 | 
30 | \item{ploidyInt}{a vector of group-wise initialized ploidy return
31 | from \code{initialize_ploidy_group}. Users are also allowed to 
32 | provide prior-knowledge ploidies as the input and to manually 
33 | tune a few cells/clones that have poor fitting}
34 | 
35 | \item{beta0}{a vector of initialized bin-specific biases
36 | returned from CODEX2 without latent factors}
37 | 
38 | \item{minCountQC}{the minimum read coverage required for
39 | normalization and EM fitting. Defalut is \code{20}}
40 | }
41 | \value{
42 | A list with components
43 |     \item{Yhat}{A list of normalized read depth matrix with EM}
44 |     \item{alpha.hat}{A list of absolute copy number matrix}
45 |     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
46 |     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
47 |     \item{g.hat}{A list of estimated Poisson latent factor}
48 |     \item{h.hat}{A list of estimated Poisson latent factor}
49 |     \item{AIC}{AIC for model selection}
50 |     \item{BIC}{BIC for model selection}
51 |     \item{RSS}{RSS for model selection}
52 |     \item{K}{Number of latent Poisson factors}
53 | }
54 | \description{
55 | Fit a Poisson generalized linear model to normalize
56 | the raw read depth data from single-cell DNA sequencing, with
57 | latent factors and shared clonal memberships. Model GC content
58 | bias using an expectation-maximization algorithm, which accounts for
59 | clonal specific copy number states.
60 | }
61 | \examples{
62 | Gini <- get_gini(Y_sim)
63 | 
64 | # first-pass CODEX2 run with no latent factors
65 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
66 |                                         gc_qc = ref_sim$gc,
67 |                                         norm_index = which(Gini<=0.12))
68 | Yhat.noK.sim <- normObj.sim$Yhat
69 | beta.hat.noK.sim <- normObj.sim$beta.hat
70 | fGC.hat.noK.sim <- normObj.sim$fGC.hat
71 | N.sim <- normObj.sim$N
72 | 
73 | # Group-wise ploidy initialization
74 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1")
75 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 
76 |                                 ref = ref_sim, groups = clones)
77 | ploidy.sim.group
78 | 
79 | normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim, 
80 |                                     gc_qc = ref_sim$gc,
81 |                                     K = 1, ploidyInt = ploidy.sim.group,
82 |                                     norm_index = which(clones=="normal"), 
83 |                                     groups = clones, 
84 |                                     T = 1:5,
85 |                                     beta0 = beta.hat.noK.sim)
86 | Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max(
87 |                                     normObj.scope.sim.group$BIC)]]
88 | fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max(
89 |                                     normObj.scope.sim.group$BIC)]]
90 | 
91 | }
92 | \author{
93 | Rujin Wang \email{rujin@email.unc.edu}
94 | }
95 | 


--------------------------------------------------------------------------------
/R/initialize_ploidy.R:
--------------------------------------------------------------------------------
  1 | #' @title Ploidy pre-initialization
  2 | #'
  3 | #' @description Pre-estimate ploidies across all cells
  4 | #'
  5 | #' @usage
  6 | #' initialize_ploidy(Y, Yhat, ref, maxPloidy = 6, minPloidy = 1.5,
  7 | #'                 minBinWidth = 5, SoS.plot = FALSE)
  8 | #' @param Y raw read depth matrix after quality control procedure
  9 | #' @param Yhat normalized read depth matrix
 10 | #' @param ref GRanges object after quality control procedure
 11 | #' @param maxPloidy maximum ploidy candidate. Defalut is \code{6}
 12 | #' @param minPloidy minimum ploidy candidate. Defalut is \code{1.5}
 13 | #' @param minBinWidth the minimum number of bins for a changed segment.
 14 | #'  Defalut is \code{5}
 15 | #' @param SoS.plot logical, whether to generate ploidy pre-estimation 
 16 | #'  plots. Default is \code{FALSE}. 
 17 | #'
 18 | #' @return
 19 | #'     \item{ploidy.SoS}{Vector of pre-estimated ploidies for each cell}
 20 | #'
 21 | #' @examples
 22 | #' Gini <- get_gini(Y_sim)
 23 | #'
 24 | #' # first-pass CODEX2 run with no latent factors
 25 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 26 | #'                                         gc_qc = ref_sim$gc,
 27 | #'                                         norm_index = which(Gini<=0.12))
 28 | #' Yhat.noK.sim <- normObj.sim$Yhat
 29 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 30 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 31 | #' N.sim <- normObj.sim$N
 32 | #'
 33 | #' # Ploidy initialization
 34 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 
 35 | #'                                 Yhat = Yhat.noK.sim, 
 36 | #'                                 ref = ref_sim)
 37 | #' ploidy.sim
 38 | #'
 39 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 40 | #' @importFrom DNAcopy CNA smooth.CNA segment
 41 | #' @importFrom GenomeInfoDb seqnames
 42 | #' @importFrom BSgenome start
 43 | #' @importFrom IRanges end
 44 | #' @export
 45 | initialize_ploidy <- function(Y, Yhat, ref, maxPloidy = 6,
 46 |     minPloidy = 1.5, minBinWidth = 5, SoS.plot = FALSE) {
 47 |     ploidy.SoS <- rep(NA, ncol(Y))
 48 | 
 49 |     breaks <- matrix(0, nrow(Y), ncol(Y))
 50 |     RCNP <- matrix(0, nrow(Y), ncol(Y))
 51 |     final <- matrix(0, nrow(Y), ncol(Y))
 52 |     X <- seq(minPloidy, maxPloidy, by = 0.05)
 53 |     n_ploidy <- length(X)
 54 |     SoS <- matrix(0, n_ploidy, ncol(Y))
 55 | 
 56 |     normal <- (Y + 1)/(Yhat + 1)
 57 | 
 58 |     for (k in seq_len(ncol(Y))) {
 59 |         if (k%%5 == 1) {
 60 |             cat("Initializing ploidy for cell ", k, "\t")
 61 |         }
 62 | 
 63 |         lr <- log(normal[, k])
 64 |         loc <- data.frame(seq = as.character(seqnames(ref)),
 65 |             start = start(ref), end = end(ref))
 66 |         CNA.object <- CNA(genomdat = lr, chrom = loc[, 1],
 67 |             maploc = as.numeric(loc[, 2]), data.type = "logratio")
 68 |         CNA.smoothed <- smooth.CNA(CNA.object)
 69 |         segs <- segment(CNA.smoothed, verbose = 0,
 70 |             min.width = minBinWidth)
 71 |         frag <- segs$output[, 2:3]
 72 |         len <- dim(frag)[1]
 73 |         bps <- array(0, len)
 74 |         for (j in seq_len(len)) {
 75 |             bps[j] <- which((loc[, 1] == frag[j, 1]) &
 76 |                 (as.numeric(loc[, 2]) == frag[j, 2]))
 77 |         }
 78 |         bps <- sort(bps)
 79 |         bps[(len = len + 1)] <- nrow(Y)
 80 |         breaks[bps, k] <- 1
 81 |         RCNP[, k][seq_len(bps[2])] <- median(normal[,
 82 |             k][seq_len(bps[2])])
 83 |         for (i in 2:(len - 1)) {
 84 |             RCNP[, k][bps[i]:(bps[i + 1] - 1)] <- median(normal[,
 85 |                 k][bps[i]:(bps[i + 1] - 1)])
 86 |         }
 87 |         RCNP[, k] <- RCNP[, k]/mean(RCNP[, k])
 88 | 
 89 |         SCNP <- RCNP[, k] %o% X
 90 |         FSCP <- round(SCNP)
 91 |         Diff2 <- (SCNP - FSCP)^2
 92 |         SoS[, k] <- colSums(Diff2, na.rm = FALSE, dims = 1)
 93 |         ploidy.SoS[k] <- X[which.min(SoS[, k])]
 94 |         
 95 |         
 96 |         if(SoS.plot){
 97 |             par(mfrow = c(1,2))
 98 |             par(mar = c(5,4,4,2))
 99 |             hist(Y[,k], 100, main = 'Read depth distribution', 
100 |                 xlab = 'Coverage per bin')
101 |             plot(X, SoS[,k], xlab = "ploidy", ylab = "Sum of squared errors", 
102 |                 main = "First-pass estimation of ploidy", pch = 16)
103 |             abline(v = X[which.min(SoS[,k])], lty = 2)
104 |         }
105 |     }
106 |     return(ploidy.SoS)
107 | }
108 | 


--------------------------------------------------------------------------------
/R/plot_EM_fit.R:
--------------------------------------------------------------------------------
  1 | #' @title Visualize EM fitting for each cell.
  2 | #'
  3 | #' @description A pdf file containing EM fitting results and plots is generated.
  4 | #'
  5 | #' @usage plot_EM_fit(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0,
  6 | #'                 minCountQC = 20, filename)
  7 | #' @param Y_qc read depth matrix across all cells after quality control
  8 | #' @param gc_qc vector of GC content for each bin after quality control
  9 | #' @param norm_index indices of normal/diploid cells
 10 | #' @param T a vector of integers indicating number of CNV groups.
 11 | #'  Use BIC to select optimal number of CNV groups.
 12 | #'  If \code{T = 1}, assume all reads are from normal regions
 13 | #'  so that EM algorithm is not implemented. Otherwise,
 14 | #'  we assume there is always a CNV group of heterozygous deletion
 15 | #'  and a group of null region. The rest groups are representative
 16 | #'  of different duplication states.
 17 | #' @param ploidyInt a vector of initialized ploidy return from
 18 | #'  \code{initialize_ploidy}
 19 | #' @param beta0 a vector of initialized bin-specific biases returned
 20 | #'  from CODEX2 without latent factors
 21 | #' @param minCountQC the minimum read coverage required for EM fitting.
 22 | #'  Defalut is \code{20}
 23 | #' @param filename the name of output pdf file
 24 | #'
 25 | #' @return pdf file with EM fitting results and two plots:
 26 | #' log likelihood, and BIC versus the number of CNV groups.
 27 | #'
 28 | #' @examples
 29 | #' Gini <- get_gini(Y_sim)
 30 | #' # first-pass CODEX2 run with no latent factors
 31 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 32 | #'                                         gc_qc = ref_sim$gc,
 33 | #'                                         norm_index = which(Gini<=0.12))
 34 | #' Yhat.noK.sim <- normObj.sim$Yhat
 35 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 36 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 37 | #' N.sim <- normObj.sim$N
 38 | #'
 39 | #' # Ploidy initialization
 40 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 
 41 | #'                                 Yhat = Yhat.noK.sim, 
 42 | #'                                 ref = ref_sim)
 43 | #' ploidy.sim
 44 | #'
 45 | #' plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc,
 46 | #'         norm_index = which(Gini<=0.12), T = 1:7,
 47 | #'         ploidyInt = ploidy.sim,
 48 | #'         beta0 = beta.hat.noK.sim,
 49 | #'         filename = 'plot_EM_fit_demo.pdf')
 50 | #'
 51 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 52 | #' @import grDevices stats
 53 | #' @export
 54 | plot_EM_fit <- function(Y_qc, gc_qc, norm_index, T, ploidyInt, beta0,
 55 |     minCountQC = 20, filename) {
 56 |     Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) {
 57 |         !any(x == 0)
 58 |     }), , drop = FALSE]
 59 |     if(dim(Y.nonzero)[1] <= 10){
 60 |         message("Adopt arithmetic mean instead of geometric mean")
 61 |         pseudo.sample <- apply(Y_qc, 1, mean)
 62 |         Ntotal <- apply(apply(Y_qc, 2, function(x) {
 63 |             x/pseudo.sample
 64 |         }), 2, median, na.rm = TRUE)
 65 |     } else{
 66 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
 67 |             exp(sum(log(x))/length(x))
 68 |         })
 69 |         Ntotal <- apply(apply(Y.nonzero, 2, function(x) {
 70 |             x/pseudo.sample
 71 |         }), 2, median)
 72 |     }
 73 |     N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc)))
 74 |     Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, byrow = TRUE)
 75 | 
 76 |     # Get initialization
 77 |     gcfit.temp <- Y_qc/Nmat/beta0
 78 |     alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
 79 |     for (j in seq_len(ncol(alpha0))) {
 80 |         loe.fit <- loess(gcfit.temp[, j] ~ gc_qc)
 81 |         gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2)
 82 |         alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2
 83 |     }
 84 | 
 85 |     offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
 86 |         data = beta0, byrow = FALSE)
 87 | 
 88 |     pdf(file = filename, width = 8, height = 10)
 89 |     for (j in seq_len(ncol(Y_qc))) {
 90 |         cat(j, "\t")
 91 |         if (j %in% norm_index) {
 92 |             fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j], gctemp = gc_qc,
 93 |                 Yj = Y_qc[, j], offsetj = offset[, j],
 94 |                 T = 1, draw.plot = TRUE, alphaj = alpha0[, j], minCountQC)
 95 |         } else {
 96 |             fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j], gctemp = gc_qc,
 97 |                 Yj = Y_qc[, j], offsetj = offset[, j],
 98 |                 T = T, draw.plot = TRUE, alphaj = alpha0[, j], minCountQC)
 99 |         }
100 |     }
101 |     dev.off()
102 | }
103 | 


--------------------------------------------------------------------------------
/R/normalize_codex2_ns_noK.R:
--------------------------------------------------------------------------------
  1 | #' @title Normalization of read depth without latent factors under
  2 | #' the case-control setting
  3 | #'
  4 | #' @description Assuming that all reads are from diploid regions,
  5 | #' fit a Poisson generalized linear model to normalize the
  6 | #' raw read depth data from single-cell DNA sequencing, without
  7 | #' latent factors under the case-control setting.
  8 | #'
  9 | #' @param Y_qc read depth matrix after quality control
 10 | #' @param gc_qc vector of GC content for each bin after quality control
 11 | #' @param norm_index indices of normal/diploid cells
 12 | #'
 13 | #' @return A list with components
 14 | #'     \item{Yhat}{A list of normalized read depth matrix}
 15 | #'     \item{fGC.hat}{A list of estimated GC content bias matrix}
 16 | #'     \item{beta.hat}{A list of estimated bin-specific bias vector}
 17 | #'     \item{N}{A vector of cell-specific library size factor,
 18 | #'         which is computed from the genome-wide read depth data}
 19 | #'
 20 | #' @examples
 21 | #' Gini <- get_gini(Y_sim)
 22 | #' # first-pass CODEX2 run with no latent factors
 23 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 24 | #'                                         gc_qc = ref_sim$gc,
 25 | #'                                         norm_index = which(Gini<=0.12))
 26 | #'
 27 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 28 | #' @import stats
 29 | #' @export
 30 | normalize_codex2_ns_noK <- function(Y_qc, gc_qc, norm_index) {
 31 |     Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) {
 32 |         !any(x == 0)
 33 |         }), , drop = FALSE]
 34 |     if(dim(Y.nonzero)[1] <= 10){
 35 |         message("Adopt arithmetic mean instead of geometric mean")
 36 |         pseudo.sample <- apply(Y_qc, 1, mean)
 37 |         Ntotal <- apply(apply(Y_qc, 2, function(x) {
 38 |             x/pseudo.sample
 39 |         }), 2, median, na.rm = TRUE)
 40 |     } else{
 41 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
 42 |             exp(sum(log(x))/length(x))
 43 |             })
 44 |         Ntotal <- apply(apply(Y.nonzero, 2, function(x) {
 45 |             x/pseudo.sample
 46 |         }), 2, median)
 47 |     }
 48 |     N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc)))
 49 |     Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N,
 50 |         byrow = TRUE)
 51 | 
 52 |     message("Computing normalization with no latent factors")
 53 |     maxiter <- 10
 54 |     maxhiter <- 50
 55 |     BHTHRESH <- 1e-04
 56 |     HHTHRESH <- 1e-05
 57 |     iter <- 1
 58 |     fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0)
 59 |     fhatnew <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
 60 |     betahat <- rep(1, nrow(Y_qc))
 61 |     betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
 62 |         data = betahat, byrow = FALSE)
 63 |     bhdiff <- rep(Inf, maxiter)
 64 |     fhdiff <- rep(Inf, maxiter)
 65 |     betahatlist <- list(length = maxiter)
 66 |     fhatlist <- list(length = maxiter)
 67 |     while (iter <= maxiter) {
 68 |         gcfit <- Y_qc/Nmat/betahatmat
 69 |         fhatnew <- apply(gcfit, 2, function(z) {
 70 |             spl <- smooth.spline(gc_qc, z)
 71 |             temp <- predict(spl, gc_qc)$y
 72 |             temp[temp <= 0] <- min(temp[temp > 0])
 73 |             temp
 74 |         })
 75 |         fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile(fhatnew, 0.005)
 76 |         betahatnew <- apply((Y_qc/(fhatnew * Nmat))[, 
 77 |                             norm_index, drop = FALSE], 1, median)
 78 |         betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0])
 79 |         bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat)
 80 |         fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat)
 81 |         if (fhdiff[iter] > min(fhdiff))
 82 |             break
 83 |         message("Iteration ", iter, "\t", "beta diff =",
 84 |             signif(bhdiff[iter], 3),
 85 |             "\t", "f(GC) diff =", signif(fhdiff[iter], 3))
 86 |         fhat <- fhatnew
 87 |         betahat <- betahatnew
 88 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
 89 |             data = betahat, byrow = FALSE)
 90 |         fhatlist[[iter]] <- fhat
 91 |         betahatlist[[iter]] <- betahat
 92 |         if (bhdiff[iter] < BHTHRESH)
 93 |             break
 94 |         if (iter > 5 & bhdiff[iter] > 1)
 95 |             break
 96 |         iter <- iter + 1
 97 |     }
 98 |     optIter <- which.min(fhdiff)
 99 |     message(paste("Stop at Iteration ", optIter, ".", sep = ""))
100 |     fhat <- fhatlist[[optIter]]
101 |     betahat <- betahatlist[[optIter]]
102 |     betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
103 |         data = betahat, byrow = FALSE)
104 |     Yhat <- pmax(round(fhat * Nmat * betahatmat, 0), 1)
105 |     fGC.hat <- signif(fhat, 3)
106 |     beta.hat <- signif(betahat, 3)
107 |     list(Yhat = Yhat, fGC.hat = fGC.hat, beta.hat = beta.hat, N = N)
108 | }
109 | 


--------------------------------------------------------------------------------
/R/initialize_ploidy_group.R:
--------------------------------------------------------------------------------
  1 | #' @title Group-wise ploidy pre-initialization
  2 | #'
  3 | #' @description Pre-estimate ploidies across cells with shared clonal 
  4 | #' memberships
  5 | #'
  6 | #' @usage
  7 | #' initialize_ploidy_group(Y, Yhat, ref, groups, 
  8 | #'                         maxPloidy = 6, minPloidy = 1.5,
  9 | #'                         minBinWidth = 5, SoS.plot = FALSE)
 10 | #' @param Y raw read depth matrix after quality control procedure
 11 | #' @param Yhat normalized read depth matrix
 12 | #' @param ref GRanges object after quality control procedure
 13 | #' @param groups clonal membership labels for each cell
 14 | #' @param maxPloidy maximum ploidy candidate. Defalut is \code{6}
 15 | #' @param minPloidy minimum ploidy candidate. Defalut is \code{1.5}
 16 | #' @param minBinWidth the minimum number of bins for a changed segment.
 17 | #'  Defalut is \code{5}
 18 | #' @param SoS.plot logical, whether to generate ploidy pre-estimation 
 19 | #'  plots. Default is \code{FALSE}. 
 20 | #'
 21 | #' @return
 22 | #'     \item{ploidy.SoS}{Vector of group-wise pre-estimated ploidies 
 23 | #'             for each cell}
 24 | #'
 25 | #' @examples
 26 | #' Gini <- get_gini(Y_sim)
 27 | #'
 28 | #' # first-pass CODEX2 run with no latent factors
 29 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 30 | #'                                         gc_qc = ref_sim$gc,
 31 | #'                                         norm_index = which(Gini<=0.12))
 32 | #' Yhat.noK.sim <- normObj.sim$Yhat
 33 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 34 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 35 | #' N.sim <- normObj.sim$N
 36 | #'
 37 | #' # Group-wise ploidy initialization
 38 | #' clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1")
 39 | #' ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 
 40 | #'                                 ref = ref_sim, groups = clones)
 41 | #' ploidy.sim.group
 42 | #'
 43 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 44 | #' @importFrom DNAcopy CNA smooth.CNA segment
 45 | #' @importFrom GenomeInfoDb seqnames
 46 | #' @importFrom BSgenome start
 47 | #' @importFrom IRanges end
 48 | #' @export
 49 | initialize_ploidy_group <- function(Y, Yhat, ref, groups, maxPloidy = 6,
 50 |                                     minPloidy = 1.5, minBinWidth = 5, 
 51 |                                     SoS.plot = FALSE) {
 52 |     ploidy.SoS <- rep(NA, ncol(Y))
 53 |     breaks <- matrix(0, nrow(Y), ncol(Y))
 54 |     RCNP <- matrix(0, nrow(Y), ncol(Y))
 55 |     final <- matrix(0, nrow(Y), ncol(Y))
 56 |     X <- seq(minPloidy, maxPloidy, by = 0.05)
 57 |     n_ploidy <- length(X)
 58 |     SoS <- matrix(0, n_ploidy, ncol(Y))
 59 |     normal <- (Y + 1)/(Yhat + 1)
 60 |     for (k in seq_len(ncol(Y))) {
 61 |         if (k%%5 == 1) {
 62 |             cat("Initializing ploidy for cell ", k, "\t")
 63 |         }
 64 |     
 65 |         lr <- log(normal[, k])
 66 |         loc <- data.frame(seq = as.character(seqnames(ref)), 
 67 |                         start = start(ref), end = end(ref))
 68 |         CNA.object <- CNA(genomdat = lr, chrom = loc[, 1],
 69 |                     maploc = as.numeric(loc[, 2]), data.type = "logratio")
 70 |         CNA.smoothed <- smooth.CNA(CNA.object)
 71 |         segs <- segment(CNA.smoothed, verbose = 0, min.width = minBinWidth)
 72 |         frag <- segs$output[, 2:3]
 73 |         len <- dim(frag)[1]
 74 |         bps <- array(0, len)
 75 |         for (j in seq_len(len)) {
 76 |             bps[j] <- which((loc[, 1] == frag[j, 1]) &
 77 |                             (as.numeric(loc[, 2]) == frag[j, 2]))
 78 |         }
 79 |         bps <- sort(bps)
 80 |         bps[(len = len + 1)] <- nrow(Y)
 81 |         breaks[bps, k] <- 1
 82 |         RCNP[, k][seq_len(bps[2])] <- median(normal[,
 83 |                             k][seq_len(bps[2])])
 84 |         for (i in 2:(len - 1)) {
 85 |             RCNP[, k][bps[i]:(bps[i + 1] - 1)] <- median(normal[,
 86 |                                     k][bps[i]:(bps[i + 1] - 1)])
 87 |         }
 88 |         RCNP[, k] <- RCNP[, k]/mean(RCNP[, k])
 89 |     
 90 |         SCNP <- RCNP[, k] %o% X
 91 |         FSCP <- round(SCNP)
 92 |         Diff2 <- (SCNP - FSCP)^2
 93 |         SoS[, k] <- colSums(Diff2, na.rm = FALSE, dims = 1)
 94 |     }
 95 | 
 96 |     cat("\n", "Initialize ploidy by group", "\n")
 97 |     for (G in unique(groups)) {
 98 |         cat(G, "\t")
 99 |         g <- which(G == groups)
100 |         ploidy.SoS[g] <- X[which.min(apply(SoS[, g, drop = FALSE], 1, sum))]
101 |     
102 |         if(SoS.plot){
103 |             par(mfrow = c(1,2))
104 |             par(mar = c(5,4,4,2))
105 |             hist(apply(Y[, g, drop = FALSE], 1, median), 100, 
106 |                 main = 'Read depth distribution', 
107 |                 xlab = 'Group-wise Median Coverage per bin')
108 |             plot(X, apply(SoS[, g, drop = FALSE], 1, sum), 
109 |                 xlab = "ploidy", ylab = "Group-wise sum of squared errors", 
110 |                 main = "First-pass estimation of ploidy", pch = 16)
111 |             abline(v = X[which.min(apply(SoS[, g, drop = FALSE], 1, sum))], 
112 |                 lty = 2)
113 |         }
114 |     }
115 |     return(ploidy.SoS)
116 | }


--------------------------------------------------------------------------------
/R/perform_qc.R:
--------------------------------------------------------------------------------
  1 | #' @title Quality control for cells and bins
  2 | #'
  3 | #' @description Perform QC step on single cells and bins.
  4 | #'
  5 | #' @usage
  6 | #' perform_qc(Y_raw, sampname_raw, ref_raw, QCmetric_raw,
  7 | #'         cov_thresh = 0, minCountQC = 20, 
  8 | #'         mapq20_thresh = 0.3, mapp_thresh = 0.9,
  9 | #'         gc_thresh = c(20, 80), nMAD = 3)
 10 | #'
 11 | #' @param Y_raw raw read count matrix returned
 12 | #'  from \code{\link{get_coverage_scDNA}}
 13 | #' @param sampname_raw sample names for quality control returned
 14 | #'  from \code{\link{get_bam_bed}}
 15 | #' @param ref_raw raw GRanges object with corresponding GC content
 16 | #'  and mappability for quality control returned from
 17 | #'  \code{\link{get_bam_bed}}
 18 | #' @param QCmetric_raw a QC metric for single cells returned from
 19 | #'  \code{\link{get_samp_QC}}
 20 | #' @param cov_thresh scalar variable specifying the lower bound of read count
 21 | #'  summation of each cell. Default is \code{0}
 22 | #' @param minCountQC the minimum read coverage required for
 23 | #'  normalization and EM fitting. Defalut is \code{20}
 24 | #' @param mapq20_thresh scalar variable specifying the lower threshold
 25 | #'  of proportion of reads with mapping quality greater than 20.
 26 | #'  Default is \code{0.3}
 27 | #' @param mapp_thresh scalar variable specifying mappability of
 28 | #'  each genomic bin. Default is \code{0.9}
 29 | #' @param gc_thresh vector specifying the lower and upper bound of
 30 | #'  GC content threshold for quality control. Default is \code{20-80}
 31 | #' @param nMAD scalar variable specifying the number of MAD from the median
 32 | #'  of total read counts adjusted by library size for each cell.
 33 | #'  Default is \code{3}
 34 | #'
 35 | #' @return A list with components
 36 | #'     \item{Y}{read depth matrix after quality control}
 37 | #'     \item{sampname}{sample names after quality control}
 38 | #'     \item{ref}{A GRanges object specifying whole genomic
 39 | #'         bin positions after quality control}
 40 | #'     \item{QCmetric}{A data frame of QC metric for single cells
 41 | #'         after quality control}
 42 | #'
 43 | #' @examples
 44 | #' Y_raw <- coverageObj.scopeDemo$Y
 45 | #' sampname_raw <- rownames(QCmetric.scopeDemo)
 46 | #' ref_raw <- ref.scopeDemo
 47 | #' QCmetric_raw <- QCmetric.scopeDemo
 48 | #' qcObj <- perform_qc(Y_raw = Y_raw, sampname_raw = sampname_raw,
 49 | #'                 ref_raw = ref_raw, QCmetric_raw = QCmetric_raw)
 50 | #'
 51 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 52 | #' @import stats
 53 | #' @export
 54 | perform_qc <- function(Y_raw, sampname_raw, ref_raw, QCmetric_raw,
 55 |     cov_thresh = 0, minCountQC = 20, 
 56 |     mapq20_thresh = 0.3, mapp_thresh = 0.9,
 57 |     gc_thresh = c(20, 80), nMAD = 3) {
 58 |     if (length(ref_raw) != nrow(Y_raw)) {
 59 |         stop("Invalid inputs: length of ref and # of rows
 60 |             in read count matrix must be the same")
 61 |     }
 62 |     if (length(sampname_raw) != ncol(Y_raw)) {
 63 |         stop("Invalid inputs: length of sample names and # of cols in
 64 |             read count matrix must be the same")
 65 |     }
 66 |     if (nrow(QCmetric_raw) != ncol(Y_raw)) {
 67 |         stop("Invalid inputs: # of rows in QC metric and # of cols in
 68 |             read count matrix must be the same")
 69 |     }
 70 |     mapp <- ref_raw$mapp
 71 |     gc <- ref_raw$gc
 72 |     sampfilter1 <- (apply(Y_raw, 2, sum) <= cov_thresh)
 73 |     message("Removed ", sum(sampfilter1),
 74 |         " samples due to failed library preparation.")
 75 |     sampfilter2 <- (apply(Y_raw, 2, mean) <= minCountQC)
 76 |     message("Removed ", sum(sampfilter2),
 77 |         " samples due to failure to meet min coverage requirement.")    
 78 |     sampfilter3 <- (QCmetric_raw[, "mapq20_prop"] < mapq20_thresh)
 79 |     message("Removed ", sum(sampfilter3),
 80 |         " samples due to low proportion of mapped reads.")
 81 |     if (sum(sampfilter1 | sampfilter2 | sampfilter3) != 0) {
 82 |         Y <- Y_raw[, !(sampfilter1 | sampfilter2 | sampfilter3)]
 83 |         sampname <- sampname_raw[!(sampfilter1 | sampfilter2 
 84 |                                 | sampfilter3)]
 85 |         QCmetric <- QCmetric_raw[!(sampfilter1 | sampfilter2 
 86 |                                 | sampfilter3), ]
 87 |     } else {
 88 |         Y <- Y_raw
 89 |         sampname <- sampname_raw
 90 |         QCmetric <- QCmetric_raw
 91 |     }
 92 |     binfilter1 <- (gc < gc_thresh[1] | gc > gc_thresh[2])
 93 |     message("Excluded ", sum(binfilter1),
 94 |         " bins due to extreme GC content.")
 95 |     binfilter2 <- (mapp < mapp_thresh)
 96 |     message("Excluded ", sum(binfilter2),
 97 |         " bins due to low mappability.")
 98 |     if (sum(binfilter1 | binfilter2) != 0) {
 99 |         ref <- ref_raw[!(binfilter1 | binfilter2)]
100 |         Y <- Y[!(binfilter1 | binfilter2), ]
101 |     } else {
102 |         ref <- ref_raw
103 |         Y <- Y
104 |     }
105 |     Y.nonzero <- Y[apply(Y, 1, function(x) {
106 |         !any(x == 0)
107 |         }), , drop = FALSE]
108 |     if(dim(Y.nonzero)[1] <= 10){
109 |         message("Adopt arithmetic mean instead of geometric mean")
110 |         pseudo.sample <- apply(Y, 1, mean)
111 |         N <- apply(apply(Y, 2, function(x) {
112 |             x/pseudo.sample
113 |         }), 2, median, na.rm = TRUE)
114 |     } else{
115 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
116 |             exp(sum(log(x))/length(x))
117 |         })
118 |         N <- apply(apply(Y.nonzero, 2, function(x) {
119 |             x/pseudo.sample
120 |         }), 2, median)
121 |     }
122 |     sampfilter3 <- (N == 0)
123 |     message("Removed ", sum(sampfilter3),
124 |             " samples due to excessive zero read counts in 
125 |             library size calculation.")
126 |     if (sum(sampfilter3) != 0) {
127 |         Y <- Y[, !(sampfilter3)]
128 |         sampname <- sampname[!(sampfilter3)]
129 |         QCmetric <- QCmetric[!(sampfilter3), ]
130 |         N <- N[!(sampfilter3)]
131 |     }
132 |     Nmat <- matrix(nrow = nrow(Y), ncol = ncol(Y), data = N,
133 |         byrow = TRUE)
134 |     bin.sum <- apply(Y/Nmat, 1, sum)
135 |     binfilter3 <- (bin.sum >= (median(bin.sum) -
136 |         nMAD * mad(bin.sum))) & (bin.sum <= (median(bin.sum) +
137 |         nMAD * mad(bin.sum)))
138 |     Y <- Y[binfilter3, ]
139 |     ref <- ref[binfilter3]
140 |     QCmetric <- as.data.frame(QCmetric)
141 |     message("There are ", ncol(Y), " samples and ",
142 |         nrow(Y), " bins after QC step. ")
143 |     list(Y = Y, sampname = sampname, ref = ref, QCmetric = QCmetric)
144 | }
145 | 


--------------------------------------------------------------------------------
/R/get_coverage_scDNA.R:
--------------------------------------------------------------------------------
  1 | #' @title Get read coverage from single-cell DNA sequencing
  2 | #'
  3 | #' @description Get read coverage for each genomic bin across all single
  4 | #'     cells from scDNA-seq. Blacklist regions, such as segmental duplication 
  5 | #'     regions and gaps near telomeres/centromeres will be masked prior to
  6 | #'     getting coverage. 
  7 | #'
  8 | #' @param bambedObj object returned from \code{get_bam_bed}
  9 | #' @param mapqthres mapping quality threshold of reads
 10 | #' @param seq the sequencing method to be used. This should be either
 11 | #' 'paired-end' or 'single-end'
 12 | #' @param hgref reference genome. This should be 'hg19', 'hg38' or 'mm10'.
 13 | #' Default is human genome \code{hg19}. 
 14 | #' @return
 15 | #'   \item{Y}{Read depth matrix}
 16 | #'
 17 | #' @examples
 18 | #' library(WGSmapp)
 19 | #' library(BSgenome.Hsapiens.UCSC.hg38)
 20 | #' bamfolder <- system.file('extdata', package = 'WGSmapp')
 21 | #' bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
 22 | #' bamdir <- file.path(bamfolder, bamFile)
 23 | #' sampname_raw <- sapply(strsplit(bamFile, '.', fixed = TRUE), '[', 1)
 24 | #' bambedObj <- get_bam_bed(bamdir = bamdir,
 25 | #'                             sampname = sampname_raw, 
 26 | #'                             hgref = "hg38")
 27 | #'
 28 | #' # Getting raw read depth
 29 | #' coverageObj <- get_coverage_scDNA(bambedObj,
 30 | #'                                 mapqthres = 40,
 31 | #'                                 seq = 'paired-end', 
 32 | #'                                 hgref = "hg38")
 33 | #' Y_raw <- coverageObj$Y
 34 | #'
 35 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 36 | #' @import Rsamtools
 37 | #' @importFrom GenomicRanges GRanges
 38 | #' @importFrom IRanges IRanges Views countOverlaps
 39 | #' @importFrom GenomeInfoDb seqnames
 40 | #' @export
 41 | get_coverage_scDNA <- function(bambedObj, mapqthres, seq, hgref = "hg19") {
 42 |     if(!hgref %in% c("hg19", "hg38", "mm10")){
 43 |         stop("Reference genome should be hg19, hg38 or mm10.")
 44 |     } 
 45 |     ref <- bambedObj$ref
 46 |     bamdir <- bambedObj$bamdir
 47 |     sampname <- bambedObj$sampname
 48 |     
 49 |     mask.ref <- get_masked_ref(hgref = hgref)
 50 |     
 51 |     Y <- matrix(nrow = length(ref), ncol = length(sampname))
 52 |     rownames(Y) <- paste(seqnames(ref), ":", start(ref), "-", 
 53 |                         end(ref), sep = "")
 54 |     colnames(Y) <- sampname
 55 |     for (i in seq_len(length(sampname))) {
 56 |         bamurl <- bamdir[i]
 57 |         what <- c("rname", "pos", "mapq", "qwidth")
 58 |         if (seq == "paired-end") {
 59 |             flag <- scanBamFlag(isPaired = TRUE, isDuplicate = FALSE,
 60 |                 isUnmappedQuery = FALSE, isNotPassingQualityControls = FALSE,
 61 |                 isFirstMateRead = TRUE)
 62 |             param <- ScanBamParam(what = what, flag = flag)
 63 |             bam <- scanBam(bamurl, param = param)[[1]]
 64 |         } else if (seq == "single-end") {
 65 |             flag <- scanBamFlag(isPaired = FALSE, isDuplicate = FALSE,
 66 |                 isUnmappedQuery = FALSE, isNotPassingQualityControls = FALSE)
 67 |             param <- ScanBamParam(what = what, flag = flag)
 68 |             bam <- scanBam(bamurl, param = param)[[1]]
 69 |         }
 70 |         message("Getting coverage for sample ", i, ": ",
 71 |             sampname[i], "...", sep = "")
 72 |         if (length(bam$rname) == 0) {
 73 |             Y[, i] <- 0  # Failed library preparation
 74 |         } else {
 75 |             if (any(grepl("chr", bam$rname) == TRUE)) {
 76 |                 bam.ref <- GRanges(seqnames = bam$rname, ranges =
 77 |                     IRanges(start = bam[["pos"]], width = bam[["qwidth"]]))
 78 |             } else {
 79 |                 bam.ref <- GRanges(seqnames = paste0("chr", bam$rname), ranges =
 80 |                     IRanges(start = bam[["pos"]], width = bam[["qwidth"]]))
 81 |             }
 82 |             bam.ref <- bam.ref[bam$mapq >= mapqthres]
 83 |             bam.ref <- suppressWarnings(bam.ref[countOverlaps(bam.ref,
 84 |                 mask.ref) == 0])
 85 |             Y[, i] <- countOverlaps(ref, bam.ref)
 86 |         }
 87 |     }
 88 |     list(Y = Y)
 89 | }
 90 | 
 91 | 
 92 | get_masked_ref <- function(hgref){
 93 |     if(hgref == "hg19"){
 94 |         # Get segmental duplication regions
 95 |         seg.dup <- read.table(system.file("extdata", 
 96 |                                 "GRCh37GenomicSuperDup.tab", 
 97 |                                 package = "WGSmapp"), header = TRUE)
 98 |         # Get hg19 gaps
 99 |         gaps <- read.table(system.file("extdata", "hg19gaps.txt", 
100 |                                 package = "WGSmapp"), 
101 |                                 header = TRUE)
102 |     } else if(hgref == "hg38"){
103 |         # Get segmental duplication regions
104 |         seg.dup <- read.table(system.file("extdata", 
105 |                                 "GRCh38GenomicSuperDup.tab", 
106 |                                 package = "WGSmapp"))
107 |         # Get hg19 gaps
108 |         gaps <- read.table(system.file("extdata", "hg38gaps.txt", 
109 |                                 package = "WGSmapp"))
110 |     } else if (hgref == "mm10"){
111 |         black.list <- read.table(system.file("extdata", 
112 |                                 "mm10-blacklist.v2.bed", 
113 |                                 package = "WGSmapp"), header = FALSE, 
114 |                                 sep = '\t')
115 |     }
116 |     if(hgref != "mm10"){
117 |         seg.dup <- seg.dup[!is.na(match(seg.dup[,1], 
118 |                             paste('chr', c(seq_len(22), 'X', 'Y'), 
119 |                             sep = ''))),]
120 |         seg.dup <- GRanges(seqnames = as.character(seg.dup[,1]), 
121 |                             ranges = IRanges(start = seg.dup[,2], 
122 |                             end = seg.dup[,3]))
123 |         gaps <- gaps[!is.na(match(gaps[,2], 
124 |                             paste('chr', c(seq_len(22), 'X', 'Y'), 
125 |                             sep = ''))),]
126 |         gaps <- GRanges(seqnames = as.character(gaps[,2]), 
127 |                             ranges = IRanges(start = gaps[,3], 
128 |                             end = gaps[,4]))
129 |         # Generate mask region
130 |         mask.ref <- sort(c(seg.dup, gaps))
131 |     } else{
132 |         black.list <- black.list[!is.na(match(black.list[,1], 
133 |                 paste('chr', c(seq_len(19), 'X', 'Y'), sep = ''))),]
134 |         black.list <- GRanges(seqnames = black.list[,1], 
135 |                 ranges = IRanges(start = black.list[,2], 
136 |                 end = black.list[,3]))
137 |         mask.ref <- black.list
138 |     }
139 |     return(mask.ref)
140 | }
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/R/plot_iCN.R:
--------------------------------------------------------------------------------
  1 | #' @title Plot post-segmentation copy number profiles of integer values
  2 | #'
  3 | #' @description Show heatmap of inferred integer copy-number profiles
  4 | #' by SCOPE with cells clustered by hierarchical clustering
  5 | #'
  6 | #' @usage
  7 | #' plot_iCN(iCNmat, ref, Gini, annotation = NULL,
  8 | #'         plot.dendrogram = TRUE, show.names = FALSE, filename)
  9 | #' @param iCNmat inferred integer copy-number matrix by SCOPE,
 10 | #' with each column being a cell and each row being a genomic bin
 11 | #' @param ref GRanges object after quality control procedure
 12 | #' @param Gini vector of Gini coefficients for each cell,
 13 | #' with the same order as that of cells in columns of \code{iCNmat}
 14 | #' @param annotation vector of annotation for each cell,
 15 | #' with the same order as that of cells in columns of \code{iCNmat}.
 16 | #' Default is \code{NULL}.
 17 | #' @param plot.dendrogram logical, whether to plot the dendrogram.
 18 | #' Default is \code{TRUE}.
 19 | #' @param show.names logical, whether to show cell names by y axis. 
 20 | #' Default is \code{FALSE}. 
 21 | #' @param filename name of the output png file
 22 | #'
 23 | #' @return png file with integer copy-number profiles across single cells
 24 | #' with specified annotations
 25 | #'
 26 | #' @examples
 27 | #' Gini <- get_gini(Y_sim)
 28 | #' plot_iCN(iCNmat = iCN_sim,
 29 | #'         ref = ref_sim,
 30 | #'         Gini = Gini,
 31 | #'         filename = 'plot_iCN_demo')
 32 | #'
 33 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 34 | #' @import graphics stats
 35 | #' @importFrom GenomeInfoDb seqnames
 36 | #' @importFrom gplots colorpanel
 37 | #' @importFrom RColorBrewer brewer.pal
 38 | #' @importFrom grDevices png dev.off
 39 | #' @export
 40 | plot_iCN <- function(iCNmat, ref, Gini, annotation = NULL,
 41 |     plot.dendrogram = TRUE, show.names = FALSE, filename) {
 42 |     smart_image <- function(mat, ...) {
 43 |         image(t(mat[rev(seq(nrow(mat))), ]), ...)
 44 |     }
 45 |     hm_col <- c("#2166AC", "#92C5DE", "#FDFDFD", "#FDDBC7",
 46 |         "#F4A582", "#D6604D", "#B2182B", "#67001F")
 47 | 
 48 |     if (!is.matrix(iCNmat)) {
 49 |         stop("Invalid plot object: must be an integer matrix. \n")
 50 |     }
 51 |     if (length(ref) != nrow(iCNmat)) {
 52 |         stop("Invalid GRanges object: length of ref and # of
 53 |             rows in iCNmat must be the same")
 54 |     }
 55 |     if (!is.null(annotation)) {
 56 |         if (!is.null(dim(annotation))) {
 57 |             stop("Invalid annotation object: has to be a vector or
 58 |                 factor with the same # of cells as that of iCNmat")
 59 |         }
 60 |         if (length(annotation) != ncol(iCNmat)) {
 61 |             stop("Invalid annotation object: length of annotation and
 62 |                 # of cells in iCNmat must be the same")
 63 |         }
 64 |     }
 65 |     if (show.names){
 66 |         if (is.null(colnames(iCNmat))){
 67 |             stop("Invalid plot object: cell names cannot be NULL")
 68 |         }
 69 |     }
 70 |     if (length(Gini) != ncol(iCNmat)) {
 71 |         stop("Invalid Gini object: length of Gini coefficient and
 72 |             # of cells in iCNmat must be the same")
 73 |     }
 74 | 
 75 |     # page setup
 76 |     if (is.null(annotation)) {
 77 |         if (plot.dendrogram) {
 78 |             if (show.names) {
 79 |                 mm <- matrix(c(0, 0, 4, 0, 0,
 80 |                             2, 3, 1, 7, 5,
 81 |                             2, 3, 1, 7, 0,
 82 |                             2, 3, 1, 7, 6), nrow = 4, byrow = TRUE)
 83 |                 mh <- c(2, 20, 20, 20)
 84 |                 mh <- mh/sum(mh)
 85 |                 mw <- c(0.25, 0.1, 5, 0.2, 0.5)
 86 |                 mw <- mw/sum(mw) 
 87 |             } else {
 88 |                 mm <- matrix(c(0, 0, 4, 0,
 89 |                             2, 3, 1, 5,
 90 |                             2, 3, 1, 0,
 91 |                             2, 3, 1, 6), nrow = 4, byrow = TRUE)
 92 |                 mh <- c(2, 20, 20, 20)
 93 |                 mh <- mh/sum(mh)
 94 |                 mw <- c(0.25, 0.1, 5, 0.5)
 95 |                 mw <- mw/sum(mw) 
 96 |             }
 97 |         } else {
 98 |             if (show.names) {
 99 |                 mm <- matrix(c(0, 0, 3, 0, 0,
100 |                             0, 2, 1, 6, 4,
101 |                             0, 2, 1, 6, 0,
102 |                             0, 2, 1, 6, 5), nrow = 4, byrow = TRUE)
103 |                 mh <- c(2, 20, 20, 20)
104 |                 mh <- mh/sum(mh)
105 |                 mw <- c(0.25, 0.1, 5, 0.2, 0.5)
106 |                 mw <- mw/sum(mw)
107 |             } else {
108 |                 mm <- matrix(c(0, 0, 3, 0,
109 |                             0, 2, 1, 4,
110 |                             0, 2, 1, 0,
111 |                             0, 2, 1, 5), nrow = 4, byrow = TRUE)
112 |                 mh <- c(2, 20, 20, 20)
113 |                 mh <- mh/sum(mh)
114 |                 mw <- c(0.25, 0.1, 5, 0.5)
115 |                 mw <- mw/sum(mw)
116 |             }
117 |         }
118 |     } else {
119 |         if (plot.dendrogram) {
120 |             if (show.names) {
121 |                 mm <- matrix(c(0, 0, 0, 5, 0, 0,
122 |                             2, 3, 4, 1, 9, 6,
123 |                             2, 3, 4, 1, 9, 7,
124 |                             2, 3, 4, 1, 9, 8), nrow = 4, byrow = TRUE)
125 |                 mh <- c(2, 20, 20, 20)
126 |                 mh <- mh/sum(mh)
127 |                 mw <- c(0.25, 0.1, 0.1, 5, 0.2, 0.5)
128 |                 mw <- mw/sum(mw)
129 |             } else {
130 |                 mm <- matrix(c(0, 0, 0, 5, 0,
131 |                             2, 3, 4, 1, 6,
132 |                             2, 3, 4, 1, 7,
133 |                             2, 3, 4, 1, 8), nrow = 4, byrow = TRUE)
134 |                 mh <- c(2, 20, 20, 20)
135 |                 mh <- mh/sum(mh)
136 |                 mw <- c(0.25, 0.1, 0.1, 5, 0.5)
137 |                 mw <- mw/sum(mw)
138 |             }
139 |         } else {
140 |             if (show.names) {
141 |                 mm <- matrix(c(0, 0, 0, 4, 0, 0,
142 |                             0, 2, 3, 1, 8, 5,
143 |                             0, 2, 3, 1, 8, 6,
144 |                             0, 2, 3, 1, 8, 7), nrow = 4, byrow = TRUE)
145 |                 mh <- c(2, 20, 20, 20)
146 |                 mh <- mh/sum(mh)
147 |                 mw <- c(0.25, 0.1, 0.1, 5, 0.2, 0.5)
148 |                 mw <- mw/sum(mw)
149 |             } else {
150 |                 mm <- matrix(c(0, 0, 0, 4, 0,
151 |                             0, 2, 3, 1, 5,
152 |                             0, 2, 3, 1, 6,
153 |                             0, 2, 3, 1, 7), nrow = 4, byrow = TRUE)
154 |                 mh <- c(2, 20, 20, 20)
155 |                 mh <- mh/sum(mh)
156 |                 mw <- c(0.25, 0.1, 0.1, 5, 0.5)
157 |                 mw <- mw/sum(mw)
158 |             }
159 |         }
160 |     }
161 |     png(paste0(filename, ".png"), width = 2500,
162 |         height = 1600, pointsize = 25)
163 |     layout(mm, widths = mw, heights = mh)
164 |     par(mar = rep(0, 4))
165 | 
166 |     iCNmat <- round(iCNmat)
167 |     if (!is.null(annotation)) {
168 |         annotation <- as.factor(annotation)
169 |     }
170 | 
171 |     chr.pos <- rep(NA, length(unique(seqnames(ref))))
172 |     for (chri in seq_len(length(chr.pos))) {
173 |         chr.pos[chri] <- length(ref[which(as.character(
174 |             seqnames(ref)) == as.character(unique(
175 |             seqnames(ref)))[chri])])
176 |         
177 |     }
178 |     chr.pos <- cumsum(chr.pos)
179 |     xpos <- round(c(0, chr.pos[seq_len(length(chr.pos)-1)]) +
180 |         (chr.pos - c(0, chr.pos[seq_len(length(chr.pos)-1)]))/2)
181 | 
182 |     # 1) iCN heatmap
183 |     dat <- t(iCNmat)
184 |     dat[dat >= 7] <- 7
185 |     dat[dat <= 0] <- 0
186 |     iCNtab <- as.numeric(names(table(dat)))
187 |     rclust <- hclust(dist(dat))
188 |     dat <- dat[rclust$order, ]
189 | 
190 |     smart_image(dat, col = hm_col[iCNtab + 1], xaxs = "i",
191 |         yaxs = "i", axes = FALSE)
192 |     abline(v = 0, lwd = 2)
193 |     for (i in seq_len(length(chr.pos))) {
194 |         abline(v = chr.pos[i]/length(ref), lwd = 2)
195 |     }
196 | 
197 |     # 2) hclust
198 |     if (plot.dendrogram) {
199 |         plot(rev(as.dendrogram(rclust)), leaflab = "none",
200 |             horiz = TRUE, axes = FALSE, yaxs = "i")
201 |     }
202 | 
203 |     # 3) Gini annotation
204 |     anno.Gini <- matrix(Gini[rclust$order], nrow = nrow(dat),
205 |         ncol = 1)
206 |     col.Gini <- gplots::colorpanel(50, "#F7FBFF", "#084594")
207 |     smart_image(anno.Gini, col = col.Gini, xaxs = "i",
208 |         yaxs = "i", axes = FALSE)
209 | 
210 |     if (!is.null(annotation)) {
211 |         # 4) Customized annotation
212 |         anno.level <- levels(annotation)
213 |         anno.mat <- matrix(match(annotation[rclust$order], anno.level),
214 |             nrow = nrow(dat), ncol = 1)
215 |         col.anno <- brewer.pal(n = 12, name = "Set3")[
216 |             sort(unique(match(annotation[rclust$order], anno.level)))]
217 |         smart_image(anno.mat, col = col.anno, xaxs = "i",
218 |             yaxs = "i", axes = FALSE)
219 |     }
220 | 
221 |     # 5) chromosome
222 |     anno.chrom <- NULL
223 |     for (i in seq_len(length(chr.pos))) {
224 |         if (i%%2 == 1) {
225 |             temp <- matrix(rep(1, length(which(as.character(
226 |                 seqnames(ref)) == as.character(unique(
227 |                 seqnames(ref)))[i]))), nrow = 1)
228 |             anno.chrom <- cbind(anno.chrom, temp)
229 |         } else {
230 |             temp <- matrix(rep(2, length(which(as.character(
231 |                 seqnames(ref)) == as.character(unique(
232 |                 seqnames(ref)))[i]))), nrow = 1)
233 |             anno.chrom <- cbind(anno.chrom, temp)
234 |         }
235 |     }
236 |     image(t(anno.chrom), col = c("gray", "black"),
237 |         xaxs = "i", yaxs = "i", axes = FALSE)
238 |     pos.text <- xpos/length(ref)
239 |     chr.noprint <- as.character(unique(seqnames(ref)))
240 |     chr.print <- substr(chr.noprint, 4, nchar(chr.noprint))
241 |     text(pos.text, 0.2, chr.print, col = c("black", "grey"), 
242 |         cex = 1.5)
243 | 
244 |     # 6) Gini legend
245 |     par(mar = c(2, 2, 2, 4))
246 |     image(1, seq_len(length(brewer.pal(n = 8, name = "Blues"))),
247 |         t(as.matrix(seq_len(length(brewer.pal(n = 8, name = "Blues"))))),
248 |         col = brewer.pal(n = 8, name = "Blues"), xlab = "", ylab = "",
249 |         xaxt = "n", yaxt = "n", bty = "n")
250 |     axis(4, at = c(1, length(brewer.pal(n = 8, name = "Blues"))),
251 |         labels = round(c(min(Gini), max(Gini)), 2), col.ticks = "white",
252 |         col = NA, lwd.ticks = 0, cex.axis = 1.5, las = 2, font = 2)
253 |     title("Gini", cex.main = 1.5)
254 | 
255 |     if (!is.null(annotation)) {
256 |         plot(0, 0, type = "n", axes = FALSE)
257 |         legend("center", legend = sort(unique(annotation)),
258 |             col = col.anno, pch = 15, bty = "n", cex = 1.5)
259 |     }
260 | 
261 |     # 8) iCN legend
262 |     par(mar = c(2, 2, 2, 4))
263 |     image(1, seq_len(length(hm_col[iCNtab + 1])),
264 |         t(as.matrix(seq_len(length(hm_col[iCNtab + 1])))),
265 |         col = hm_col[iCNtab + 1], xlab = "", ylab = "",
266 |         xaxt = "n", yaxt = "n", bty = "n")
267 |     axis(4, at = seq_len(length(hm_col[iCNtab + 1])),
268 |         labels = c(0:7)[iCNtab + 1], col.ticks = "white", col = NA,
269 |         lwd.ticks = 0, cex.axis = 1.5, las = 2, font = 2)
270 |     title("integer CN", cex.main = 1.5)
271 |     
272 |     # 9) cell names
273 |     if (show.names) {
274 |         par(mar = rep(0, 4))
275 |         image(1, seq_len(ncol(iCNmat)), 
276 |             t(as.matrix(seq_len(ncol(iCNmat)))),
277 |             col = "white", xlab = "", ylab = "",
278 |             xaxt = "n", yaxt = "n", bty = "n")
279 |         text(1, rev(seq_len(ncol(iCNmat))), 
280 |             labels = colnames(iCNmat)[rclust$order], las = 2, cex = 1.5, font = 2)
281 |     }
282 |     dev.off()
283 | }
284 | 
285 | 
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 


--------------------------------------------------------------------------------
/vignettes/SCOPE_vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SCOPE: Single-cell Copy Number Estimation"
  3 | author: "Rujin Wang, Danyu Lin, Yuchao Jiang"
  4 | date: "`r format(Sys.Date())`"
  5 | output:
  6 |     html_document:
  7 |         highlight: pygments
  8 |         toc: true
  9 | vignette: >
 10 |     %\VignetteIndexEntry{SCOPE: Single-cell Copy Number Estimation}
 11 |     %\VignetteEngine{knitr::rmarkdown}
 12 |     \usepackage[utf8]{inputenc}
 13 | ---
 14 | 
 15 | # 1. Overview of analysis pipeline
 16 | ## 1.1 Introduction
 17 | SCOPE is a statistical framework designed for calling 
 18 | copy number variants (CNVs) from whole-genome single-cell 
 19 | DNA sequencing read depths. The distinguishing features 
 20 | of SCOPE include: 
 21 | 
 22 | 1. Utilizes cell-specific Gini coefficients for quality 
 23 | controls and for identification of normal/diploid cells. 
 24 | In most single-cell cancer genomics studies, diploid cells 
 25 | are inevitably picked up from adjacent normal tissues for 
 26 | sequencing and can thus serve as normal controls for 
 27 | read depth normalization. However, not all 
 28 | platforms/experiments allow or adopt flow-sorting 
 29 | based techniques before scDNA-seq and thus cell ploidy 
 30 | and case-control labeling are not always readily 
 31 | available. Gini coefficient is able to index diploid 
 32 | cells out of the entire cell populations and serves 
 33 | as good proxies to identify cell outliers. 
 34 | 
 35 | 2. Employs an EM algorithm to model GC content bias, 
 36 | which accounts for the different copy number states 
 37 | along the genome. SCOPE is based on a Poisson latent 
 38 | factor model for cross-sample normalization, 
 39 | borrowing information both across regions and 
 40 | across samples to estimate the bias terms. 
 41 | 
 42 | 3. Incorporates multi-sample segmentation procedure 
 43 | to identify breakpoints that are shared across cells 
 44 | from the same genetic background
 45 | 
 46 | 
 47 | ## 1.2 Bioinformatic pre-processing
 48 | 
 49 | We demonstrate here how the pre-processing bioinformatic
 50 | pipeline works. The 
 51 | [Picard toolkit](https://broadinstitute.github.io/picard/) 
 52 | is open-source and free for all uses. The `split_script.py` 
 53 | python script is pre-stored in the package for demultiplexing. 
 54 | 
 55 | There are two types of scDNA-seq data sources: public 
 56 | data from NCBI Sequence Read Archive and data from 
 57 | 10X Genomics. For the NCBI SRA data, start with the 
 58 | SRA files. Fastq-dump to obtain FASTQ files. Align 
 59 | FASTQ sequences to NCBI hg19 reference genome and 
 60 | convert to BAM files. For the 10X Genomic datasets, 
 61 | process from the original integrated BAM file. 
 62 | Error-corrected chromium cellular barcode information 
 63 | for each read is stored as CB tag fields. Only 
 64 | reads that contain CB tags and are in the list 
 65 | of barcode of interest are demultiplexed via 
 66 | a Python script. Sort, add read group, and 
 67 | dedup on aligned/demultiplexed BAMs. Use 
 68 | deduped BAM files as the input. 
 69 | 
 70 | ```{bash, eval = FALSE}
 71 | # public data from NCBI Sequence Read Archive
 72 | SRR=SRRXXXXXXX
 73 | kim=/pine/scr/r/u/rujin/Kim_Navin_et_al_Cell_2018
 74 | fastq_dir=$kim/fastq
 75 | align_dir=$kim/align
 76 | 
 77 | # Align FASTQ sequences to NCBI hg19 reference genome 
 78 | # (Single-end sequenced cells have only 1 FASTQ file; 
 79 | # paired-end sequencing would generate two FASTQ files, 
 80 | # with suffix "_1" and "_2")
 81 | cd $fastq_dir
 82 | bwa mem -M -t 16 \
 83 |     ucsc.hg19.fasta `ls | grep "$SRR" | tr '\n' ' '` > $align_dir/"$SRR".sam
 84 | 
 85 | # Convert .sam to .bam
 86 | cd $align_dir
 87 | samtools view -bS "$SRR".sam > "$SRR".bam
 88 | 
 89 | # Sort
 90 | java -Xmx30G -jar /proj/yuchaojlab/bin/picard.jar SortSam \
 91 |     INPUT="$SRR".bam OUTPUT="$SRR".sorted.bam \
 92 |     SORT_ORDER=coordinate
 93 | 
 94 | # Add read group
 95 | java -Xmx40G -jar /proj/yuchaojlab/bin/picard.jar AddOrReplaceReadGroups \
 96 |     I="$SRR".sorted.bam O="$SRR".sorted.rg.bam RGID="$SRR" \
 97 |     RGLB=Chung_Et_Al RGPL=ILLUMINA RGPU=machine RGSM="$SRR"
 98 | samtools index "$SRR".sorted.rg.bam
 99 | 
100 | # Dedup
101 | java -Xmx40G -jar /proj/yuchaojlab/bin/picard.jar MarkDuplicates \
102 |     REMOVE_DUPLICATES=true \
103 |     I="$SRR".sorted.rg.bam O="$SRR".sorted.rg.dedup.bam \
104 |     METRICS_FILE="$SRR".sorted.rg.dedup.metrics.txt \
105 |     PROGRAM_RECORD_ID= MarkDuplicates PROGRAM_GROUP_VERSION=null \
106 |     PROGRAM_GROUP_NAME=MarkDuplicates
107 | java -jar /proj/yuchaojlab/bin/picard.jar BuildBamIndex \
108 |     I="$SRR".sorted.rg.dedup.bam
109 | 
110 | # 10X Genomics
111 | XGenomics=/pine/scr/r/u/rujin/10XGenomics
112 | dataset=breast_tissue_A_2k
113 | output_dir=$XGenomics/$dataset/output
114 | align_dir=$XGenomics/$dataset/align
115 | 
116 | # Demultiplex
117 | cd $output_dir
118 | samtools view ${dataset}_possorted_bam.bam | python $XGenomics/split_script.py
119 | 
120 | # Add header to demultiplexed bam files for further processing
121 | cd $XGenomics
122 | samtools view -H $dataset/output/${dataset}_possorted_bam.bam > \
123 |     $dataset/header.txt
124 | barcode=AAAGATGGTGTAAAGT
125 | cat header.txt $align_dir/$barcode/$barcode-1.sam > \
126 |     $align_dir/$barcode/$barcode-1.header.sam
127 | 
128 | # Convert .sam to .bam
129 | cd $align_dir/$barcode
130 | samtools view -bS "$barcode"-1.header.sam > "$barcode".bam
131 | ```
132 | 
133 | 
134 | # 2. Pre-computation and Quality Control
135 | ## 2.1 Pre-preparation
136 | SCOPE enables reconstruction of user-defined genome-wide 
137 | consecutive bins with fixed interval length prior to downstream 
138 | analysis by specifying arguments `genome` and `resolution` in 
139 | `get_bam_bed()` function. Make sure that all chromosomes are 
140 | named consistently and be concordant with `.bam` files. SCOPE 
141 | processes the entire genome altogether. Use function `get_bam_bed()` 
142 | to finish the pre-preparation step. By default, SCOPE is designed 
143 | for hg19 with fixed bin-length = 500kb. 
144 | ```{r, eval=TRUE, message=FALSE}
145 | library(SCOPE)
146 | library(WGSmapp)
147 | library(BSgenome.Hsapiens.UCSC.hg38)
148 | bamfolder <- system.file("extdata", package = "WGSmapp")
149 | bamFile <- list.files(bamfolder, pattern = '*.dedup.bam$')
150 | bamdir <- file.path(bamfolder, bamFile)
151 | sampname_raw <- sapply(strsplit(bamFile, ".", fixed = TRUE), "[", 1)
152 | bambedObj <- get_bam_bed(bamdir = bamdir, sampname = sampname_raw, 
153 |                         hgref = "hg38")
154 | ref_raw <- bambedObj$ref
155 | ```
156 | 
157 | ## 2.2 Getting GC content and mappability
158 | Compute GC content and mappability for each bin. 
159 | By default, SCOPE is intended for hg19 reference genome. 
160 | To compute mappability for hg19, we employed the 100-mers 
161 | mappability track from the ENCODE Project
162 | (`wgEncodeCrgMapabilityAlign100mer.bigwig` from [link](
163 | http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgFileUi?
164 | db=hg19&g=wgEncodeMapability)) 
165 | and computed weighted average of the mappability 
166 | scores if multiple ENCODE regions overlap with 
167 | the same bin. For SCOPE, the whole-genome mappability 
168 | track on human hg19 assembly is stored as part of the package. 
169 | 
170 | 
171 | The whole-genome mappability track on human 
172 | hg38 assembly is also stored in SCOPE package. 
173 | For more details on mappability calculation, 
174 | please refer to [CODEX2 for hg38](
175 | https://github.com/yuchaojiang/CODEX2/blob/master/README.md). 
176 | Load the hg38 reference package, specify argument `hgref = "hg38"` 
177 | in `get_mapp()` and `get_gc()`. By default, `hg19` are used. 
178 | 
179 | ```{r, eval=TRUE, message=FALSE}
180 | mapp <- get_mapp(ref_raw, hgref = "hg38")
181 | head(mapp)
182 | gc <- get_gc(ref_raw, hgref = "hg38")
183 | values(ref_raw) <- cbind(values(ref_raw), DataFrame(gc, mapp))
184 | ref_raw
185 | ```
186 | 
187 | Note that SCOPE can also be adapted to 
188 | the mouse genome (mm10) in a similar way 
189 | (see [CODEX2 for mouse genome](https://github.com/yuchaojiang/CODEX2/blob/master/README.md)). 
190 | Specify argument `hgref = "mm10"` in `get_bam_bed()`, `get_mapp()`, 
191 | `get_gc()` and `get_coverage_scDNA()`. Calculation of GC content 
192 | and mappability needs to be modified from the default (hg19). 
193 | For mm10, there are two workarounds: 1) set all mappability to 1 
194 | to avoid extensive computation; 2) adopt QC procedures based on
195 | annotation results, e.g., filter out bins within "blacklist" regions, 
196 | which generally have low mappability. To be specific, we obtained
197 | and pre-stored mouse genome "blacklist" regions from
198 | [Amemiya et al., Scientific Reports, 2019](https://github.com/Boyle-Lab/Blacklist/tree/master/lists/mm10-blacklist.v2.bed.gz). 
199 | ```{r, eval=FALSE}
200 | library(BSgenome.Mmusculus.UCSC.mm10)
201 | mapp <- get_mapp(ref_raw, hgref = "mm10")
202 | gc <- get_gc(ref_raw, hgref = "mm10")
203 | ```
204 | 
205 | 
206 | For unknown reference assembly without 
207 | pre-calculated mappability track, 
208 | refer to [CODEX2: mappability pre-calculation](
209 | https://github.com/yuchaojiang/CODEX2/blob/master/mouse/mapp.R).
210 | 
211 | ## 2.3 Getting coverage
212 | Obtain either single-end or paired-end sequencing 
213 | read depth matrix. SCOPE, by default, adopts a 
214 | fixed binning method to compute the depth of 
215 | coverage while removing reads that are mapped 
216 | to multiple genomic locations and to "blacklist" 
217 | regions. This is followed by an additional step 
218 | of quality control to remove bins with extreme 
219 | mappability to avoid erroneous detections. 
220 | Specifically, the "blacklist" bins, including 
221 | [segmental duplication regions](http://humanparalogy.
222 | gs.washington.edu/build37/data/GRCh37GenomicSuperDup.tab) 
223 | and [gaps in reference assembly](https://gist.github.com/leipzig/6123703) 
224 | from telomere, centromere, and/or 
225 | heterochromatin regions. 
226 | ```{r, eval=TRUE}
227 | # Getting raw read depth
228 | coverageObj <- get_coverage_scDNA(bambedObj, mapqthres = 40, 
229 |                                 seq = 'paired-end', hgref = "hg38")
230 | Y_raw <- coverageObj$Y
231 | ```
232 | 
233 | ## 2.4 Quality control
234 | `get_samp_QC()` is used to perform QC step on 
235 | single cells, where total number/proportion 
236 | of reads, total number/proportion of mapped 
237 | reads, total number/proportion of mapped 
238 | non-duplicate reads, and number/proportion 
239 | of reads with mapping quality greater than 20 
240 | will be returned. Use `perform_qc()` to further 
241 | remove samples/cells with low proportion of 
242 | mapped reads, bins that have extreme GC content 
243 | (less than 20% and greater than 80%) and low 
244 | mappability (less than 0.9) to reduce artifacts. 
245 | ```{r, eval=TRUE}
246 | QCmetric_raw <- get_samp_QC(bambedObj)
247 | qcObj <- perform_qc(Y_raw = Y_raw, 
248 |     sampname_raw = sampname_raw, ref_raw = ref_raw, 
249 |     QCmetric_raw = QCmetric_raw)
250 | Y <- qcObj$Y
251 | sampname <- qcObj$sampname
252 | ref <- qcObj$ref
253 | QCmetric <- qcObj$QCmetric
254 | ```
255 | 
256 | 
257 | 
258 | # 3. Running SCOPE
259 | ## 3.1 Gini coefficient
260 | One feature of SCOPE is to identify normal/diploid 
261 | cells using Gini index. Gini coefficient is 
262 | calculated for each cell as 2 times the area 
263 | between the Lorenz curve and the diagonal. 
264 | The value of the Gini index varies between 
265 | 0 and 1, where 0 is the most uniform and 1 
266 | is the most extreme. Cells with extremely 
267 | high Gini coefficients(greater than 0.5) 
268 | are recommended to be excluded. Set up a 
269 | Gini threshold for identification of 
270 | diploid/normal cells (for example, 
271 | Gini less than 0.12). We demonstrate 
272 | the pre-stored toy dataset as follows. 
273 | ```{r, eval=TRUE, message=FALSE}
274 | # get gini coefficient for each cell
275 | Gini <- get_gini(Y_sim)
276 | ```
277 | 
278 | ## 3.2 Running SCOPE with negative control samples
279 | 
280 | Normal cell index is determined either by Gini coefficients 
281 | or prior knowledge. SCOPE utilizes ploidy estimates from a 
282 | first-pass normalization run to ensure fast convergence
283 | and to avoid local optima. Specify `SoS.plot = TRUE` in 
284 | `initialize_ploidy()` to visualize ploidy pre-estimations. 
285 | The normalization procedure includes an expectation-maximization 
286 | algorithm in the Poisson generalized linear model. Note that, 
287 | by setting `ploidyInt` in the normalization function, SCOPE allows 
288 | users to exploit prior-knowledge ploidies as the input and to 
289 | manually tune a few cells that have poor fitting. 
290 | ```{r, eval=TRUE, message=TRUE}
291 | # first-pass CODEX2 run with no latent factors
292 | normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
293 |                                         gc_qc = ref_sim$gc,
294 |                                         norm_index = which(Gini<=0.12))
295 | 
296 | # Ploidy initialization
297 | ploidy.sim <- initialize_ploidy(Y = Y_sim, Yhat = normObj.sim$Yhat, ref = ref_sim)
298 | 
299 | # If using high performance clusters, parallel computing is 
300 | # easy and improves computational efficiency. Simply use 
301 | # normalize_scope_foreach() instead of normalize_scope(). 
302 | # All parameters are identical. 
303 | normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, gc_qc = ref_sim$gc,
304 |     K = 1, ploidyInt = ploidy.sim,
305 |     norm_index = which(Gini<=0.12), T = 1:5,
306 |     beta0 = normObj.sim$beta.hat, nCores = 2)
307 | # normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc,
308 | #     K = 1, ploidyInt = ploidy.sim,
309 | #     norm_index = which(Gini<=0.12), T = 1:5,
310 | #     beta0 = beta.hat.noK.sim)
311 | Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]]
312 | fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]]
313 | ```
314 | 
315 | Visualize selection results for each individual cell. 
316 | By default, BIC is used to choose optimal CNV group. 
317 | ```{r, eval=FALSE}
318 | plot_EM_fit(Y_qc = Y_sim, gc_qc = ref_sim$gc, norm_index = which(Gini<=0.12), 
319 |             T = 1:5,
320 |             ploidyInt = ploidy.sim, beta0 = normObj.sim$beta.hat,
321 |             filename = "plot_EM_fit_demo.pdf")
322 | ```
323 | 
324 | Upon completion of SCOPE's default normalization and segmentation, 
325 | SCOPE includes the option to cluster cells based on the matrix of 
326 | normalized z-scores, estimated copy numbers, or estimated changepoints. 
327 | Given the inferred subclones, SCOPE can opt to perform a second round of 
328 | group-wise ploidy initialization and normalization
329 | ```{r, eval=FALSE}
330 | # Group-wise ploidy initialization
331 | clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1")
332 | ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim,
333 |                                 ref = ref_sim, groups = clones)
334 | ploidy.sim.group
335 | 
336 | # Group-wise normalization
337 | normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim,
338 |                                     gc_qc = ref_sim$gc,
339 |                                     K = 1, ploidyInt = ploidy.sim.group,
340 |                                     norm_index = which(clones=="normal"),
341 |                                     groups = clones,
342 |                                     T = 1:5,
343 |                                     beta0 = beta.hat.noK.sim)
344 | Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max(
345 |                                     normObj.scope.sim.group$BIC)]]
346 | fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max(
347 |                                     normObj.scope.sim.group$BIC)]]
348 | ```
349 | 
350 | ## 3.3 Cross-sample segmentation by SCOPE
351 | 
352 | SCOPE provides the cross-sample segmentation, 
353 | which outputs shared breakpoints 
354 | across cells from the same clone. This step 
355 | processes the entire genome chromosome by 
356 | chromosome. Shared breakpoints and integer copy-number 
357 | profiles will be returned. 
358 | ```{r, eval=TRUE, message=FALSE}
359 | chrs <- unique(as.character(seqnames(ref_sim)))
360 | segment_cs <- vector('list',length = length(chrs))
361 | names(segment_cs) <- chrs
362 | for (chri in chrs) {
363 |     message('\n', chri, '\n')
364 |     segment_cs[[chri]] <- segment_CBScs(Y = Y_sim,
365 |                                     Yhat = Yhat.sim,
366 |                                     sampname = colnames(Y_sim),
367 |                                     ref = ref_sim,
368 |                                     chr = chri,
369 |                                     mode = "integer", max.ns = 1)
370 | }
371 | iCN_sim <- do.call(rbind, lapply(segment_cs, function(z){z[["iCN"]]}))
372 | ```
373 | 
374 | ## 3.4 Visualization
375 | 
376 | SCOPE offers heatmap of inferred integer copy-number 
377 | profiles with cells clustered by hierarchical clustering.
378 | ```{r, eval=FALSE}
379 | plot_iCN(iCNmat = iCN_sim, ref = ref_sim, Gini = Gini, 
380 |         filename = "plot_iCN_demo")
381 | ```
382 | 
383 | 
384 | ## Session information
385 | 
386 | ```{r}
387 | sessionInfo()
388 | ```


--------------------------------------------------------------------------------
/R/normalize_scope_foreach.R:
--------------------------------------------------------------------------------
  1 | #' @title Normalization of read depth with latent factors using
  2 | #' Expectation-Maximization algorithm under the case-control
  3 | #' setting in parallel
  4 | #'
  5 | #' @description Fit a Poisson generalized linear model to normalize
  6 | #' the raw read depth data from single-cell DNA sequencing,
  7 | #' with latent factors under the case-control setting. Model GC
  8 | #' content bias using an expectation-maximization algorithm,
  9 | #' which accounts for the different copy number states.
 10 | #'
 11 | #' @usage
 12 | #' normalize_scope_foreach(Y_qc, gc_qc, K, norm_index, T,
 13 | #'     ploidyInt, beta0, minCountQC = 20, nCores = NULL)
 14 | #' @param Y_qc read depth matrix after quality control
 15 | #' @param gc_qc vector of GC content for each bin after quality control
 16 | #' @param K Number of latent Poisson factors
 17 | #' @param norm_index indices of normal/diploid cells
 18 | #' @param T a vector of integers indicating number of CNV groups.
 19 | #'  Use BIC to select optimal number of CNV groups. If \code{T = 1},
 20 | #'  assume all reads are from normal regions so that EM algorithm is
 21 | #'  not implemented. Otherwise, we assume there is always a CNV group
 22 | #'  of heterozygous deletion and a group of null region. The rest groups
 23 | #'  are representative of different duplication states.
 24 | #' @param ploidyInt a vector of initialized ploidy return
 25 | #'  from \code{initialize_ploidy}. Users are also allowed to provide
 26 | #'  prior-knowledge ploidies as the input and to manually tune a few 
 27 | #'  cells that have poor fitting
 28 | #' @param beta0 a vector of initialized bin-specific biases returned
 29 | #'  from CODEX2 without latent factors
 30 | #' @param minCountQC the minimum read coverage required for normalization
 31 | #'  and EM fitting. Defalut is \code{20}
 32 | #' @param nCores number of cores to use. If \code{NULL}, number of cores
 33 | #'  is detected. Default is \code{NULL}.
 34 | #'
 35 | #' @return A list with components
 36 | #'     \item{Yhat}{A list of normalized read depth matrix with EM}
 37 | #'     \item{alpha.hat}{A list of absolute copy number matrix}
 38 | #'     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
 39 | #'     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
 40 | #'     \item{g.hat}{A list of estimated Poisson latent factor}
 41 | #'     \item{h.hat}{A list of estimated Poisson latent factor}
 42 | #'     \item{AIC}{AIC for model selection}
 43 | #'     \item{BIC}{BIC for model selection}
 44 | #'     \item{RSS}{RSS for model selection}
 45 | #'     \item{K}{Number of latent Poisson factors}
 46 | #'
 47 | #' @examples
 48 | #' Gini <- get_gini(Y_sim)
 49 | #'
 50 | #' # first-pass CODEX2 run with no latent factors
 51 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 52 | #'                                         gc_qc = ref_sim$gc,
 53 | #'                                         norm_index = which(Gini<=0.12))
 54 | #' Yhat.noK.sim <- normObj.sim$Yhat
 55 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 56 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 57 | #' N.sim <- normObj.sim$N
 58 | #'
 59 | #' # Ploidy initialization
 60 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim,
 61 | #'                             Yhat = Yhat.noK.sim,
 62 | #'                             ref = ref_sim)
 63 | #' ploidy.sim
 64 | #'
 65 | #' # Specify nCores = 2 only for checking examples
 66 | #' normObj.scope.sim <- normalize_scope_foreach(Y_qc = Y_sim, 
 67 | #'                         gc_qc = ref_sim$gc,
 68 | #'                         K = 1, ploidyInt = ploidy.sim,
 69 | #'                         norm_index = which(Gini<=0.12), T = 1:5,
 70 | #'                         beta0 = beta.hat.noK.sim, nCores = 2)
 71 | #' Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]]
 72 | #' fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]]
 73 | #'
 74 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 75 | #' @import stats foreach parallel doParallel
 76 | #' @export
 77 | normalize_scope_foreach <- function(Y_qc, gc_qc, K, norm_index, T,
 78 |     ploidyInt, beta0, minCountQC = 20, nCores = NULL) {
 79 |     if (max(K) > length(norm_index))
 80 |         stop("Number of latent Poisson factors K cannot
 81 |             exceed the number of normal samples. ")
 82 |     Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) {
 83 |         !any(x == 0)
 84 |         }), , drop = FALSE]
 85 |     if(dim(Y.nonzero)[1] <= 10){
 86 |         message("Adopt arithmetic mean instead of geometric mean")
 87 |         pseudo.sample <- apply(Y_qc, 1, mean)
 88 |         Ntotal <- apply(apply(Y_qc, 2, function(x) {
 89 |             x/pseudo.sample
 90 |         }), 2, median, na.rm = TRUE)
 91 |     } else{
 92 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
 93 |             exp(sum(log(x))/length(x))
 94 |             })
 95 |         Ntotal <- apply(apply(Y.nonzero, 2, function(x) {
 96 |             x/pseudo.sample
 97 |         }), 2, median)
 98 |     }
 99 |     N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc)))
100 |     Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
101 |         data = N, byrow = TRUE)
102 | 
103 |     # Get initialization
104 |     gcfit.temp <- Y_qc/Nmat/beta0
105 |     alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
106 |     for (j in seq_len(ncol(alpha0))) {
107 |         loe.fit <- loess(gcfit.temp[, j] ~ gc_qc)
108 |         gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2)
109 |         alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2
110 |     }
111 | 
112 |     Yhat <- vector("list", length(K))
113 |     fGC.hat <- vector("list", length(K))
114 |     alpha.hat <- vector("list", length(K))
115 |     beta.hat <- vector("list", length(K))
116 |     g.hat <- vector("list", length(K))
117 |     h.hat <- vector("list", length(K))
118 |     AIC <- rep(NA, length = length(K))
119 |     BIC <- rep(NA, length = length(K))
120 |     RSS <- rep(NA, length = length(K))
121 | 
122 |     # Initialization
123 |     message("Initialization ...")
124 |     gcfit.temp <- Y_qc/Nmat/beta0
125 |     offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
126 |         data = beta0, byrow = FALSE)
127 |     fhat.temp <- getfGC_foreach(gcfit.temp = gcfit.temp, gctemp = gc_qc,
128 |         Y = Y_qc, norm_index = norm_index, offset = offset,
129 |         T = T, alpha = alpha0, minCountQC = minCountQC, nCores = nCores)
130 |     fhat0 <- fhat.temp$fGC.hat
131 |     alpha0 <- fhat.temp$alpha
132 | 
133 |     for (ki in seq_len(length(K))) {
134 |         k <- K[ki]
135 |         message("Computing normalization with k = ", k,
136 |             " latent factors ...", sep = "")
137 |         message("k = ", k)
138 |         maxiter <- 10
139 |         maxhiter <- 50
140 |         BHTHRESH <- 1e-04
141 |         HHTHRESH <- 1e-05
142 |         iter <- 1
143 |         fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0)
144 |         betahat <- beta0
145 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
146 |             data = betahat, byrow = FALSE)
147 |         ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k)
148 |         hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k)
149 |         bhdiff <- rep(Inf, maxiter)
150 |         fhdiff <- rep(Inf, maxiter)
151 | 
152 |         betahatlist <- vector("list", maxiter)
153 |         fhatlist <- vector("list", maxiter)
154 |         ghatlist <- vector("list", maxiter)
155 |         hhatlist <- vector("list", maxiter)
156 |         alphahatlist <- vector("list", maxiter)
157 | 
158 |         while (iter <= maxiter) {
159 |             if (iter == 1) {
160 |                 fhatnew <- fhat0
161 |                 alpha <- alpha0
162 |             }
163 |             if (iter > 1) {
164 |                 gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat))
165 |                 offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
166 |                     data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat))
167 |                 fhat.temp <- getfGC_foreach(gcfit.temp = gcfit.temp,
168 |                     gctemp = gc_qc,
169 |                     Y = Y_qc, norm_index = norm_index,
170 |                     offset = offset, T = T, alpha = alpha0,
171 |                     minCountQC = minCountQC,
172 |                     nCores = nCores)
173 |                 fhatnew <- fhat.temp$fGC.hat
174 |                 alpha <- fhat.temp$alpha
175 |             }
176 |             fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile(
177 |                 fhatnew, 0.005)
178 |             betahatnew <- apply((Y_qc/(fhatnew * Nmat *
179 |                 exp(ghat %*% t(hhat))))[, norm_index, drop = FALSE], 
180 |                 1, median)
181 |             betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0])
182 |             bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat)
183 |             fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat)
184 |             if (fhdiff[iter] > min(fhdiff))
185 |                 break
186 |             message("Iteration ", iter, "\t", "beta diff =",
187 |                 signif(bhdiff[iter], 3), "\t", "f(GC) diff =",
188 |                 signif(fhdiff[iter],
189 |                 3))
190 |             fhat <- fhatnew
191 |             betahat <- betahatnew
192 |             betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
193 |                 data = betahat, byrow = FALSE)
194 |             L <- log(Nmat * fhat * betahatmat * alpha/2)
195 |             logmat <- log(pmax(Y_qc, 1)) - L
196 |             logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
197 |                 data = apply(logmat, 1, mean), byrow = FALSE)
198 |             hhat <- svd(logmat, nu = k, nv = k)$v
199 |             hhatnew <- hhat
200 |             hiter <- 1
201 |             hhdiff <- rep(Inf, maxhiter)
202 |             while (hiter <= maxhiter) {
203 |                 for (s in seq_len(nrow(Y_qc))) {
204 |                     temp <- try(glm(formula = Y_qc[s, norm_index] ~
205 |                         hhat[norm_index, ] -
206 |                         1, offset = L[s, norm_index],
207 |                         family = poisson)$coefficients, silent = TRUE)
208 |                     if (is.character(temp)) {
209 |                         temp <- lm(log(pmax(Y_qc[s, norm_index],
210 |                             1)) ~ hhat[norm_index, ] - 1,
211 |                             offset = log(L[s, norm_index]))$coefficients
212 |                     }
213 |                     ghat[s, ] <- temp
214 |                 }
215 |                 # avoid overflow or underflow of the g latent factors
216 |                 ghat[is.na(ghat)] <- 0
217 |                 if (max(ghat) >= 30) {
218 |                     ghat <- apply(ghat, 2, function(z) {
219 |                         z[z > quantile(z, 0.995)] = min(quantile(z, 0.995), 30)
220 |                         z
221 |                     })
222 |                 }
223 |                 if (min(ghat) <= -30) {
224 |                     ghat <- apply(ghat, 2, function(z) {
225 |                         z[z < quantile(z, 0.005)] = max(quantile(z, 0.005), -30)
226 |                         z
227 |                     })
228 |                 }
229 |                 for (t in seq_len(ncol(Y_qc))) {
230 |                     hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1,
231 |                         offset = L[, t], family = poisson)$coefficients
232 |                 }
233 |                 gh <- ghat %*% t(hhatnew)
234 |                 gh <- scale(gh, center = TRUE, scale = FALSE)
235 |                 hhatnew <- svd(gh, nu = k, nv = k)$v
236 |                 hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat)
237 |                 message("\t\t\t", "hhat diff =", signif(hhdiff[hiter], 3))
238 |                 hhat <- hhatnew
239 |                 if (hhdiff[hiter] < HHTHRESH)
240 |                     break
241 |                 if (hiter > 10 & (rank(hhdiff))[hiter] <= 3)
242 |                     break
243 |                 hiter <- hiter + 1
244 |             }
245 |             alphahatlist[[iter]] <- alpha
246 |             fhatlist[[iter]] <- fhat
247 |             betahatlist[[iter]] <- betahat
248 |             ghatlist[[iter]] <- ghat
249 |             hhatlist[[iter]] <- hhat
250 |             if (bhdiff[iter] < BHTHRESH)
251 |                 break
252 |             if (iter > 5 & bhdiff[iter] > 1)
253 |                 break
254 |             iter <- iter + 1
255 |         }
256 |         optIter <- which.min(fhdiff)
257 |         message(paste("Stop at Iteration ", optIter, ".", sep = ""))
258 |         alpha <- alphahatlist[[optIter]]
259 |         fhat <- fhatlist[[optIter]]
260 |         betahat <- betahatlist[[optIter]]
261 |         ghat <- ghatlist[[optIter]]
262 |         hhat <- hhatlist[[optIter]]
263 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
264 |             data = betahat, byrow = FALSE)
265 |         Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat *
266 |             exp(ghat %*% t(hhat)), 0), 1)
267 |         alpha.hat[[ki]] <- alpha
268 |         fGC.hat[[ki]] <- signif(fhat, 3)
269 |         beta.hat[[ki]] <- signif(betahat, 3)
270 |         h.hat[[ki]] <- signif(hhat, 3)
271 |         g.hat[[ki]] <- signif(ghat, 3)
272 |         Yhat.temp <- Yhat[[ki]] * alpha/2
273 |         AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - Yhat.temp) -
274 |             2 * (length(ghat) + length(hhat))
275 |         BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) - Yhat.temp) -
276 |             (length(ghat) + length(hhat)) * log(length(Y_qc))
277 |         RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc))
278 |         message("AIC", k, " = ", round(AIC[ki], 3))
279 |         message("BIC", k, " = ", round(BIC[ki], 3))
280 |         message("RSS", k, " = ", round(RSS[ki], 3), "\n")
281 |     }
282 |     list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat,
283 |         beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat,
284 |         AIC = AIC, BIC = BIC, RSS = RSS, K = K)
285 | }
286 | 
287 | 
288 | 
289 | getfGCj_foreach <- function(gcfit.tempj, gctemp, Yj, offsetj, T,
290 |     draw.plot = NULL, alphaj, minCountQC) {
291 |     alphaj <- pmax(1, round(alphaj))
292 |     if (is.null(draw.plot)) {
293 |         draw.plot <- FALSE
294 |     }
295 |     fGCi <- fitGC(gctemp, gcfit.tempj)
296 |     resid <- abs(gcfit.tempj - fGCi)
297 | 
298 |     bin.filter <- which(resid > (median(resid) +
299 |         5 * mad(resid)) | Yj < minCountQC)
300 |     if (length(bin.filter) == 0) {
301 |         bin.filter <- which.max(gcfit.tempj)
302 |     }
303 |     Ti <- T
304 |     if (Ti == 1) {
305 |         Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 1/Ti)
306 |         vec_pi <- 1
307 |         loe.fit.temp <- loess(gcfit.tempj[-bin.filter] ~ gctemp[-bin.filter])
308 |         fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit
309 |         temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
310 |         fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
311 |     }
312 |     if (Ti >= 2) {
313 |         Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 0)
314 |         mintemp <- pmin(Ti, alphaj)
315 |         Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1
316 |         vec_pi <- colSums(Z)/nrow(Z)
317 | 
318 |         loe.fit.temp <- loess((gcfit.tempj/(Z %*%
319 |             as.matrix(seq_len(Ti)/2)))[-bin.filter] ~ gctemp[-bin.filter])
320 |         fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit
321 |         temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
322 |         fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
323 | 
324 |         diff.GC <- Inf
325 |         diff.Z <- Inf
326 |         iter <- 1
327 |         while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) {
328 |             Mtemp <- Mstep(Z, gcfit.tempj, gctemp)
329 |             vec_pi.new <- Mtemp$vec_pi
330 |             fGCi.new <- Mtemp$fGCi
331 |             Z.new <- Estep(fGCi.new, vec_pi.new, Yj, offsetj)
332 |             diff.GC <- sum((fGCi - fGCi.new)^2)/length(fGCi)
333 |             diff.Z <- sum((Z.new - Z)^2)/length(Z)
334 |             vec_pi <- vec_pi.new
335 |             Z <- Z.new
336 |             fGCi <- fGCi.new
337 |             iter <- iter + 1
338 |             if (iter >= 50)
339 |                 break
340 |         }
341 |     }
342 | 
343 |     if (Ti == 1) {
344 |         loe.fit.plot <- loess(gcfit.tempj ~ gctemp)
345 |         fGCi.plot <- loe.fit.plot$fitted
346 |         temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
347 |         fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
348 |         df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
349 | 
350 |         loglik <- sum(dpois(Yj[-bin.filter],
351 |             lambda = (offsetj * fGCi)[-bin.filter], log = TRUE))
352 |         BIC <- 2 * loglik - (length(gcfit.tempj) - df) *
353 |             log(length(gcfit.tempj))
354 |     } else {
355 |         loe.fit.plot <- loess((gcfit.tempj/(Z %*%
356 |             as.matrix(seq_len(Ti)/2))) ~ gctemp)
357 |         fGCi.plot <- loe.fit.plot$fitted
358 |         temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
359 |         fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
360 |         df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
361 | 
362 |         loglik <- sum(dpois(Yj[-bin.filter],
363 |             lambda = (offsetj * fGCi * (Z %*%
364 |             as.matrix(seq_len(Ti)/2)))[-bin.filter],
365 |             log = TRUE))
366 |         BIC <- 2 * loglik - (length(gcfit.tempj) - df + Ti - 1) *
367 |             log(length(gcfit.tempj))
368 |     }
369 | 
370 |     if (draw.plot) {
371 |         smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter],
372 |             xlab = "GC content", ylab = "Y/beta/N/exp(gxh)",
373 |             nrpoints = 0, main = paste("T =", Ti))
374 |         if (Ti == 1) {
375 |             points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)],
376 |                 lty = 2, col = 2, type = "l", lwd = 2)
377 |             points(gctemp, gcfit.tempj, cex = 0.4, col = 2, pch = 16)
378 |         } else {
379 |             for (k in seq_len(Ti)) {
380 |                 points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)] *
381 |                     k/2, lty = 2, col = k, type = "l", lwd = 2)
382 |                 points(gctemp[which((round(Z))[, k] == 1)],
383 |                     (gcfit.tempj)[which((round(Z))[, k] == 1)], cex = 0.4,
384 |                     col = k, pch = 16)
385 |             }
386 |         }
387 |     }
388 | 
389 |     fGCi.obj <- fGCi
390 |     Z.obj <- Z
391 |     vec_pi.obj <- vec_pi
392 |     return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj,
393 |         vec_pi.obj = vec_pi.obj, bin.filter = bin.filter,
394 |         loglik = loglik, BIC = BIC))
395 | }
396 | 
397 | 
398 | 
399 | if (getRversion() >= "2.15.1") {
400 |     utils::globalVariables(c("Ti"))
401 | }
402 | getfGC_foreach <- function(gcfit.temp, gctemp, Y, norm_index, offset, T,
403 |     alpha, minCountQC, nCores) {
404 |     fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y))
405 |     for (j in seq_len(ncol(Y))) {
406 |         cat(j, "\t")
407 |         if (j %in% norm_index) {
408 |             alpha[, j] <- 2
409 |             loe.fit <- loess(gcfit.temp[, j] ~ gctemp)
410 |             fGC.hat[, j] <- loe.fit$fitted
411 |         } else {
412 |             # begin foreach
413 |             if (is.null(nCores)) {
414 |                 nCores <- detectCores() - 1
415 |             }
416 |             registerDoParallel(nCores)
417 | 
418 |             TList <- foreach(Ti = T, .export = c("fitGC", "Estep", "Mstep",
419 |                 "getfGCj_foreach")) %dopar% {
420 |                 getfGCj_foreach(gcfit.tempj = gcfit.temp[, j], gctemp = gctemp,
421 |                     Yj = Y[, j], offsetj = offset[,j], T = Ti,
422 |                     draw.plot = FALSE,
423 |                     alphaj = alpha[, j], minCountQC = minCountQC)
424 |             }
425 | 
426 |             # When you're done, clean up the cluster
427 |             stopImplicitCluster()
428 |             # end foreach
429 | 
430 |             fGCi.obj <- lapply(TList, function(x) x[["fGCi.obj"]])
431 |             Z.obj <- lapply(TList, function(x) x[["Z.obj"]])
432 |             vec_pi.obj <- lapply(TList, function(x) x[["vec_pi.obj"]])
433 |             loglik <- vapply(TList, function(x) x[["loglik"]], numeric(1))
434 |             BIC <- vapply(TList, function(x) x[["BIC"]], numeric(1))
435 | 
436 |             fGCj <- list(fGCi.obj = fGCi.obj, Z.obj = Z.obj,
437 |                 vec_pi.obj = vec_pi.obj, loglik = loglik, BIC = BIC)
438 |             if (which.max(fGCj$BIC) == 1) {
439 |                 alpha[, j] <- 2
440 |             } else {
441 |                 alpha[, j] <- apply(fGCj$Z.obj[[which.max(fGCj$BIC)]],
442 |                     1, which.max)
443 |             }
444 |             fGC.hat[, j] <- fGCj$fGCi.obj[[which.max(fGCj$BIC)]]
445 |         }
446 |     }
447 |     return(list(fGC.hat = fGC.hat, alpha = alpha))
448 | }
449 | 


--------------------------------------------------------------------------------
/R/segment_CBScs.R:
--------------------------------------------------------------------------------
  1 | #' @title Cross-sample segmentation
  2 | #'
  3 | #' @description SCOPE offers a cross-sample Poisson likelihood-based
  4 | #' recursive segmentation, enabling shared breakpoints across cells
  5 | #' from the same genetic background.
  6 | #' 
  7 | #' @usage segment_CBScs(Y, Yhat, sampname, ref, chr, 
  8 | #'                     mode = "integer", max.ns)
  9 | #'
 10 | #' @param Y raw read depth matrix after quality control procedure
 11 | #' @param Yhat normalized read depth matrix
 12 | #' @param sampname vector of sample names
 13 | #' @param ref GRanges object after quality control procedure
 14 | #' @param chr chromosome name. Make sure it is consistent with the
 15 | #'  reference genome.
 16 | #' @param mode format of returned copy numbers. Only integer mode is
 17 | #'  supported for scDNA-seq data.
 18 | #' @param max.ns a number specifying how many rounds of nested structure
 19 | #'  searching would be performed. Defalut is \code{0}.
 20 | #'
 21 | #' @return A list with components
 22 | #'     \item{poolcall}{Cross-sample CNV callings indicating
 23 | #'         shared breakpoints}
 24 | #'     \item{finalcall}{Final cross-sample segmented callset of
 25 | #'         CNVs with genotyping results}
 26 | #'     \item{image.orig}{A matrix giving logarithm of normalized
 27 | #'         z-scores}
 28 | #'     \item{image.seg}{A matrix of logarithm of estimated
 29 | #'         copy number over 2}
 30 | #'     \item{iCN}{A matrix of inferred integer copy number profiles}
 31 | #'
 32 | #' @examples
 33 | #' Yhat.sim <- normObj.scopeDemo$Yhat[[which.max(normObj.scopeDemo$BIC)]]
 34 | #' segment_cs_chr1 <- segment_CBScs(Y = Y_sim, Yhat = Yhat.sim,
 35 | #'                         sampname = colnames(Y_sim),
 36 | #'                         ref = ref_sim, chr = 'chr1', max.ns = 1)
 37 | #'
 38 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 39 | #' @importFrom GenomicRanges GRanges
 40 | #' @importFrom IRanges IRanges Views countOverlaps
 41 | #' @importFrom GenomeInfoDb seqnames
 42 | #' @export
 43 | segment_CBScs <- function(Y, Yhat, sampname, ref, chr, mode = "integer", 
 44 |     max.ns = 0) {
 45 |     if (is.na(match(chr, unique(as.character(seqnames(ref)))))) {
 46 |         stop("Chromosome not found in the reference genome. Make sure that
 47 |             all chromosomes are named consistently.  \n")
 48 |     }
 49 | 
 50 |     stbin.flag <- min(which(as.character(seqnames(ref)) == chr))
 51 |     Y <- Y[which(as.character(seqnames(ref)) == chr), ]
 52 |     Yhat <- Yhat[which(as.character(seqnames(ref)) == chr), ]
 53 |     ref <- ref[which(as.character(seqnames(ref)) == chr)]
 54 | 
 55 |     poolcall <- NULL
 56 |     message("Cross-sample segmenting for ", ncol(Y), " samples.")
 57 | 
 58 |     chpts0 <- c(1, nrow(Y))
 59 |     cs.scan <- compute_cs_lratio(Y, Yhat, sampname, chpts0)
 60 |     i <- cs.scan$i
 61 |     j <- cs.scan$j
 62 |     Z <- cs.scan$Z
 63 |     init.cs.finalmat <- cs.scan$finalmat
 64 | 
 65 |     if (!is.null(init.cs.finalmat) && nrow(init.cs.finalmat) > 0) {
 66 |         # Further cross-sample nested searching
 67 |         chpts <- init.cs.finalmat[, c(1, 2)]
 68 |         if (is.null(dim(chpts))) {
 69 |             chpts <- t(as.matrix(chpts))
 70 |         }
 71 | 
 72 |         keep_going <- 1
 73 |         # number of nested searching
 74 |         num_ns <- 1
 75 |         while (!all(keep_going == 0) & num_ns <= max.ns) {
 76 |             nested.output <- search_cs_nested(Y, Yhat, sampname, chpts)
 77 |             keep_going <- vapply(nested.output,
 78 |                 function(z) z$is.nested, numeric(1))
 79 | 
 80 |             newchpts <- NULL
 81 |             for (r in seq_len(length(keep_going))) {
 82 |                 if (keep_going[r]) {
 83 |                     newchpts <- rbind(newchpts,
 84 |                         nested.output[[r]]$finalmat[, c(1, 2)])
 85 |                 } else {
 86 |                     newchpts <- rbind(newchpts, chpts[r, ])
 87 |                 }
 88 |             }
 89 |             chpts <- newchpts
 90 |             num_ns <- num_ns + 1
 91 |         }
 92 | 
 93 |         # backward compute Z after nested searching
 94 |         temp <- NULL
 95 |         for (r in seq_len(nrow(chpts))) {
 96 |             idx <- which(i == chpts[r, 1] & j == chpts[r, 2])
 97 |             if (length(idx) != 0) {
 98 |                 temp <- c(temp, Z[idx])
 99 |             } else {
100 |                 temp <- c(temp, rep(NA, 4))
101 |             }
102 |         }
103 |         cs.finalmat <- cbind(chpts, temp)
104 |         cs.finalmat <- cs.finalmat[!is.na(cs.finalmat[, 3]), ,
105 |             drop = FALSE]
106 |         cs.finalmat <- cs.finalmat[order(-cs.finalmat[, 3]), ,
107 |             drop = FALSE]
108 |     } else {
109 |         cs.finalmat <- matrix(data = c(chpts0, 0),
110 |             nrow = 1, ncol = 3, byrow = TRUE)
111 |     }
112 | 
113 |     cs.loglikeij <- rep(NA, nrow(cs.finalmat))
114 |     cs.mBIC <- rep(NA, nrow(cs.finalmat))
115 | 
116 |     kappa1 <- 3/2
117 |     kappa2 <- 2.27
118 |     N <- ncol(Y)
119 |     T <- nrow(Y)
120 | 
121 |     for (s in seq_len(nrow(cs.finalmat))) {
122 |         tau <- sort(unique(c(as.vector(cs.finalmat[seq_len(s),
123 |             c(1, 2)]), 1, nrow(Y))))
124 |         m <- length(tau) - 2
125 |         if (m > 0) {
126 |             J <- matrix(data = NA, nrow = m + 2, ncol = N)
127 | 
128 |             Y0 <- Y
129 |             Yhat0 <- Yhat
130 |             Y0[Y0 <= 20] <- 20
131 |             Yhat0[Yhat0 <= 20] <- 20
132 | 
133 |             muhat <- matrix(data = NA, nrow = m + 1, ncol = N)
134 |             rhat <- matrix(data = NA, nrow = m + 1, ncol = N)
135 |             muhat[1, ] <- round(apply(Y0[seq_len(tau[2]), ,
136 |                 drop = FALSE], 2, sum)/tau[2])
137 |             rhat[1, ] <- round(2 * (apply(Y0[seq_len(tau[2]), ,
138 |                 drop = FALSE], 2, sum)/apply(Yhat0[seq_len(tau[2]),
139 |                 , drop = FALSE], 2, sum)))
140 |             for (r in seq_len(m)) {
141 |                 muhat[r + 1, ] <- round(apply(
142 |                     Yhat0[(tau[r + 1] + 1):tau[r + 2], ,
143 |                     drop = FALSE], 2, sum)/(tau[r + 2] - tau[r + 1] + 1))
144 |                 rhat[r + 1, ] <- round((apply(Y0[
145 |                     (tau[r + 1] + 1):tau[r + 2], ,
146 |                     drop = FALSE], 2, sum)/apply(Yhat0[
147 |                     (tau[r + 1] + 1):tau[r + 2], ,
148 |                     drop = FALSE], 2, sum)))
149 |             }
150 |             carriershat <- rhat[2:(m + 1), ] - rhat[seq_len(m), ]
151 |             carriershat[carriershat != 0] <- 1
152 |             J[2:(m + 1), ] <- carriershat
153 |             deltahat <- muhat[2:(m + 1), ] - muhat[seq_len(m), ]
154 |             deltahatJ <- deltahat * J[2:(m + 1), ]
155 |             if (is.null(dim(deltahatJ))) {
156 |                 deltahatJ <- matrix(data = deltahat, nrow = 1,
157 |                     ncol = length(deltahat))
158 |             }
159 |             deltahatJ.sq.sum <- apply(deltahatJ^2, 1, sum)
160 |             deltahatJ.sq.sum <- max(deltahatJ.sq.sum, 1)
161 | 
162 |             # M and pi
163 |             M <- sum(carriershat)
164 |             pihat <- M/(N * m)
165 | 
166 |             cs.temp <- create_chptsmat(cs.finalmat[seq_len(s), c(1, 2),
167 |                 drop = FALSE], chpts0)
168 |             L <- matrix(data = NA, ncol = N, nrow = nrow(cs.temp))
169 |             for (r in seq_len(nrow(L))) {
170 |                 yact.temp <- apply(Y[cs.temp[r, 1]:cs.temp[r, 2], ,
171 |                     drop = FALSE], 2, sum)
172 |                 lambda.temp <- apply(Yhat[cs.temp[r, 1]:cs.temp[r, 2], ,
173 |                     drop = FALSE], 2, sum)
174 |                 yact.temp[lambda.temp < 20] <- 20
175 |                 lambda.temp[lambda.temp < 20] <- 20
176 |                 L[r, ] <- (1 - round(2 * yact.temp/lambda.temp)/2) *
177 |                     lambda.temp + log((round(2 * (yact.temp/lambda.temp)) +
178 |                     1e-04)/2.0001) * yact.temp
179 |             }
180 |             cs.loglikeij[s] <- sum(L)
181 | 
182 |             term1 <- cs.loglikeij[s]
183 |             if (M == 0) {
184 |                 term2 <- 0
185 |             } else {
186 |                 term2 <- -M/2 * log(2 * cs.loglikeij[s]/M)
187 |             }
188 |             term3 <- -log(choose(T, m))
189 |             # term3 <- - m * log(choose(T, m))
190 |             term4 <- -M/2
191 |             term5 <- -sum(log(deltahatJ.sq.sum))
192 |             term6 <- -m * (kappa1 - kappa2)
193 |             if (pihat == 0 || pihat == 1) {
194 |                 term7 <- 0
195 |             } else {
196 |                 term7 <- (M * log(pihat) + (N * m - M) * log(1 - pihat))
197 |             }
198 |             mbic <- term1 + term2 + term3 + term4 + term5 + term6 + term7
199 |             cs.mBIC[s] <- mbic
200 |         } else {
201 |             cs.mBIC[s] <- 0
202 |         }
203 |     }
204 |     cs.mBIC <- round(cs.mBIC, digits = 3)
205 | 
206 |     cs.finalmat <- (cbind(cs.finalmat, cs.mBIC)[seq_len(which.max(cs.mBIC)), ,
207 |         drop = FALSE])
208 |     poolcall <- create_chptsmat(cs.finalmat[, c(1, 2), drop = FALSE], chpts0)
209 |     colnames(poolcall) <- c("st", "end")
210 | 
211 |     finalcall <- NULL
212 |     image.orig <- log(pmax(0.001, Y)/Yhat)
213 |     image.orig[image.orig <= -2] <- -2
214 |     image.orig[image.orig > 2] <- 2
215 | 
216 |     image.seg <- matrix(data = NA, nrow = nrow(image.orig),
217 |         ncol = ncol(image.orig))
218 |     iCN <- matrix(data = NA, nrow = nrow(image.orig),
219 |         ncol = ncol(image.orig))
220 |     colnames(image.seg) <- colnames(Y)
221 |     colnames(iCN) <- colnames(Y)
222 | 
223 |     for (i in seq_len(nrow(poolcall))) {
224 |         st_bin <- poolcall[i, "st"]
225 |         ed_bin <- poolcall[i, "end"]
226 | 
227 |         yact.ind <- colSums(Y[st_bin:ed_bin, , drop = FALSE])
228 |         lambda.ind <- colSums(Yhat[st_bin:ed_bin, , drop = FALSE])
229 |         if (mode == "integer") {
230 |             chat.ind <- round(2 * (yact.ind/lambda.ind))
231 |         } else if (mode == "fraction") {
232 |             chat.ind <- 2 * (yact.ind/lambda.ind)
233 |         }
234 |         chat.ind[chat.ind > 14] <- 14
235 | 
236 |         image.seg[poolcall[i, 1]:poolcall[i, 2], ] <-
237 |             matrix(nrow = (poolcall[i, 2] - poolcall[i, 1] + 1),
238 |             ncol = ncol(image.seg),
239 |             data = log(chat.ind/2), byrow = TRUE)
240 |         iCN[poolcall[i, 1]:poolcall[i, 2], ] <-
241 |             matrix(nrow = (poolcall[i, 2] - poolcall[i, 1] + 1),
242 |             ncol = ncol(iCN),
243 |             data = chat.ind, byrow = TRUE)
244 | 
245 |         temp <- cbind(colnames(Y), rep(st_bin, ncol(Y)),
246 |             rep(ed_bin, ncol(Y)), chat.ind)
247 |         temp <- temp[chat.ind != 2, ]
248 |         finalcall <- rbind(finalcall, temp)
249 |         rownames(finalcall) <- NULL
250 |         colnames(finalcall) <- c("sample_name", "st_bin",
251 |             "ed_bin", "cnv_no")
252 |     }
253 | 
254 |     finalcall <- as.data.frame(finalcall)
255 | 
256 |     finalcall$st_bin <- as.numeric(paste(finalcall$st_bin))
257 |     finalcall$ed_bin <- as.numeric(paste(finalcall$ed_bin))
258 |     finalcall$cnv_no <- as.numeric(paste(finalcall$cnv_no))
259 | 
260 | 
261 |     poolcall[, "st"] <- poolcall[, "st"] + stbin.flag - 1
262 |     poolcall[, "end"] <- poolcall[, "end"] + stbin.flag - 1
263 |     finalcall[, "st_bin"] <- finalcall[, "st_bin"] + stbin.flag - 1
264 |     finalcall[, "ed_bin"] <- finalcall[, "ed_bin"] + stbin.flag - 1
265 |     return(list(poolcall = poolcall, finalcall = finalcall,
266 |         image.orig = image.orig, image.seg = image.seg, iCN = iCN))
267 | }
268 | 
269 | 
270 | 
271 | create_chptsmat <- function(mat1, st_end) {
272 |     st <- st_end[1]
273 |     end <- st_end[2]
274 |     mat1 <- mat1[order(mat1[, 1]), , drop = FALSE]
275 |     if (mat1[1, 1] != st) {
276 |         newchptsmat <- matrix(data = c(st, mat1[1, 1] - 1, mat1[1, ]),
277 |             ncol = 2, byrow = TRUE)
278 |     } else {
279 |         newchptsmat <- t(as.matrix(mat1[1, ]))
280 |     }
281 |     if (nrow(mat1) > 1) {
282 |         for (r in 2:nrow(mat1)) {
283 |             if (mat1[r, 1] != mat1[r - 1, 2] + 1) {
284 |                 newchptsmat <- rbind(newchptsmat, matrix(data =
285 |                     c(mat1[r - 1, 2] + 1, mat1[r, 1] - 1, mat1[r, ]),
286 |                     ncol = 2, byrow = TRUE))
287 |             } else {
288 |                 newchptsmat <- rbind(newchptsmat, matrix(data = mat1[r, ],
289 |                     ncol = 2, byrow = TRUE))
290 |             }
291 |         }
292 |     }
293 |     if (mat1[nrow(mat1), 2] != end) {
294 |         newchptsmat <- rbind(newchptsmat,
295 |             matrix(data = c(mat1[nrow(mat1), 2] + 1, end), nrow = 1, ncol = 2))
296 |     }
297 |     return(newchptsmat)
298 | }
299 | 
300 | 
301 | 
302 | compute_cs_lratio <- function(Y, Yhat, sampname, this.chpts, msgprint = TRUE) {
303 |     Z0 <- NULL
304 |     if (this.chpts[2] - this.chpts[1] < 2) {
305 |         return(list(i = NA, j = NA, Z = NA,
306 |             finalmat = t(as.matrix(this.chpts)), is.nested = 0))
307 |     } else {
308 |         lmax <- nrow(Y) - 1
309 |         for (sampno in seq_len(ncol(Y))) {
310 |             if (msgprint) {
311 |                 message("Calculating scan statistic for sample ",
312 |                     sampno, ": ", sampname[sampno], ".")
313 |             }
314 | 
315 |             y <- Y[, sampno]
316 |             yhat <- Yhat[, sampno]
317 | 
318 |             if (any(yhat < 10)) {
319 |                 if (any(y[yhat < 10] < 10)) {
320 |                     y[yhat < 10] <- 10
321 |                 }
322 |                 yhat[yhat < 10] <- 10
323 |             }
324 | 
325 |             ysum <- sum(y)
326 |             yhatsum <- sum(yhat)
327 |             num <- length(y)
328 |             y <- c(y, rep(0, lmax))
329 |             yhat <- c(yhat, rep(0, lmax))
330 |             i <- rep(seq_len(num), rep(lmax, num))
331 |             j <- rep(seq_len(lmax), num) + i
332 |             yact <- rep(0, length(i))
333 |             lambda <- rep(0, length(i))
334 |             for (k in seq_len(num)) {
335 |                 yact[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum(
336 |                     y[k:(k + lmax)])[-1]
337 |                 lambda[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum(
338 |                     yhat[k:(k + lmax)])[-1]
339 |             }
340 |             i <- i[j <= num]
341 |             yact <- yact[j <= num]
342 |             lambda <- lambda[j <= num]
343 |             j <- j[j <= num]
344 | 
345 |             # integer mode
346 |             chat <- round(2 * (yact/lambda))
347 |             idx.noLeft <- which(i == this.chpts[1])
348 |             idx.noRight <- which(j == this.chpts[2])
349 |             chat.L <- rep(NA, length(chat))
350 |             chat.R <- rep(NA, length(chat))
351 |             yact.L <- rep(NA, length(chat))
352 |             yact.R <- rep(NA, length(chat))
353 |             lambda.L <- rep(NA, length(chat))
354 |             lambda.R <- rep(NA, length(chat))
355 |             for (x in seq_len(length(chat))) {
356 |                 if (x %in% idx.noLeft) {
357 |                     yact.L[x] <- 0
358 |                     lambda.L[x] <- 0
359 |                 } else {
360 |                     yact.L[x] <- sum(y[seq_len((i[x] - 1))])
361 |                     lambda.L[x] <- sum(yhat[seq_len((i[x] - 1))])
362 |                 }
363 | 
364 |                 if (x %in% idx.noRight) {
365 |                     yact.R[x] <- 0
366 |                     lambda.R[x] <- 0
367 |                 } else {
368 |                     yact.R[x] <- sum(y[(j[x] + 1):length(y)])
369 |                     lambda.R[x] <- sum(yhat[(j[x] + 1):length(y)])
370 |                 }
371 |             }
372 |             chat.L <- round(2 * (yact.L/lambda.L))
373 |             chat.R <- round(2 * (yact.R/lambda.R))
374 |             chat.L[which(is.na(chat.L))] <- 2
375 |             chat.R[which(is.na(chat.R))] <- 2
376 | 
377 |             lratio.C <- (1 - chat/2) * lambda +
378 |                 log((chat + 1e-04)/2.0001) * yact
379 |             lratio.L <- (1 - chat.L/2) * lambda.L +
380 |                 log((chat.L + 1e-04)/2.0001) * yact.L
381 |             lratio.R <- (1 - chat.R/2) * lambda.R +
382 |                 log((chat.R + 1e-04)/2.0001) * yact.R
383 |             lratio <- lratio.C + lratio.L + lratio.R
384 |             chat[chat > 14] <- 14
385 | 
386 |             Z0 <- cbind(Z0, lratio)
387 |         }
388 |         Z <- rowSums(Z0)
389 | 
390 |         winlag <- this.chpts[1] - 1
391 |         i <- i + winlag
392 |         j <- j + winlag
393 | 
394 |         if (sum(Z > 0) > 0) {
395 |             if (sum(Z > 0) >= 2) {
396 |                 finalmat <- (cbind(i, j, Z))[Z > 0, ]
397 |                 finalmat <- finalmat[order(-finalmat[, 3]), ]
398 |                 s <- 1
399 |                 while (s <= (nrow(finalmat))) {
400 |                     rowstart <- finalmat[s, 1]
401 |                     rowend <- finalmat[s, 2]
402 |                     rowsel <- (finalmat[, 1] <= rowend &
403 |                         finalmat[, 2] >= rowstart)
404 |                     rowsel[s] <- FALSE
405 |                     finalmat <- finalmat[!rowsel, ]
406 |                     if (is.vector(finalmat)) {
407 |                         finalmat <- t(as.matrix(finalmat))
408 |                     }
409 |                     s <- s + 1
410 |                 }
411 |             }
412 |             if (sum(Z > 0) == 1) {
413 |                 finalmat <- (cbind(i, j, Z))[Z > 0, ]
414 |                 finalmat <- t(as.matrix(finalmat))
415 |             }
416 |             finalmat <- finalmat[finalmat[, 2] - finalmat[, 1] > 1, ,
417 |                 drop = FALSE]
418 |             finalmat <- finalmat[finalmat[, 3] > 10, , drop = FALSE]
419 |             finalmat <- round(finalmat, digits = 3)
420 | 
421 |             if (nrow(finalmat) == 0) {
422 |                 is.nested <- 0
423 |             } else if (nrow(finalmat) == 1 & finalmat[1, 1] ==
424 |                 this.chpts[1] & finalmat[1, 2] == this.chpts[2]) {
425 |                 is.nested <- 0
426 |             } else if (nrow(finalmat) > 2 &
427 |                 length(unique(finalmat[, 3])) == 1) {
428 |                 is.nested <- 0
429 |             } else if (max(finalmat[, 2] - finalmat[, 1]) <= 5) {
430 |                 is.nested <- 0
431 |             } else if (nrow(finalmat) == 1 & (this.chpts[2] -
432 |                 this.chpts[1]) -
433 |                 (finalmat[1, 2] - finalmat[1, 1]) == 1) {
434 |                 is.nested <- 0
435 |             } else {
436 |                 is.nested <- 1
437 |             }
438 |         } else {
439 |             finalmat <- NULL
440 |             is.nested <- 0
441 |         }
442 | 
443 |         # Avoid losing the boundaries
444 |         if (!is.null(finalmat) && nrow(finalmat) > 0) {
445 |             min.st <- min(finalmat[, c(1, 2)])
446 |             if (this.chpts[1] < min.st) {
447 |                 idx <- which(i == this.chpts[1] & j == (min.st - 1))
448 |                 idx2 <- which(finalmat[, 1] == min.st)
449 |                 if (length(idx) != 0) {
450 |                     Z.temp <- Z[idx]
451 |                     if (Z.temp > 0) {
452 |                         finalmat[idx2, 1] <- this.chpts[1]
453 |                     } else {
454 |                         finalmat <- rbind(c(this.chpts[1], min.st - 1, 0),
455 |                         finalmat)
456 |                     }
457 |                 } else {
458 |                     finalmat[idx2, 1] <- this.chpts[1]
459 |                 }
460 |             }
461 |             max.ed <- max(finalmat[, c(1, 2)])
462 |             if (this.chpts[2] > max.ed) {
463 |                 idx <- which(i == (max.ed + 1) & j == this.chpts[2])
464 |                 idx2 <- which(finalmat[, 2] == max.ed)
465 |                 if (length(idx) != 0) {
466 |                     Z.temp <- Z[idx]
467 |                     if (Z.temp > 0) {
468 |                         finalmat[idx2, 2] <- this.chpts[2]
469 |                     } else {
470 |                         finalmat <- rbind(finalmat, c(max.ed + 1,
471 |                             this.chpts[2], 0))
472 |                     }
473 |                 } else {
474 |                     finalmat[idx2, 2] <- this.chpts[2]
475 |                 }
476 |             }
477 |         }
478 | 
479 |         if (!is.null(finalmat) && nrow(finalmat) > 1) {
480 |             for (r in seq_len(nrow(finalmat))) {
481 |                 singlepoint <- finalmat[r, 1] - 1
482 |                 sg.idx <- which(finalmat[, 2] == singlepoint - 1)
483 |                 if (length(sg.idx) != 0 && r < sg.idx) {
484 |                     finalmat[sg.idx, 2] <- singlepoint
485 |                 }
486 |             }
487 |             for (r in seq_len(nrow(finalmat))) {
488 |                 singlepoint <- finalmat[r, 2] + 1
489 |                 sg.idx <- which(finalmat[, 1] == singlepoint + 1)
490 |                 if (length(sg.idx) != 0 && r < sg.idx) {
491 |                     finalmat[sg.idx, 1] <- singlepoint
492 |                 }
493 |             }
494 |         }
495 | 
496 |         if (!is.null(finalmat) && nrow(finalmat) > 1) {
497 |             finalmat <- create_chptsmat(finalmat[, c(1, 2),
498 |                 drop = FALSE], this.chpts)
499 |         }
500 |         return(list(i = i, j = j, Z = Z, finalmat = finalmat,
501 |             is.nested = is.nested))
502 |     }
503 | }
504 | 
505 | 
506 | 
507 | search_cs_nested <- function(Y, Yhat, sampname, chptsmat) {
508 |     message("Performing cross-sample nested search... \n")
509 |     nest.list <- vector("list", nrow(chptsmat))
510 |     for (r in seq_len(nrow(chptsmat))) {
511 |         nest.list[[r]] <- compute_cs_lratio(Y[chptsmat[r, 1]:chptsmat[r, 2], ,
512 |             drop = FALSE], Yhat[chptsmat[r, 1]:chptsmat[r,
513 |             2], , drop = FALSE], sampname, chptsmat[r, ], msgprint = FALSE)
514 |     }
515 |     return(nest.list)
516 | }
517 | 
518 | 
519 | 
520 | 
521 | 


--------------------------------------------------------------------------------
/R/normalize_scope.R:
--------------------------------------------------------------------------------
  1 | #' @title Normalization of read depth with latent factors using
  2 | #' Expectation-Maximization algorithm under the case-control setting
  3 | #'
  4 | #' @description Fit a Poisson generalized linear model to normalize
  5 | #' the raw read depth data from single-cell DNA sequencing, with
  6 | #' latent factors under the case-control setting. Model GC content
  7 | #' bias using an expectation-maximization algorithm, which accounts for
  8 | #' the different copy number states.
  9 | #'
 10 | #' @usage
 11 | #' normalize_scope(Y_qc, gc_qc, K, norm_index, T, ploidyInt,
 12 | #'                 beta0, minCountQC = 20)
 13 | #' @param Y_qc read depth matrix after quality control
 14 | #' @param gc_qc vector of GC content for each bin after quality control
 15 | #' @param K Number of latent Poisson factors
 16 | #' @param norm_index indices of normal/diploid cells
 17 | #' @param T a vector of integers indicating number of CNV groups.
 18 | #'  Use BIC to select optimal number of CNV groups. If \code{T = 1},
 19 | #'  assume all reads are from normal regions so that EM algorithm is
 20 | #'  not implemented. Otherwise, we assume there is always a CNV group
 21 | #'  of heterozygous deletion and a group of null region. The rest
 22 | #'  groups are representative of different duplication states.
 23 | #' @param ploidyInt a vector of initialized ploidy return
 24 | #'  from \code{initialize_ploidy}. Users are also allowed to provide
 25 | #'  prior-knowledge ploidies as the input and to manually tune a few 
 26 | #'  cells that have poor fitting
 27 | #' @param beta0 a vector of initialized bin-specific biases
 28 | #'  returned from CODEX2 without latent factors
 29 | #' @param minCountQC the minimum read coverage required for
 30 | #'  normalization and EM fitting. Defalut is \code{20}
 31 | #'
 32 | #' @return A list with components
 33 | #'     \item{Yhat}{A list of normalized read depth matrix with EM}
 34 | #'     \item{alpha.hat}{A list of absolute copy number matrix}
 35 | #'     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
 36 | #'     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
 37 | #'     \item{g.hat}{A list of estimated Poisson latent factor}
 38 | #'     \item{h.hat}{A list of estimated Poisson latent factor}
 39 | #'     \item{AIC}{AIC for model selection}
 40 | #'     \item{BIC}{BIC for model selection}
 41 | #'     \item{RSS}{RSS for model selection}
 42 | #'     \item{K}{Number of latent Poisson factors}
 43 | #'
 44 | #' @examples
 45 | #' Gini <- get_gini(Y_sim)
 46 | #'
 47 | #' # first-pass CODEX2 run with no latent factors
 48 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 49 | #'                                         gc_qc = ref_sim$gc,
 50 | #'                                         norm_index = which(Gini<=0.12))
 51 | #' Yhat.noK.sim <- normObj.sim$Yhat
 52 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 53 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 54 | #' N.sim <- normObj.sim$N
 55 | #'
 56 | #' # Ploidy initialization
 57 | #' ploidy.sim <- initialize_ploidy(Y = Y_sim, 
 58 | #'                                 Yhat = Yhat.noK.sim, 
 59 | #'                                 ref = ref_sim)
 60 | #' ploidy.sim
 61 | #'
 62 | #' normObj.scope.sim <- normalize_scope(Y_qc = Y_sim, gc_qc = ref_sim$gc,
 63 | #'                                     K = 1, ploidyInt = ploidy.sim,
 64 | #'                                     norm_index = which(Gini<=0.12), T = 1:5,
 65 | #'                                     beta0 = beta.hat.noK.sim)
 66 | #' Yhat.sim <- normObj.scope.sim$Yhat[[which.max(normObj.scope.sim$BIC)]]
 67 | #' fGC.hat.sim <- normObj.scope.sim$fGC.hat[[which.max(normObj.scope.sim$BIC)]]
 68 | #'
 69 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 70 | #' @import stats
 71 | #' @export
 72 | normalize_scope <- function(Y_qc, gc_qc, K, norm_index, T, ploidyInt,
 73 |     beta0, minCountQC = 20) {
 74 |     if (max(K) > length(norm_index))
 75 |         stop("Number of latent Poisson factors K cannot exceed the number of
 76 |             normal samples. ")
 77 |     Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) {
 78 |         !any(x == 0)
 79 |     }), , drop = FALSE]
 80 |     if(dim(Y.nonzero)[1] <= 10){
 81 |         message("Adopt arithmetic mean instead of geometric mean")
 82 |         pseudo.sample <- apply(Y_qc, 1, mean)
 83 |         Ntotal <- apply(apply(Y_qc, 2, function(x) {
 84 |             x/pseudo.sample
 85 |         }), 2, median, na.rm = TRUE)
 86 |     } else{
 87 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
 88 |             exp(sum(log(x))/length(x))
 89 |             })
 90 |         Ntotal <- apply(apply(Y.nonzero, 2, function(x) {
 91 |             x/pseudo.sample
 92 |         }), 2, median)
 93 |     }
 94 |     N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc)))
 95 |     Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N,
 96 |         byrow = TRUE)
 97 | 
 98 |     # Get initialization
 99 |     gcfit.temp <- Y_qc/Nmat/beta0
100 |     alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
101 |     for (j in seq_len(ncol(alpha0))) {
102 |         loe.fit <- loess(gcfit.temp[, j] ~ gc_qc)
103 |         gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2)
104 |         alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2
105 |     }
106 | 
107 | 
108 |     Yhat <- vector("list", length(K))
109 |     fGC.hat <- vector("list", length(K))
110 |     alpha.hat <- vector("list", length(K))
111 |     beta.hat <- vector("list", length(K))
112 |     g.hat <- vector("list", length(K))
113 |     h.hat <- vector("list", length(K))
114 |     AIC <- rep(NA, length = length(K))
115 |     BIC <- rep(NA, length = length(K))
116 |     RSS <- rep(NA, length = length(K))
117 | 
118 |     # Initialization
119 |     message("Initialization ...")
120 |     gcfit.temp <- Y_qc/Nmat/beta0
121 |     offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
122 |         data = beta0, byrow = FALSE)
123 |     fhat.temp <- getfGC(gcfit.temp = gcfit.temp, gctemp = gc_qc,
124 |         Y = Y_qc, norm_index = norm_index, offset = offset,
125 |         T = T, alpha = alpha0, minCountQC = minCountQC)
126 |     fhat0 <- fhat.temp$fGC.hat
127 |     alpha0 <- fhat.temp$alpha
128 | 
129 |     for (ki in seq_len(length(K))) {
130 |         k <- K[ki]
131 |         message("Computing normalization with k = ", k,
132 |             " latent factors ...", sep = "")
133 |         message("k = ", k)
134 |         maxiter <- 10
135 |         maxhiter <- 50
136 |         BHTHRESH <- 1e-04
137 |         HHTHRESH <- 1e-05
138 |         iter <- 1
139 |         fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0)
140 |         betahat <- beta0
141 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
142 |             data = betahat, byrow = FALSE)
143 |         ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k)
144 |         hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k)
145 |         bhdiff <- rep(Inf, maxiter)
146 |         fhdiff <- rep(Inf, maxiter)
147 | 
148 |         betahatlist <- vector("list", maxiter)
149 |         fhatlist <- vector("list", maxiter)
150 |         ghatlist <- vector("list", maxiter)
151 |         hhatlist <- vector("list", maxiter)
152 |         alphahatlist <- vector("list", maxiter)
153 | 
154 |         while (iter <= maxiter) {
155 |             if (iter == 1) {
156 |                 fhatnew <- fhat0
157 |                 alpha <- alpha0
158 |             }
159 |             if (iter > 1) {
160 |                 gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat))
161 |                 offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
162 |                     data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat))
163 |                 fhat.temp <- getfGC(gcfit.temp = gcfit.temp, gctemp = gc_qc,
164 |                     Y = Y_qc, norm_index = norm_index,
165 |                     offset = offset, T = T, alpha = alpha0,
166 |                     minCountQC = minCountQC)
167 |                 fhatnew <- fhat.temp$fGC.hat
168 |                 alpha <- fhat.temp$alpha
169 |             }
170 |             fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile(
171 |                 fhatnew, 0.005)
172 |             betahatnew <- apply((Y_qc/(fhatnew * Nmat * exp(ghat %*%
173 |                 t(hhat))))[, norm_index, drop = FALSE], 1, median)
174 |             betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0])
175 |             bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat)
176 |             fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat)
177 |             if (fhdiff[iter] > min(fhdiff))
178 |                 break
179 |             message("Iteration ", iter, "\t", "beta diff =",
180 |                 signif(bhdiff[iter], 3), "\t", "f(GC) diff =",
181 |                 signif(fhdiff[iter], 3))
182 |             fhat <- fhatnew
183 |             betahat <- betahatnew
184 |             betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
185 |                 data = betahat, byrow = FALSE)
186 |             L <- log(Nmat * fhat * betahatmat * alpha/2)
187 |             logmat <- log(pmax(Y_qc, 1)) - L
188 |             logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
189 |                 data = apply(logmat, 1, mean), byrow = FALSE)
190 |             hhat <- svd(logmat, nu = k, nv = k)$v
191 |             hhatnew <- hhat
192 |             hiter <- 1
193 |             hhdiff <- rep(Inf, maxhiter)
194 |             while (hiter <= maxhiter) {
195 |                 for (s in seq_len(nrow(Y_qc))) {
196 |                     temp <- try(glm(formula = Y_qc[s, norm_index] ~
197 |                         hhat[norm_index, ] -
198 |                         1, offset = L[s, norm_index],
199 |                         family = poisson)$coefficients,
200 |                         silent = TRUE)
201 |                     if (is.character(temp)) {
202 |                         temp <- lm(log(pmax(Y_qc[s, norm_index], 1)) ~
203 |                             hhat[norm_index, ] -
204 |                             1, offset = log(L[s, norm_index]))$coefficients
205 |                     }
206 |                     ghat[s, ] <- temp
207 |                 }
208 |                 # avoid overflow or underflow of the g latent factors
209 |                 ghat[is.na(ghat)] <- 0
210 |                 if (max(ghat) >= 30) {
211 |                     ghat <- apply(ghat, 2, function(z) {
212 |                         z[z > quantile(z, 0.995)] = min(quantile(z,
213 |                         0.995), 30)
214 |                         z
215 |                     })
216 |                 }
217 |                 if (min(ghat) <= -30) {
218 |                     ghat <- apply(ghat, 2, function(z) {
219 |                         z[z < quantile(z, 0.005)] = max(quantile(z,
220 |                         0.005), -30)
221 |                         z
222 |                     })
223 |                 }
224 |                 for (t in seq_len(ncol(Y_qc))) {
225 |                     hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1,
226 |                         offset = L[, t], family = poisson)$coefficients
227 |                 }
228 |                 gh <- ghat %*% t(hhatnew)
229 |                 gh <- scale(gh, center = TRUE, scale = FALSE)
230 |                 hhatnew <- svd(gh, nu = k, nv = k)$v
231 |                 hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat)
232 |                 message("\t\t\t", "hhat diff =",
233 |                     signif(hhdiff[hiter], 3))
234 |                 hhat <- hhatnew
235 |                 if (hhdiff[hiter] < HHTHRESH)
236 |                     break
237 |                 if (hiter > 10 & (rank(hhdiff))[hiter] <= 3)
238 |                     break
239 |                 hiter <- hiter + 1
240 |             }
241 |             alphahatlist[[iter]] <- alpha
242 |             fhatlist[[iter]] <- fhat
243 |             betahatlist[[iter]] <- betahat
244 |             ghatlist[[iter]] <- ghat
245 |             hhatlist[[iter]] <- hhat
246 |             if (bhdiff[iter] < BHTHRESH)
247 |                 break
248 |             if (iter > 5 & bhdiff[iter] > 1)
249 |                 break
250 |             iter <- iter + 1
251 |         }
252 |         optIter <- which.min(fhdiff)
253 |         message(paste("Stop at Iteration ", optIter, ".", sep = ""))
254 |         alpha <- alphahatlist[[optIter]]
255 |         fhat <- fhatlist[[optIter]]
256 |         betahat <- betahatlist[[optIter]]
257 |         ghat <- ghatlist[[optIter]]
258 |         hhat <- hhatlist[[optIter]]
259 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
260 |             data = betahat, byrow = FALSE)
261 |         Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat *
262 |             exp(ghat %*% t(hhat)), 0), 1)
263 |         alpha.hat[[ki]] <- alpha
264 |         fGC.hat[[ki]] <- signif(fhat, 3)
265 |         beta.hat[[ki]] <- signif(betahat, 3)
266 |         h.hat[[ki]] <- signif(hhat, 3)
267 |         g.hat[[ki]] <- signif(ghat, 3)
268 |         Yhat.temp <- Yhat[[ki]] * alpha/2
269 |         AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) -
270 |             Yhat.temp) - 2 * (length(ghat) + length(hhat))
271 |         BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) -
272 |             Yhat.temp) - (length(ghat) + length(hhat)) *
273 |             log(length(Y_qc))
274 |         RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc))
275 |         message("AIC", k, " = ", round(AIC[ki], 3))
276 |         message("BIC", k, " = ", round(BIC[ki], 3))
277 |         message("RSS", k, " = ", round(RSS[ki], 3), "\n")
278 |     }
279 |     list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat,
280 |         beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat,
281 |         AIC = AIC, BIC = BIC, RSS = RSS, K = K)
282 | }
283 | 
284 | 
285 | 
286 | getfGCj <- function(gcfit.tempj, gctemp, Yj, offsetj, T, draw.plot = NULL,
287 |     alphaj, minCountQC) {
288 |     alphaj <- pmax(1, round(alphaj))
289 |     if (is.null(draw.plot)) {
290 |         draw.plot <- FALSE
291 |     }
292 |     loglik = BIC = rep(NA, length(T))
293 | 
294 |     fGCi.obj <- vector("list", length(T))
295 |     Z.obj <- vector("list", length(T))
296 |     vec_pi.obj <- vector("list", length(T))
297 |     fGCi <- fitGC(gctemp, gcfit.tempj)
298 |     resid <- abs(gcfit.tempj - fGCi)
299 | 
300 |     bin.filter <- which(resid > (median(resid) +
301 |         5 * mad(resid)) | Yj < minCountQC)
302 |     if (length(bin.filter) == 0) {
303 |         bin.filter <- which.max(gcfit.tempj)
304 |     }
305 |     if (draw.plot) {
306 |         par(mfrow = c(5, 2))
307 |         smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter],
308 |             main = "Original", xlab = "GC content",
309 |             ylab = "Y/beta/N/exp(gxh)")
310 |     }
311 |     for (Ti in T) {
312 |         if (Ti == 1) {
313 |             Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti,
314 |                 data = 1/Ti)
315 |             vec_pi <- 1
316 |             loe.fit.temp <- loess(gcfit.tempj[-bin.filter] ~
317 |                 gctemp[-bin.filter])
318 |             fGCi <- predict(loe.fit.temp, newdata = gctemp,
319 |                 se = TRUE)$fit
320 |             temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
321 |             fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
322 |         }
323 |         if (Ti >= 2) {
324 |             Z <- matrix(nrow = length(gcfit.tempj), ncol = Ti, data = 0)
325 |             mintemp <- pmin(Ti, alphaj)
326 |             Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1
327 |             vec_pi <- colSums(Z)/nrow(Z)
328 | 
329 |             loe.fit.temp <- loess((gcfit.tempj/(Z %*%
330 |                 as.matrix(seq_len(Ti)/2)))[-bin.filter] ~
331 |                 gctemp[-bin.filter])
332 |             fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit
333 |             temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
334 |             fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
335 | 
336 |             diff.GC <- Inf
337 |             diff.Z <- Inf
338 |             iter <- 1
339 |             while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) {
340 |                 Mtemp <- Mstep(Z, gcfit.tempj, gctemp)
341 |                 vec_pi.new <- Mtemp$vec_pi
342 |                 fGCi.new <- Mtemp$fGCi
343 |                 Z.new <- Estep(fGCi.new, vec_pi.new, Yj, offsetj)
344 |                 diff.GC <- sum((fGCi - fGCi.new)^2)/length(fGCi)
345 |                 diff.Z <- sum((Z.new - Z)^2)/length(Z)
346 |                 vec_pi <- vec_pi.new
347 |                 Z <- Z.new
348 |                 fGCi <- fGCi.new
349 |                 iter <- iter + 1
350 |                 if (iter >= 50)
351 |                     break
352 |             }
353 |         }
354 | 
355 |         if (Ti == 1) {
356 |             loe.fit.plot <- loess(gcfit.tempj ~ gctemp)
357 |             fGCi.plot <- loe.fit.plot$fitted
358 |             temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
359 |             fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
360 |             df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
361 | 
362 |             loglik[which(T == Ti)] <- sum(dpois(Yj[-bin.filter],
363 |                 lambda = (offsetj * fGCi)[-bin.filter], log = TRUE))
364 |             BIC[which(T == Ti)] <- 2 * loglik[which(T == Ti)] -
365 |                 (length(gcfit.tempj) - df) * log(length(gcfit.tempj))
366 |         } else {
367 |             loe.fit.plot <- loess((gcfit.tempj/(Z %*%
368 |                 as.matrix(seq_len(Ti)/2))) ~ gctemp)
369 |             fGCi.plot <- loe.fit.plot$fitted
370 |             temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
371 |             fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
372 |             df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
373 | 
374 |             loglik[which(T == Ti)] <- sum(dpois(Yj[-bin.filter],
375 |                 lambda = (offsetj * fGCi *
376 |                 (Z %*% as.matrix(seq_len(Ti)/2)))[-bin.filter],
377 |                 log = TRUE))
378 |             BIC[which(T == Ti)] <- 2 * loglik[which(T == Ti)] -
379 |                 (length(gcfit.tempj) - df + Ti - 1) *
380 |                 log(length(gcfit.tempj))
381 |         }
382 | 
383 |         if (draw.plot) {
384 |             smoothScatter(gctemp[-bin.filter], gcfit.tempj[-bin.filter],
385 |                 xlab = "GC content", ylab = "Y/beta/N/exp(gxh)",
386 |                 nrpoints = 0, main = paste("T =", Ti))
387 |             if (Ti == 1) {
388 |                 points(gctemp[order(gctemp)], fGCi.plot[order(gctemp)],
389 |                     lty = 2, col = 2, type = "l", lwd = 2)
390 |                 points(gctemp, gcfit.tempj, cex = 0.4,
391 |                     col = 2, pch = 16)
392 |             } else {
393 |                 for (k in seq_len(Ti)) {
394 |                     points(gctemp[order(gctemp)],
395 |                         fGCi.plot[order(gctemp)] * k/2,
396 |                         lty = 2, col = k, type = "l", lwd = 2)
397 |                     points(gctemp[which((round(Z))[, k] == 1)],
398 |                         (gcfit.tempj)[which((round(Z))[, k] == 1)],
399 |                         cex = 0.4, col = k, pch = 16)
400 |                 }
401 |             }
402 |         }
403 | 
404 |         fGCi.obj[[which(T == Ti)]] <- fGCi
405 |         Z.obj[[which(T == Ti)]] <- Z
406 |         vec_pi.obj[[which(T == Ti)]] <- vec_pi
407 |     }
408 |     if (draw.plot) {
409 |         plot(T, loglik, type = "b", xlab = "T", ylab = "loglik",
410 |             main = "Log likelihood")
411 |         abline(v = which.max(loglik), lty = 2)
412 |         plot(T, BIC, type = "b", xlab = "T", ylab = "BIC", main = "BIC")
413 |         abline(v = which.max(BIC), lty = 2)
414 |         par(mfrow = c(1, 1))
415 |     }
416 |     return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj,
417 |         vec_pi.obj = vec_pi.obj, bin.filter = bin.filter,
418 |         loglik = loglik, BIC = BIC))
419 | }
420 | 
421 | 
422 | 
423 | getfGC <- function(gcfit.temp, gctemp, Y, norm_index, offset, T,
424 |     alpha, minCountQC) {
425 |     fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y))
426 |     for (j in seq_len(ncol(Y))) {
427 |         cat(j, "\t")
428 |         if (j %in% norm_index) {
429 |             alpha[, j] <- 2
430 |             loe.fit <- loess(gcfit.temp[, j] ~ gctemp)
431 |             fGC.hat[, j] <- loe.fit$fitted
432 |         } else {
433 |             fGCj <- getfGCj(gcfit.tempj = gcfit.temp[, j],
434 |                 gctemp = gctemp, Yj = Y[, j], offsetj = offset[, j],
435 |                 T = T, draw.plot = FALSE, alphaj = alpha[, j],
436 |                 minCountQC = minCountQC)
437 |             if (which.max(fGCj$BIC) == 1) {
438 |                 alpha[, j] <- 2
439 |             } else {
440 |                 alpha[, j] <- apply(fGCj$Z.obj[[which.max(fGCj$BIC)]],
441 |                     1, which.max)
442 |             }
443 |             fGC.hat[, j] <- fGCj$fGCi.obj[[which.max(fGCj$BIC)]]
444 |         }
445 |     }
446 |     return(list(fGC.hat = fGC.hat, alpha = alpha))
447 | }
448 | 
449 | 
450 | Estep <- function(fGCi, vec_pi, Yj, offsetj) {
451 |     P <- matrix(nrow = length(fGCi), ncol = length(vec_pi))
452 |     vec_pi[vec_pi == 0] <- 1e-100
453 |     lambda <- matrix(nrow = nrow(P), ncol = ncol(P),
454 |         data = offsetj * fGCi) * matrix(nrow = nrow(P), ncol = ncol(P),
455 |         data = seq_len(length(vec_pi))/2, byrow = TRUE)
456 |     P <- dpois(matrix(nrow = nrow(P), ncol = ncol(P), data = Yj),
457 |         lambda = lambda, log = TRUE) + matrix(nrow = nrow(P),
458 |         ncol = ncol(P), data = log(vec_pi), byrow = TRUE)
459 |     Z <- matrix(nrow = length(fGCi), ncol = length(vec_pi))
460 |     for (k in seq_len(length(vec_pi))) {
461 |         Z[, k] <- 1/(1 + apply(exp(apply(P, 2, function(x) {
462 |             x - P[, k]
463 |         })[, -k, drop = FALSE]), 1, sum))
464 |     }
465 |     return(Z)
466 | }
467 | 
468 | 
469 | 
470 | Mstep <- function(Z, gcfit.tempj, gctemp) {
471 |     gcfit.temp <- gcfit.tempj/(Z %*% as.matrix(seq_len(ncol(Z))/2))
472 |     fGCi <- fitGC(gctemp, gcfit.temp)
473 |     vec_pi <- colSums(Z)/nrow(Z)
474 |     return(list(vec_pi = vec_pi, fGCi = fGCi))
475 | }
476 | 
477 | 
478 | 
479 | fitGC <- function(gctemp, gcfit.temp) {
480 |     loe.fit <- loess(gcfit.temp ~ gctemp)
481 |     fGCi <- loe.fit$fitted
482 |     temp <- min(fGCi[fGCi > 0])
483 |     fGCi[fGCi <= 0] <- temp
484 |     return(fGCi)
485 | }
486 | 


--------------------------------------------------------------------------------
/R/normalize_scope_group.R:
--------------------------------------------------------------------------------
  1 | #' @title Group-wise normalization of read depth with latent factors using
  2 | #' Expectation-Maximization algorithm and shared clonal memberships
  3 | #'
  4 | #' @description Fit a Poisson generalized linear model to normalize
  5 | #' the raw read depth data from single-cell DNA sequencing, with
  6 | #' latent factors and shared clonal memberships. Model GC content
  7 | #' bias using an expectation-maximization algorithm, which accounts for
  8 | #' clonal specific copy number states.
  9 | #'
 10 | #' @usage
 11 | #' normalize_scope_group(Y_qc, gc_qc, K, norm_index, groups, T, 
 12 | #'                         ploidyInt, beta0, minCountQC = 20)
 13 | #' @param Y_qc read depth matrix after quality control
 14 | #' @param gc_qc vector of GC content for each bin after quality control
 15 | #' @param K Number of latent Poisson factors
 16 | #' @param norm_index indices of normal/diploid cells using group/clone
 17 | #' labels
 18 | #' @param groups clonal membership labels for each cell
 19 | #' @param T a vector of integers indicating number of CNV groups.
 20 | #'  Use BIC to select optimal number of CNV groups. If \code{T = 1},
 21 | #'  assume all reads are from normal regions so that EM algorithm is
 22 | #'  not implemented. Otherwise, we assume there is always a CNV group
 23 | #'  of heterozygous deletion and a group of null region. The rest
 24 | #'  groups are representative of different duplication states.
 25 | #' @param ploidyInt a vector of group-wise initialized ploidy return
 26 | #'  from \code{initialize_ploidy_group}. Users are also allowed to 
 27 | #'  provide prior-knowledge ploidies as the input and to manually 
 28 | #'  tune a few cells/clones that have poor fitting
 29 | #' @param beta0 a vector of initialized bin-specific biases
 30 | #'  returned from CODEX2 without latent factors
 31 | #' @param minCountQC the minimum read coverage required for
 32 | #'  normalization and EM fitting. Defalut is \code{20}
 33 | #'
 34 | #' @return A list with components
 35 | #'     \item{Yhat}{A list of normalized read depth matrix with EM}
 36 | #'     \item{alpha.hat}{A list of absolute copy number matrix}
 37 | #'     \item{fGC.hat}{A list of EM estimated GC content bias matrix}
 38 | #'     \item{beta.hat}{A list of EM estimated bin-specific bias vector}
 39 | #'     \item{g.hat}{A list of estimated Poisson latent factor}
 40 | #'     \item{h.hat}{A list of estimated Poisson latent factor}
 41 | #'     \item{AIC}{AIC for model selection}
 42 | #'     \item{BIC}{BIC for model selection}
 43 | #'     \item{RSS}{RSS for model selection}
 44 | #'     \item{K}{Number of latent Poisson factors}
 45 | #'
 46 | #' @examples
 47 | #' Gini <- get_gini(Y_sim)
 48 | #'
 49 | #' # first-pass CODEX2 run with no latent factors
 50 | #' normObj.sim <- normalize_codex2_ns_noK(Y_qc = Y_sim,
 51 | #'                                         gc_qc = ref_sim$gc,
 52 | #'                                         norm_index = which(Gini<=0.12))
 53 | #' Yhat.noK.sim <- normObj.sim$Yhat
 54 | #' beta.hat.noK.sim <- normObj.sim$beta.hat
 55 | #' fGC.hat.noK.sim <- normObj.sim$fGC.hat
 56 | #' N.sim <- normObj.sim$N
 57 | #'
 58 | #' # Group-wise ploidy initialization
 59 | #' clones <- c("normal", "tumor1", "normal", "tumor1", "tumor1")
 60 | #' ploidy.sim.group <- initialize_ploidy_group(Y = Y_sim, Yhat = Yhat.noK.sim, 
 61 | #'                                 ref = ref_sim, groups = clones)
 62 | #' ploidy.sim.group
 63 | #'
 64 | #' normObj.scope.sim.group <- normalize_scope_group(Y_qc = Y_sim, 
 65 | #'                                     gc_qc = ref_sim$gc,
 66 | #'                                     K = 1, ploidyInt = ploidy.sim.group,
 67 | #'                                     norm_index = which(clones=="normal"), 
 68 | #'                                     groups = clones, 
 69 | #'                                     T = 1:5,
 70 | #'                                     beta0 = beta.hat.noK.sim)
 71 | #' Yhat.sim.group <- normObj.scope.sim.group$Yhat[[which.max(
 72 | #'                                     normObj.scope.sim.group$BIC)]]
 73 | #' fGC.hat.sim.group <- normObj.scope.sim.group$fGC.hat[[which.max(
 74 | #'                                     normObj.scope.sim.group$BIC)]]
 75 | #'
 76 | #' @author Rujin Wang \email{rujin@email.unc.edu}
 77 | #' @import stats
 78 | #' @export
 79 | normalize_scope_group <- function(Y_qc, gc_qc, K, norm_index, groups, T, 
 80 |                                     ploidyInt, beta0, minCountQC = 20) {
 81 |     if (max(K) > length(norm_index))
 82 |         stop("Number of latent Poisson factors K cannot exceed the number of
 83 |         normal samples. ")
 84 |     Y.nonzero <- Y_qc[apply(Y_qc, 1, function(x) {
 85 |         !any(x == 0)
 86 |     }), , drop = FALSE]
 87 |     if(dim(Y.nonzero)[1] <= 10){
 88 |         message("Adopt arithmetic mean instead of geometric mean")
 89 |         pseudo.sample <- apply(Y_qc, 1, mean)
 90 |         Ntotal <- apply(apply(Y_qc, 2, function(x) {
 91 |             x/pseudo.sample
 92 |             }), 2, median, na.rm = TRUE)
 93 |     } else{
 94 |         pseudo.sample <- apply(Y.nonzero, 1, function(x) {
 95 |             exp(sum(log(x))/length(x))
 96 |         })
 97 |         Ntotal <- apply(apply(Y.nonzero, 2, function(x) {
 98 |             x/pseudo.sample
 99 |         }), 2, median)
100 |     }
101 |     N <- round(Ntotal/median(Ntotal) * median(colSums(Y_qc)))
102 |     Nmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = N, 
103 |                     byrow = TRUE)
104 | 
105 |     # Get initialization
106 |     gcfit.temp <- Y_qc/Nmat/beta0
107 |     alpha0 <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc))
108 |     for (j in seq_len(ncol(alpha0))) {
109 |         loe.fit <- loess(gcfit.temp[, j] ~ gc_qc)
110 |         gcfit.null <- loe.fit$fitted/(ploidyInt[j]/2)
111 |         alpha0[, j] <- gcfit.temp[, j]/gcfit.null * 2
112 |     }
113 |     
114 |     Yhat <- vector("list", length(K))
115 |     fGC.hat <- vector("list", length(K))
116 |     alpha.hat <- vector("list", length(K))
117 |     beta.hat <- vector("list", length(K))
118 |     g.hat <- vector("list", length(K))
119 |     h.hat <- vector("list", length(K))
120 |     AIC <- rep(NA, length = length(K))
121 |     BIC <- rep(NA, length = length(K))
122 |     RSS <- rep(NA, length = length(K))
123 | 
124 |     # Initialization
125 |     message("Initialization ...")
126 |     gcfit.temp <- Y_qc/Nmat/beta0
127 |     offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
128 |                             data = beta0, byrow = FALSE)
129 | 
130 |     fhat.temp <- getfGC2(gcfit.temp = gcfit.temp, gctemp = gc_qc,
131 |                         Y = Y_qc, norm_index = norm_index, groups = groups, 
132 |                         offset = offset,
133 |                         T = T, alpha = alpha0, minCountQC = minCountQC)
134 |     fhat0 <- fhat.temp$fGC.hat
135 |     alpha0 <- fhat.temp$alpha
136 | 
137 |     for (ki in seq_len(length(K))) {
138 |         k <- K[ki]
139 |         message("Computing normalization with k = ", k,
140 |                 " latent factors ...", sep = "")
141 |         message("k = ", k)
142 |         maxiter <- 10
143 |         maxhiter <- 50
144 |         BHTHRESH <- 1e-04
145 |         HHTHRESH <- 1e-05
146 |         iter <- 1
147 |         fhat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc), data = 0)
148 |         betahat <- beta0
149 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
150 |                             data = betahat, byrow = FALSE)
151 |         ghat <- matrix(0, nrow = nrow(Y_qc), ncol = k)
152 |         hhat <- matrix(0, nrow = ncol(Y_qc), ncol = k)
153 |         bhdiff <- rep(Inf, maxiter)
154 |         fhdiff <- rep(Inf, maxiter)
155 |     
156 |         betahatlist <- vector("list", maxiter)
157 |         fhatlist <- vector("list", maxiter)
158 |         ghatlist <- vector("list", maxiter)
159 |         hhatlist <- vector("list", maxiter)
160 |         alphahatlist <- vector("list", maxiter)
161 |     
162 |         while (iter <= maxiter) {
163 |             if (iter == 1) {
164 |                 fhatnew <- fhat0
165 |                 alpha <- alpha0
166 |             }
167 |             if (iter > 1) {
168 |                 gcfit.temp <- Y_qc/Nmat/betahat/exp(ghat %*% t(hhat))
169 |                 offset <- Nmat * matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
170 |                         data = betahat, byrow = FALSE) * exp(ghat %*% t(hhat))
171 |                 fhat.temp <- getfGC2(gcfit.temp = gcfit.temp, gctemp = gc_qc,
172 |                             Y = Y_qc, norm_index = norm_index, groups = groups, 
173 |                             offset = offset, T = T, alpha = alpha0,
174 |                             minCountQC = minCountQC)
175 |                 fhatnew <- fhat.temp$fGC.hat
176 |                 alpha <- fhat.temp$alpha
177 |             }
178 |             fhatnew[fhatnew < quantile(fhatnew, 0.005)] <- quantile(
179 |                 fhatnew, 0.005)
180 |             betahatnew <- apply((Y_qc/(fhatnew * Nmat * exp(ghat %*%
181 |                 t(hhat))))[, norm_index, drop = FALSE], 1, median)
182 |             betahatnew[betahatnew <= 0] <- min(betahatnew[betahatnew > 0])
183 |             bhdiff[iter] <- sum((betahatnew - betahat)^2)/length(betahat)
184 |             fhdiff[iter] <- sum((fhatnew - fhat)^2)/length(fhat)
185 |             if (fhdiff[iter] > min(fhdiff))
186 |                 break
187 |             message("Iteration ", iter, "\t", "beta diff =",
188 |                 signif(bhdiff[iter], 3), "\t", "f(GC) diff =",
189 |                 signif(fhdiff[iter], 3))
190 |             fhat <- fhatnew
191 |             betahat <- betahatnew
192 |             betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
193 |                         data = betahat, byrow = FALSE)
194 |             L <- log(Nmat * fhat * betahatmat * alpha/2)
195 |             logmat <- log(pmax(Y_qc, 1)) - L
196 |             logmat <- logmat - matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
197 |                         data = apply(logmat, 1, mean), byrow = FALSE)
198 |             hhat <- svd(logmat, nu = k, nv = k)$v
199 |             hhatnew <- hhat
200 |             hiter <- 1
201 |             hhdiff <- rep(Inf, maxhiter)
202 |             while (hiter <= maxhiter) {
203 |                 for (s in seq_len(nrow(Y_qc))) {
204 |                     temp <- try(glm(formula = Y_qc[s, norm_index] ~ 
205 |                                 hhat[norm_index, ] -
206 |                                 1, offset = L[s, norm_index],
207 |                                 family = poisson)$coefficients,
208 |                                 silent = TRUE)
209 |                     if (is.character(temp)) {
210 |                         temp <- lm(log(pmax(Y_qc[s, norm_index], 1)) ~
211 |                             hhat[norm_index, ] -
212 |                             1, offset = log(L[s, norm_index]))$coefficients
213 |                     }
214 |                     ghat[s, ] <- temp
215 |                 }
216 |                 # avoid overflow or underflow of the g latent factors
217 |                 ghat[is.na(ghat)] <- 0
218 |                 if (max(ghat) >= 30) {
219 |                     ghat <- apply(ghat, 2, function(z) {
220 |                         z[z > quantile(z, 0.995)] = min(quantile(z, 0.995), 30)
221 |                         z
222 |                         })
223 |                 }
224 |                 if (min(ghat) <= -30) {
225 |                     ghat <- apply(ghat, 2, function(z) {
226 |                         z[z < quantile(z, 0.005)] = max(quantile(z,
227 |                         0.005), -30)
228 |                         z
229 |                         })
230 |                 }
231 |                 for (t in seq_len(ncol(Y_qc))) {
232 |                     hhatnew[t, ] <- glm(formula = Y_qc[, t] ~ ghat - 1,
233 |                         offset = L[, t], family = poisson)$coefficients
234 |                 }
235 |                 gh <- ghat %*% t(hhatnew)
236 |                 gh <- scale(gh, center = TRUE, scale = FALSE)
237 |                 hhatnew <- svd(gh, nu = k, nv = k)$v
238 |                 hhdiff[hiter] <- sum((hhatnew - hhat)^2)/length(hhat)
239 |                 message("\t\t\t", "hhat diff =",
240 |                     signif(hhdiff[hiter], 3))
241 |                 hhat <- hhatnew
242 |                 if (hhdiff[hiter] < HHTHRESH)
243 |                     break
244 |                 if (hiter > 10 & (rank(hhdiff))[hiter] <= 3)
245 |                     break
246 |                 hiter <- hiter + 1
247 |             }
248 |         
249 |             alphahatlist[[iter]] <- alpha
250 |             fhatlist[[iter]] <- fhat
251 |             betahatlist[[iter]] <- betahat
252 |             ghatlist[[iter]] <- ghat
253 |             hhatlist[[iter]] <- hhat
254 |             if (bhdiff[iter] < BHTHRESH)
255 |                 break
256 |             if (iter > 5 & bhdiff[iter] > 1)
257 |                 break
258 |             iter <- iter + 1
259 |         }
260 |         optIter <- which.min(fhdiff)
261 |         message(paste("Stop at Iteration ", optIter, ".", sep = ""))
262 |         alpha <- alphahatlist[[optIter]]
263 |         fhat <- fhatlist[[optIter]]
264 |         betahat <- betahatlist[[optIter]]
265 |         ghat <- ghatlist[[optIter]]
266 |         hhat <- hhatlist[[optIter]]
267 |         betahatmat <- matrix(nrow = nrow(Y_qc), ncol = ncol(Y_qc),
268 |                         data = betahat, byrow = FALSE)
269 |         Yhat[[ki]] <- pmax(round(fhat * Nmat * betahatmat *
270 |                         exp(ghat %*% t(hhat)), 0), 1)
271 |         alpha.hat[[ki]] <- alpha
272 |         fGC.hat[[ki]] <- signif(fhat, 3)
273 |         beta.hat[[ki]] <- signif(betahat, 3)
274 |         h.hat[[ki]] <- signif(hhat, 3)
275 |         g.hat[[ki]] <- signif(ghat, 3)
276 |         Yhat.temp <- Yhat[[ki]] * alpha/2
277 |         AIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) -
278 |                 Yhat.temp) - 2 * (length(ghat) + length(hhat))
279 |         BIC[ki] <- 2 * sum(Y_qc * log(pmax(Yhat.temp, 1)) -
280 |                 Yhat.temp) - (length(ghat) + length(hhat)) * 
281 |                 log(length(Y_qc))
282 |         RSS[ki] <- sum((Y_qc - Yhat.temp)^2/length(Y_qc))
283 |         message("AIC", k, " = ", round(AIC[ki], 3))
284 |         message("BIC", k, " = ", round(BIC[ki], 3))
285 |         message("RSS", k, " = ", round(RSS[ki], 3), "\n")
286 |     }
287 |     list(Yhat = Yhat, alpha.hat = alpha.hat, fGC.hat = fGC.hat,
288 |         beta.hat = beta.hat, g.hat = g.hat, h.hat = h.hat,
289 |         AIC = AIC, BIC = BIC, RSS = RSS, K = K)
290 | }
291 | 
292 | getfGCG <- function(gcfit.tempg, gctemp, Yg, offsetg, T, draw.plot = NULL,
293 |                     alphag, minCountQC) {
294 |     alphag <- pmax(1, round(alphag))
295 |     if (is.null(draw.plot)) {
296 |         draw.plot <- FALSE
297 |     }
298 |     loglik <- rep(NA, length(T))
299 |     BIC <- rep(NA, length(T))
300 |     fGCi.obj <- vector("list", length(T))
301 |     fGCg.plot <- vector("list", length(T))
302 |     Z.obj <- vector("list", length(T))
303 |     vec_pi.obj <- vector("list", length(T))
304 | 
305 |     bin.filter.list = vector("list", length = ncol(gcfit.tempg))
306 |     for (j in seq_len(ncol(gcfit.tempg))) {
307 |         fGCi <- fitGC(gctemp, gcfit.tempg[,j])
308 |         resid <- abs(gcfit.tempg[,j] - fGCi)
309 |         bin.filter.list[[j]] <- which(resid > (median(resid) + 5 * 
310 |             mad(resid)) | Yg[,j] < minCountQC)
311 |         if (length(bin.filter.list[[j]]) == 0) {
312 |             bin.filter.list[[j]] <- which.max(gcfit.tempg[,j])
313 |         }
314 |     }
315 |     bin.filter = sort(unique(unlist(bin.filter.list)))
316 | 
317 |     for (Ti in T) {
318 |         cat(Ti, "\t")
319 |         if (Ti == 1) {
320 |             fGCg <- matrix(NA, ncol = ncol(gcfit.tempg), 
321 |                 nrow = nrow(gcfit.tempg))
322 |             Z <- matrix(nrow = nrow(gcfit.tempg), ncol = Ti,
323 |                 data = 1/Ti)
324 |             vec_pi <- 1
325 |             for (j in seq_len(ncol(gcfit.tempg))) {
326 |                 loe.fit.temp <- loess(gcfit.tempg[-bin.filter,j] ~
327 |                         gctemp[-bin.filter])
328 |                 fGCi <- predict(loe.fit.temp, newdata = gctemp,
329 |                         se = TRUE)$fit
330 |                 temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
331 |                 fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
332 |                 fGCg[,j] = fGCi
333 |             }
334 |         }
335 |         if (Ti >= 2) {
336 |             Z <- matrix(nrow = nrow(gcfit.tempg), ncol = Ti, data = 0)
337 |             mintemp <- pmin(Ti, alphag)
338 |             Z[cbind(seq_len(nrow(Z)), mintemp)] <- 1
339 |             vec_pi <- colSums(Z)/nrow(Z)
340 | 
341 |             fGCg <- matrix(NA, ncol = ncol(gcfit.tempg), 
342 |                 nrow = nrow(gcfit.tempg))
343 |             for (j in seq_len(ncol(gcfit.tempg))) {
344 |                 loe.fit.temp <- loess((gcfit.tempg[,j]/(Z %*% 
345 |                                 as.matrix(seq_len(Ti)/2)))[-bin.filter] ~
346 |                                 gctemp[-bin.filter])
347 |                 fGCi <- predict(loe.fit.temp, newdata = gctemp, se = TRUE)$fit
348 |                 temp <- min(fGCi[!is.na(fGCi) & fGCi > 0])
349 |                 fGCi[fGCi <= 0 | is.na(fGCi)] <- temp
350 |                 fGCg[,j] <- fGCi
351 |             }
352 | 
353 |             diff.GC <- Inf
354 |             diff.Z <- Inf
355 |             iter <- 1
356 |             while (iter <= 3 | diff.GC > 5e-06 | diff.Z > 0.005) {
357 |                 Mtemp <- MstepG(Z, gcfit.tempg, gctemp)
358 |                 vec_pi.new <- Mtemp$vec_pig
359 |                 fGCi.new <- Mtemp$fGCg
360 |                 Z.new <- EstepG(fGCi.new, vec_pi.new, Yg, offsetg)
361 |                 diff.GC <- sum((fGCg - fGCi.new)^2)/length(fGCg)
362 |                 diff.Z <- sum((Z.new - Z)^2)/length(Z)
363 |                 vec_pi <- vec_pi.new
364 |                 Z <- Z.new
365 |                 fGCg <- fGCi.new
366 |                 iter <- iter + 1
367 |                 if (iter >= 50)
368 |                     break
369 |             }
370 |         }
371 |     
372 |         fGCg.plot[[which(T == Ti)]] <- matrix(NA, ncol = ncol(gcfit.tempg), 
373 |                 nrow = nrow(gcfit.tempg))
374 |         if (Ti == 1) {
375 |             loglik.temp <- rep(NA, ncol(gcfit.tempg))
376 |             BIC.temp <- rep(NA, ncol(gcfit.tempg))
377 |             for (j in seq_len(ncol(gcfit.tempg))) {
378 |                 loe.fit.plot <- loess(gcfit.tempg[,j] ~ gctemp)
379 |                 fGCi.plot <- loe.fit.plot$fitted
380 |                 temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
381 |                 fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
382 |                 df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
383 |                 fGCg.plot[[which(T == Ti)]][,j] = fGCi.plot
384 |         
385 |                 loglik.temp[j] <- sum(dpois(Yg[-bin.filter,j], 
386 |                         lambda = (offsetg[,j] * fGCg[,j])[-bin.filter], 
387 |                         log = TRUE))
388 |                 BIC.temp[j] <- 2 * loglik.temp[j] - (nrow(gcfit.tempg) - df) * 
389 |                         log(nrow(gcfit.tempg))
390 |             }
391 |             loglik[which(T == Ti)] <- sum(loglik.temp)
392 |             BIC[which(T == Ti)] <- sum(BIC.temp)
393 |         } else {
394 |             loglik.temp <- rep(NA, ncol(gcfit.tempg)) 
395 |             BIC.temp <- rep(NA, ncol(gcfit.tempg))
396 |                 for (j in seq_len(ncol(gcfit.tempg))) {
397 |                     loe.fit.plot <- loess((gcfit.tempg[,j]/(Z %*%
398 |                         as.matrix(seq_len(Ti)/2))) ~ gctemp)
399 |                     fGCi.plot <- loe.fit.plot$fitted
400 |                     temp <- min(fGCi.plot[!is.na(fGCi.plot) & fGCi.plot > 0])
401 |                     fGCi.plot[fGCi.plot <= 0 | is.na(fGCi.plot)] <- temp
402 |                     df <- predict(loe.fit.plot, newdata = gctemp, se = TRUE)$df
403 |                     fGCg.plot[[which(T == Ti)]][,j] <- fGCi.plot
404 |         
405 |                     loglik.temp[j] <- sum(dpois(Yg[-bin.filter,j], lambda = 
406 |                             (offsetg[,j] * fGCg[,j] * (Z %*% 
407 |                             as.matrix(seq_len(Ti)/2)))[-bin.filter], 
408 |                             log = TRUE))
409 |                     BIC.temp[j] <- 2 * loglik.temp[j] - (nrow(gcfit.tempg) - 
410 |                             df + Ti - 1) * log(nrow(gcfit.tempg))
411 |                 }
412 |             loglik[which(T == Ti)] <- sum(loglik.temp)
413 |             BIC[which(T == Ti)] <- sum(BIC.temp)
414 |         }
415 |     
416 |         fGCi.obj[[which(T == Ti)]] <- fGCg
417 |         Z.obj[[which(T == Ti)]] <- Z
418 |         vec_pi.obj[[which(T == Ti)]] <- vec_pi
419 |     }
420 | 
421 | 
422 |     for (j in seq_len(ncol(gcfit.tempg))) {
423 |         if (draw.plot) {
424 |             par(mfrow = c(5, 2))
425 |             smoothScatter(gctemp[-bin.filter], gcfit.tempg[-bin.filter,j],
426 |                     main = "Original", xlab = "GC content",
427 |                     ylab = "Y/beta/N/exp(gxh)")
428 |         }
429 |     
430 |         for (Ti in T) {
431 |             if (draw.plot) {
432 |                 smoothScatter(gctemp[-bin.filter], gcfit.tempg[-bin.filter,j],
433 |                     xlab = "GC content", ylab = "Y/beta/N/exp(gxh)",
434 |                     nrpoints = 0, main = paste("T =", Ti))
435 |                 if (Ti == 1) {
436 |                     points(gctemp[order(gctemp)], fGCg.plot[[which(T == Ti)]][
437 |                         order(gctemp),j],
438 |                     lty = 2, col = 2, type = "l", lwd = 2)
439 |                     points(gctemp, gcfit.tempg[,j], cex = 0.4,
440 |                         col = 2, pch = 16)
441 |                 } else {
442 |                     for (k in seq_len(Ti)) {
443 |                         points(gctemp[order(gctemp)],
444 |                             fGCg.plot[[which(T == Ti)]][order(gctemp),j] * k/2,
445 |                             lty = 2, col = k, type = "l", lwd = 2)
446 |                         points(gctemp[which((round(Z.obj[[which(T == Ti)
447 |                             ]]))[, k] == 1)],
448 |                             (gcfit.tempg[,j])[which((round(Z.obj[[which(T == Ti)
449 |                             ]]))[, k] == 1)],
450 |                             cex = 0.4, col = k, pch = 16)
451 |                     }
452 |                 }
453 |             }
454 |         }
455 |     
456 |         if (draw.plot) {
457 |             plot(T, loglik, type = "b", xlab = "T", ylab = "loglik", 
458 |                 main = "Log likelihood")
459 |             abline(v = which.max(loglik), lty = 2)
460 |             plot(T, BIC, type = "b", xlab = "T", ylab = "BIC", main = "BIC")
461 |             abline(v = which.max(BIC), lty = 2)
462 |             par(mfrow = c(1, 1))
463 |         }
464 |     }
465 |     return(list(fGCi.obj = fGCi.obj, Z.obj = Z.obj,
466 |                 vec_pi.obj = vec_pi.obj, bin.filter = bin.filter,
467 |                 loglik = loglik, BIC = BIC))
468 | }
469 | 
470 | getfGC2 <- function(gcfit.temp, gctemp, Y, norm_index, groups, offset, T,
471 |                     alpha, minCountQC) {
472 |     fGC.hat <- matrix(ncol = ncol(Y), nrow = nrow(Y))
473 |     for (G in unique(groups)) {
474 |         cat(G, "\t")
475 |         g <- which(groups == G)
476 |         if (!any(is.na(match(norm_index, g))) & 
477 |             !any(is.na(match(g, norm_index)))) {
478 |             alpha[, g] <- 2
479 |             for (j in g) {
480 |                 loe.fit <- loess(gcfit.temp[, j] ~ gctemp)
481 |                 fGC.hat[, j] <- loe.fit$fitted  
482 |             }
483 |         } else {
484 |             fGCg <- getfGCG(gcfit.tempg = gcfit.temp[, g, drop = FALSE],
485 |                         gctemp = gctemp, Yg = Y[, g, drop = FALSE], 
486 |                         offsetg = offset[, g, drop = FALSE],
487 |                         T = T, draw.plot = FALSE, 
488 |                         alphag = apply(alpha[, g, drop = FALSE], 1, median),
489 |                         minCountQC = minCountQC)
490 |             if (which.max(fGCg$BIC) == 1) {
491 |                 alpha[, g] <- 2
492 |             } else {
493 |                 alpha[, g] <- apply(fGCg$Z.obj[[which.max(fGCg$BIC)]], 
494 |                                 1, which.max)
495 |             }
496 |             fGC.hat[, g] <- fGCg$fGCi.obj[[which.max(fGCg$BIC)]]
497 |         }
498 |     }
499 |     return(list(fGC.hat = fGC.hat, alpha = alpha))
500 | }
501 | 
502 | 
503 | EstepG <- function(fGCg, vec_pig, Yg, offsetg) {
504 |     P <- matrix(nrow = nrow(fGCg), ncol = length(vec_pig))
505 |     vec_pig[vec_pig == 0] <- 1e-100
506 |     pPoisson <- vector("list", length = ncol(fGCg))
507 |     for (j in seq_len(ncol(Yg))) {
508 |         lambdaj <- matrix(nrow = nrow(P), ncol = ncol(P),
509 |             data = offsetg[, j, drop = FALSE] * fGCg[, j, drop = FALSE]) * 
510 |             matrix(nrow = nrow(P), ncol = ncol(P),
511 |             data = seq_len(length(vec_pig))/2, byrow = TRUE)
512 |         pPoisson[[j]] <- dpois(matrix(nrow = nrow(P), ncol = ncol(P), 
513 |                     data = Yg[, j, drop = FALSE]),
514 |                     lambda = lambdaj, log = TRUE)
515 |     }
516 |     P <- apply(simplify2array(pPoisson), c(1,2), sum) + matrix(nrow = nrow(P), 
517 |                     ncol = ncol(P), data = log(vec_pig), byrow = TRUE)
518 |     Zg <- matrix(nrow = nrow(fGCg), ncol = length(vec_pig))
519 |     for (k in seq_len(length(vec_pig))) {
520 |         Zg[, k] <- 1/(1 + apply(exp(apply(P, 2, function(x) {
521 |             x - P[, k]
522 |             })[, -k, drop = FALSE]), 1, sum))
523 |     }
524 |     return(Zg)
525 | }
526 | 
527 | 
528 | 
529 | MstepG <- function(Zg, gcfit.tempg, gctemp) {
530 |     fGCg <- matrix(NA, nrow = nrow(gcfit.tempg), ncol = ncol(gcfit.tempg))
531 |     for (j in seq_len(ncol(gcfit.tempg))) {
532 |         gcfit.temp <- gcfit.tempg[,j]/(Zg %*% as.matrix(seq_len(ncol(Zg))/2))
533 |         fGCi <- fitGC(gctemp, gcfit.temp)
534 |         fGCg[,j] <- fGCi
535 |     }
536 |     vec_pig <- colSums(Zg)/nrow(Zg)
537 |     return(list(vec_pig = vec_pig, fGCg = fGCg))
538 | }
539 | 
540 | 
541 | 
542 | fitGC <- function(gctemp, gcfit.temp) {
543 |     loe.fit <- loess(gcfit.temp ~ gctemp)
544 |     fGCi <- loe.fit$fitted
545 |     temp <- min(fGCi[fGCi > 0])
546 |     fGCi[fGCi <= 0] <- temp
547 |     return(fGCi)
548 | }
549 | 
550 | 
551 | 


--------------------------------------------------------------------------------