├── .Rbuildignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── config.yml
    │   └── infercnv-support-and-development-hiatus.md
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── .wip
    │   └── Seurat_integration.R
    ├── SplatterScrape.R
    ├── data.R
    ├── inferCNV.R
    ├── inferCNV_BayesNet.R
    ├── inferCNV_HMM.R
    ├── inferCNV_constants.R
    ├── inferCNV_heatmap.R
    ├── inferCNV_hidden_spike.R
    ├── inferCNV_i3HMM.R
    ├── inferCNV_mask_non_DE.R
    ├── inferCNV_meanVarSim.R
    ├── inferCNV_ops.R
    ├── inferCNV_simple_sim.R
    ├── inferCNV_tumor_subclusters.R
    ├── inferCNV_tumor_subclusters.random_smoothed_trees.R
    ├── infercnv_sampling.R
    ├── noise_reduction.R
    └── seurat_interaction.R
├── README.md
├── Rstudio_helpers
    └── Examine_and_Filter_Cells_and_Genes.Rmd
├── WDL
    └── infercnv.wdl
├── data
    ├── HMM_states.rda
    ├── infercnv_annots_example.rda
    ├── infercnv_data_example.rda
    ├── infercnv_genes_example.rda
    ├── infercnv_object_example.rda
    └── mcmc_obj.rda
├── docker
    └── Dockerfile
├── example
    ├── Makefile
    ├── README.txt
    ├── __alt_exec_modes
    │   ├── run.no_spike.R
    │   ├── run.set_num_ref_groups.R
    │   └── run.use_zscores.R
    ├── example.Rmd
    ├── run.R
    ├── run_memory_profiling_per_step.sh
    └── run_test.R
├── inst
    ├── BUGS_Mixture_Model
    ├── BUGS_Mixture_Model_i3
    ├── CITATION
    ├── NEWS
    ├── extdata
    │   ├── gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt
    │   ├── oligodendroglioma_annotations_downsampled.txt
    │   └── oligodendroglioma_expression_downsampled.counts.matrix.gz
    └── script
    │   └── README.txt
├── man
    ├── CreateInfercnvObject.Rd
    ├── HMM_states.Rd
    ├── MCMC_inferCNV-class.Rd
    ├── add_to_seurat.Rd
    ├── apply_median_filtering.Rd
    ├── color.palette.Rd
    ├── filterHighPNormals.Rd
    ├── inferCNVBayesNet.Rd
    ├── infercnv-class.Rd
    ├── infercnv-package.Rd
    ├── infercnv_annots_example.Rd
    ├── infercnv_data_example.Rd
    ├── infercnv_genes_example.Rd
    ├── infercnv_object_example.Rd
    ├── mcmc_obj.Rd
    ├── plot_cnv.Rd
    ├── plot_per_group.Rd
    ├── plot_subclusters.Rd
    ├── run.Rd
    ├── sample_object.Rd
    └── validate_infercnv_obj.Rd
├── scripts
    ├── ExploratoryPlots.R
    ├── HB_example_to_inferCNV_obj.R
    ├── KS_matrix_comparison.R
    ├── KS_matrix_comparison.use_infercnv_obj.R
    ├── QQ_matrix_comparison.R
    ├── apply_median_filtering.R
    ├── boxplot_cell_exprs.R
    ├── check_matrix_format.py
    ├── cross_cell_scaling_normalization.R
    ├── dropout_matrix_comparison.R
    ├── examine_dropout_logistic.R
    ├── examine_infercnv_data_params.R
    ├── examine_infercnv_data_params.just_dispersion.R
    ├── examine_normal_cutoffs_vs_KS.R
    ├── examine_normal_sampling_distributions.R
    ├── examine_normal_sampling_distributions.i3.R
    ├── examine_simulated_vs_observed_dispersion.R
    ├── examine_simulated_vs_observed_dispersion.from_matrix.R
    ├── explore_HMM_exec.R
    ├── explore_HMM_exec.hspike.R
    ├── explore_steps_by_gene.simple.R
    ├── genome_smoothed_lineplots.R
    ├── gtf_to_position_file.py
    ├── inferCNV.R
    ├── inferCNV_to_HB.R
    ├── inferCNV_utils.R
    ├── infercnv_obj_to_input_files.R
    ├── infercnv_validate.R
    ├── meanvar_sim_counts.R
    ├── plot_hspike.R
    ├── plot_hspike.by_num_cells.R
    ├── plot_hspike.diff_normal_tumor.R
    ├── plot_hspike_vs_sample_chrs.R
    ├── plot_infercnv_obj.R
    ├── plot_tumor_vs_normal_chr_densities.R
    ├── plot_tumor_vs_normal_chr_densities.i3.R
    ├── prepare_sparsematrix.R
    ├── recursive_random_tree_height_cutting.random_trees.R
    ├── recursive_random_tree_height_cutting.sigclust2.R
    ├── recursive_random_tree_height_cutting.using_hmms.R
    ├── run.stub.R
    ├── run_BayesNet.R
    ├── run_HMM_each_cell_separately.R
    ├── run_HMM_on_hspike.R
    ├── run_HMM_on_subclusters.R
    ├── run_HMM_per_chr.R
    ├── run_tests_sampling_and_group_plots.R
    ├── sim_vs_orig_counts.QQplot.R
    └── splatterScrape_sim_counts.R
├── tests
    ├── testthat.R
    └── testthat
    │   └── test_infer_cnv.R
└── vignettes
    ├── .wip
        └── inferCNV.Rmd
    └── inferCNV.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^Meta$
 2 | ^doc$
 3 | ^\.travis\.yml$
 4 | ^R/\.wip
 5 | ^vignettes/\.wip
 6 | ^vignettes/example_output
 7 | ^docker
 8 | ^run_tests
 9 | ^\.
10 | ^inferCNV\.wiki
11 | ^infercnv.Rdata
12 | ^inferCNV.Rproj
13 | ^Rstudio_helpers
14 | ^__simulations
15 | ^example
16 | ^example/full_precision
17 | ^example/example.html
18 | ^example/test_subdir
19 | ^example/oligodendroglioma_expression_downsampled.txt
20 | ^example_output
21 | ^example/C125.matrix.obj
22 | ^output_dir
23 | ^.*\.Rproj$
24 | ^\.Rproj\.user$
25 | ^devel_debug
26 | ^run_tests
27 | ^external
28 | ^\.gitmodules
29 | ^scripts
30 | ^WDL/infercnv.wdl
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/infercnv-support-and-development-hiatus.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: InferCNV support and development hiatus
 3 | about: InferCNV support and development is on pause due to lack of dedicated resources available.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | ---
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Meta
2 | doc
3 | .idea
4 | .Rproj.user
5 | .Rhistory
6 | .Rprofile
7 | .example_output
8 | vignettes/example_output
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | r: bioc-release
 3 | bioc_required: true
 4 | bioc_use_devel: false
 5 | 
 6 | cache:
 7 |   apt: true
 8 |   packages: true
 9 |   timeout: 3000
10 | 
11 | before_install:
12 |   - sudo apt-get update
13 |   - sudo apt-get install jags
14 | 
15 | # r_build_args: --no-build-vignettes --no-manual --no-resave-data
16 | r_check_args: --no-build-vignettes # --no-manual
17 | 
18 | #script:
19 | #- travis_wait R CMD build .
20 | #- R CMD check --no-build-vignettes *tar.gz
21 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: infercnv
 2 | Type: Package
 3 | Title: Infer Copy Number Variation from Single-Cell RNA-Seq Data
 4 | Version: 1.23.0
 5 | Date: 2023-12-01
 6 | Authors@R: c( person("Timothy", "Tickle", email = "ttickle@broadinstitute.org", role = "aut"), person("Itay", "Tirosh", email = "tirosh@broadinstitute.org", role = "aut"), person("Christophe", "Georgescu", email = "cgeorges@broadinstitute.org", role = c("aut", "cre")), person("Maxwell", "Brown", email = "mbrown@broadinstitute.org", role = "aut"), person("Brian", "Haas", email = "bhaas@broadinstitute.org", role = "aut")) 
 7 | BugReports: https://github.com/broadinstitute/inferCNV/issues
 8 | Description: Using single-cell RNA-Seq expression to visualize CNV in cells.
 9 | biocViews: Software, CopyNumberVariation, VariantDetection, StructuralVariation, GenomicVariation, Genetics, Transcriptomics, StatisticalMethod, Bayesian, HiddenMarkovModel, SingleCell
10 | Depends: R(>= 4.0)
11 | License: BSD_3_clause + file LICENSE
12 | LazyData: TRUE
13 | VignetteBuilder: knitr
14 | Suggests: BiocStyle, knitr, rmarkdown, testthat
15 | RoxygenNote: 7.2.3
16 | NeedsCompilation: no
17 | SystemRequirements: JAGS 4.x.y
18 | Imports: graphics, grDevices, RColorBrewer, gplots, futile.logger, stats, utils, methods, ape, phyclust, Matrix, fastcluster, parallelDist, dplyr, HiddenMarkov, ggplot2, edgeR, coin, caTools, digest, RANN, igraph, reshape2, rjags, fitdistrplus, future, foreach, doParallel, Seurat, BiocGenerics, SummarizedExperiment, SingleCellExperiment, tidyr, parallel, coda, gridExtra, argparse
19 | URL: https://github.com/broadinstitute/inferCNV/wiki
20 | Collate: 
21 |     'SplatterScrape.R'
22 |     'data.R'
23 |     'inferCNV.R'
24 |     'inferCNV_BayesNet.R'
25 |     'inferCNV_HMM.R'
26 |     'inferCNV_constants.R'
27 |     'inferCNV_heatmap.R'
28 |     'inferCNV_hidden_spike.R'
29 |     'inferCNV_i3HMM.R'
30 |     'inferCNV_mask_non_DE.R'
31 |     'inferCNV_meanVarSim.R'
32 |     'inferCNV_ops.R'
33 |     'inferCNV_simple_sim.R'
34 |     'inferCNV_tumor_subclusters.R'
35 |     'inferCNV_tumor_subclusters.random_smoothed_trees.R'
36 |     'infercnv_sampling.R'
37 |     'noise_reduction.R'
38 |     'seurat_interaction.R'
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Timothy Tickle, Christophe Georgescu, Itay Tirosh 
3 | ORGANIZATION: Broad Institute
4 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | # Generated by roxygen2: do not edit by hand
  2 | 
  3 | export(CreateInfercnvObject)
  4 | export(add_to_seurat)
  5 | export(apply_median_filtering)
  6 | export(color.palette)
  7 | export(filterHighPNormals)
  8 | export(inferCNVBayesNet)
  9 | export(plot_cnv)
 10 | export(plot_per_group)
 11 | export(plot_subclusters)
 12 | export(run)
 13 | export(sample_object)
 14 | exportClasses(MCMC_inferCNV)
 15 | exportClasses(infercnv)
 16 | import(RColorBrewer)
 17 | import(argparse)
 18 | import(coda)
 19 | import(doParallel)
 20 | import(fitdistrplus)
 21 | import(foreach)
 22 | import(futile.logger)
 23 | import(future)
 24 | import(ggplot2)
 25 | importFrom(BiocGenerics,counts)
 26 | importFrom(BiocGenerics,mean)
 27 | importFrom(BiocGenerics,t)
 28 | importFrom(Matrix,Matrix)
 29 | importFrom(Matrix,colSums)
 30 | importFrom(Matrix,rowMeans)
 31 | importFrom(Matrix,sparseMatrix)
 32 | importFrom(RANN,nn2)
 33 | importFrom(Seurat,CreateSeuratObject)
 34 | importFrom(Seurat,FindNeighbors)
 35 | importFrom(Seurat,FindVariableFeatures)
 36 | importFrom(Seurat,RunPCA)
 37 | importFrom(Seurat,ScaleData)
 38 | importFrom(SingleCellExperiment,SingleCellExperiment)
 39 | importFrom(SummarizedExperiment,"assays<-")
 40 | importFrom(SummarizedExperiment,"colData<-")
 41 | importFrom(SummarizedExperiment,"rowData<-")
 42 | importFrom(SummarizedExperiment,assays)
 43 | importFrom(SummarizedExperiment,colData)
 44 | importFrom(SummarizedExperiment,rowData)
 45 | importFrom(SummarizedExperiment,start)
 46 | importFrom(ape,as.phylo)
 47 | importFrom(ape,drop.tip)
 48 | importFrom(ape,read.tree)
 49 | importFrom(ape,write.tree)
 50 | importFrom(caTools,runmean)
 51 | importFrom(coin,oneway_test)
 52 | importFrom(coin,pvalue)
 53 | importFrom(digest,digest)
 54 | importFrom(dplyr,"%>%")
 55 | importFrom(dplyr,count)
 56 | importFrom(edgeR,estimateDisp)
 57 | importFrom(fastcluster,hclust)
 58 | importFrom(gplots,bluered)
 59 | importFrom(grDevices,col2rgb)
 60 | importFrom(grDevices,colorRampPalette)
 61 | importFrom(grDevices,dev.off)
 62 | importFrom(grDevices,pdf)
 63 | importFrom(grDevices,png)
 64 | importFrom(grDevices,rgb)
 65 | importFrom(graphics,abline)
 66 | importFrom(graphics,axis)
 67 | importFrom(graphics,boxplot)
 68 | importFrom(graphics,hist)
 69 | importFrom(graphics,image)
 70 | importFrom(graphics,layout)
 71 | importFrom(graphics,legend)
 72 | importFrom(graphics,lines)
 73 | importFrom(graphics,mtext)
 74 | importFrom(graphics,par)
 75 | importFrom(graphics,plot)
 76 | importFrom(graphics,points)
 77 | importFrom(graphics,rect)
 78 | importFrom(graphics,text)
 79 | importFrom(graphics,title)
 80 | importFrom(gridExtra,gtable_combine)
 81 | importFrom(gridExtra,marrangeGrob)
 82 | importFrom(gridExtra,tableGrob)
 83 | importFrom(gridExtra,ttheme_default)
 84 | importFrom(igraph,cluster_leiden)
 85 | importFrom(igraph,graph_from_adjacency_matrix)
 86 | importFrom(methods,is)
 87 | importFrom(methods,new)
 88 | importFrom(methods,setClass)
 89 | importFrom(parallel,detectCores)
 90 | importFrom(parallelDist,parallelDist)
 91 | importFrom(phyclust,get.rooted.tree.height)
 92 | importFrom(reshape2,melt)
 93 | importFrom(rjags,coda.samples)
 94 | importFrom(rjags,jags.model)
 95 | importFrom(stats,as.dendrogram)
 96 | importFrom(stats,as.dist)
 97 | importFrom(stats,as.hclust)
 98 | importFrom(stats,complete.cases)
 99 | importFrom(stats,cor)
100 | importFrom(stats,cutree)
101 | importFrom(stats,density)
102 | importFrom(stats,dist)
103 | importFrom(stats,dnorm)
104 | importFrom(stats,ecdf)
105 | importFrom(stats,filter)
106 | importFrom(stats,ks.test)
107 | importFrom(stats,lm)
108 | importFrom(stats,median)
109 | importFrom(stats,nls)
110 | importFrom(stats,order.dendrogram)
111 | importFrom(stats,p.adjust)
112 | importFrom(stats,pnorm)
113 | importFrom(stats,predict)
114 | importFrom(stats,qgamma)
115 | importFrom(stats,qnorm)
116 | importFrom(stats,quantile)
117 | importFrom(stats,rbinom)
118 | importFrom(stats,rchisq)
119 | importFrom(stats,reorder)
120 | importFrom(stats,rgamma)
121 | importFrom(stats,rlnorm)
122 | importFrom(stats,rnbinom)
123 | importFrom(stats,rnorm)
124 | importFrom(stats,rpois)
125 | importFrom(stats,runif)
126 | importFrom(stats,sd)
127 | importFrom(stats,shapiro.test)
128 | importFrom(stats,smooth.spline)
129 | importFrom(stats,t.test)
130 | importFrom(stats,update)
131 | importFrom(stats,var)
132 | importFrom(stats,wilcox.test)
133 | importFrom(tidyr,gather)
134 | importFrom(utils,capture.output)
135 | importFrom(utils,flush.console)
136 | importFrom(utils,head)
137 | importFrom(utils,read.csv)
138 | importFrom(utils,read.table)
139 | importFrom(utils,tail)
140 | importFrom(utils,write.table)
141 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells.
 2 | #' This is only to demonstrate how to use methods, not actual data to be used in an analysis.
 3 | #'
 4 | #' @format A data frame with 8252 rows (genes) and 20 columns (cells)
 5 | #'
 6 | #'
 7 | "infercnv_data_example"
 8 | 
 9 | #' Generated classification for 10 normal cells and 10 tumor cells.
10 | #'
11 | #' @format A data frame with 20 rows (cells) and 1 columns (classification)
12 | #'
13 | #'
14 | "infercnv_annots_example"
15 | 
16 | #' Downsampled gene coordinates file from GrCh37
17 | #'
18 | #' @format A data frame with 10338 rows (genes) and 3 columns (chr, start, end)
19 | #'
20 | #'
21 | "infercnv_genes_example"
22 | 
23 | #' infercnv object result of the processing of run() in the example, to be used for other examples.
24 | #'
25 | #' @format An infercnv object
26 | #'
27 | #'
28 | "infercnv_object_example"
29 | 
30 | #' infercnv object result of the processing of run() in the HMM example, to be used for other examples.
31 | #'
32 | #' @format An infercnv object containing HMM predictions
33 | #'
34 | #'
35 | "HMM_states"
36 | 
37 | #' infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples.
38 | #'
39 | #' @format An infercnv object containing posterior probability of CNV states
40 | #'
41 | #'
42 | "mcmc_obj"
43 | 


--------------------------------------------------------------------------------
/R/inferCNV_constants.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | 
 5 | C_CHR <- "chr"
 6 | C_START <- "start"
 7 | C_STOP <- "stop"
 8 | C_HCLUST_METHODS <- c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")
 9 | C_OUTPUT_FORMAT <- c("pdf", "png")
10 | 
11 | 
12 | ## also including some globals:
13 | infercnv.env <- new.env()
14 | infercnv.env$GLOBAL_NUM_THREADS <- 1  # default is single-threaded.
15 | 
16 | 
17 | #' @importFrom grDevices col2rgb colorRampPalette dev.off pdf png rgb
18 | #' @importFrom graphics abline axis boxplot hist image layout lines mtext par plot points rect text title legend
19 | #' @importFrom stats as.dendrogram as.dist as.hclust cutree density dist filter median order.dendrogram quantile reorder sd complete.cases cor t.test p.adjust predict rnorm runif smooth.spline var wilcox.test dnorm ecdf ks.test lm nls pnorm qgamma qnorm rbinom rchisq rgamma rlnorm rnbinom rpois shapiro.test update
20 | #' @importFrom utils flush.console read.table write.table tail read.csv head capture.output
21 | #' @import futile.logger
22 | #' @importFrom methods setClass new is
23 | #' @importFrom gplots bluered
24 | #' @importFrom ape write.tree as.phylo read.tree drop.tip
25 | #' @importFrom phyclust get.rooted.tree.height
26 | #' @importFrom fastcluster hclust
27 | #' @importFrom parallelDist parallelDist
28 | #' @import RColorBrewer
29 | #' @importFrom Matrix Matrix rowMeans colSums sparseMatrix
30 | #' @importFrom dplyr %>% count
31 | #' @import fitdistrplus
32 | #' @import foreach
33 | #' @import doParallel
34 | #' @import future
35 | #' @import coda
36 | #' @import ggplot2
37 | #' @import argparse
38 | #' @importFrom edgeR estimateDisp
39 | #' @importFrom caTools runmean
40 | #' @importFrom coin oneway_test pvalue
41 | #' @importFrom digest digest
42 | #' @importFrom RANN nn2
43 | #' @importFrom igraph graph_from_adjacency_matrix cluster_leiden
44 | #' @importFrom reshape2 melt
45 | #' @importFrom rjags jags.model coda.samples
46 | #' @importFrom BiocGenerics counts t mean
47 | #' @importFrom SummarizedExperiment start colData rowData assays assays<- rowData<- colData<- 
48 | #' @importFrom SingleCellExperiment SingleCellExperiment
49 | #' @importFrom tidyr gather
50 | #' @importFrom parallel detectCores
51 | #' @importFrom gridExtra ttheme_default tableGrob gtable_combine marrangeGrob 
52 | #' @importFrom Seurat CreateSeuratObject FindVariableFeatures ScaleData RunPCA FindNeighbors
53 | 
54 | 
55 | NULL
56 | 
57 | 


--------------------------------------------------------------------------------
/R/inferCNV_meanVarSim.R:
--------------------------------------------------------------------------------
  1 | .get_simulated_cell_matrix_using_meanvar_trend <- function(infercnv_obj, gene_means, num_cells, include.dropout=FALSE) {
  2 | 
  3 |     # should be working on the total sum count normalized data.
  4 |     # model the mean variance relationship
  5 | 
  6 | 
  7 |     mean_var_table = .get_mean_var_table(infercnv_obj)
  8 | 
  9 |     dropout_logistic_params <- NULL
 10 | 
 11 |     if (include.dropout) {
 12 | 
 13 |         mean_p0_table <- .get_mean_vs_p0_table(infercnv_obj)
 14 | 
 15 |         dropout_logistic_params <- .get_logistic_params(mean_p0_table)
 16 |     }
 17 | 
 18 |     return(.get_simulated_cell_matrix_using_meanvar_trend_helper(gene_means, mean_var_table, num_cells, dropout_logistic_params))
 19 | }
 20 | 
 21 | 
 22 | 
 23 | .get_simulated_cell_matrix_using_meanvar_trend_helper <- function(gene_means, mean_var_table, num_cells, dropout_logistic_params=NULL) {
 24 | 
 25 |     ngenes = length(gene_means)
 26 | 
 27 |     logm = log(mean_var_table$m + 1)
 28 |     logv = log(mean_var_table$v + 1)
 29 | 
 30 |     mean_var_spline = smooth.spline(logv ~ logm)
 31 | 
 32 | 
 33 |     spike_cell_names = paste0('sim_cell_', seq_len(num_cells))
 34 | 
 35 |     sim_cell_matrix = matrix(rep(0,ngenes*num_cells), nrow=ngenes)
 36 |     rownames(sim_cell_matrix) = names(gene_means)
 37 |     colnames(sim_cell_matrix) = spike_cell_names
 38 | 
 39 |     sim_expr_vals <- function(gene_idx) {
 40 |         m = gene_means[gene_idx]
 41 |         return(.sim_expr_val_mean_var_no_dropout(m, mean_var_spline))
 42 |     }
 43 | 
 44 |     for (i in seq_len(num_cells)) {
 45 |         newvals = sapply(seq_len(ngenes), FUN=sim_expr_vals)
 46 |         sim_cell_matrix[,i] = newvals
 47 |     }
 48 | 
 49 |     ## apply dropout
 50 |     if (!is.null(dropout_logistic_params)) {
 51 |         sim_cell_matrix <- .apply_dropout(sim_cell_matrix, dropout_logistic_params)
 52 |     }
 53 |     
 54 |     return(sim_cell_matrix)
 55 | }
 56 | 
 57 | .get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix <- function(gene_means, normal_counts_matrix, num_cells, include.dropout=TRUE, cell_groupings=NULL) {
 58 | 
 59 |     mean_var_table <- .get_mean_var_given_matrix(normal_counts_matrix, cell_groupings)
 60 | 
 61 |     dropout_logistic_params <- NULL
 62 |     if (include.dropout) {
 63 |         mean_vs_p0_table <- .get_mean_vs_p0_table_from_matrix(normal_counts_matrix, cell_groupings)
 64 |         dropout_logistic_params <- .get_logistic_params(mean_vs_p0_table)
 65 |     }
 66 | 
 67 |     sim_matrix <- .get_simulated_cell_matrix_using_meanvar_trend_helper(gene_means, mean_var_table, num_cells, dropout_logistic_params)
 68 | 
 69 |     return(sim_matrix)
 70 | }
 71 | 
 72 | 
 73 | ##' @keywords internal
 74 | ##' @noRd
 75 | ##'
 76 | 
 77 | .sim_expr_val_mean_var <- function(m,  mean_var_spline, dropout_logistic_params) {
 78 | 
 79 |     # include drop-out prediction
 80 |         
 81 |     val = 0
 82 |     if (m > 0) {
 83 |         logm = log(m+1)
 84 |         pred_log_var = predict(mean_var_spline, logm)$y
 85 | 
 86 |         var = max(exp(pred_log_var)-1, 0)
 87 | 
 88 |         val = round(max(rnorm(n=1, mean=m, sd=sqrt(var)), 0))
 89 | 
 90 |         if ( (! is.null(dropout_logistic_params)) & val > 0) {
 91 | 
 92 |             dropout_prob <- predict(dropout_logistic_params$spline, log(val))$y[1]
 93 |             
 94 |             if (runif(1) <= dropout_prob) {
 95 |                 ## a drop-out
 96 |                 val = 0
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     return(val)
102 | }
103 | 
104 | 
105 | .sim_expr_val_mean_var_no_dropout <- function(m,  mean_var_spline) {
106 | 
107 |     val = 0
108 |     if (m > 0) {
109 |         logm = log(m+1)
110 |         pred_log_var = predict(mean_var_spline, logm)$y
111 | 
112 |         var = max(exp(pred_log_var)-1, 0)
113 | 
114 |         val = round(max(rnorm(n=1, mean=m, sd=sqrt(var)), 0))
115 |                 
116 |     }
117 |     
118 |     return(val)
119 | }
120 | 
121 | 
122 | .apply_dropout <- function(counts.matrix, dropout_logistic_params) {
123 |     
124 |     
125 |     
126 |     counts.matrix <- apply(counts.matrix, 1, function(x) {
127 | 
128 |         mean.val = mean(x)
129 |         dropout_prob <- predict(dropout_logistic_params$spline, log(mean.val))$y[1]
130 | 
131 |         nzeros = sum(x==0)
132 |         ntotal = length(x)
133 |         nremaining = ntotal - nzeros
134 | 
135 |         # padj = ( (pzero*total) - (current_nzero) ) / remaining
136 | 
137 |         padj = ( (dropout_prob * ntotal) - (nzeros) ) / nremaining
138 |         padj = max(padj, 0)
139 |         
140 |         flog.debug(sprintf("mean.val: %g, dropout_prob: %g, adj_dropout_prob: %g",
141 |                            mean.val,
142 |                            dropout_prob,
143 |                            padj))
144 | 
145 |         x.adj = sapply(x, function(y) {
146 |             if(runif(1) <= padj) {
147 |                 return(0)
148 |             } else {
149 |                 return(y)
150 |             }
151 |                         
152 |         } )
153 | 
154 |         x.adj
155 | 
156 |     })
157 | 
158 |     return(t(counts.matrix))
159 |     
160 | 
161 | }
162 | 
163 | 
164 | 
165 | ##' .get_mean_var_table()
166 | ##'
167 | ##' Computes the gene mean/variance table based on all defined cell groupings (reference and observations)
168 | ##'
169 | ##' @param infercnv_obj An infercnv object populated with raw count data
170 | ##'
171 | ##' @return data.frame with 3 columns: group_name, mean, variance
172 | ##'
173 | ##'
174 | ##' @keywords internal
175 | ##' @noRd
176 | ##'
177 | 
178 | .get_mean_var_table <- function(infercnv_obj) {
179 | 
180 |     group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
181 | 
182 |     mean_variance_table <- .get_mean_var_given_matrix(infercnv_obj@expr.data, group_indices)
183 | 
184 |     return(mean_variance_table)
185 | 
186 | }
187 | 
188 | 
189 | .get_mean_var_given_matrix <- function(expr.matrix, cell_cluster_groupings=NULL) {
190 | 
191 |     if (is.null(cell_cluster_groupings)) {
192 |         ## use all cells
193 |         cell_cluster_groupings = list(allcells=seq(ncol(expr.matrix)))
194 |     }
195 | 
196 |     mean_var_table <- NULL
197 | 
198 |     for (group_name in names(cell_cluster_groupings)) {
199 |         
200 |         expr.data = expr.matrix[, cell_cluster_groupings[[ group_name ]] ]
201 |         m = rowMeans(expr.data)
202 |         v = apply(expr.data, 1, var)
203 |         if (is.null(mean_var_table)) {
204 |             mean_var_table = data.frame(g=group_name, m=m, v=v)
205 |         } else {
206 |             mean_var_table = rbind(mean_var_table, data.frame(g=group_name, m=m, v=v))
207 |         }
208 |     }
209 | 
210 |     return(mean_var_table)
211 | }
212 | 
213 | ##' .get_spike_in_average_bounds()
214 | ##'
215 | ##' return mean bounds for expression of all cells in the spike-in
216 | ##'
217 | ##' @param infercnv_obj An infercnv object populated with raw count data
218 | ##'
219 | ##' @return c(left_bound, right_bound)
220 | ##'
221 | ##' @keywords internal
222 | ##' @noRd
223 | ##'
224 | 
225 | 


--------------------------------------------------------------------------------
/R/noise_reduction.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @title apply_median_filtering
  3 | #' 
  4 | #' @description Apply a median filtering to the expression matrix within each tumor bounds
  5 | #'
  6 | #' @param infercnv_obj infercnv_object
  7 | #' 
  8 | #' @param window_size Size of the window side centered on the data point to filter (default = 7).
  9 | #' 
 10 | #' @param on_observations  boolean (default=TRUE), run on observations data (tumor cells).
 11 | #'
 12 | #' @param on_references  boolean (default=TRUE), run on references (normal cells).
 13 | #'
 14 | #' @return infercnv_obj with median filtering applied to observations
 15 | #'
 16 | #' @export
 17 | #'
 18 | #' @examples
 19 | #' # data(infercnv_data_example)
 20 | #' # data(infercnv_annots_example)
 21 | #' # data(infercnv_genes_example)
 22 | #'
 23 | #' # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
 24 | #' #                                                           gene_order_file=infercnv_genes_example,
 25 | #' #                                                           annotations_file=infercnv_annots_example,
 26 | #' #                                                           ref_group_names=c("normal"))
 27 | #'
 28 | #' # infercnv_object_example <- infercnv::run(infercnv_object_example,
 29 | #' #                                          cutoff=1,
 30 | #' #                                          out_dir=tempfile(), 
 31 | #' #                                          cluster_by_groups=TRUE, 
 32 | #' #                                          denoise=TRUE,
 33 | #' #                                          HMM=FALSE,
 34 | #' #                                          num_threads=2,
 35 | #' #                                          no_plot=TRUE)
 36 | #'
 37 | #' data(infercnv_object_example)
 38 | #'
 39 | #' infercnv_object_example <- infercnv::apply_median_filtering(infercnv_object_example)
 40 | #' # plot result object
 41 | #'
 42 | 
 43 | apply_median_filtering <- function(infercnv_obj,
 44 |                                    window_size=7,
 45 |                                    on_observations=TRUE,
 46 |                                    on_references=TRUE) {
 47 | 
 48 |     if (window_size%%2 != 1 | window_size < 2) {
 49 |       flog.error("::apply_median_filtering: Error, window_size is an even or < 2. Please specify an odd number >= 3.")
 50 |     }
 51 |     
 52 |     half_window = (window_size - 1) / 2
 53 |     
 54 |     gene_chr_listing = infercnv_obj@gene_order[[C_CHR]]
 55 |     chrs = unlist(unique(gene_chr_listing))
 56 |     
 57 |     if (on_observations) {
 58 |         for (tumor_type in names(infercnv_obj@observation_grouped_cell_indices)) {
 59 |             
 60 |             tumor_indices_list = infercnv_obj@tumor_subclusters[["subclusters"]][[ tumor_type ]]
 61 |             
 62 |             for (tumor_indices in tumor_indices_list) {
 63 |                 for (chr in chrs) {
 64 |                     chr_genes_indices = which(gene_chr_listing == chr)
 65 |                     working_data = infercnv_obj@expr.data[chr_genes_indices, tumor_indices, drop=FALSE]
 66 |                     
 67 |                     infercnv_obj@expr.data[chr_genes_indices, tumor_indices] = .median_filter(data=working_data,
 68 |                                                                                               window_size=window_size,
 69 |                                                                                               half_window=half_window)
 70 |                 }
 71 |             }
 72 |         }
 73 |     }
 74 |     
 75 |     if (on_references) {
 76 |         for (ref_indices in infercnv_obj@reference_grouped_cell_indices) {
 77 |             for (chr in chrs) {
 78 |                 chr_genes_indices = which(gene_chr_listing == chr)
 79 |                 working_data = infercnv_obj@expr.data[chr_genes_indices, ref_indices, drop=FALSE]
 80 |                 
 81 |                 infercnv_obj@expr.data[chr_genes_indices, ref_indices] = .median_filter(data=working_data,
 82 |                                                                                         window_size=window_size,
 83 |                                                                                         half_window=half_window)
 84 |             }
 85 |         }
 86 |     }
 87 |     
 88 |     return(infercnv_obj)
 89 | }
 90 | 
 91 | 
 92 | .median_filter <- function(data,
 93 |                            window_size,
 94 |                            half_window) {
 95 | 
 96 |     xdim = dim(data)[1]
 97 |     ydim = dim(data)[2]
 98 |     results = data
 99 |     
100 |     # if (xdim >= window_size & ydim >= window_size) {
101 |         for (posx in seq_len(xdim)) {
102 |             posxa <- ifelse(posx <= (half_window + 1), 1, (posx - (half_window + 1)))
103 |             posxb <- ifelse(posx >= (xdim - (half_window + 1)), xdim, (posx + (half_window + 1)))
104 |             for (posy in seq_len(ydim)) {
105 |                 posya <- ifelse(posy <= (half_window + 1), 1, (posy - (half_window + 1)))
106 |                 posyb <- ifelse(posy >= (ydim - (half_window + 1)), ydim, (posy + (half_window + 1)))
107 |                 results[posx, posy] = median(data[posxa:posxb, posya:posyb])
108 |             }
109 |         }
110 |     #}
111 | 
112 |     return(results)
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Subclustering
 2 | 
 3 | Subclustering resolution is one of the primary settings that will need to be adjusted in most runs to avoid oversplitting. The tutorial below explains how it works and details about it can also be found on the [wiki](https://github.com/broadinstitute/infercnv/wiki/infercnv-tumor-subclusters#tumor-subclustering-by-leiden-clustering-preferred).
 4 | 
 5 | # Documentation
 6 | ### Full documentation
 7 | 
 8 | Visit project [wiki](https://github.com/broadinstitute/inferCNV/wiki) for InferCNV documentation.
 9 | 
10 | 
11 | ### Infercnv video tutorial
12 | 
13 | A **video** tutorial giving on overview of infercnv features and how to run an analysis can be found below **(click on the image)**:
14 | 
15 | [![Tutorial: Running infercnv](http://img.youtube.com/vi/-qOcHAavZT8/0.jpg)](http://www.youtube.com/watch?v=-qOcHAavZT8 "Tutorial: Running infercnv")
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/Rstudio_helpers/Examine_and_Filter_Cells_and_Genes.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Examine and Filter Cells and Genes"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | # Read in data matrix
 11 | ```{r} 
 12 | data = read.table("Glioblastoma_expressed_genes.txt", header=T, row.names=1)  ## CHANGE TO YOUR INPUT MATRIX
 13 | 
 14 | ```
 15 | 
 16 | # Examine distributions of counts of genes and cells
 17 | ```{r}
 18 | reads_per_cell = colSums(data)
 19 | reads_per_gene = rowSums(data)
 20 | genes_per_cell = colSums(data>0)
 21 | cells_per_gene = rowSums(data>0)
 22 | 
 23 | hist(log10(reads_per_cell+1),main='reads per cell',col='wheat')
 24 | hist(log10(genes_per_cell+1), main='genes per cell', col='wheat')
 25 | plot(reads_per_cell, genes_per_cell, log='xy', col='wheat')
 26 | hist(log10(reads_per_gene+1),main='reads per gene',col='wheat')
 27 | ```
 28 | 
 29 | ```{r}
 30 | plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')
 31 | ```
 32 | #Cell filtering criteria: define min and max genes per cell
 33 | 
 34 | ```{r}
 35 | ##################################################
 36 | ## ********* USER DEFINED SECTION ***************
 37 | ##################################################
 38 | 
 39 | #  set upper and lower thresholds for genes per cell:
 40 | MIN_GENES_PER_CELL = 350  ## user-defined setting
 41 | MAX_GENES_PER_CELL = 1800  ## user-defined setting
 42 | 
 43 | # now replot with the thresholds being shown:
 44 | plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')
 45 | abline(h=MIN_GENES_PER_CELL, col='green')  # lower threshold
 46 | abline(h=MAX_GENES_PER_CELL, col='green') # upper threshold
 47 | ```
 48 | 
 49 | 
 50 | # Examine percent mitochondrial read content
 51 | ```{r}
 52 | # define the mitochondrial genes
 53 | mito_genes = grep("^mt-", rownames(data) , ignore.case=T, value=T)
 54 | print(mito_genes)
 55 | ```
 56 | 
 57 | ```{r}
 58 | # compute pct mito
 59 | mito_gene_read_counts = colSums(data[mito_genes,])
 60 | pct_mito = mito_gene_read_counts / reads_per_cell * 100
 61 | plot(sort(pct_mito))
 62 | ```
 63 | 
 64 | # Decide on maximum allowed percent mitochondrial reads:
 65 | ```{r}
 66 | ##################################################
 67 | ## ********* USER DEFINED SECTION ***************
 68 | ##################################################
 69 | 
 70 | MAX_PCT_MITO = 10   ## user-defined setting
 71 | 
 72 | plot(sort(pct_mito))
 73 | abline(h=MAX_PCT_MITO, col='red')
 74 | ```
 75 | 
 76 | 
 77 | # cell selection as per Peter Karchenko - the Pagoda way
 78 | 
 79 | ```{r}
 80 | df = data.frame(reads_per_cell=reads_per_cell, genes_per_cell=genes_per_cell)
 81 | head(df)
 82 | ```
 83 | 
 84 | 
 85 | # Plot gene_per_cell vs. reads_per_cell, define outliers
 86 | 
 87 | ```{r}
 88 | library(MASS)
 89 | df = df[order(df$reads_per_cell),] # order by reads_per_cell
 90 | plot(df, log='xy')
 91 | m <- rlm(genes_per_cell~reads_per_cell,data=df) # robust linear model, not sens to outliers
 92 | p.level = 1e-3
 93 | # predict genes_per_cell based on observed reads_per_cell
 94 | suppressWarnings(pb <- data.frame(predict(m, interval='prediction', 
 95 |                                           level = 1-p.level, # define conf interval
 96 |                                           type="response")))
 97 | polygon(c(df$reads_per_cell, rev(df$reads_per_cell)),
 98 |         c(pb$lwr, rev(pb$upr)), col=adjustcolor(2,alpha=0.1), border = NA)
 99 | 
100 | # identifier outliers as having observed genes_per_cell outside the prediction confidence interval
101 | outliers <- rownames(df)[df$genes_per_cell > pb$upr | df$genes_per_cell < pb$lwr];
102 | points(df[outliers,],col=2,cex=0.6)
103 | ```
104 | 
105 | # Before pruning cells, let's make a backup copy of the original matrix:
106 | ```{r}
107 | data.prefiltered = data
108 | ```
109 | 
110 | # Now, let's do some pruning to remove 'bad' cells
111 | ```{r}
112 | filtered_data = data.prefiltered # just in case we re-run this block using different thresholds.
113 | 
114 | ###############################################################
115 | # prune genes, require a gene to be expressed in at least 3 cells
116 | 
117 | filtered_data.prefiltered = filtered_data
118 | filtered_data = filtered_data[cells_per_gene >= 3,]  ## user can change this if needed.
119 | 
120 | ###############################################################
121 | # prune cells
122 | valid_cells = colnames(filtered_data) # all cells
123 | message('starting with: ', length(valid_cells), ' cells') # number starting with
124 | 
125 | ## remove cells based on gene count criteria:
126 | valid_cells = valid_cells[genes_per_cell >= MIN_GENES_PER_CELL & genes_per_cell <= MAX_GENES_PER_CELL]  # set values based on your evaluation above
127 | message('after filtering low and high gene count outliers: ', length(valid_cells), ' cells') # number after filtering based gene count thresholds
128 | 
129 | ## remove cells having excessive mito read content
130 | valid_cells = valid_cells[valid_cells %in% names(pct_mito)[pct_mito <= MAX_PCT_MITO]]
131 | message('after removing high-mito cells: ', length(valid_cells), ' cells') # number remaining after high-mito cells removed
132 | 
133 | ## remove cells identified as outliers via the Karchenko method
134 | valid_cells = valid_cells[ ! valid_cells %in% outliers]
135 | message('after removing final outliers: ', length(valid_cells), ' cells') # number surviving outlier detection
136 | 
137 | ## update the count matrix to contain only the valid cells
138 | filtered_data = filtered_data[,valid_cells]
139 | 
140 | write.table(filtered_data, file="filtered_data.counts.matrix", quote=F, sep="\t")
141 | ```
142 | 
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/WDL/infercnv.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow infercnv {
 4 |     input {
 5 |         File raw_counts_matrix # the matrix of genes (rows) vs. cells (columns)
 6 |         File gene_order_file # data file containing the positions of each gene along each chromosome in the genome
 7 |         File annotations_file # a description of the cells, indicating the cell type classifications.
 8 |         String additional_args = ""
 9 |         Int cpu = 1
10 |         String memory = "12G"
11 |         String docker = "trinityctat/infercnv:1.11.1"
12 |         Int preemptible = 2
13 |         Int extra_disk_space = 10
14 |     }
15 | 
16 |     call run_infercnv {
17 |         input:
18 |             raw_counts_matrix = raw_counts_matrix,
19 |             gene_order_file = gene_order_file,
20 |             annotations_file = annotations_file,
21 |             additional_args = additional_args,
22 |             cpu = cpu,
23 |             memory = memory,
24 |             extra_disk_space = extra_disk_space,
25 |             docker = docker,
26 |             preemptible = preemptible
27 |     }
28 | 
29 |     output {
30 |         Array[File] infercnv_figures = run_infercnv.infercnv_outputs
31 |         Array[File] infercnv_outputs = run_infercnv.infercnv_outputs
32 |         File infercnv_full_outputs = run_infercnv.infercnv_full_outputs
33 |     }
34 | }
35 | 
36 | task run_infercnv {
37 |     input {
38 |         File raw_counts_matrix
39 |         File gene_order_file
40 |         File annotations_file
41 |         String memory
42 |         Int cpu
43 |         String docker
44 |         Int preemptible
45 |         String additional_args
46 |         Int extra_disk_space
47 |     }
48 | 
49 |     command {
50 |         set -e
51 | 
52 |         mkdir infercnv
53 | 
54 |         inferCNV.R \
55 |         --raw_counts_matrix ${raw_counts_matrix} \
56 |         --annotations_file ${annotations_file} \
57 |         --gene_order_file ${gene_order_file} \
58 |         --num_threads ${cpu} \
59 |         --out_dir infercnv \
60 |         ${additional_args}
61 | 
62 |         tar -cvzf infercnv_full_outputs.tar.gz infercnv
63 |     }
64 | 
65 |     output {
66 |         File infercnv_full_outputs = "infercnv_full_outputs.tar.gz"
67 |         Array[File] infercnv_figures = glob("infercnv/*.png")
68 |         Array[File] infercnv_outputs = glob("infercnv/infercnv.*.txt infercnv/top_*.txt infercnv/*pred_cnv_*.dat")
69 |     }
70 | 
71 |     runtime {
72 |         docker: docker
73 |         memory: memory
74 |         bootDiskSizeGb: 12
75 |         disks: "local-disk " + ceil(size(raw_counts_matrix, "GB")*2 + extra_disk_space) + " HDD"
76 |         cpu: cpu
77 |         preemptible: preemptible
78 |     }
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/data/HMM_states.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/HMM_states.rda


--------------------------------------------------------------------------------
/data/infercnv_annots_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_annots_example.rda


--------------------------------------------------------------------------------
/data/infercnv_data_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_data_example.rda


--------------------------------------------------------------------------------
/data/infercnv_genes_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_genes_example.rda


--------------------------------------------------------------------------------
/data/infercnv_object_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_object_example.rda


--------------------------------------------------------------------------------
/data/mcmc_obj.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/mcmc_obj.rda


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Docker file for inferCNV
 2 | FROM bioconductor/bioconductor_docker:devel
 3 | 
 4 | LABEL org.label-schema.license="BSD-3-Clause" \
 5 |       org.label-schema.vendor="Broad Institute" \
 6 |       maintainer="Christophe Georgescu <cgeorges@broadinstitute.org>"
 7 | 
 8 | RUN apt-get update && apt-get -y install curl libssl-dev libcurl4-openssl-dev \
 9 |                                         libxml2-dev git python3 jags \
10 |                                         r-cran-rjags time && \
11 |                       apt-get clean && rm -rf /var/tmp/* \
12 |                                           /tmp/* /var/lib/apt/lists/*
13 | 
14 | # Install R and Bioconductor packages
15 | RUN echo "options(repos = c(CRAN = 'https://cran.rstudio.com'))" >.Rprofile
16 | RUN R -e "BiocManager::install('infercnv')"
17 | #RUN R -e "install.packages(c('cluster', 'Seurat', 'parallelDist', 'optparse'), repos = 'http://cran.us.r-project.org')"
18 | RUN R -e "install.packages(c('cluster', 'Seurat', 'optparse', 'igraph', 'reshape2'), repos = 'http://cran.us.r-project.org')"
19 | #RUN R -e "install.packages('phyclust', repos = 'http://cran.us.r-project.org')"
20 | 
21 | # RUN pip3 install numpy igraph pandas leidenalg
22 | # ENV RETICULATE_PYTHON=/usr/bin/python3
23 | 
24 | # Checkout and install infercnv
25 | # update to version bump commit
26 | RUN git clone https://github.com/broadinstitute/infercnv && cd infercnv && \
27 |       git checkout master && git checkout 1b46b48303bac4a882bcb758e78fcf7f832fdefb && \
28 |       R CMD INSTALL . 
29 | 
30 | ENV PATH=${PATH}:/infercnv/scripts
31 | 
32 | CMD inferCNV.R --help
33 | 
34 | 


--------------------------------------------------------------------------------
/example/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | all:
 4 | 	./run.R
 5 | 
 6 | clean:
 7 | 	rm -rf ./output_dir
 8 | 
 9 | 
10 | 
11 | debug:
12 | 	../scripts/plot_hspike.by_num_cells.R --infercnv_obj output_dir/run.final.infercnv_obj	
13 | 	../scripts/plot_hspike.R --infercnv_obj output_dir/run.final.infercnv_obj
14 | 	../scripts/run_HMM_on_hspike.R --infercnv_obj output_dir/run.final.infercnv_obj
15 | 	../scripts/plot_hspike_vs_sample_chrs.R --infercnv_obj output_dir/preliminary.infercnv_obj
16 | 
17 | 
18 | 
19 | 
20 | i3:
21 | 	cat run.R  | sed s/HMM=TRUE/HMM=TRUE,HMM_type=\'i3\'/ > run.i3.R
22 | 	Rscript ./run.i3.R
23 | 
24 | Bayes:
25 | 	cat run.R  | sed s/HMM=TRUE/HMM=TRUE,BayesMaxPNormal=0.35/ > run.Bayes.R
26 | 	Rscript ./run.Bayes.R
27 | 
28 | 


--------------------------------------------------------------------------------
/example/README.txt:
--------------------------------------------------------------------------------
1 | This example uses an abridged version of the gencode annotations. You do not want to use that file with your own data. It's abridged here only to reduce space in R packaging.
2 | 
3 | The complete gencode annotation file can be found here:
4 | https://github.com/broadinstitute/inferCNV_examples/tree/master/__gene_position_data
5 | 
6 | 


--------------------------------------------------------------------------------
/example/__alt_exec_modes/run.no_spike.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library("infercnv")
 4 | 
 5 | # create the infercnv object
 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix",
 7 |                                     annotations_file="../oligodendroglioma_annotations_downsampled.txt",
 8 |                                     delim="\t",
 9 |                                     gene_order_file="../gencode_downsampled.txt",
10 |                                     ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
11 | 
12 | out_dir="output_dir.no_spike"
13 | # perform infercnv operations to reveal cnv signal
14 | infercnv_obj = infercnv::run(infercnv_obj,
15 |                              cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
16 |                              out_dir=out_dir, 
17 |                              cluster_by_groups=T, 
18 |                              plot_steps=F,
19 |                              include.spike=F  # used for final scaling to fit range (0,2) centered at 1.
20 |                              )
21 | 
22 | 


--------------------------------------------------------------------------------
/example/__alt_exec_modes/run.set_num_ref_groups.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library("infercnv")
 4 | 
 5 | # create the infercnv object
 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix",
 7 |                                     annotations_file="../oligodendroglioma_annotations_downsampled.txt",
 8 |                                     delim="\t",
 9 |                                     gene_order_file="../gencode_downsampled.txt",
10 |                                     ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
11 | 
12 | 
13 | out_dir="output_dir_ref_grps_4"
14 | 
15 | # perform infercnv operations to reveal cnv signal
16 | infercnv_obj = infercnv::run(infercnv_obj,
17 |                              cutoff=1, 
18 |                              out_dir=out_dir,
19 |                              cluster_by_groups=T, 
20 |                              plot_steps=T,
21 |                              num_ref_groups=4,
22 |                              )
23 | 
24 | # generate final plot
25 | plot_cnv(infercnv_obj,
26 |          out_dir=out_dir, 
27 |          cluster_by_groups=T,
28 |          color_safe_pal=FALSE,
29 |          x.center=1,
30 |          x.range=c(0.6,1.4),
31 |          title="inferCNV",
32 |          obs_title="Observations (Cells)",
33 |          ref_title="References (Cells)",
34 |          output_filename="infercnv")
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/example/__alt_exec_modes/run.use_zscores.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library("infercnv")
 4 | 
 5 | # create the infercnv object
 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix",
 7 |                                     annotations_file="../oligodendroglioma_annotations_downsampled.txt",
 8 |                                     delim="\t",
 9 |                                     gene_order_file="../gencode_downsampled.txt",
10 |                                     ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
11 | 
12 | 
13 | out_dir="output_dir_use_zscores"
14 | 
15 | # perform infercnv operations to reveal cnv signal
16 | infercnv_obj = infercnv::run(infercnv_obj,
17 |                              cutoff=1, 
18 |                              out_dir=out_dir,
19 |                              cluster_by_groups=T, 
20 |                              plot_steps=T,
21 |                              use_zscores=T,
22 |                              include.spike=T
23 |                              )
24 | 
25 | 


--------------------------------------------------------------------------------
/example/run.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(error = function() traceback(2))
 4 | 
 5 | library("infercnv")
 6 | 
 7 | # create the infercnv object
 8 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"),
 9 |                                     annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"),
10 |                                     delim="\t",
11 |                                     gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"),
12 |                                     ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
13 | 
14 | out_dir="output_dir"
15 | # perform infercnv operations to reveal cnv signal
16 | infercnv_obj = infercnv::run(infercnv_obj,
17 |                              cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
18 |                              out_dir=out_dir, 
19 |                              cluster_by_groups=TRUE, 
20 |                              analysis_mode="subclusters",    
21 |                              plot_steps=FALSE,
22 |                              denoise=TRUE,
23 |                              sd_amplifier=2,
24 |                              HMM=TRUE
25 |                              )
26 | 
27 | 


--------------------------------------------------------------------------------
/example/run_memory_profiling_per_step.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in `seq 1 21`; do
4 |     gtime -v Rscript run_test.R $i > profiling/up_to_step_${i}_1.log 2> profiling/up_to_step_${i}_1.times
5 |     gtime -v Rscript run_test.R $i > profiling/up_to_step_${i}_2.log 2> profiling/up_to_step_${i}_2.times
6 | done
7 | 


--------------------------------------------------------------------------------
/example/run_test.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args = commandArgs(trailingOnly=TRUE)
 4 | if (length(args) != 1) {
 5 |     stop()
 6 | }
 7 | 
 8 | as.numeric(args[1])
 9 | 
10 | 
11 | options(error = function() traceback(2))
12 | 
13 | library("infercnv")
14 | 
15 | # create the infercnv object
16 | # infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"),
17 | #                                     annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"),
18 | #                                     delim="\t",
19 | #                                     gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"),
20 | #                                     ref_group_names=NULL) 
21 | 
22 | infercnv_obj <- readRDS("default_input_infercnv_object.rds")
23 | 
24 | out_dir="output_dir_memory_test"
25 | # perform infercnv operations to reveal cnv signal
26 | infercnv_obj = infercnv::run(infercnv_obj,
27 |                              cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
28 |                              out_dir=out_dir, 
29 |                              cluster_by_groups=TRUE,
30 |                              plot_steps=FALSE,
31 |                              no_plot=TRUE,
32 |                              denoise=TRUE,
33 |                              debug=TRUE,
34 |                              HMM=TRUE,
35 |                              up_to_step=as.numeric(args[1]))
36 | 
37 | 


--------------------------------------------------------------------------------
/inst/BUGS_Mixture_Model:
--------------------------------------------------------------------------------
 1 | model {
 2 |     ## Likelihood
 3 |     ## Single cell
 4 |     for( j in 1:C ) {                                   # for each cell line K
 5 |         for ( i in 1:G ) {                             # for each gene j 
 6 |             ## Likelihood 
 7 |             ## generative distribution of the observed data 
 8 |             gexp[i, j] ~ dnorm(mu.1[j], tau.1[j])
 9 |         }
10 |         
11 |         ## mu and tau are Fixed Effects dependent on the cluster/state assignment 
12 |         ## result in cluster specific means and precision
13 |         ## gamma is group specific Random Effect 
14 |         
15 |         mu.1[j] <-  mu[1] * (equals(epsilon[j], 1 )) +
16 |                     mu[2] * (equals(epsilon[j], 2 )) +
17 |                     mu[3] * (equals(epsilon[j], 3 )) +
18 |                     mu[4] * (equals(epsilon[j], 4 )) +
19 |                     mu[5] * (equals(epsilon[j], 5 )) +
20 |                     mu[6] * (equals(epsilon[j], 6 )) 
21 | 
22 |         tau.1[j] <- sig[1] * (equals( epsilon[j], 1 )) +
23 |                     sig[2] * (equals( epsilon[j], 2 )) +
24 |                     sig[3] * (equals( epsilon[j], 3 )) +
25 |                     sig[4] * (equals( epsilon[j], 4 )) +
26 |                     sig[5] * (equals( epsilon[j], 5 )) +
27 |                     sig[6] * (equals( epsilon[j], 6 ))
28 |         
29 |     # PRIOR
30 |     ##     Epsilons hold our cluster/state assignment 
31 |     ##     theta are the mixture probabilities for states 
32 |     ##      cell specific 
33 |      
34 |         epsilon[j] ~ dcat(theta[])    
35 |     }
36 | 
37 | 
38 |     # HYPERPARAMETERS 
39 |     ## hyperparameter for for gamma, a flat gamma distribution 
40 |     sigma ~ dgamma(1,1)
41 | 
42 |     #      dirchlet with equal probabilities for each state, equals to a uniform 
43 |     #      provides the probabilities distribution of states 
44 |     #      alpha can be 1 or (1/nubmer of states)
45 | 
46 |    
47 |     ## Hyperparameter for epsilon, 
48 |     ## This is the mixing property!
49 | 
50 |     theta[1:6] ~ ddirich(alpha[])
51 | 
52 |     # HYPERHYPERPARAMETER
53 | 
54 |     for(i in 1:6){
55 |         alpha[i] <- 1
56 |     }
57 | }


--------------------------------------------------------------------------------
/inst/BUGS_Mixture_Model_i3:
--------------------------------------------------------------------------------
 1 | model {
 2 |     ## Likelihood
 3 |     ## Single cell
 4 |     for( j in 1:C ) {                                   # for each cell line K
 5 |         for ( i in 1:G ) {                             # for each gene j 
 6 |             ## Likelihood 
 7 |             ## generative distribution of the observed data 
 8 |             gexp[i, j] ~ dnorm(mu.1[j], tau.1[j])
 9 |         }
10 |         
11 |         ## mu and tau are Fixed Effects dependent on the cluster/state assignment 
12 |         ## result in cluster specific means and precision
13 |         ## gamma is group specific Random Effect 
14 |         
15 |         mu.1[j] <-  mu[1] * (equals(epsilon[j], 1 )) +
16 |                     mu[2] * (equals(epsilon[j], 2 )) +
17 |                     mu[3] * (equals(epsilon[j], 3 ))
18 | 
19 |         tau.1[j] <- sig[1] * (equals( epsilon[j], 1 )) +
20 |                     sig[2] * (equals( epsilon[j], 2 )) +
21 |                     sig[3] * (equals( epsilon[j], 3 ))
22 |         
23 |     # PRIOR
24 |     ##     Epsilons hold our cluster/state assignment 
25 |     ##     theta are the mixture probabilities for states 
26 |     ##      cell specific 
27 |      
28 |         epsilon[j] ~ dcat(theta[])    
29 |     }
30 | 
31 | 
32 |     # HYPERPARAMETERS 
33 |     ## hyperparameter for for gamma, a flat gamma distribution 
34 |     sigma ~ dgamma(1,1)
35 | 
36 |     #      dirchlet with equal probabilities for each state, equals to a uniform 
37 |     #      provides the probabilities distribution of states 
38 |     #      alpha can be 1 or (1/nubmer of states)
39 | 
40 |    
41 |     ## Hyperparameter for epsilon, 
42 |     ## This is the mixing property!
43 | 
44 |     theta[1:3] ~ ddirich(alpha[])
45 | 
46 |     # HYPERHYPERPARAMETER
47 | 
48 |     for(i in 1:3){
49 |         alpha[i] <- 1
50 |     }
51 | }


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite the inferCNV package in publications, please use:")
 2 | 
 3 | citEntry(entry = "manual",
 4 | 	title = "inferCNV of the Trinity CTAT Project.",
 5 | 	author = personList(
 6 | 		person("Timothy", "Tickle", email = "ttickle@broadinstitute.org", role = c("aut", "cre")),
 7 | 		person("Itay", "Tirosh", email = "tirosh@broadinstitute.org", role = "aut"),
 8 | 		person("Christophe", "Georgescu", email = "cgeorges@broadinstitute.org", role = "aut"),
 9 | 		person("Maxwell", "Brown", email = "mbrown@broadinstitute.org", role = "aut"),
10 | 		person("Brian", "Haas", email = "bhaas@broadinstitute.org", role = "aut")
11 | 	),
12 | 	organization = "Klarman Cell Observatory, Broad Institute of MIT and Harvard",
13 | 	address = "Cambridge, MA, USA",
14 | 	year = 2019,
15 | 	url = "https://github.com/broadinstitute/inferCNV",
16 | 	textVersion = "inferCNV of the Trinity CTAT Project. https://github.com/broadinstitute/inferCNV"
17 | )
18 | 
19 | #citEntry(entry = "article",
20 | #	title = "",
21 | #	author = personList(
22 | #		person(),
23 | #		person()
24 | #	),
25 | #	journal = "",
26 | #	year = "2018",
27 | #	volume = "",
28 | #	pages = "",
29 | #	textVersion = paste("authors", "title", "journal")
30 | #)
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/inst/extdata/oligodendroglioma_annotations_downsampled.txt:
--------------------------------------------------------------------------------
  1 | MGH54_P2_C12	Microglia/Macrophage
  2 | MGH36_P6_F03	Microglia/Macrophage
  3 | MGH53_P4_H08	Microglia/Macrophage
  4 | MGH53_P2_E09	Microglia/Macrophage
  5 | MGH36_P5_E12	Microglia/Macrophage
  6 | MGH54_P2_H07	Microglia/Macrophage
  7 | MGH36_P4_H06	Microglia/Macrophage
  8 | MGH53_P1_C01	Microglia/Macrophage
  9 | MGH36_P4_A10	Microglia/Macrophage
 10 | MGH36_P3_D10	Microglia/Macrophage
 11 | MGH54_P2_F09	Microglia/Macrophage
 12 | MGH36_P7_H06	Microglia/Macrophage
 13 | MGH54_P2_H03	Microglia/Macrophage
 14 | MGH36_P8_A02	Microglia/Macrophage
 15 | MGH53_P2_C08	Microglia/Macrophage
 16 | MGH53_P2_A07	Microglia/Macrophage
 17 | MGH53_P1_F10	Microglia/Macrophage
 18 | MGH36_P3_H06	Microglia/Macrophage
 19 | MGH54_P2_F03	Microglia/Macrophage
 20 | MGH54_P16_F12	Oligodendrocytes (non-malignant)
 21 | MGH54_P12_C10	Oligodendrocytes (non-malignant)
 22 | MGH54_P11_C11	Oligodendrocytes (non-malignant)
 23 | MGH54_P15_D06	Oligodendrocytes (non-malignant)
 24 | MGH54_P16_A03	Oligodendrocytes (non-malignant)
 25 | MGH53_P7_B09	Oligodendrocytes (non-malignant)
 26 | MGH54_P10_G04	Oligodendrocytes (non-malignant)
 27 | MGH53_P2_A02	Oligodendrocytes (non-malignant)
 28 | MGH53_P7_F07	Oligodendrocytes (non-malignant)
 29 | MGH53_P5_G02	Oligodendrocytes (non-malignant)
 30 | MGH53_P11_H03	Oligodendrocytes (non-malignant)
 31 | MGH53_P1_A10	Oligodendrocytes (non-malignant)
 32 | MGH53_P5_H09	Oligodendrocytes (non-malignant)
 33 | MGH53_P11_E03	Oligodendrocytes (non-malignant)
 34 | MGH53_P10_F11	Oligodendrocytes (non-malignant)
 35 | MGH53_P1_D07	Oligodendrocytes (non-malignant)
 36 | MGH53_P2_G04	Oligodendrocytes (non-malignant)
 37 | MGH53_P2_G09	Oligodendrocytes (non-malignant)
 38 | MGH53_P5_F04	Oligodendrocytes (non-malignant)
 39 | MGH53_P11_F08	Oligodendrocytes (non-malignant)
 40 | MGH53_P8_F03	Oligodendrocytes (non-malignant)
 41 | MGH53_P6_B11	Oligodendrocytes (non-malignant)
 42 | MGH53_P6_H06	Oligodendrocytes (non-malignant)
 43 | MGH36_P1_B02	malignant_MGH36
 44 | MGH36_P1_H10	malignant_MGH36
 45 | MGH36_P3_A09	malignant_MGH36
 46 | MGH36_P3_B02	malignant_MGH36
 47 | MGH36_P3_C04	malignant_MGH36
 48 | MGH36_P3_E06	malignant_MGH36
 49 | MGH36_P4_B09	malignant_MGH36
 50 | MGH36_P4_D11	malignant_MGH36
 51 | MGH36_P4_G03	malignant_MGH36
 52 | MGH36_P6_C04	malignant_MGH36
 53 | MGH36_P6_G08	malignant_MGH36
 54 | MGH36_P7_B04	malignant_MGH36
 55 | MGH36_P7_D03	malignant_MGH36
 56 | MGH36_P7_F04	malignant_MGH36
 57 | MGH36_P7_G04	malignant_MGH36
 58 | MGH36_P5_B08	malignant_MGH36
 59 | MGH36_P5_F05	malignant_MGH36
 60 | MGH36_P5_F11	malignant_MGH36
 61 | MGH36_P5_H05	malignant_MGH36
 62 | MGH36_P10_B08	malignant_MGH36
 63 | MGH36_P10_C10	malignant_MGH36
 64 | MGH36_P10_E07	malignant_MGH36
 65 | MGH36_P10_F09	malignant_MGH36
 66 | MGH36_P8_E05	malignant_MGH36
 67 | MGH36_P8_H09	malignant_MGH36
 68 | MGH36_P9_B01	malignant_MGH36
 69 | MGH36_P9_B11	malignant_MGH36
 70 | MGH36_P9_H03	malignant_MGH36
 71 | MGH36_P2_A08	malignant_MGH36
 72 | MGH36_P2_C02	malignant_MGH36
 73 | MGH36_P2_G01	malignant_MGH36
 74 | MGH36_P2_G02	malignant_MGH36
 75 | MGH36_P2_H06	malignant_MGH36
 76 | MGH53_P5_A08	malignant_MGH53
 77 | MGH53_P5_D02	malignant_MGH53
 78 | MGH53_P6_F03	malignant_MGH53
 79 | MGH53_P6_H04	malignant_MGH53
 80 | MGH53_P7_B10	malignant_MGH53
 81 | MGH53_P7_C03	malignant_MGH53
 82 | MGH53_P7_E02	malignant_MGH53
 83 | MGH53_P7_G11	malignant_MGH53
 84 | MGH53_P7_H03	malignant_MGH53
 85 | MGH53_P8_A07	malignant_MGH53
 86 | MGH53_P8_C11	malignant_MGH53
 87 | MGH53_P8_E05	malignant_MGH53
 88 | MGH53_P8_E10	malignant_MGH53
 89 | MGH53_P8_H04	malignant_MGH53
 90 | MGH53_P1_B04	malignant_MGH53
 91 | MGH53_P12_A01	malignant_MGH53
 92 | MGH53_P12_B09	malignant_MGH53
 93 | MGH53_P12_C02	malignant_MGH53
 94 | MGH53_P12_C09	malignant_MGH53
 95 | MGH53_P12_D12	malignant_MGH53
 96 | MGH53_P12_E03	malignant_MGH53
 97 | MGH53_P10_B02	malignant_MGH53
 98 | MGH53_P10_C09	malignant_MGH53
 99 | MGH53_P10_E09	malignant_MGH53
100 | MGH53_P10_H08	malignant_MGH53
101 | MGH53_P11_A03	malignant_MGH53
102 | MGH53_P11_B02	malignant_MGH53
103 | MGH53_P11_B11	malignant_MGH53
104 | MGH53_P11_F12	malignant_MGH53
105 | MGH53_P11_H12	malignant_MGH53
106 | MGH53_P9_A09	malignant_MGH53
107 | MGH53_P9_C12	malignant_MGH53
108 | MGH53_P4_C03	malignant_MGH53
109 | MGH53_P4_F01	malignant_MGH53
110 | 97_P3_G07	malignant_97
111 | 97_P3_E04	malignant_97
112 | 97_P3_D10	malignant_97
113 | 97_P3_E01	malignant_97
114 | 97_P3_E03	malignant_97
115 | 97_P3_B10	malignant_97
116 | 97_P3_B04	malignant_97
117 | 97_P3_B01	malignant_97
118 | 97_P3_B03	malignant_97
119 | 97_P3_D01	malignant_97
120 | 97_P3_D04	malignant_97
121 | 97_P3_D12	malignant_97
122 | 97_P3_F12	malignant_97
123 | 97_P3_E12	malignant_97
124 | 97_P5_D09	malignant_97
125 | 97_P6_H01	malignant_97
126 | 97_P5_C10	malignant_97
127 | 97_P6_E07	malignant_97
128 | 97_P5_D02	malignant_97
129 | 97_P6_G10	malignant_97
130 | 97_P5_G05	malignant_97
131 | 97_P6_B09	malignant_97
132 | 97_P5_H08	malignant_97
133 | 97_P5_F04	malignant_97
134 | 97_P5_D01	malignant_97
135 | 97_P6_F05	malignant_97
136 | 97_P6_A06	malignant_97
137 | 97_P5_A07	malignant_97
138 | 97_P6_E01	malignant_97
139 | 97_P6_D09	malignant_97
140 | 97_P5_G06	malignant_97
141 | 97_P5_E12	malignant_97
142 | 97_P6_A07	malignant_97
143 | 97_P6_G12	malignant_97
144 | 97_P6_H06	malignant_97
145 | 93_P3_B02	malignant_93
146 | 93_P3_G05	malignant_93
147 | 93_P3_H04	malignant_93
148 | 93_P3_A10	malignant_93
149 | 93_P3_C04	malignant_93
150 | 93_P3_D07	malignant_93
151 | 93_P3_G07	malignant_93
152 | 93_P3_E09	malignant_93
153 | 93_P3_G11	malignant_93
154 | 93_P3_A11	malignant_93
155 | 93_P6_H11	malignant_93
156 | 93_P5_H06	malignant_93
157 | 93_P5_C12	malignant_93
158 | 93_P6_A02	malignant_93
159 | 93_P5_D07	malignant_93
160 | 93_P6_C07	malignant_93
161 | 93_P9_C04	malignant_93
162 | 93_P9_E04	malignant_93
163 | 93_P9_H01	malignant_93
164 | 93_P8_B06	malignant_93
165 | 93_P10_E05	malignant_93
166 | 93_P9_B10	malignant_93
167 | 93_P8_G11	malignant_93
168 | 93_P9_F02	malignant_93
169 | 93_P10_F03	malignant_93
170 | 93_P9_G11	malignant_93
171 | 93_P8_E09	malignant_93
172 | 93_P8_C11	malignant_93
173 | 93_P9_A03	malignant_93
174 | 93_P10_G11	malignant_93
175 | 93_P9_B11	malignant_93
176 | 93_P9_D06	malignant_93
177 | 93_P8_B02	malignant_93
178 | 93_P8_C09	malignant_93
179 | 93_P9_H03	malignant_93
180 | 93_P10_D04	malignant_93
181 | 93_P8_G09	malignant_93
182 | 93_P10_B10	malignant_93
183 | 93_P9_C07	malignant_93
184 | 93_P8_A12	malignant_93
185 | 


--------------------------------------------------------------------------------
/inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz


--------------------------------------------------------------------------------
/inst/script/README.txt:
--------------------------------------------------------------------------------
1 | This example uses an abridged version of the gencode annotations. You do not want to use that file with your own data. It's abridged here only to reduce space in R packaging.
2 | 
3 | The complete gencode annotation file can be found here:
4 | https://github.com/broadinstitute/inferCNV_examples/tree/master/__gene_position_data
5 | 
6 | 


--------------------------------------------------------------------------------
/man/CreateInfercnvObject.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV.R
 3 | \name{CreateInfercnvObject}
 4 | \alias{CreateInfercnvObject}
 5 | \title{CreateInfercnvObject}
 6 | \usage{
 7 | CreateInfercnvObject(
 8 |   raw_counts_matrix,
 9 |   gene_order_file,
10 |   annotations_file,
11 |   ref_group_names,
12 |   delim = "\\t",
13 |   max_cells_per_group = NULL,
14 |   min_max_counts_per_cell = c(100, +Inf),
15 |   chr_exclude = c("chrX", "chrY", "chrM")
16 | )
17 | }
18 | \arguments{
19 | \item{raw_counts_matrix}{the matrix of genes (rows) vs. cells (columns) containing the raw counts
20 | If a filename is given, it'll be read via read.table()
21 | otherwise, if matrix or Matrix, will use the data directly.}
22 | 
23 | \item{gene_order_file}{data file containing the positions of each gene along each chromosome in the genome.}
24 | 
25 | \item{annotations_file}{a description of the cells, indicating the cell type classifications}
26 | 
27 | \item{ref_group_names}{a vector containing the classifications of the reference (normal) cells to use for infering cnv}
28 | 
29 | \item{delim}{delimiter used in the input files}
30 | 
31 | \item{max_cells_per_group}{maximun number of cells to use per group. Default=NULL, using all cells defined in the annotations_file. This option is useful for randomly subsetting the existing data for a quicker preview run, such as using 50 cells per group instead of hundreds.}
32 | 
33 | \item{min_max_counts_per_cell}{minimum and maximum counts allowed per cell. Any cells outside this range will be removed from the counts matrix. default=(100, +Inf) and uses all cells. If used, should be set as c(min_counts, max_counts)}
34 | 
35 | \item{chr_exclude}{list of chromosomes in the reference genome annotations that should be excluded from analysis.  Default = c('chrX', 'chrY', 'chrM')}
36 | }
37 | \value{
38 | infercnv
39 | }
40 | \description{
41 | Creation of an infercnv object. This requires the following inputs:
42 | A more detailed description of each input is provided below:
43 | 
44 | The raw_counts_matrix:
45 | 
46 |           MGH54_P16_F12 MGH53_P5_C12 MGH54_P12_C10 MGH54_P16_F02 MGH54_P11_C11  ...
47 | DDX11L1     0.0000000     0.000000      0.000000      0.000000     0.0000000
48 | WASH7P      0.0000000     2.231939      7.186235      5.284944     0.9650009
49 | FAM138A     0.1709991     0.000000      0.000000      0.000000     0.0000000
50 | OR4F5       0.0000000     0.000000      0.000000      0.000000     0.0000000
51 | OR4F29      0.0000000     0.000000      0.000000      0.000000     0.0000000
52 | ...
53 | 
54 | The gene_order_file, contains chromosome, start, and stop position for each gene, tab-delimited:
55 | 
56 |          chr  start   stop
57 | DDX11L1 chr1  11869  14412
58 | WASH7P  chr1  14363  29806
59 | FAM138A chr1  34554  36081
60 | OR4F5   chr1  69091  70008
61 | OR4F29  chr1 367640 368634
62 | OR4F16  chr1 621059 622053
63 | ...
64 | 
65 | The annotations_file, containing the cell name and the cell type classification, tab-delimited.
66 | 
67 |             V1                   V2
68 | 1 MGH54_P2_C12 Microglia/Macrophage
69 | 2 MGH36_P6_F03 Microglia/Macrophage
70 | 3 MGH53_P4_H08 Microglia/Macrophage
71 | 4 MGH53_P2_E09 Microglia/Macrophage
72 | 5 MGH36_P5_E12 Oligodendrocytes (non-malignant)
73 | 6 MGH54_P2_H07 Oligodendrocytes (non-malignant)
74 | ...
75 | 179  93_P9_H03 malignant
76 | 180 93_P10_D04 malignant
77 | 181  93_P8_G09 malignant
78 | 182 93_P10_B10 malignant
79 | 183  93_P9_C07 malignant
80 | 184  93_P8_A12 malignant
81 | ...
82 | 
83 | 
84 | and the ref_group_names vector might look like so:  c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")
85 | }
86 | \examples{
87 | data(infercnv_data_example)
88 | data(infercnv_annots_example)
89 | data(infercnv_genes_example)
90 | 
91 | infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
92 |                                                gene_order_file=infercnv_genes_example,
93 |                                                annotations_file=infercnv_annots_example,
94 |                                                ref_group_names=c("normal"))
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/man/HMM_states.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{HMM_states}
 5 | \alias{HMM_states}
 6 | \title{infercnv object result of the processing of run() in the HMM example, to be used for other examples.}
 7 | \format{
 8 | An infercnv object containing HMM predictions
 9 | }
10 | \usage{
11 | HMM_states
12 | }
13 | \description{
14 | infercnv object result of the processing of run() in the HMM example, to be used for other examples.
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/MCMC_inferCNV-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV_BayesNet.R
 3 | \docType{class}
 4 | \name{MCMC_inferCNV-class}
 5 | \alias{MCMC_inferCNV-class}
 6 | \alias{MCMC_inferCNV}
 7 | \title{MCMC_inferCNV class}
 8 | \description{
 9 | Uses Markov Chain Monte Carlo (MCMC) and Gibbs sampling to estimate the posterior
10 | probability of being in one of six Copy Number Variation states (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by
11 | inferCNV's HMM. Posterior probabilities are found for the entire CNV cluster and each individual
12 | cell line in the CNV.
13 | }
14 | \section{Slots}{
15 | 
16 | \describe{
17 | \item{\code{bugs_model}}{BUGS model.}
18 | 
19 | \item{\code{sig}}{fitted values for cell lines, 1/standard deviation to be used for determining the distribution of each cell line}
20 | 
21 | \item{\code{mu}}{Mean values to be used for determining the distribution of each cell line}
22 | 
23 | \item{\code{group_id}}{ID's given to the cell clusters.}
24 | 
25 | \item{\code{cell_gene}}{List containing the Cells and Genes that make up each CNV.}
26 | 
27 | \item{\code{cnv_probabilities}}{Probabilities of each CNV belonging to a particular state from 0 (least likely)to 1 (most likely).}
28 | 
29 | \item{\code{cell_probabilities}}{Probabilities of each cell being in a particular state, from 0 (least likely)to 1 (most likely).}
30 | 
31 | \item{\code{args}}{Input arguments given by the user}
32 | 
33 | \item{\code{cnv_regions}}{ID for each CNV found by the HMM}
34 | }}
35 | 
36 | \keyword{classes}
37 | 


--------------------------------------------------------------------------------
/man/add_to_seurat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/seurat_interaction.R
 3 | \name{add_to_seurat}
 4 | \alias{add_to_seurat}
 5 | \title{add_to_seurat()}
 6 | \usage{
 7 | add_to_seurat(
 8 |   seurat_obj = NULL,
 9 |   assay_name = "RNA",
10 |   infercnv_output_path,
11 |   top_n = 10,
12 |   bp_tolerance = 2e+06,
13 |   column_prefix = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{seurat_obj}{Seurat object to add meta.data to (default: NULL)}
18 | 
19 | \item{assay_name}{Name of the assay in the Seurat object if provided. (default: "RNA")}
20 | 
21 | \item{infercnv_output_path}{Path to the output folder of the infercnv run to use}
22 | 
23 | \item{top_n}{How many of the largest CNA (in number of genes) to get.}
24 | 
25 | \item{bp_tolerance}{How many bp of tolerance to have around feature start/end positions for top_n largest CNVs.}
26 | 
27 | \item{column_prefix}{String to add as a prefix to the Seurat metadata columns. Only applied to the seurat_obj, if supplied. Default is NULL}
28 | }
29 | \value{
30 | seurat_obj
31 | }
32 | \description{
33 | Add meta.data about CNAs to a Seurat object from an infercnv_obj
34 | }
35 | 


--------------------------------------------------------------------------------
/man/apply_median_filtering.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/noise_reduction.R
 3 | \name{apply_median_filtering}
 4 | \alias{apply_median_filtering}
 5 | \title{apply_median_filtering}
 6 | \usage{
 7 | apply_median_filtering(
 8 |   infercnv_obj,
 9 |   window_size = 7,
10 |   on_observations = TRUE,
11 |   on_references = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{infercnv_obj}{infercnv_object}
16 | 
17 | \item{window_size}{Size of the window side centered on the data point to filter (default = 7).}
18 | 
19 | \item{on_observations}{boolean (default=TRUE), run on observations data (tumor cells).}
20 | 
21 | \item{on_references}{boolean (default=TRUE), run on references (normal cells).}
22 | }
23 | \value{
24 | infercnv_obj with median filtering applied to observations
25 | }
26 | \description{
27 | Apply a median filtering to the expression matrix within each tumor bounds
28 | }
29 | \examples{
30 | # data(infercnv_data_example)
31 | # data(infercnv_annots_example)
32 | # data(infercnv_genes_example)
33 | 
34 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
35 | #                                                           gene_order_file=infercnv_genes_example,
36 | #                                                           annotations_file=infercnv_annots_example,
37 | #                                                           ref_group_names=c("normal"))
38 | 
39 | # infercnv_object_example <- infercnv::run(infercnv_object_example,
40 | #                                          cutoff=1,
41 | #                                          out_dir=tempfile(), 
42 | #                                          cluster_by_groups=TRUE, 
43 | #                                          denoise=TRUE,
44 | #                                          HMM=FALSE,
45 | #                                          num_threads=2,
46 | #                                          no_plot=TRUE)
47 | 
48 | data(infercnv_object_example)
49 | 
50 | infercnv_object_example <- infercnv::apply_median_filtering(infercnv_object_example)
51 | # plot result object
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/man/color.palette.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV_ops.R
 3 | \name{color.palette}
 4 | \alias{color.palette}
 5 | \title{Helper function allowing greater control over the steps in a color palette.}
 6 | \usage{
 7 | color.palette(steps, between = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{steps}{Vector of colors to change use in the palette}
11 | 
12 | \item{between}{Steps where gradients change}
13 | 
14 | \item{...}{Additional arguments of colorRampPalette}
15 | }
16 | \value{
17 | Color palette
18 | }
19 | \description{
20 | Helper function allowing greater control over the steps in a color palette.
21 |              Source: http://menugget.blogspot.com/2011/11/define-color-steps-for-
22 |              colorramppalette.html#more
23 | }
24 | \examples{
25 | color.palette(c("darkblue", "white", "darkred"),
26 |               c(2, 2))
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/filterHighPNormals.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV_BayesNet.R
 3 | \name{filterHighPNormals}
 4 | \alias{filterHighPNormals}
 5 | \title{filterHighPNormals: Filter the HMM identified CNV's by the CNV's posterior probability
 6 | of belonging to a normal state.}
 7 | \usage{
 8 | filterHighPNormals(MCMC_inferCNV_obj, HMM_states, BayesMaxPNormal, useRaster)
 9 | }
10 | \arguments{
11 | \item{MCMC_inferCNV_obj}{MCMC infernCNV object.}
12 | 
13 | \item{HMM_states}{InferCNV object with HMM states in expression data.}
14 | 
15 | \item{BayesMaxPNormal}{Option to filter CNV or cell lines by some probability threshold.}
16 | 
17 | \item{useRaster}{Option to use rasterization when plotting}
18 | }
19 | \value{
20 | Returns a list of (MCMC_inferCNV_obj, HMM_states) With removed CNV's.
21 | }
22 | \description{
23 | The following function will filter the HMM identified CNV's by the CNV's posterior
24 | probability of belonging to a normal state identified by the function inferCNVBayesNet(). Will filter
25 | CNV's based on a user desired threshold probability. Any CNV with a probability of being normal above
26 | the threshold will be removed.
27 | }
28 | \examples{
29 | data(mcmc_obj)
30 | 
31 | mcmc_obj_hmm_states_list <- infercnv::filterHighPNormals( MCMC_inferCNV_obj = mcmc_obj, 
32 |                                           HMM_states        = HMM_states, 
33 |                                           BayesMaxPNormal   = 0.5)
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/inferCNVBayesNet.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/inferCNV_BayesNet.R
  3 | \name{inferCNVBayesNet}
  4 | \alias{inferCNVBayesNet}
  5 | \title{inferCNVBayesNet: Run Bayesian Network Mixture Model To Obtain Posterior Probabilities For HMM Predicted States}
  6 | \usage{
  7 | inferCNVBayesNet(
  8 |   file_dir,
  9 |   infercnv_obj,
 10 |   HMM_states,
 11 |   out_dir,
 12 |   resume_file_token,
 13 |   model_file = NULL,
 14 |   CORES = 1,
 15 |   postMcmcMethod = NULL,
 16 |   plotingProbs = TRUE,
 17 |   quietly = TRUE,
 18 |   diagnostics = FALSE,
 19 |   HMM_type = HMM_type,
 20 |   k_obs_groups = k_obs_groups,
 21 |   cluster_by_groups = cluster_by_groups,
 22 |   reassignCNVs = TRUE,
 23 |   no_plot = no_plot,
 24 |   useRaster
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{file_dir}{Location of the directory of the inferCNV outputs.}
 29 | 
 30 | \item{infercnv_obj}{InferCNV object.}
 31 | 
 32 | \item{HMM_states}{InferCNV object with HMM states in expression data.}
 33 | 
 34 | \item{out_dir}{(string) Path to where the output file should be saved to.}
 35 | 
 36 | \item{resume_file_token}{(string) String token that contains some info on settings used to name files.}
 37 | 
 38 | \item{model_file}{Path to the BUGS Model file.}
 39 | 
 40 | \item{CORES}{Option to run parallel by specifying the number of cores to be used. (Default: 1)}
 41 | 
 42 | \item{postMcmcMethod}{What actions to take after finishing the MCMC.}
 43 | 
 44 | \item{plotingProbs}{Option for adding plots of Cell and CNV probabilities. (Default: TRUE)}
 45 | 
 46 | \item{quietly}{Option to print descriptions along each step. (Default: TRUE)}
 47 | 
 48 | \item{diagnostics}{Option to plot Diagnostic plots and tables. (Default: FALSE)}
 49 | 
 50 | \item{HMM_type}{The type of HMM that was ra, either 'i3' or 'i6'. Determines how many state were predicted by the HMM.}
 51 | 
 52 | \item{k_obs_groups}{Number of groups in which to break the observations. (default: 1)}
 53 | 
 54 | \item{cluster_by_groups}{If observations are defined according to groups (ie. patients), each group
 55 | of cells will be clustered separately. (default=FALSE, instead will use k_obs_groups setting)}
 56 | 
 57 | \item{reassignCNVs}{(boolean) Given the CNV associated probability of belonging to each possible state, 
 58 | reassign the state assignments made by the HMM to the state that has the highest probability. (default: TRUE)}
 59 | 
 60 | \item{no_plot}{(boolean) Option set by infercnv::run() for producing visualizations.}
 61 | 
 62 | \item{useRaster}{Option to use rasterization when plotting}
 63 | }
 64 | \value{
 65 | Returns a MCMC_inferCNV_obj and posterior probability of being in one of six Copy Number Variation states
 66 | (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by inferCNV's HMM.
 67 | }
 68 | \description{
 69 | Uses Markov Chain Monte Carlo (MCMC) and Gibbs sampling to estimate the posterior
 70 | probability of being in one of six Copy Number Variation states (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by
 71 | inferCNV's HMM. Posterior probabilities are found for the entire CNV cluster and each individual
 72 | cell line in the CNV.
 73 | }
 74 | \examples{
 75 | data(infercnv_data_example)
 76 | data(infercnv_annots_example)
 77 | data(infercnv_genes_example)
 78 | data(HMM_states)
 79 | 
 80 | infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
 81 |                                                           gene_order_file=infercnv_genes_example,
 82 |                                                           annotations_file=infercnv_annots_example,
 83 |                                                           ref_group_names=c("normal"))
 84 |           
 85 | out_dir = tempfile()
 86 | infercnv_object_example <- infercnv::run(infercnv_object_example,
 87 |                                          cutoff=1,
 88 |                                          out_dir=out_dir, 
 89 |                                          cluster_by_groups=TRUE,
 90 |                                          analysis_mode="samples",
 91 |                                          denoise=TRUE,
 92 |                                          HMM=TRUE,
 93 |                                          num_threads=2,
 94 |                                          no_plot=TRUE)
 95 | mcmc_obj <- infercnv::inferCNVBayesNet(infercnv_obj      = infercnv_object_example,
 96 |                                        HMM_states        = HMM_states,
 97 |                                        file_dir          = out_dir,
 98 |                                        postMcmcMethod    = "removeCNV",
 99 |                                        out_dir           = out_dir,
100 |                                        resume_file_token = "HMMi6.hmm_mode-samples",
101 |                                        quietly           = TRUE,
102 |                                        CORES             = 2,
103 |                                        plotingProbs      = FALSE,
104 |                                        diagnostics       = FALSE,
105 |                                        HMM_type          = 'i6',
106 |                                        k_obs_groups      = 1,
107 |                                        cluster_by_groups = FALSE,
108 |                                        reassignCNVs      = FALSE,
109 |                                        no_plot           = TRUE)
110 |                               
111 | }
112 | 


--------------------------------------------------------------------------------
/man/infercnv-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV.R
 3 | \docType{class}
 4 | \name{infercnv-class}
 5 | \alias{infercnv-class}
 6 | \alias{infercnv}
 7 | \title{The infercnv Class}
 8 | \description{
 9 | An infercnv object encapsulates the expression data and gene chromosome ordering information
10 | that is leveraged by infercnv for data exploration.  The infercnv object is passed among the
11 | infercnv data processing and plotting routines.
12 | }
13 | \details{
14 | Slots in the infercnv object include:
15 | }
16 | \section{Slots}{
17 | 
18 | \describe{
19 | \item{\code{expr.data}}{<matrix>  the count or expression data matrix, manipulated throughout infercnv ops}
20 | 
21 | \item{\code{count.data}}{<matrix>  retains the original count data, but shrinks along with expr.data when genes are removed.}
22 | 
23 | \item{\code{gene_order}}{<data.frame> chromosomal gene order}
24 | 
25 | \item{\code{reference_grouped_cell_indices}}{<list>  mapping [['group_name']] to c(cell column indices) for reference (normal) cells}
26 | 
27 | \item{\code{observation_grouped_cell_indices}}{<list> mapping [['group_name']] to c(cell column indices) for observation (tumor) cells}
28 | 
29 | \item{\code{tumor_subclusters}}{<list> stores subclustering of tumors if requested}
30 | 
31 | \item{\code{options}}{<list> stores the options relevant to the analysis in itself (in contrast with options relevant to plotting or paths)}
32 | 
33 | \item{\code{.hspike}}{a hidden infercnv object populated with simulated spiked-in data}
34 | }}
35 | 
36 | 


--------------------------------------------------------------------------------
/man/infercnv-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV.R
 3 | \docType{package}
 4 | \name{infercnv-package}
 5 | \alias{infercnv-package}
 6 | \alias{_PACKAGE}
 7 | \title{infercnv: Infer Copy Number Variation from Single-Cell RNA-Seq Data}
 8 | \description{
 9 | Using single-cell RNA-Seq expression to visualize CNV in cells.
10 | }
11 | \details{
12 | The main functions you will need to use are CreateInfercnvObject() and run(infercnv_object).
13 | For additional details on running the analysis step by step, please refer to the example vignette.
14 | }
15 | \seealso{
16 | Useful links:
17 | \itemize{
18 |   \item \url{https://github.com/broadinstitute/inferCNV/wiki}
19 |   \item Report bugs at \url{https://github.com/broadinstitute/inferCNV/issues}
20 | }
21 | 
22 | }
23 | \author{
24 | \strong{Maintainer}: Christophe Georgescu \email{cgeorges@broadinstitute.org}
25 | 
26 | Authors:
27 | \itemize{
28 |   \item Timothy Tickle \email{ttickle@broadinstitute.org}
29 |   \item Itay Tirosh \email{tirosh@broadinstitute.org}
30 |   \item Maxwell Brown \email{mbrown@broadinstitute.org}
31 |   \item Brian Haas \email{bhaas@broadinstitute.org}
32 | }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/man/infercnv_annots_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{infercnv_annots_example}
 5 | \alias{infercnv_annots_example}
 6 | \title{Generated classification for 10 normal cells and 10 tumor cells.}
 7 | \format{
 8 | A data frame with 20 rows (cells) and 1 columns (classification)
 9 | }
10 | \usage{
11 | infercnv_annots_example
12 | }
13 | \description{
14 | Generated classification for 10 normal cells and 10 tumor cells.
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/infercnv_data_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{infercnv_data_example}
 5 | \alias{infercnv_data_example}
 6 | \title{Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells.
 7 | This is only to demonstrate how to use methods, not actual data to be used in an analysis.}
 8 | \format{
 9 | A data frame with 8252 rows (genes) and 20 columns (cells)
10 | }
11 | \usage{
12 | infercnv_data_example
13 | }
14 | \description{
15 | Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells.
16 | This is only to demonstrate how to use methods, not actual data to be used in an analysis.
17 | }
18 | \keyword{datasets}
19 | 


--------------------------------------------------------------------------------
/man/infercnv_genes_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{infercnv_genes_example}
 5 | \alias{infercnv_genes_example}
 6 | \title{Downsampled gene coordinates file from GrCh37}
 7 | \format{
 8 | A data frame with 10338 rows (genes) and 3 columns (chr, start, end)
 9 | }
10 | \usage{
11 | infercnv_genes_example
12 | }
13 | \description{
14 | Downsampled gene coordinates file from GrCh37
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/infercnv_object_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{infercnv_object_example}
 5 | \alias{infercnv_object_example}
 6 | \title{infercnv object result of the processing of run() in the example, to be used for other examples.}
 7 | \format{
 8 | An infercnv object
 9 | }
10 | \usage{
11 | infercnv_object_example
12 | }
13 | \description{
14 | infercnv object result of the processing of run() in the example, to be used for other examples.
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/mcmc_obj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{mcmc_obj}
 5 | \alias{mcmc_obj}
 6 | \title{infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples.}
 7 | \format{
 8 | An infercnv object containing posterior probability of CNV states
 9 | }
10 | \usage{
11 | mcmc_obj
12 | }
13 | \description{
14 | infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples.
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/plot_cnv.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/inferCNV_heatmap.R
  3 | \name{plot_cnv}
  4 | \alias{plot_cnv}
  5 | \title{Plot the matrix as a heatmap, with cells as rows and genes as columns, ordered according to chromosome}
  6 | \usage{
  7 | plot_cnv(
  8 |   infercnv_obj,
  9 |   out_dir = ".",
 10 |   title = "inferCNV",
 11 |   obs_title = "Observations (Cells)",
 12 |   ref_title = "References (Cells)",
 13 |   cluster_by_groups = TRUE,
 14 |   cluster_references = TRUE,
 15 |   plot_chr_scale = FALSE,
 16 |   chr_lengths = NULL,
 17 |   k_obs_groups = 1,
 18 |   contig_cex = 1,
 19 |   x.center = mean(infercnv_obj@expr.data),
 20 |   x.range = "auto",
 21 |   hclust_method = "ward.D",
 22 |   custom_color_pal = NULL,
 23 |   color_safe_pal = FALSE,
 24 |   output_filename = "infercnv",
 25 |   output_format = "png",
 26 |   png_res = 300,
 27 |   dynamic_resize = 0,
 28 |   ref_contig = NULL,
 29 |   write_expr_matrix = FALSE,
 30 |   write_phylo = FALSE,
 31 |   useRaster = TRUE
 32 | )
 33 | }
 34 | \arguments{
 35 | \item{infercnv_obj}{infercnv object}
 36 | 
 37 | \item{out_dir}{Directory in which to save pdf and other output.}
 38 | 
 39 | \item{title}{Plot title.}
 40 | 
 41 | \item{obs_title}{Title for the observations matrix.}
 42 | 
 43 | \item{ref_title}{Title for the reference matrix.}
 44 | 
 45 | \item{cluster_by_groups}{Whether to cluster observations by their annotations or not. Using this ignores k_obs_groups.}
 46 | 
 47 | \item{cluster_references}{Whether to cluster references within their annotations or not. (dendrogram not displayed)}
 48 | 
 49 | \item{plot_chr_scale}{Whether to scale the chromosme width on the heatmap based on their actual size rather than just the number of expressed genes.}
 50 | 
 51 | \item{chr_lengths}{A named list of chromsomes lengths to use when plot_chr_scale=TRUE, or else chromosome size is assumed to be the last chromosome's stop position + 10k bp}
 52 | 
 53 | \item{k_obs_groups}{Number of groups to break observation into.}
 54 | 
 55 | \item{contig_cex}{Contig text size.}
 56 | 
 57 | \item{x.center}{Value on which to center expression.}
 58 | 
 59 | \item{x.range}{vector containing the extreme values in the heatmap (ie. c(-3,4) )}
 60 | 
 61 | \item{hclust_method}{Clustering method to use for hclust.}
 62 | 
 63 | \item{custom_color_pal}{Specify a custom set of colors for the heatmap. 
 64 | Has to be in the shape color.palette(c("darkblue", "white", "darkred"),
 65 |                                      c(2, 2))}
 66 | 
 67 | \item{color_safe_pal}{Logical indication of using a color blindness safe palette.}
 68 | 
 69 | \item{output_filename}{Filename to save the figure to.}
 70 | 
 71 | \item{output_format}{format for heatmap image file (default: 'png'), options('png', 'pdf', NA)
 72 | If set to NA, will print graphics natively}
 73 | 
 74 | \item{png_res}{Resolution for png output.}
 75 | 
 76 | \item{dynamic_resize}{Factor (>= 0) by which to scale the dynamic resize of the observation 
 77 | heatmap and the overall plot based on how many cells there are.
 78 | Default is 0, which disables the scaling. Try 1 first if you want to enable.}
 79 | 
 80 | \item{ref_contig}{If given, will focus cluster on only genes in this contig.}
 81 | 
 82 | \item{write_expr_matrix}{Includes writing a matrix file containing the expression data that is plotted in the heatmap.}
 83 | 
 84 | \item{write_phylo}{Write newick strings of the dendrograms displayed on the left side of the heatmap to file.}
 85 | 
 86 | \item{useRaster}{Whether to use rasterization for drawing heatmap. Only disable if it produces an error as it is much faster than not using it.}
 87 | }
 88 | \value{
 89 | A list of all relevent settings used for the plotting to be able to reuse them in another plot call while keeping consistant plotting settings, most importantly x.range.
 90 | }
 91 | \description{
 92 | Formats the data and sends it for plotting.
 93 | }
 94 | \examples{
 95 | # data(infercnv_data_example)
 96 | # data(infercnv_annots_example)
 97 | # data(infercnv_genes_example)
 98 | 
 99 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
100 | #                                                           gene_order_file=infercnv_genes_example,
101 | #                                                           annotations_file=infercnv_annots_example,
102 | #                                                           ref_group_names=c("normal"))
103 | 
104 | # infercnv_object_example <- infercnv::run(infercnv_object_example,
105 | #                                          cutoff=1,
106 | #                                          out_dir=tempfile(), 
107 | #                                          cluster_by_groups=TRUE, 
108 | #                                          denoise=TRUE,
109 | #                                          HMM=FALSE,
110 | #                                          num_threads=2,
111 | #                                          no_plot=TRUE)
112 | 
113 | data(infercnv_object_example)
114 | 
115 | plot_cnv(infercnv_object_example,
116 |          out_dir=tempfile(),
117 |          obs_title="Observations (Cells)",
118 |          ref_title="References (Cells)",
119 |          cluster_by_groups=TRUE,
120 |          x.center=1,
121 |          x.range="auto",
122 |          hclust_method='ward.D',
123 |          color_safe_pal=FALSE,
124 |          output_filename="infercnv",
125 |          output_format="png",
126 |          png_res=300,
127 |          dynamic_resize=0
128 |          )
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/man/plot_per_group.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/infercnv_sampling.R
 3 | \name{plot_per_group}
 4 | \alias{plot_per_group}
 5 | \title{plot_per_group}
 6 | \usage{
 7 | plot_per_group(
 8 |   infercnv_obj,
 9 |   on_references = TRUE,
10 |   on_observations = TRUE,
11 |   sample = FALSE,
12 |   n_cells = 1000,
13 |   every_n = NULL,
14 |   above_m = 1000,
15 |   k_obs_groups = 1,
16 |   base_filename = "infercnv_per_group",
17 |   output_format = "png",
18 |   write_expr_matrix = TRUE,
19 |   save_objects = FALSE,
20 |   png_res = 300,
21 |   dynamic_resize = 0,
22 |   useRaster = TRUE,
23 |   out_dir
24 | )
25 | }
26 | \arguments{
27 | \item{infercnv_obj}{infercnv_object}
28 | 
29 | \item{on_references}{boolean (default=TRUE), plot references (normal cells).}
30 | 
31 | \item{on_observations}{boolean (default=TRUE), plot observations data (tumor cells).}
32 | 
33 | \item{sample}{Whether unique groups of cells should be sampled from or not. (see other parameters for how sampling is done) (Default: FALSE)}
34 | 
35 | \item{n_cells}{Number of cells that should be sampled per group if sampling is enabled (default = 1000) .}
36 | 
37 | \item{every_n}{Sample 1 cell every_n cells for each group that has above_m cells, if sampling is enabled. 
38 | If subclusters are defined, this will make sure that at least one cell per subcluster is sampled. 
39 | Requires above_m to be set to work, overriding n_cells parameter. (Default: NULL)}
40 | 
41 | \item{above_m}{Sample only groups that have at least above_m cells if sampling is enabled. (default: 1000)
42 | Does not require every_n to be set.}
43 | 
44 | \item{k_obs_groups}{Number of groups to break each group in with cutree (in the color bars on the left side of the plot only). (Default: 1)}
45 | 
46 | \item{base_filename}{Base prefix for the output files names. 
47 | Will be followed by OBS/REF to indidate the type of the group, and the group name. (Default: "infercnv_per_group")}
48 | 
49 | \item{output_format}{Output format for the figure. Choose between "png", "pdf" and NA. NA means to only write the text outputs without generating the figure itself. (default: "png")}
50 | 
51 | \item{write_expr_matrix}{Includes writing a matrix file containing the expression data that is plotted in the heatmap. (default: FALSE)}
52 | 
53 | \item{save_objects}{Whether to save the infercnv objects generated for each group as RDS. (default: FALSE)}
54 | 
55 | \item{png_res}{Resolution for png output. (Default: 300)}
56 | 
57 | \item{dynamic_resize}{Factor (>= 0) by which to scale the dynamic resize of the observation 
58 | heatmap and the overall plot based on how many cells there are.
59 | Default is 0, which disables the scaling. Try 1 first if you want to enable. (Default: 0)}
60 | 
61 | \item{useRaster}{Whether to use rasterization for drawing heatmap. Only disable if it produces an error as it is much faster than not using it.}
62 | 
63 | \item{out_dir}{Directory in which to save plots and other outputs.}
64 | }
65 | \value{
66 | void
67 | }
68 | \description{
69 | Takes an infercnv object and subdivides it into one object per group of cells 
70 | to allow plotting of each group on a seperate plot. If references are selected, they will appear
71 | on the observation heatmap area as it is larger.
72 | }
73 | \examples{
74 | # data(infercnv_data_example)
75 | # data(infercnv_annots_example)
76 | # data(infercnv_genes_example)
77 | 
78 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
79 | #                                                           gene_order_file=infercnv_genes_example,
80 | #                                                           annotations_file=infercnv_annots_example,
81 | #                                                           ref_group_names=c("normal"))
82 | 
83 | # infercnv_object_example <- infercnv::run(infercnv_object_example,
84 | #                                          cutoff=1,
85 | #                                          out_dir=tempfile(), 
86 | #                                          cluster_by_groups=TRUE, 
87 | #                                          denoise=TRUE,
88 | #                                          HMM=FALSE,
89 | #                                          num_threads=2,
90 | #                                          no_plot=TRUE)
91 | 
92 | data(infercnv_object_example)
93 | 
94 | infercnv::plot_per_group(infercnv_object_example, out_dir=tempfile())
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/man/plot_subclusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV_tumor_subclusters.R
 3 | \name{plot_subclusters}
 4 | \alias{plot_subclusters}
 5 | \title{Plot a heatmap of the data in the infercnv object with the subclusters being displayed as annotations.}
 6 | \usage{
 7 | plot_subclusters(
 8 |   infercnv_obj,
 9 |   out_dir,
10 |   output_filename = "subcluster_as_annotations"
11 | )
12 | }
13 | \arguments{
14 | \item{infercnv_obj}{infercnv object}
15 | 
16 | \item{out_dir}{Directory in which to output.}
17 | 
18 | \item{output_filename}{Filename to save the figure to.}
19 | }
20 | \value{
21 | infercnv_obj the modified infercnv object that was plotted where subclusters are assigned as annotation groups
22 | }
23 | \description{
24 | Formats the data and sends it for plotting.
25 | }
26 | \examples{
27 | # data(infercnv_data_example)
28 | # data(infercnv_annots_example)
29 | # data(infercnv_genes_example)
30 | 
31 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
32 | #                                                           gene_order_file=infercnv_genes_example,
33 | #                                                           annotations_file=infercnv_annots_example,
34 | #                                                           ref_group_names=c("normal"))
35 | 
36 | # infercnv_object_example <- infercnv::run(infercnv_object_example,
37 | #                                          cutoff=1,
38 | #                                          out_dir=tempfile(), 
39 | #                                          cluster_by_groups=TRUE, 
40 | #                                          denoise=TRUE,
41 | #                                          HMM=FALSE,
42 | #                                          num_threads=2,
43 | #                                          no_plot=TRUE)
44 | 
45 | data(infercnv_object_example)
46 | 
47 | plot_subclusters(infercnv_object_example,
48 |                  out_dir=tempfile(),
49 |                  output_filename="subclusters_as_annotations"
50 |                  )
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/man/sample_object.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/infercnv_sampling.R
 3 | \name{sample_object}
 4 | \alias{sample_object}
 5 | \title{sample_object}
 6 | \usage{
 7 | sample_object(
 8 |   infercnv_obj,
 9 |   n_cells = 100,
10 |   every_n = NULL,
11 |   above_m = NULL,
12 |   on_references = TRUE,
13 |   on_observations = TRUE
14 | )
15 | }
16 | \arguments{
17 | \item{infercnv_obj}{infercnv_object}
18 | 
19 | \item{n_cells}{Number of cells that should be sampled per group (default = 100).}
20 | 
21 | \item{every_n}{Sample 1 cell every_n cells for each group. If subclusters are defined, 
22 | this will make sure that at least one cell per subcluster is sampled. 
23 | Requires above_m to be set to work, overriding n_cells parameter.}
24 | 
25 | \item{above_m}{Sample groups that have at least above_m cells. 
26 | Requires every_n to be set to work, overriding n_cells parameter}
27 | 
28 | \item{on_references}{boolean (default=TRUE), sample references (normal cells).}
29 | 
30 | \item{on_observations}{boolean (default=TRUE), sample observations data (tumor cells).}
31 | }
32 | \value{
33 | sampled infercnv_obj
34 | }
35 | \description{
36 | Apply sampling on an infercnv object to reduce the number of cells in it 
37 | and allow faster plotting or have all groups take up the same height on the heatmap
38 | }
39 | \examples{
40 | # data(infercnv_data_example)
41 | # data(infercnv_annots_example)
42 | # data(infercnv_genes_example)
43 | 
44 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 
45 | #                                                           gene_order_file=infercnv_genes_example,
46 | #                                                           annotations_file=infercnv_annots_example,
47 | #                                                           ref_group_names=c("normal"))
48 | 
49 | # infercnv_object_example <- infercnv::run(infercnv_object_example,
50 | #                                          cutoff=1,
51 | #                                          out_dir=tempfile(), 
52 | #                                          cluster_by_groups=TRUE, 
53 | #                                          denoise=TRUE,
54 | #                                          HMM=FALSE,
55 | #                                          num_threads=2,
56 | #                                          no_plot=TRUE)
57 | 
58 | data(infercnv_object_example)
59 | 
60 | infercnv_object_example <- infercnv::sample_object(infercnv_object_example, n_cells=5)
61 | # plot result object
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/man/validate_infercnv_obj.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inferCNV.R
 3 | \name{validate_infercnv_obj}
 4 | \alias{validate_infercnv_obj}
 5 | \title{validate_infercnv_obj()}
 6 | \usage{
 7 | validate_infercnv_obj(infercnv_obj)
 8 | }
 9 | \arguments{
10 | \item{infercnv_obj}{infercnv_object}
11 | }
12 | \value{
13 | none
14 | }
15 | \description{
16 | validate an infercnv_obj
17 | ensures that order of genes in the @gene_order slot match up perfectly with the gene rows in the @expr.data matrix.
18 | Otherwise, throws an error and stops execution.
19 | }
20 | 


--------------------------------------------------------------------------------
/scripts/HB_example_to_inferCNV_obj.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(HoneyBADGER)
 4 | library(infercnv)
 5 | 
 6 | data(gexp) ## tumor cells, dim:  [6082,75]
 7 | data(ref) ## reference, length: 6082
 8 | 
 9 | 
10 | 
11 | raw.data = cbind(gexp, data.frame('GTEX'=ref))
12 | 
13 | cell.annots = data.frame(cell=colnames(gexp), type='tumor')
14 | cell.annots = rbind(cell.annots, data.frame(cell='GTEX', type='normal'))
15 | 
16 | write.table(raw.data, file="hb.example.matrix", quote=F, sep="\t")
17 | write.table(cell.annots, file='hb.example.cell_annots', quote=F, sep="\t", col.names=F, row.names=F)
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/KS_matrix_comparison.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | set.seed(1234)
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | library(tidyverse)
 8 | 
 9 | parser = ArgumentParser()
10 | parser$add_argument("--matrix1", required=T, nargs=1)
11 | parser$add_argument("--matrix2", required=T, nargs=1)
12 | parser$add_argument("--log", required=F, default=FALSE, action="store_true")
13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf")
14 | 
15 | args = parser$parse_args()
16 | 
17 | 
18 | #' learn distribution parameters:
19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1))
20 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1))
21 | 
22 | 
23 | png(args$output)
24 | if (args$log) {
25 |     data1 = log(data1+1)
26 |     data2 = log(data2+1)
27 | }
28 | 
29 | 
30 | ## plotting ideas borrowed from
31 | ## https://stackoverflow.com/questions/39162178/kolmogorov-smirnov-plot-in-r-ggplot
32 | 
33 | 
34 | m1_ecdf = ecdf(data1)
35 | m2_ecdf = ecdf(data2)
36 | val_range = range(data1, data2)
37 | step = (val_range[2] - val_range[1])/100
38 | vals = seq(val_range[1], val_range[2], step)
39 | 
40 | 
41 | m1_cdf = m1_ecdf(vals)
42 | m2_cdf = m2_ecdf(vals)
43 | 
44 | cdfs = data.frame(vals,
45 |                   m1_cdf,
46 |                   m2_cdf)
47 | 
48 | ks_point = which.max(abs(cdfs$m1_cdf - cdfs$m2_cdf))
49 | ks_point_info = cdfs[ks_point,]
50 | ##message("KS point info: ", paste(ks_point_info, collapse=', '))
51 | 
52 | cdfs = cdfs %>% gather('m1_cdf', 'm2_cdf', key='type', value='cdf')
53 | 
54 | 
55 | ggplot(cdfs, aes(x=vals, y=cdf)) +
56 |     geom_line(aes(color=type, linetype=type)) +
57 |     geom_segment(aes(x=ks_point_info$vals,
58 |                      y=ks_point_info$m1_cdf,
59 |                      xend=ks_point_info$vals,
60 |                      yend=ks_point_info$m2_cdf), color='magenta', size=2) +
61 |     ggtitle(sprintf("%s vs. %s KS", args$matrix1, args$matrix2)) + xlab("number") + ylab("cdf")
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/scripts/KS_matrix_comparison.use_infercnv_obj.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | set.seed(1234)
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | library(tidyverse)
 8 | 
 9 | parser = ArgumentParser()
10 | parser$add_argument("--matrix1", required=T, nargs=1)
11 | parser$add_argument("--infercnv_obj", required=T, nargs=1)
12 | parser$add_argument("--log", required=F, default=FALSE, action="store_true")
13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf")
14 | 
15 | args = parser$parse_args()
16 | 
17 | 
18 | #' learn distribution parameters:
19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1))
20 | 
21 | 
22 | 
23 | infercnv_obj_file = args$infercnv_obj
24 | infercnv_obj = readRDS(infercnv_obj_file)
25 | data2 = as.matrix(infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)])
26 | 
27 | 
28 | png(args$output)
29 | if (args$log) {
30 |     data1 = log(data1+1)
31 |     data2 = log(data2+1)
32 | }
33 | 
34 | 
35 | ## plotting ideas borrowed from
36 | ## https://stackoverflow.com/questions/39162178/kolmogorov-smirnov-plot-in-r-ggplot
37 | 
38 | 
39 | m1_ecdf = ecdf(data1)
40 | m2_ecdf = ecdf(data2)
41 | val_range = range(data1, data2)
42 | step = (val_range[2] - val_range[1])/100
43 | vals = seq(val_range[1], val_range[2], step)
44 | 
45 | 
46 | m1_cdf = m1_ecdf(vals)
47 | m2_cdf = m2_ecdf(vals)
48 | 
49 | cdfs = data.frame(vals,
50 |                   m1_cdf,
51 |                   m2_cdf)
52 | 
53 | ks_point = which.max(abs(cdfs$m1_cdf - cdfs$m2_cdf))
54 | ks_point_info = cdfs[ks_point,]
55 | ##message("KS point info: ", paste(ks_point_info, collapse=', '))
56 | 
57 | cdfs = cdfs %>% gather('m1_cdf', 'm2_cdf', key='type', value='cdf')
58 | 
59 | 
60 | ggplot(cdfs, aes(x=vals, y=cdf)) +
61 |     geom_line(aes(color=type, linetype=type)) +
62 |     geom_segment(aes(x=ks_point_info$vals,
63 |                      y=ks_point_info$m1_cdf,
64 |                      xend=ks_point_info$vals,
65 |                      yend=ks_point_info$m2_cdf), color='magenta', size=2) +
66 |     ggtitle(sprintf("%s vs. %s KS", args$matrix1, args$matrix2)) + xlab("number") + ylab("cdf")
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/scripts/QQ_matrix_comparison.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | set.seed(1234)
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | parser = ArgumentParser()
 8 | parser$add_argument("--matrix1", required=T, nargs=1)
 9 | parser$add_argument("--matrix2", required=T, nargs=1)
10 | parser$add_argument("--log", required=F, default=FALSE, action="store_true")
11 | parser$add_argument("--output", required=T, nargs=1, help="output filename png")
12 | 
13 | args = parser$parse_args()
14 | 
15 | 
16 | #' learn distribution parameters:
17 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1))
18 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1))
19 | 
20 | 
21 | png(args$output)
22 | if (args$log) {
23 |     data1 = log(data1+1)
24 |     data2 = log(data2+1)
25 | }
26 | qqplot(data1, data2)
27 | abline(a=0,b=1, col='red')
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/scripts/apply_median_filtering.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | parser$add_argument("--window_size", help="window size", required=FALSE, type='integer', default=11)
 8 | args = parser$parse_args()
 9 | 
10 | library(infercnv)
11 | library(ggplot2)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | infercnv_obj = infercnv:::.subcluster_tumors_general(infercnv_obj)
18 | 
19 | mf_infercnv_obj = infercnv:::.apply_heatmap_median_filtering(infercnv_obj, window_size=args$window_size)
20 | 
21 | saveRDS(mf_infercnv_obj, file=sprintf("%s-median_filtered.W%d.obj", infercnv_obj_file, args$window_size) )
22 | 
23 | plot_cnv(mf_infercnv_obj, output_filename=paste0(infercnv_obj_file, sprintf(".mf.W%d", args$window_size)))
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/scripts/boxplot_cell_exprs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | parser$add_argument("--log", help="log(x+1) transform expr", action='store_true', default=FALSE)
 8 | 
 9 | args = parser$parse_args()
10 | 
11 | library(infercnv)
12 | library(ggplot2)
13 | library(tidyverse)
14 | 
15 | infercnv_obj_file = args$infercnv_obj
16 | 
17 | infercnv_obj = readRDS(infercnv_obj_file)
18 | 
19 | expr.data = infercnv_obj@expr.data
20 | 
21 | if (args$log) {
22 |     expr.data = log(expr.data+1)
23 | }
24 | 
25 | ## build df of expr values.
26 | cell_groups = c(infercnv_obj@reference_grouped_cell_indices, infercnv_obj@observation_grouped_cell_indices)
27 | 
28 | cell_group_names = names(cell_groups)
29 | 
30 | 
31 | pngname = sprintf("%s-boxplot.png", infercnv_obj_file)
32 | png(pngname)
33 | 
34 | expr_tibble = do.call(rbind, lapply(cell_group_names, function(cell_group_name) {
35 |     cell_group_expr = expr.data[, cell_groups[[ cell_group_name ]] ]
36 |     
37 |     cell_group_expr = as.tibble(cell_group_expr)
38 |     
39 |     cell_group_expr = cell_group_expr %>% gather(key='cellname', value='expr')
40 |     
41 |     cell_group_expr = cell_group_expr %>% mutate(group_name=cell_group_name)
42 | }))
43 | 
44 | 
45 | 
46 | p = expr_tibble %>% ggplot(aes(y=expr, x=cellname, color=group_name)) + geom_boxplot(outlier.shape=NA) + facet_wrap(~group_name, scales='free_x')
47 | 
48 | plot(p)
49 | 
50 | saveRDS(expr_tibble, 'my.tibble.obj')
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/check_matrix_format.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | """Converts a square expression matrix to an R-format compatible expression matrix
 5 | """
 6 | 
 7 | 
 8 | # Import statements
 9 | import argparse
10 | import csv
11 | import os
12 | 
13 | __author__ = 'Jon Bistline'
14 | __copyright__ = 'Copyright 2018'
15 | __credits__ = ["Jon Bistline"]
16 | __license__ = 'BSD-3'
17 | __maintainer__ = 'Jon Bistline'
18 | __email__ = 'bistline@broadinstitute.org'
19 | __status__ = 'Development'
20 | 
21 | def convert_matrix_format(input_matrix, delimiter, output_name):
22 |     """ Convert input expression matrix to R-compatible expression matrix (header line is 1 cell shorter than data lines)
23 | 
24 |     :param input_matrix: Path to input expression matrix
25 |     :type input_matrix: String
26 |     :param delimiter: delimiter to parse input matrix with (tab, comma, etc.)
27 |     :type delimiter: String
28 | 
29 |     """
30 | 
31 |     if not input_matrix or not os.path.exists(input_matrix):
32 |         print("".join(["check_matrix_format.py:: ",
33 |                        "Could not find input matrix : " + input_matrix]))
34 | 
35 |     # read first line
36 |     with open(input_matrix, "r") as exp_matrix:
37 |         print("".join(["Opening input matrix and checking header format: ", input_matrix]))
38 |         print("".join(["Using delimiter: ", delimiter]))
39 |         rewrite_file = False
40 |         matrix = csv.reader(exp_matrix, delimiter=delimiter)
41 |         header_list = next(matrix)
42 |         # check if first value in header_list needs to be removed
43 |         headers_to_remove = ['GENE', 'gene', '']
44 |         if header_list[0] in headers_to_remove:
45 |             print("Input matrix is being converted to R format.")
46 |             rewrite_file = True
47 |             header_list.pop(0)
48 |             with open(output_name, 'w+') as new_expression_matrix:
49 |                 writer = csv.writer(new_expression_matrix, delimiter=delimiter)
50 |                 writer.writerow(header_list)
51 |                 for line in matrix:
52 |                     writer.writerow(line)
53 | 
54 |     if rewrite_file is True:
55 |         print("".join(["Conversion complete, new output file: ", output_name]))
56 |     else:
57 |         os.rename(input_matrix, output_name)
58 |         print("".join(["No conversion necessary, input matrix is in R format already, renamed to new output file: ", output_name]))
59 | 
60 | if __name__ == "__main__":
61 | 
62 |     # Parse arguments
63 |     prsr_arguments = argparse.ArgumentParser(prog='check_matrix_format.py',
64 |                                              description=__doc__, # Use text from file summary up top
65 |                                              formatter_class=argparse.RawDescriptionHelpFormatter)
66 |     # Add positional argument
67 |     prsr_arguments.add_argument("--input_matrix",
68 |                                 metavar="input_matrix",
69 |                                 help="Path to the input expression matrix")
70 |     prsr_arguments.add_argument("--delimiter",
71 |                                 metavar="delimiter",
72 |                                 default="\t",
73 |                                 help="delimiter to parse input matrix with (tab, comma, etc.)")
74 |     prsr_arguments.add_argument("--output_name",
75 |                                 metavar="output_name",
76 |                                 default="expression.r_format.txt",
77 |                                 help="path to output expression matrix")
78 |     args = prsr_arguments.parse_args()
79 | 
80 |     # Run Script
81 |     convert_matrix_format(args.input_matrix, args.delimiter, args.output_name)
82 | 


--------------------------------------------------------------------------------
/scripts/cross_cell_scaling_normalization.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | parser$add_argument("--log", help="log transform expr", action='store_true', default=FALSE)
 8 | 
 9 | args = parser$parse_args()
10 | 
11 | library(infercnv)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | expr.data = infercnv_obj@expr.data
18 | 
19 | 
20 | ## do upper quartile normalization
21 | upper_quart = apply(expr.data, 2, quantile, probs=0.75)
22 | mean_upper_quart = mean(upper_quart)
23 | revised.expr.data = sweep(expr.data, 2, mean_upper_quart/upper_quart, "*")
24 | 
25 | new_upper_quart = apply(revised.expr.data, 2, quantile, probs=0.75) 
26 | 
27 | print(new_upper_quart)
28 | 
29 | infercnv_obj@expr.data = revised.expr.data
30 | 
31 | saveRDS(infercnv_obj, 'rescaled.obj')
32 | 
33 | 


--------------------------------------------------------------------------------
/scripts/dropout_matrix_comparison.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | set.seed(1234)
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | library(tidyverse)
 8 | library(infercnv)
 9 | 
10 | parser = ArgumentParser()
11 | parser$add_argument("--matrix1", required=T, nargs=1)
12 | parser$add_argument("--matrix2", required=T, nargs=1)
13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf")
14 | 
15 | args = parser$parse_args()
16 | 
17 | 
18 | #' learn distribution parameters:
19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1))
20 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1))
21 | 
22 | ## total sum normalize each:
23 | median_cs = median(colSums(data1), colSums(data2))
24 | data1 <- infercnv:::.normalize_data_matrix_by_seq_depth(data1, median_cs)
25 | data2 <- infercnv:::.normalize_data_matrix_by_seq_depth(data2, median_cs)
26 | 
27 | 
28 | pdf(args$output)
29 | 
30 | 
31 | data1.mean_vs_p0 <- infercnv:::.get_mean_vs_p0_from_matrix(data1)
32 | data2.mean_vs_p0 <- infercnv:::.get_mean_vs_p0_from_matrix(data2)
33 | 
34 | plot_mean_vs_p0_with_data <- function(title='title', mean_vs_p0_table) {
35 | 
36 |     logm <- log(mean_vs_p0_table$m + 1)
37 |     p0 <- mean_vs_p0_table$p0
38 | 
39 |     plot(logm, p0, pch='.', main=title)
40 | 
41 |     x_approx_mid <- median(logm[which(p0>0.2 & p0 < 0.8)])
42 | 
43 |     x <- logm
44 |     y <- p0
45 |     df <- data.frame(x,y)
46 | 
47 |     fit <- nls(y ~ infercnv:::.logistic(x, x0 = x0, k = k), data = df,
48 |                start = list(x0 = x_approx_mid, k = -1))
49 | 
50 |     logistic_x <- x
51 |     logistic_y <- predict(fit, newdata=x)
52 |     points(x, logistic_y, col='green')
53 | 
54 |     ## also try fitting a spline
55 |     spline.fit <- smooth.spline(x,y)
56 |     spline.pts = predict(spline.fit, newdata=x)
57 |     points(spline.pts$x, spline.pts$y, col='magenta')
58 |     legend('topright', c('logistic', 'spline'), col=c('green', 'magenta'), pch=1)
59 | 
60 |     ret = list(logistic_x = logistic_x,
61 |                logistic_y = logistic_y,
62 |                spline_x <- spline.pts$x,
63 |                spline_y <- spline.pts$y)
64 | 
65 | 
66 |     return(ret)
67 | }
68 | 
69 | 
70 | p1 <- plot_mean_vs_p0_with_data(args$matrix1, data1.mean_vs_p0)
71 | p2 <- plot_mean_vs_p0_with_data(args$matrix2, data2.mean_vs_p0)
72 | 
73 | 
74 | ## plot both logistics in a single plot
75 | plot(p1$logistic_x, p1$logistic_y, col='blue')
76 | points(p2$logistic_x, p2$logistic_y, col='magenta')
77 | legend('topright', c(args$matrix1, args$matrix2), col=c('blue', 'magenta'), pch=1)
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/scripts/examine_dropout_logistic.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) == 0) {
 6 |     stop("Error, require params: infercnv.obj");
 7 | }
 8 | 
 9 | infercnv_obj_file = args[1]
10 | 
11 | pdf(paste0(infercnv_obj_file, '.dropout.pdf'))
12 | 
13 | infercnv_obj = readRDS(infercnv_obj_file)
14 | 
15 | 
16 | library(edgeR)
17 | library(fitdistrplus)
18 | library(infercnv)
19 | 
20 | # borrowing some code from splatter
21 | 
22 | get_parameters <- function(group_name, expr.matrix) {
23 | 
24 |     params = list()
25 |     params[['group_name']] = group_name
26 |     
27 |     # estimate gamma for  genes
28 |     lib.sizes <- colSums(expr.matrix)
29 |     lib.med <- median(lib.sizes)
30 |     norm.counts <- t(t(expr.matrix) / lib.sizes * lib.med)
31 |     norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ]
32 | 
33 |     
34 |     # estimate dropout params
35 |     mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix)
36 |     logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table)
37 | 
38 |     params[['dropout.logistic.midpt']] = logistic_params$midpt
39 |     params[['dropout.logistic.slope']] = logistic_params$slope
40 |         
41 |     
42 | 
43 |     mean_vs_p0_table = cbind(mean_vs_p0_table, logm=log(mean_vs_p0_table$m + 1))
44 |     smoothScatter(mean_vs_p0_table$logm, mean_vs_p0_table$p0, main=group_name)
45 |     points(mean_vs_p0_table$logm,
46 |            infercnv:::.logistic(mean_vs_p0_table$logm, logistic_params$midpt, logistic_params$slope), col='red')
47 | 
48 | 
49 |     midpt_use = mean(mean_vs_p0_table$logm[mean_vs_p0_table$p0>0.48 & mean_vs_p0_table$p0<0.52])
50 |     
51 |     points(mean_vs_p0_table$logm,
52 |            infercnv:::.logistic(mean_vs_p0_table$logm, midpt_use, logistic_params$slope), col='magenta')
53 |     
54 | 
55 |     s = smooth.spline(mean_vs_p0_table$logm, mean_vs_p0_table$p0)
56 |     r = range(mean_vs_p0_table$logm)
57 |     x=seq(r[1], r[2], 0.1)
58 |     points(x, predict(s, x)$y, col='orange')
59 |     
60 | 
61 |     return(params)
62 | 
63 | }
64 | 
65 | 
66 | 
67 | 
68 | # examine each group
69 | all_groups = c(infercnv_obj@observation_grouped_cell_indices,  infercnv_obj@reference_grouped_cell_indices)
70 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices)
71 | 
72 | for (group in names(all_groups)) {
73 | 
74 |     group_idxs = all_groups[[ group ]]
75 |     expr.data = infercnv_obj@expr.data[,  group_idxs]
76 | 
77 |     params = get_parameters(group, expr.data)
78 |     params = t(as.data.frame(params))
79 |     
80 |     print(params)
81 |     
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/scripts/examine_infercnv_data_params.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) == 0) {
 6 |     stop("Error, require params: infercnv.obj");
 7 | }
 8 | 
 9 | infercnv_obj_file = args[1]
10 | 
11 | infercnv_obj = readRDS(infercnv_obj_file)
12 | 
13 | 
14 | library(edgeR)
15 | library(fitdistrplus)
16 | library(infercnv)
17 | library(Matrix)
18 | 
19 | # borrowing some code from splatter
20 | 
21 | get_parameters <- function(group_name, expr.matrix) {
22 | 
23 |     message(sprintf("getting params for: %s", group_name))
24 |     params = list()
25 |     params[['group_name']] = group_name
26 |     
27 |     # estimate gamma for  genes
28 |     lib.sizes <- colSums(expr.matrix)
29 |     lib.med <- median(lib.sizes)
30 |     norm.counts <- t(t(expr.matrix) / lib.sizes * lib.med)
31 |     norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ]
32 | 
33 |     ## note, fitting the gamma is done differently in splatter... using method = "mge", gof = "CvM", and first winsorizing the data at q=0.1
34 |     means <- rowMeans(norm.counts)
35 |     means.fit <- fitdistrplus::fitdist(means, "gamma", method = "mme")
36 |     mean.shape = unname(means.fit$estimate["shape"])
37 |     mean.rate = unname(means.fit$estimate["rate"])
38 |     
39 |     params[[ 'gamma.mean.shape' ]] = mean.shape
40 |     params[[ 'gamma.mean.rate' ]] = mean.rate
41 |     
42 |     
43 |     # estimate dropout params
44 |     mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix)
45 |     logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table)
46 | 
47 |     params[['dropout.logistic.midpt']] = logistic_params$midpt
48 |     params[['dropout.logistic.slope']] = logistic_params$slope
49 |         
50 |     
51 |     # estimate common dispersion
52 |     design <- matrix(1, ncol(expr.matrix), 1)
53 |     disps <- edgeR::estimateDisp(expr.matrix, design = design)
54 | 
55 |     params[[ 'common.dispersion' ]] = disps$common.dispersion
56 |     
57 | 
58 |     return(params)
59 | 
60 | }
61 | 
62 | 
63 | 
64 | # examine each group
65 | all_groups = c(infercnv_obj@observation_grouped_cell_indices,  infercnv_obj@reference_grouped_cell_indices)
66 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices)
67 | 
68 | for (group in names(all_groups)) {
69 | 
70 |     group_idxs = all_groups[[ group ]]
71 |     expr.data = infercnv_obj@expr.data[,  group_idxs]
72 | 
73 |     params = get_parameters(group, expr.data)
74 |     params = t(as.data.frame(params))
75 |     
76 |     print(params)
77 |     
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/scripts/examine_infercnv_data_params.just_dispersion.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) == 0) {
 6 |     stop("Error, require params: infercnv.obj");
 7 | }
 8 | 
 9 | infercnv_obj_file = args[1]
10 | 
11 | infercnv_obj = readRDS(infercnv_obj_file)
12 | 
13 | 
14 | library(edgeR)
15 | library(fitdistrplus)
16 | library(infercnv)
17 | library(Matrix)
18 | 
19 | # borrowing some code from splatter
20 | 
21 | get_parameters <- function(group_name, expr.matrix) {
22 | 
23 |     message(sprintf("getting params for: %s", group_name))
24 |     params = list()
25 |     params[['group_name']] = group_name
26 |     
27 |     
28 |     # estimate common dispersion
29 |     design <- matrix(1, ncol(expr.matrix), 1)
30 |     disps <- edgeR::estimateDisp(expr.matrix, design = design)
31 | 
32 |     params[[ 'common.dispersion' ]] = disps$common.dispersion
33 |     
34 | 
35 |     return(params)
36 | 
37 | }
38 | 
39 | 
40 | 
41 | # examine each group
42 | all_groups = c(infercnv_obj@observation_grouped_cell_indices,  infercnv_obj@reference_grouped_cell_indices)
43 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices)
44 | 
45 | for (group in names(all_groups)) {
46 | 
47 |     group_idxs = all_groups[[ group ]]
48 |     expr.data = infercnv_obj@expr.data[,  group_idxs]
49 | 
50 |     params = get_parameters(group, expr.data)
51 |     params = t(as.data.frame(params))
52 |     
53 |     print(params)
54 |     
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/scripts/examine_normal_cutoffs_vs_KS.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | parser$add_argument("--scale", help="scale", action='store_true', default=FALSE)
 8 | parser$add_argument("--subtract", help="subtract", action='store_true', default=FALSE)
 9 | parser$add_argument("--smooth", help="smooth", action='store_true', default=TRUE)
10 | parser$add_argument("--show_tumor", help="show tumor instead of normal", action='store_true', default=FALSE)
11 | parser$add_argument("--output", help="name of output png file", required=TRUE)
12 | 
13 | args = parser$parse_args()
14 | 
15 | library(infercnv)
16 | library(tidyverse)
17 | library(futile.logger)
18 | 
19 | infercnv_obj_file = args$infercnv_obj
20 | 
21 | infercnv_obj = readRDS(infercnv_obj_file)
22 | 
23 | if (! infercnv:::has_reference_cells(infercnv_obj)) {
24 |     stop("Error, cannot tune parameters without reference 'normal' cells defined")
25 | }
26 | 
27 | if (args$scale) {
28 |     infercnv_obj <- infercnv:::scale_infercnv_expr(infercnv_obj)
29 | }
30 | 
31 | if (args$subtract) {
32 |     infercnv_obj <- subtract_ref_expr_from_obs(infercnv_obj, inv_log=FALSE)
33 | }
34 | 
35 | 
36 | if (args$smooth) {
37 |     infercnv_obj <- smooth_by_chromosome(infercnv_obj, window_length=101, smooth_ends=TRUE)
38 | }
39 | 
40 | if (args$show_tumor) {
41 |     expr_vals <- infercnv_obj@expr.data[, unlist(infercnv_obj@observation_grouped_cell_indices)]
42 | } else {
43 |     expr_vals <- infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)]
44 | }
45 | 
46 | 
47 | mu = mean(expr_vals)
48 | sigma = sd(expr_vals)
49 | 
50 | data.want = data.frame(vals=as.numeric(expr_vals))
51 | 
52 | mean_delta = infercnv:::determine_mean_delta_via_Z(sigma, p=0.05)
53 | KS_delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sigma, alpha=0.05)
54 | 
55 | 
56 | png(args$output)
57 | 
58 | message("plotting ncells distribution")
59 | 
60 | message("mean delta: ", mean_delta)
61 | message("KS_delta: ", KS_delta)
62 | 
63 | p = data.want %>% ggplot(aes(vals)) +
64 |     geom_density(alpha=0.3)
65 | 
66 | p = p +
67 |     stat_function(fun=dnorm, color='black', args=list('mean'=mu,'sd'=sigma))
68 | 
69 | 
70 | ## add Z-based
71 | 
72 | p = p +
73 |     stat_function(fun=dnorm, color='blue', args=list('mean'=mu-mean_delta,'sd'=sigma)) +
74 |     stat_function(fun=dnorm, color='blue', args=list('mean'=mu+mean_delta,'sd'=sigma))
75 | 
76 | ## add KS-based
77 | 
78 | p = p +
79 |     stat_function(fun=dnorm, color='magenta', args=list('mean'=mu-KS_delta,'sd'=sigma)) +
80 |     stat_function(fun=dnorm, color='magenta', args=list('mean'=mu+KS_delta,'sd'=sigma))
81 | 
82 | plot(p)
83 | 


--------------------------------------------------------------------------------
/scripts/examine_normal_sampling_distributions.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 |     
  5 | parser = ArgumentParser()
  6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
  7 | args = parser$parse_args()
  8 | 
  9 | library(infercnv)
 10 | library(tidyverse)
 11 | library(futile.logger)
 12 | 
 13 | infercnv_obj_file = args$infercnv_obj
 14 | 
 15 | infercnv_obj = readRDS(infercnv_obj_file)
 16 | 
 17 | if (! infercnv:::has_reference_cells(infercnv_obj)) {
 18 |     stop("Error, cannot tune parameters without reference 'normal' cells defined")
 19 | }
 20 | 
 21 | expr_vals <- infercnv_obj@expr.data
 22 | mu = mean(expr_vals)
 23 | sigma = sd(expr_vals)
 24 | nrounds = 1000
 25 | sds = c()
 26 | ngenes = nrow(expr_vals)
 27 | 
 28 | normal_samples = infercnv_obj@reference_grouped_cell_indices
 29 | 
 30 | num_normal_samples = length(normal_samples)
 31 | 
 32 | mean_vals_df = NULL;
 33 | z_p_val = 0.05
 34 | 
 35 | num_cells_to_empirical_sd = list()
 36 | 
 37 | ncells_partitions = seq (1,100,5)
 38 | for (ncells in ncells_partitions) {
 39 |     means = c()
 40 |     
 41 |     message(sprintf("num cells: %g", ncells))
 42 | 
 43 |     cells_counted = 0;
 44 |     
 45 |     for(i in 1:nrounds) {
 46 |         ## pick a random gene
 47 |         rand.gene = sample(1:ngenes)
 48 |         
 49 |         ## pick a random normal cell type
 50 |         rand.sample = sample(num_normal_samples)
 51 |         #rand.sample=1
 52 |         
 53 |         vals = sample(expr_vals[rand.gene, normal_samples[[rand.sample]] ], size=ncells, replace=T)
 54 |         m_val = mean(vals)
 55 |         means = c(means,  m_val)
 56 | 
 57 |         cells_counted = cells_counted + length(vals)
 58 | 
 59 |         
 60 |     }
 61 |     my.sd = sd(means)
 62 |     sds = c(sds, my.sd)
 63 | 
 64 |     num_cells_to_empirical_sd[[ ncells ]] = my.sd
 65 |     
 66 |     df = data.frame(num_cells=ncells, vals=means)
 67 |                                         #print(df)
 68 |     if(is.null(mean_vals_df)) {
 69 |         mean_vals_df = df
 70 |     } else {
 71 |         mean_vals_df = rbind(mean_vals_df, df)
 72 |     }
 73 |     
 74 | }
 75 | 
 76 | ## fit linear model
 77 | num_cells = ncells_partitions
 78 | 
 79 | write.table(data.frame(num_cells=num_cells, sds=sds), file='num_cells_vs_sds.table.dat', quote=F, sep="\t")
 80 | 
 81 | 
 82 | fit = lm(log(sds) ~ log(num_cells)) #note, hbadger does something similar, but not for the hmm cnv state levels
 83 | 
 84 | my.spline = smooth.spline(log(num_cells), log(sds)) 
 85 | 
 86 | message("plotting log(sd) vs. log(num_cells)")
 87 | 
 88 | plot(log(num_cells), log(sds), main='log(sd) vs. log(num_cells)')
 89 | 
 90 | plot(num_cells, sds, main='sd vs. num_cells')
 91 | 
 92 | my.spline2 = smooth.spline(num_cells, sds) 
 93 | 
 94 | ## store mean_delta for the single gene for convenience sake
 95 | mean_delta = qnorm(p=1-z_p_val, sd=sigma, mean=0)
 96 | 
 97 | normal_sd_trend = list(mu=mu,
 98 |                        sigma=sigma,
 99 |                        fit=fit,
100 |                        spline=my.spline,
101 |                        mean_delta=mean_delta)
102 | 
103 | 
104 | 
105 | ### do some plotting
106 | 
107 | 
108 | for (ncells in ncells_partitions) {
109 | 
110 |     message(sprintf("plotting ncells distribution: %g", ncells))
111 |     
112 |     data.want = mean_vals_df %>% filter(num_cells == ncells)
113 |     
114 |     
115 |     p = data.want %>% ggplot(aes(vals, fill=num_cells)) +
116 |         geom_density(alpha=0.3)
117 | 
118 |     sigma <- exp(predict(normal_sd_trend$fit,
119 |                          newdata=data.frame(num_cells=ncells))[[1]]) 
120 | 
121 |     message("ncells:", ncells, " sigma: ", sigma)
122 |     
123 |     p = p +
124 |         stat_function(fun=dnorm, color='black', args=list('mean'=1,'sd'=sigma))  +
125 |         ggtitle(sprintf("num_cells: %g, sd: %g", ncells, sigma))
126 | 
127 |     p = p +
128 |         stat_function(fun=dnorm, color='magenta', args=list('mean'=1,'sd'=num_cells_to_empirical_sd[[ ncells]] )) 
129 | 
130 | 
131 |     pval=0.01
132 |     
133 |     left_mean = 1 - 2 * (1-qnorm(p=pval, mean=1, sd=sigma))
134 |     message("left_mean: ", left_mean)
135 |     p = p +
136 |         stat_function(fun=dnorm, color='blue', args=list('mean'=left_mean,'sd'=sigma))
137 | 
138 | 
139 |     right_mean = 1 + 2 * (qnorm(p=1-pval, mean=1, sd=sigma)-1)
140 |     message("right_mean: ", right_mean)        
141 |         p = p +
142 |             stat_function(fun=dnorm, color='blue', args=list('mean'=right_mean,'sd'=sigma))  
143 |     
144 |     
145 | 
146 |     
147 | 
148 |     if (FALSE) {
149 |     
150 |         spline.sd = exp(predict(my.spline, x=log(ncells))$y)
151 |         
152 |         
153 |         p = p +
154 |             stat_function(fun=dnorm, color='green', args=list('mean'=1,'sd'=spline.sd)) 
155 |         
156 |         spline2.sd = predict(my.spline2, x=ncells)$y
157 |         
158 |         message(spline2.sd)
159 |         
160 |         p = p +
161 |             stat_function(fun=dnorm, color='orange', args=list('mean'=1,'sd'=spline2.sd)) 
162 |     }
163 |     
164 |     plot(p)
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/scripts/examine_normal_sampling_distributions.i3.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 |     
  5 | parser = ArgumentParser()
  6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
  7 | args = parser$parse_args()
  8 | 
  9 | library(infercnv)
 10 | library(tidyverse)
 11 | library(futile.logger)
 12 | 
 13 | infercnv_obj_file = args$infercnv_obj
 14 | 
 15 | infercnv_obj = readRDS(infercnv_obj_file)
 16 | 
 17 | 
 18 | expr_vals <- infercnv_obj@expr.data
 19 | 
 20 | 
 21 | sd_trend_info = infercnv:::.i3HMM_get_sd_trend_by_num_cells_fit(infercnv_obj)
 22 | 
 23 | 
 24 | mu = sd_trend_info$mu
 25 | sigma = sd_trend_info$sigma
 26 | 
 27 | sds = c()
 28 | ngenes = nrow(expr_vals)
 29 | 
 30 | tumor_samples = infercnv_obj@observation_grouped_cell_indices
 31 | 
 32 | print(tumor_samples)
 33 | 
 34 | num_tumor_samples = length(tumor_samples)
 35 | 
 36 | print(num_tumor_samples)
 37 | 
 38 | mean_vals_df = NULL;
 39 | z_p_val = 0.05
 40 | 
 41 | 
 42 | num_cells_to_empirical_sd = list()
 43 | 
 44 | nrounds=100
 45 | 
 46 | ncells_partitions = seq (1,100,5)
 47 | for (ncells in ncells_partitions) {
 48 |     means = c()
 49 |     
 50 |     message(sprintf("num cells: %g", ncells))
 51 | 
 52 |     cells_counted = 0;
 53 |     
 54 |     for(i in 1:nrounds) {
 55 |         ## pick a random gene
 56 |         rand.gene = sample(1:ngenes, size=1)
 57 |         
 58 |         ## pick a random normal cell type
 59 |         rand.sample = sample(1:num_tumor_samples, size=1)
 60 |                                         #rand.sample=1
 61 |         #print(rand.sample)
 62 |         
 63 |         vals = sample(expr_vals[rand.gene, tumor_samples[[rand.sample]] ], size=ncells, replace=T)
 64 |         m_val = mean(vals)
 65 |         means = c(means,  m_val)
 66 |         
 67 |         cells_counted = cells_counted + length(vals)
 68 |                 
 69 |     }
 70 |     means.sd = sd(means)
 71 |     means.mean = mean(means)
 72 |     
 73 |     num_cells_to_empirical_sd[[ ncells ]] = means.sd
 74 |     
 75 |     df = data.frame(num_cells=ncells, vals=means)
 76 | 
 77 |     message(sprintf("plotting ncells distribution: %g", ncells))
 78 |     
 79 |     data.want = df
 80 |     
 81 |     
 82 |     p = data.want %>% ggplot(aes(vals, fill=num_cells)) +
 83 |         geom_density(alpha=0.3)   +
 84 |         ggtitle(sprintf("num_cells: %g", ncells))
 85 | 
 86 |     ## draw parameterized distribution
 87 |     p = p +
 88 |         stat_function(fun=dnorm, color='black', args=list('mean'=means.mean,'sd'=means.sd))
 89 |     
 90 | 
 91 |     alpha=0.05
 92 |     ks_delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sd_trend_info$sigma, k_cells=ncells, alpha=alpha, plot=T)
 93 |     
 94 |     left_mean = means.mean - ks_delta
 95 |     message("left_mean: ", left_mean)
 96 |     p = p +
 97 |         stat_function(fun=dnorm, color='blue', args=list('mean'=left_mean,'sd'=means.sd))
 98 | 
 99 | 
100 |     right_mean = means.mean + ks_delta
101 |     message("right_mean: ", right_mean)        
102 |         p = p +
103 |             stat_function(fun=dnorm, color='blue', args=list('mean'=right_mean,'sd'=means.sd))  
104 |     
105 |         
106 |     plot(p)
107 | }
108 | 
109 | 


--------------------------------------------------------------------------------
/scripts/examine_simulated_vs_observed_dispersion.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) == 0) {
 6 |     stop("Error, require params: infercnv.obj");
 7 | }
 8 | 
 9 | infercnv_obj_file = args[1]
10 | 
11 | pdf(paste0(infercnv_obj_file, '.dispersion_estimation.pdf'))
12 | 
13 | infercnv_obj = readRDS(infercnv_obj_file)
14 | 
15 | 
16 | library(edgeR)
17 | library(fitdistrplus)
18 | library(infercnv)
19 | 
20 | # examine each group
21 | normal_grp_idx <- unlist(infercnv_obj@reference_grouped_cell_indices)
22 | expr.matrix = infercnv_obj@expr.data[, normal_grp_idx]
23 | 
24 | 
25 | ## estimate dropout params
26 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix)
27 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table)
28 | 
29 | iterations=1
30 | dispersion_params = c(0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10)
31 | 
32 | resultset=matrix(0, ncol=3, nrow=iterations*length(dispersion_params))
33 | colnames(resultset) = c('target', 'before_Zinf', 'after_Zinf')
34 | 
35 | 
36 | row = 0
37 | 
38 | 
39 | for (common.dispersion in dispersion_params) {
40 |     message(sprintf("Exploring common.dispersion set at: %g", common.dispersion)) 
41 |     for (iter in 1:iterations) {
42 |         message(sprintf("\titer: %d", iter))
43 |         
44 |         row = row + 1
45 | 
46 |         ## simulate w/o zero-inflation
47 |         sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, NULL, 100, common_dispersion=common.dispersion)
48 | 
49 |         ## estimate common disp from these data:
50 |         design <- matrix(1, ncol(sim_counts), 1)
51 | 
52 | 
53 |         disps <- edgeR::estimateDisp(sim_counts, design = design)
54 |                                         #print(sprintf("estimated disp before dropouts: %g", disps$common.dispersion))
55 | 
56 |         resultset[row,1] <- common.dispersion
57 |         resultset[row,2] <- disps$common.dispersion
58 |         
59 | 
60 |         ## include zero-inflation
61 |         sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, mean_vs_p0_table, 100,
62 |                                                            common_dispersion=common.dispersion)
63 |         
64 |         
65 |         disps <- edgeR::estimateDisp(sim_counts, design = design)
66 |         resultset[row,3] <- disps$common.dispersion   
67 | 
68 |     }
69 | 
70 |     
71 | }
72 | 
73 | 
74 | resultset = as.data.frame(resultset)
75 | print(resultset)
76 | write.table(resultset, file=paste0(infercnv_obj_file, ".dispersion_estimation.dat"), quote=F, sep="\t")
77 | 
78 | ## examples:
79 | ## 10x:  0.221 + 1.05 * (true_dispersion)  # colon single sample
80 | ##       0.223 + 1.05 * (true_dipersion)   # multiple colon samples
81 | 
82 | ## smrtSeq: 0.95 + 1.56 * (true_dispersion)   # oligodendro
83 | ##          1.073 + 1.628 * (true_dispersion) # melanoma
84 | 
85 | 
86 | res.lm = lm(resultset[,3] ~ resultset[,1])
87 | 
88 | print(res.lm)
89 | 
90 | coeff  = res.lm$coefficients
91 | intercept = coeff[1]
92 | slope = coeff[2]
93 | 
94 | plot(resultset[,1], resultset[,3], main=sprintf("y=%g + %g * x", intercept, slope), col='green')
95 | points(resultset[,1], resultset[,2])
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/scripts/examine_simulated_vs_observed_dispersion.from_matrix.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) == 0) {
 6 |     stop("Error, require params: normal_cells.matrix");
 7 | }
 8 | 
 9 | matrix.file = args[1]
10 | 
11 | pdf(paste0(matrix.file, '.dispersion_estimation.pdf'))
12 | 
13 | library(edgeR)
14 | library(fitdistrplus)
15 | library(infercnv)
16 | 
17 | expr.matrix = read.table(matrix.file)
18 | 
19 | 
20 | ## estimate dropout params
21 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix)
22 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table)
23 | 
24 | iterations=1
25 | dispersion_params = c(0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10)
26 | 
27 | resultset=matrix(0, ncol=3, nrow=iterations*length(dispersion_params))
28 | colnames(resultset) = c('target', 'before_Zinf', 'after_Zinf')
29 | 
30 | 
31 | row = 0
32 | 
33 | 
34 | for (common.dispersion in dispersion_params) {
35 |     message(sprintf("Exploring common.dispersion set at: %g", common.dispersion)) 
36 |     for (iter in 1:iterations) {
37 |         message(sprintf("\titer: %d", iter))
38 |         
39 |         row = row + 1
40 | 
41 |         ## simulate w/o zero-inflation
42 |         sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, NULL, 100, common_dispersion=common.dispersion)
43 | 
44 |         ## estimate common disp from these data:
45 |         design <- matrix(1, ncol(sim_counts), 1)
46 | 
47 | 
48 |         disps <- edgeR::estimateDisp(sim_counts, design = design)
49 |                                         #print(sprintf("estimated disp before dropouts: %g", disps$common.dispersion))
50 | 
51 |         resultset[row,1] <- common.dispersion
52 |         resultset[row,2] <- disps$common.dispersion
53 |         
54 | 
55 |         ## include zero-inflation
56 |         sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, mean_vs_p0_table, 100,
57 |                                                            common_dispersion=common.dispersion)
58 |         
59 |         
60 |         disps <- edgeR::estimateDisp(sim_counts, design = design)
61 |         resultset[row,3] <- disps$common.dispersion   
62 | 
63 |     }
64 | 
65 |     
66 | }
67 | 
68 | 
69 | resultset = as.data.frame(resultset)
70 | print(resultset)
71 | write.table(resultset, file=paste0(matrix.file, ".dispersion_estimation.dat"), quote=F, sep="\t")
72 | 
73 | ## examples:
74 | ## 10x:  0.221 + 1.05 * (true_dispersion)  # colon single sample
75 | ##       0.223 + 1.05 * (true_dipersion)   # multiple colon samples
76 | 
77 | ## smrtSeq: 0.95 + 1.56 * (true_dispersion)   # oligodendro
78 | ##          1.073 + 1.628 * (true_dispersion) # melanoma
79 | 
80 | 
81 | res.lm = lm(resultset[,3] ~ resultset[,1])
82 | 
83 | print(res.lm)
84 | 
85 | coeff  = res.lm$coefficients
86 | intercept = coeff[1]
87 | slope = coeff[2]
88 | 
89 | plot(resultset[,1], resultset[,3], main=sprintf("y=%g + %g * x", intercept, slope), col='green')
90 | points(resultset[,1], resultset[,2])
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/scripts/explore_HMM_exec.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 |     
  5 | parser = ArgumentParser()
  6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
  7 | parser$add_argument("--chr", help='restrict to chr', required=FALSE, nargs=1, default=NULL)
  8 | args = parser$parse_args()
  9 | 
 10 | library(infercnv)
 11 | library(futile.logger)
 12 | library(HiddenMarkov)
 13 | 
 14 | infercnv_obj_file = args$infercnv_obj
 15 | 
 16 | infercnv_obj = readRDS(infercnv_obj_file)
 17 | 
 18 | cnv_mean_sd=infercnv:::get_spike_dists(infercnv_obj@.hspike)
 19 | cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(infercnv_obj@.hspike)
 20 | transition_out_p=1e-6
 21 | p_val=0.05
 22 | hclust_method='ward.D2'
 23 | 
 24 | 
 25 | flog.info(sprintf("predict_CNV_via_HMM_on_tumor_subclusters(p_val=%g)", p_val))
 26 | HMM_info  <- infercnv:::.get_HMM(cnv_mean_sd, transition_out_p)
 27 | chrs = unique(infercnv_obj@gene_order$chr)
 28 | expr.data = infercnv_obj@expr.data
 29 | gene_order = infercnv_obj@gene_order
 30 | hmm.data = expr.data
 31 | hmm.data[,] = -1 #init to invalid state
 32 | 
 33 | tumor_subclusters <- unlist(infercnv_obj@tumor_subclusters[["subclusters"]], recursive=F)
 34 | if (is.null(tumor_subclusters)) {
 35 |     message("No subclusters defined, running per-sample instead")
 36 |     tumor_subclusters <- infercnv_obj@observation_grouped_cell_indices
 37 | }
 38 | 
 39 | if (! is.null(args$chr)) {
 40 |    chrs = c(args$chr)
 41 | }  	  
 42 | 
 43 | 
 44 | ##########################################
 45 | #chrs = c('chr1')
 46 | ##########################################
 47 | 
 48 | 
 49 | ##############################################
 50 | ## From HiddenMarkovPackage
 51 | getj <- function (x, j)  {
 52 |     if (is.null(x)) 
 53 |         return(NULL)
 54 |     n <- length(x)
 55 |     for (i in 1:n) x[[i]] <- x[[i]][j]
 56 |     return(x)
 57 | }
 58 | 
 59 | 
 60 | local.Viterbi.dthmm <- function (object, ...) {
 61 |     x <- object$x
 62 |     dfunc <- HiddenMarkov:::makedensity(object$distn)
 63 |     n <- length(x)
 64 |     m <- nrow(object$Pi) # transition matrix
 65 |     nu <- matrix(NA, nrow = n, ncol = m)  # scoring matrix
 66 |     y <- rep(NA, n) # final trace
 67 |     pseudocount = 1e-20
 68 |     
 69 |     object$pm$sd = max(object$pm$sd)
 70 | 
 71 |     emissions <- matrix(NA, nrow = n, ncol = m) 
 72 |     emissions_pre <- emissions
 73 |     
 74 |     ## init first row
 75 |     emission <- pnorm(abs(x[1]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F)
 76 |     #emissions_pre[1,] <- emission
 77 |     emissions_pre[1,] <- abs(x[1]-object$pm$mean)/object$pm$sd 
 78 | 
 79 |     emission <- 1 / (-1 * emission)
 80 |     emission <- emission / sum(emission)
 81 |     
 82 |     emissions[1,] <- log(emission)
 83 |     
 84 |     nu[1, ] <- log(object$delta) + # start probabilities
 85 |         emissions[1,]
 86 |     
 87 |     logPi <- log(object$Pi) # convert transition matrix to log(p)
 88 |     
 89 |     for (i in 2:n) {
 90 | 
 91 |         matrixnu <- matrix(nu[i - 1, ], nrow = m, ncol = m)
 92 |         
 93 |         #nu[i, ] <- apply(matrixnu + logPi, 2, max) +
 94 |         #              dfunc(x=x[i], object$pm, getj(object$pn, i),
 95 |         #                    log=TRUE)
 96 | 
 97 |         
 98 |         #emission <- dfunc(x=x[i], object$pm, getj(object$pn, i), log=T)
 99 |         ## normalize emission p-values
100 |         ## first add pseudcounts
101 |         #missions[i, ] <- emissions[i, ] + pseudocount
102 |         #emissions[i, ] <- emissions[i, ] / sum(emissions[i, ]) 
103 |  
104 |         #emissions[i, ] <- log(emissions[i, ])
105 |                 
106 | 
107 |         emission <- pnorm(abs(x[i]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F)
108 |        	#emissions_pre[i,] <- emission
109 | 	emissions_pre[i,] <- abs(x[i]-object$pm$mean)/object$pm$sd
110 | 
111 | 	emission <- 1 / (-1 * emission)
112 |         emission <- emission / sum(emission)
113 |         
114 |         emissions[i, ] <- log(emission)
115 |         
116 |         nu[i, ] <- apply(matrixnu + logPi, 2, max) + emissions[i, ] 
117 | 
118 |         #print(matrixnu)
119 |         #print(logPi)
120 |     }
121 |     if (any(nu[n, ] == -Inf)) 
122 |         stop("Problems With Underflow")
123 | 
124 |     write.table(nu, file='nu.txt', quote=F, sep="\t")
125 |     write.table(emissions, file='emissions.txt', quote=F, sep="\t")
126 |     write.table(emissions_pre, file='emissions_pre.txt', quote=F, sep="\t")
127 | 
128 |     ## traceback
129 |     y[n] <- which.max(nu[n, ])
130 | 
131 |     for (i in seq(n - 1, 1, -1))
132 |         y[i] <- which.max(logPi[, y[i + 1]] + nu[i, ])
133 | 
134 |     return(y)
135 | }
136 | 
137 | 
138 | ##########################################
139 | 
140 | 
141 | for (chr in chrs) {
142 |     print(chr)
143 |     chr_gene_idx = which(gene_order$chr == chr)
144 |     
145 |     ## run through each cell for this chromosome:
146 |     for (tumor_subcluster_name in names(tumor_subclusters)) {
147 |         print(tumor_subcluster_name)
148 |         tumor_subcluster_cells_idx <- tumor_subclusters[[tumor_subcluster_name]]
149 |                 
150 |         gene_expr_vals = rowMeans(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F])
151 |         ##gene_expr_vals = apply(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F], 1, median)
152 |         if (length(gene_expr_vals) < 2) { next; }
153 |         num_cells = length(tumor_subcluster_cells_idx)
154 |         
155 |         state_emission_params <- infercnv:::.get_state_emission_params(num_cells, cnv_mean_sd, cnv_level_to_mean_sd_fit)
156 |         print(state_emission_params)
157 |         print(gene_expr_vals)
158 |         
159 |         hmm <- HiddenMarkov::dthmm(gene_expr_vals,
160 |                                    HMM_info[['state_transitions']],
161 |                                    HMM_info[['delta']],
162 |                                    "norm",
163 |                                    state_emission_params)
164 | 
165 |         hmm_trace <- local.Viterbi.dthmm(hmm)
166 |         
167 |         print(hmm_trace)
168 |         
169 |         hmm.data[chr_gene_idx,tumor_subcluster_cells_idx] <- hmm_trace
170 | 
171 |         break
172 |     }
173 | }
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/scripts/explore_HMM_exec.hspike.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 |     
  5 | parser = ArgumentParser()
  6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
  7 | args = parser$parse_args()
  8 | 
  9 | library(infercnv)
 10 | library(futile.logger)
 11 | library(HiddenMarkov)
 12 | 
 13 | infercnv_obj_file = args$infercnv_obj
 14 | 
 15 | infercnv_obj = readRDS(infercnv_obj_file)
 16 | 
 17 | cnv_mean_sd=infercnv:::get_spike_dists(infercnv_obj@.hspike)
 18 | cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(infercnv_obj@.hspike)
 19 | transition_out_p=1e-6
 20 | p_val=0.05
 21 | hclust_method='ward.D2'
 22 | 
 23 | 
 24 | flog.info(sprintf("predict_CNV_via_HMM_on_tumor_subclusters(p_val=%g)", p_val))
 25 | HMM_info  <- infercnv:::.get_HMM(cnv_mean_sd, transition_out_p)
 26 | 
 27 | infercnv_obj = infercnv_obj@.hspike
 28 | 
 29 | chrs = unique(infercnv_obj@gene_order$chr)
 30 | expr.data = infercnv_obj@expr.data
 31 | gene_order = infercnv_obj@gene_order
 32 | hmm.data = expr.data
 33 | hmm.data[,] = -1 #init to invalid state
 34 | 
 35 | tumor_subclusters <- c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
 36 | 
 37 | 
 38 | ##########################################
 39 | #chrs = c('chr1')
 40 | ##########################################
 41 | 
 42 | 
 43 | ##############################################
 44 | ## From HiddenMarkovPackage
 45 | getj <- function (x, j)  {
 46 |     if (is.null(x)) 
 47 |         return(NULL)
 48 |     n <- length(x)
 49 |     for (i in 1:n) x[[i]] <- x[[i]][j]
 50 |     return(x)
 51 | }
 52 | 
 53 | 
 54 | local.Viterbi.dthmm <- function (object, ...){
 55 |     x <- object$x
 56 |     dfunc <- HiddenMarkov:::makedensity(object$distn)
 57 |     n <- length(x)
 58 |     m <- nrow(object$Pi) # transition matrix
 59 |     nu <- matrix(NA, nrow = n, ncol = m)  # scoring matrix
 60 |     y <- rep(NA, n) # final trace
 61 |     pseudocount = 1e-20
 62 |     
 63 |     emissions <- matrix(NA, nrow = n, ncol = m) 
 64 |     
 65 |     ## init first row
 66 |     emission <- pnorm(abs(x[1]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F)
 67 |     emission <- 1 / (-1 * emission)
 68 |     emission <- emission / sum(emission)
 69 |     
 70 |     emissions[1,] <- log(emission)
 71 |     
 72 |     nu[1, ] <- log(object$delta) + # start probabilities
 73 |         emissions[1,]
 74 |     
 75 |     logPi <- log(object$Pi) # convert transition matrix to log(p)
 76 |     
 77 |     for (i in 2:n) {
 78 |         
 79 |         matrixnu <- matrix(nu[i - 1, ], nrow = m, ncol = m)
 80 |         
 81 |         #nu[i, ] <- apply(matrixnu + logPi, 2, max) +
 82 |         #              dfunc(x=x[i], object$pm, getj(object$pn, i),
 83 |         #                    log=TRUE)
 84 | 
 85 |         
 86 |         #emission <- dfunc(x=x[i], object$pm, getj(object$pn, i), log=T)
 87 |         ## normalize emission p-values
 88 |         ## first add pseudcounts
 89 |         #missions[i, ] <- emissions[i, ] + pseudocount
 90 |         #emissions[i, ] <- emissions[i, ] / sum(emissions[i, ]) 
 91 |  
 92 |         #emissions[i, ] <- log(emissions[i, ])
 93 |                 
 94 | 
 95 |         emission <- pnorm(abs(x[i]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F)
 96 |         emission <- 1 / (-1 * emission)
 97 |         emission <- emission / sum(emission)
 98 |         
 99 |         emissions[i, ] <- log(emission)
100 |         
101 |         nu[i, ] <- apply(matrixnu + logPi, 2, max) + emissions[i, ] 
102 | 
103 |         #print(matrixnu)
104 |         #print(logPi)
105 |     }
106 |     if (any(nu[n, ] == -Inf)) 
107 |         stop("Problems With Underflow")
108 | 
109 |     write.table(nu, file='nu.txt', quote=F, sep="\t")
110 |     write.table(emissions, file='emissions.txt', quote=F, sep="\t")
111 | 
112 |     ## traceback
113 |     y[n] <- which.max(nu[n, ])
114 | 
115 |     for (i in seq(n - 1, 1, -1))
116 |         y[i] <- which.max(logPi[, y[i + 1]] + nu[i, ])
117 | 
118 |     return(y)
119 | }
120 | 
121 | 
122 | ##########################################
123 | 
124 | #chrs = c("chr13")
125 | for (chr in chrs) {
126 |     print(chr)
127 |     chr_gene_idx = which(gene_order$chr == chr)
128 |     
129 |     ## run through each cell for this chromosome:
130 |     for (tumor_subcluster_name in names(tumor_subclusters)) {
131 |         print(tumor_subcluster_name)
132 |         tumor_subcluster_cells_idx <- tumor_subclusters[[tumor_subcluster_name]]
133 |                 
134 |         gene_expr_vals = rowMeans(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F])
135 |         ##gene_expr_vals = apply(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F], 1, median)
136 |         
137 |         num_cells = length(tumor_subcluster_cells_idx)
138 |         
139 |         state_emission_params <- infercnv:::.get_state_emission_params(num_cells, cnv_mean_sd, cnv_level_to_mean_sd_fit)
140 |         print(state_emission_params)
141 |         print(gene_expr_vals)
142 |         
143 |         hmm <- HiddenMarkov::dthmm(gene_expr_vals,
144 |                                    HMM_info[['state_transitions']],
145 |                                    HMM_info[['delta']],
146 |                                    "norm",
147 |                                    state_emission_params)
148 | 
149 |         hmm_trace <- local.Viterbi.dthmm(hmm)
150 |         
151 |         print(hmm_trace)
152 |         
153 |         hmm.data[chr_gene_idx,tumor_subcluster_cells_idx] <- hmm_trace
154 | 
155 |         
156 |     }
157 | }
158 | 
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/scripts/genome_smoothed_lineplots.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | pdf(sprintf("%s.chr_lineplots.pdf", infercnv_obj_file))
18 | 
19 | normal_groups = infercnv_obj@reference_grouped_cell_indices
20 | tumor_groups = infercnv_obj@observation_grouped_cell_indices
21 | 
22 | expr.data = infercnv_obj@expr.data
23 | 
24 | num_tumor_groups = length(tumor_groups)
25 | 
26 | windowsizes = c(25,50,75,100)
27 | num_windowsizes = length(windowsizes)
28 | par(mfrow=c(num_windowsizes, 1))
29 | 
30 | library(tidyverse)
31 | 
32 | 
33 | plotme <- function(normal_pts, tumor_pts, windowsize) {
34 | 
35 |     all_pts = c(normal_pts, tumor_pts)
36 | 
37 |     all_pts_names = names(all_pts)
38 | 
39 |     my.colors = rainbow(length(all_pts))
40 |     
41 |     yrange = range(unlist(all_pts))
42 | 
43 |     text.adj = 0.7
44 |     for (i in 1:length(all_pts)) {
45 |         if (i == 1) {
46 |             plot(all_pts[[i]], t='l', col=my.colors[i], main=sprintf("windowsize: %g, tumor: %s", windowsize, all_pts_names[length(all_pts_names)]), ylim=yrange,
47 |                  cex.lab=text.adj, cex.main=text.adj, cex.axis=text.adj)
48 |         } else {
49 |             points(all_pts[[i]], t='l', col=my.colors[i])
50 |         }
51 |     }
52 |     abline(h=0)
53 |     legend('top', legend=all_pts_names, col=my.colors, pch=1, horiz=T, bty='n', cex=text.adj)
54 |     
55 | }
56 | 
57 | 
58 | 
59 | get_smoothed <- function(cell_idx, windowsize) {
60 |     group_expr_data = expr.data[, cell_idx]
61 |     smoothed = apply(group_expr_data, 2, caTools::runmean, k=windowsize)
62 |     smoothed_mean = rowMeans(smoothed)
63 | 
64 |     ## center it:
65 |     smoothed_mean = smoothed_mean - median(smoothed_mean)
66 |     
67 |     return(smoothed_mean)
68 | }
69 | 
70 | plot_chr_smooths <- function(tumor_type) {
71 | 
72 |     
73 |     tumor_pts = tumor_groups[[tumor_type]]
74 |     
75 |     
76 |     for (windowsize in windowsizes) {
77 |         message(sprintf("\t-plotting %s", tumor_type))
78 | 
79 |         normal_pts = list()
80 |         for (normal_type in names(normal_groups)) {
81 |             normal_pts[[ normal_type ]] <- get_smoothed(normal_groups[[normal_type]], windowsize)
82 |         }
83 |         
84 |         tumor_pts = list()
85 |         tumor_pts[[ tumor_type ]] = get_smoothed(tumor_groups[[tumor_type]], windowsize)
86 |         plotme(normal_pts, tumor_pts, windowsize)
87 |     }
88 | }
89 | 
90 | 
91 | 
92 | 
93 | for (tumor_type in names(tumor_groups)) {
94 |     message(sprintf("plotting for %s", tumor_type))
95 |     plot_chr_smooths(tumor_type)
96 | }
97 | 


--------------------------------------------------------------------------------
/scripts/gtf_to_position_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | Converts GTF files to proprietary formats.
  6 | """
  7 | 
  8 | 
  9 | # Import statements
 10 | import argparse
 11 | import csv
 12 | import os
 13 | 
 14 | __author__ = 'Timothy Tickle, Itay Tirosh, Brian Haas'
 15 | __copyright__ = 'Copyright 2016'
 16 | __credits__ = ["Timothy Tickle"]
 17 | __license__ = 'BSD-3'
 18 | __maintainer__ = 'Timothy Tickle'
 19 | __email__ = 'ttickle@bbroadinstitute.org'
 20 | __status__ = 'Development'
 21 | 
 22 | 
 23 | def convert_to_positional_file(input_gtf, output_positional, attribute_key):
 24 |     """ Convert input GTF file to positional file.
 25 | 
 26 |     :param input_gtf: Path to input gtf file
 27 |     :type input_gtf: String
 28 |     :param output_positional: Path to output positional file
 29 |     :type output_positional: String
 30 |     :param attribute_key: Key of the GTF attribute to use for feature/row names
 31 |     :type attribute_key: String
 32 | 
 33 |     :returns: Indicator of success (True) or Failure (False)
 34 |     :rtype: boolean
 35 |     """
 36 | 
 37 |     if not input_gtf or not os.path.exists(input_gtf):
 38 |         print("".join(["gtf_to_position_file.py:: ",
 39 |                        "Could not find input file : " + input_gtf]))
 40 | 
 41 |     all_genes_found = set()
 42 | 
 43 |     # Holds lines to output after parsing.
 44 |     output_line = []
 45 |     previous_gene = None
 46 |     previous_chr = None
 47 |     gene_positions = []
 48 | 
 49 |     # Metrics for the file
 50 |     i_comments = 0
 51 |     i_duplicate_entries = 0
 52 |     i_entries = 0
 53 |     i_accepted_entries = 0
 54 |     i_written_lines = 0
 55 | 
 56 |     with open(input_gtf, "r") as gtf:
 57 |         gtf_file = csv.reader(gtf,delimiter="\t")
 58 |         for gtf_line in gtf_file:
 59 |             if gtf_line[0][0] == "#":
 60 |                 i_comments += 1
 61 |                 continue
 62 |             i_entries += 1
 63 |             # Clean up the attribute keys and match the one of interest.
 64 |             attributes = gtf_line[8].split(";")
 65 |             attributes = [entry.strip(" ") for entry in attributes]
 66 |             attributes = [entry.split(" ") for entry in attributes if entry]
 67 |             attributes = [[entry[0].strip('"'),entry[1].strip('"')] for entry in attributes]
 68 |             attributes = dict([[entry[0].split("|")[0],entry[1]] for entry in attributes])
 69 |             if attribute_key in attributes:
 70 |                 gene_name = attributes[attribute_key]
 71 |             else:
 72 |                 print("Could not find an attribute in the GTF with the name '"+attribute_key+"'. Line="+"\t".join(gtf_line))
 73 |                 exit(99)
 74 |             if not gene_name == previous_gene:
 75 |                 if len(gene_positions) > 1 and previous_gene not in all_genes_found:
 76 |                     i_accepted_entries += 1
 77 |                     gene_positions.sort()
 78 |                     output_line.append("\t".join([previous_gene,
 79 |                                                   previous_chr,
 80 |                                                   str(gene_positions[0]),
 81 |                                                   str(gene_positions[-1])]))
 82 |                     all_genes_found.add(previous_gene)
 83 |                 gene_positions = []
 84 |             else:
 85 |                 i_duplicate_entries += 1
 86 |             gene_positions += [int(gtf_line[3]), int(gtf_line[4])]
 87 |             previous_gene = gene_name
 88 |             previous_chr = gtf_line[0]
 89 |         if previous_gene and previous_chr and len(gene_positions) > 1:
 90 |             i_accepted_entries += 1
 91 |             gene_positions.sort()
 92 |             output_line.append("\t".join([previous_gene,
 93 |                                           previous_chr,
 94 |                                           str(gene_positions[0]),
 95 |                                           str(gene_positions[-1])]))
 96 | 
 97 |     with open(output_positional, "w") as positional_file:
 98 |         i_written_lines += len(output_line)
 99 |         positional_file.write("\n".join(output_line))
100 | 
101 |     # Print metrics
102 |     print("Number of lines read: " + str(i_entries))
103 |     print("Number of comments: " + str(i_comments))
104 |     print("Number of entries: " + str(i_accepted_entries))
105 |     print("Number of duplicate entries: " + str(i_duplicate_entries))
106 |     print("Number of entries written: " + str(i_written_lines))
107 | 
108 | if __name__ == "__main__":
109 | 
110 |     # Parse arguments
111 |     prsr_arguments = argparse.ArgumentParser(prog='gtf_to_position_file.py',
112 |                                              description='Convert a GTF file to a positional file.',
113 |                                              formatter_class=argparse.ArgumentDefaultsHelpFormatter)
114 |     # Add positional argument
115 |     prsr_arguments.add_argument("input_gtf",
116 |                                 metavar="input_gtf",
117 |                                 help="Path to the input GTF file.")
118 |     prsr_arguments.add_argument("--attribute_name",
119 |                                 metavar="attribute_name",
120 |                                 default="gene_id",
121 |                                 help="The name of the attribute in the GTF attributes to use instead of gene name, for example 'gene_name' or 'transcript_id'.")
122 |     prsr_arguments.add_argument("output_positional",
123 |                                 metavar="output_positional",
124 |                                 help="Path for the output positional file.")
125 |     args = prsr_arguments.parse_args()
126 | 
127 |     # Run Script
128 |     convert_to_positional_file(args.input_gtf, args.output_positional, args.attribute_name)
129 | 


--------------------------------------------------------------------------------
/scripts/inferCNV_to_HB.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 |     
  5 | parser = ArgumentParser()
  6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
  7 | parser$add_argument("--no_scale_data", help="dont scale the data (ie. already scaled)", required=F, action='store_true', default=FALSE)
  8 | args = parser$parse_args()
  9 | 
 10 | library(infercnv)
 11 | library(ggplot2)
 12 | library(futile.logger)
 13 | library(HoneyBADGER)
 14 | 
 15 | infercnv_obj_file = args$infercnv_obj
 16 | 
 17 | infercnv_obj = readRDS(infercnv_obj_file)
 18 | 
 19 | require(biomaRt) ## for gene coordinates
 20 | mart.obj <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
 21 |                     dataset = 'hsapiens_gene_ensembl',
 22 |                     host = "jul2015.archive.ensembl.org")
 23 | 
 24 | do_scale=TRUE
 25 | if (args$no_scale_data) {
 26 |     do_scale=FALSE
 27 | }
 28 | 
 29 | 
 30 | run_hbadger <- function(tumor_group_name, normal_matrix, tumor_matrix) {
 31 | 
 32 |     hb <- new('HoneyBADGER', name=tumor_group_name)
 33 | 
 34 |     ref_normal <- rowMeans(normal_matrix)
 35 |     
 36 |     hb$setGexpMats(tumor_matrix, ref_normal, mart.obj, filter=FALSE, scale=do_scale, verbose=TRUE)
 37 |     
 38 |     pdf(sprintf("%s-hb.pdf", tumor_group_name))
 39 | 
 40 |     hb$plotGexpProfile() ## initial visualization
 41 | 
 42 | 
 43 |     hb$setMvFit(verbose=TRUE)
 44 |     hb$setGexpDev(verbose=TRUE)
 45 |     hb$calcGexpCnvBoundaries(init=TRUE, verbose=FALSE)
 46 |     
 47 | 
 48 |     ## double check what CNVs were identified
 49 |     bgf <- hb$bound.genes.final
 50 |     genes <- hb$genes
 51 |     regions.genes <- range(genes[unlist(bgf)])
 52 |     
 53 |     print(regions.genes)
 54 | 
 55 |     if (length(regions.genes) == 0) {
 56 |         message("No cnv regions identified")
 57 |         return()
 58 |     }
 59 |     
 60 |     ## Indeed, our initial HMM has identified a number of candidate CNVs to test. We can now retest all identified CNVs on all cells to derive the final posterior probability of each CNV in each cell. We can cluster cells on these posterior probabilities and visualize them as a heatmap.
 61 |     
 62 |     hb$retestIdentifiedCnvs(retestBoundGenes = TRUE, retestBoundSnps = FALSE, verbose=FALSE)
 63 |     
 64 |     ## look at final results
 65 |     results <- hb$summarizeResults(geneBased=TRUE, alleleBased=FALSE)
 66 |     print(head(results[,1:7]))
 67 |     write.table(results[,1:7], sprintf("%s-hb.cnvs.tsv", tumor_group_name), quote=F, sep="\t")
 68 |     
 69 |     
 70 |     ## visualize as heatmap 
 71 |     trees <- hb$visualizeResults(geneBased=TRUE, alleleBased=FALSE, details=TRUE, margins=c(25,15))
 72 |     
 73 |     ## order cells
 74 |     hc <- trees$hc
 75 |     order <- hc$labels[hc$order]
 76 |     ## plot all chromosomes
 77 |     hb$plotGexpProfile(cellOrder=order)
 78 |     
 79 |     
 80 |     ## plot just identified cnvs
 81 |     hb$plotGexpProfile(cellOrder=order, region=hb$cnvs[['gene-based']][['amp']])
 82 |     
 83 |     hb$plotGexpProfile(cellOrder=order, region=hb$cnvs[['gene-based']][['del']])
 84 |     
 85 |     
 86 | }
 87 | 
 88 | 
 89 | 
 90 | 
 91 | normal_matrix = infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices), drop=F]
 92 | 
 93 | tumor_groups = infercnv_obj@observation_grouped_cell_indices
 94 | 
 95 | tumor_group_names = names(tumor_groups)
 96 | tumor_group_name = tumor_group_names[1] # for debugging
 97 | for (tumor_group_name in tumor_group_names) {
 98 |     tumor_grp_idx = tumor_groups[[tumor_group_name]]
 99 | 
100 |     tumor_matrix = infercnv_obj@expr.data[,tumor_grp_idx]
101 | 
102 |     run_hbadger(tumor_group_name, normal_matrix, tumor_matrix)
103 | }
104 | 


--------------------------------------------------------------------------------
/scripts/inferCNV_utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(tidyverse)
  3 | library(futile.logger)
  4 | 
  5 | # plot expression density by chromosome for each observation group, reference groups are shown as single 'normal' group.
  6 | plot_density_by_chr <- function(infercnv_obj, pdf_filename=NULL, exclude_range=NULL, include_range = NULL, chrs=NULL) {
  7 | 
  8 |     ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj)
  9 |     
 10 |     
 11 |     if (is.null(chrs)) {
 12 |         chrs = unique(infercnv_obj@gene_order$chr) 
 13 |     }
 14 |     
 15 |     if (! is.null(pdf_filename)) {
 16 |         pdf(pdf_filename)
 17 |     }
 18 | 
 19 | 
 20 |     chr_expr_vals = list()
 21 |     
 22 |     for (chr in chrs) {
 23 | 
 24 |         
 25 |         gene_idx = which(infercnv_obj@gene_order$chr == chr)
 26 |         
 27 |         ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices])
 28 |         
 29 |         df = data.frame(class='normal', vals=ref_data_pts)
 30 |         
 31 |         for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) {
 32 |             
 33 |             tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]]
 34 |             tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx])
 35 |             
 36 |             df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts))
 37 |         }
 38 | 
 39 |         flog.info(sprintf("Plotting data for chr: %s", chr))
 40 | 
 41 |         if (! is.null(exclude_range)) {
 42 |             excl_range_left = exclude_range[1]
 43 |             excl_range_right = exclude_range[2]
 44 | 
 45 |             df = df %>% filter(vals < excl_range_left | vals > excl_range_right)
 46 |         } else if (! is.null(include_range)) {
 47 |             include_range_left = include_range[1]
 48 |             include_range_right = include_range[2]
 49 | 
 50 |             df = df %>% filter(vals >= include_range_left & vals <= include_range_right)
 51 |         }
 52 |                 
 53 |         p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + scale_y_continuous(trans='log10', limits=c(1,NA)) + ggtitle(chr)
 54 |         plot(p)
 55 | 
 56 |         chr_expr_vals[[ chr ]] = df
 57 |         
 58 |     }
 59 | 
 60 |     if (! is.null(pdf_filename)) {
 61 |         dev.off()
 62 |     }
 63 | 
 64 |     return(chr_expr_vals)
 65 |     
 66 | }
 67 | 
 68 | 
 69 | 
 70 | # plot the spike distribution for each specified chromosome in a single density plot
 71 | plot_spike_dist <- function(infercnv_obj, chrs) {
 72 | 
 73 | 
 74 |     spike_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ 'SPIKE' ]]
 75 | 
 76 |     spike_expr = infercnv_obj@expr.data[ , spike_cell_idx ]
 77 | 
 78 |     df = data.frame(class='rest', vals=as.numeric(spike_expr[ -1 * which(infercnv_obj@gene_order$chr %in% chrs), ]))
 79 | 
 80 |     for (chr in chrs) {
 81 | 
 82 |         df = rbind(df, data.frame(class=chr, vals=as.numeric(spike_expr[ which(infercnv_obj@gene_order$chr == chr), ])))
 83 |     }
 84 | 
 85 |     p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + scale_y_continuous(trans='log10', limits=c(1,NA)) + ggtitle('spike')
 86 |     plot(p)
 87 |     
 88 | }
 89 | 
 90 | ## examine dist of counts of non-zero valued genes per cell per grouping 
 91 | plot_dist_counts_expr_genes_by_chr <- function(infercnv_obj, pdf_filename=NULL, chrs=NULL) {
 92 | 
 93 |     group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
 94 | 
 95 |     if (is.null(chrs)) {
 96 |         chrs = unique(infercnv_obj@gene_order$chr) 
 97 |     }
 98 |     
 99 |     if (! is.null(pdf_filename)) {
100 |         pdf(pdf_filename)
101 |     }
102 | 
103 |     gene_counts_dfs = list()
104 |     
105 |     for (chr in chrs) {
106 |         gene_idx = which(infercnv_obj@gene_order$chr == chr)
107 | 
108 |         df = NULL
109 |         for (group in names(group_indices)) {
110 |             cell_idx = group_indices[[group]]
111 |             expr.data = infercnv_obj@expr.data[gene_idx, cell_idx]
112 |             gene_counts = apply(expr.data, 2, function(x) { sum(x != 0) } )
113 |             if (is.null(df)) {
114 |                 df = data.frame(class=group, gene_counts=gene_counts)
115 |             } else {
116 |                 df = rbind(df, data.frame(class=group, gene_counts=gene_counts))
117 |             }
118 |         }
119 |         p = df %>% ggplot(aes(gene_counts, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr)
120 |         plot(p)
121 | 
122 |         gene_counts_dfs[[ chr ]] = df
123 |     }
124 | 
125 |     if (! is.null(pdf_filename)) {
126 |         dev.off()
127 |     }
128 | 
129 |     return(gene_counts_dfs)
130 | }
131 | 
132 | 
133 | 
134 | #' takes the mean expr per gene per group
135 | #' returns dataframe with mean_gene_grpA, mean_gene_grpB 
136 | compare_gene_expr_means_by_group_pair <- function(infercnv_obj, groupA, groupB, chr=NULL) {
137 | 
138 |     group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
139 | 
140 |     group_indices[[ "normal" ]] = infercnv:::get_reference_grouped_cell_indices(infercnv_obj)
141 |     
142 |     expr.data = infercnv_obj@expr.data
143 |     
144 |     if (! is.null(chr)) {
145 |         gene_idx = which(infercnv_obj@gene_order$chr == chr)
146 |         expr.data = expr.data[gene_idx,]
147 |     }
148 |     groupA.expr.data = expr.data[, group_indices[[ groupA ]] ]
149 |     groupB.expr.data = expr.data[, group_indices[[ groupB ]] ]
150 | 
151 |     groupA.gene_mean = rowMeans(groupA.expr.data)
152 |     groupB.gene_mean = rowMeans(groupB.expr.data)
153 | 
154 |     #plot(groupA.gene_mean, groupB.gene_mean)
155 |     smoothScatter(groupA.gene_mean, groupB.gene_mean)
156 |     abline(a=0, b=1, col='magenta')
157 |     
158 |     df=data.frame(groupA=groupA.gene_mean, groupB=groupB.gene_mean)
159 | 
160 |     return(df)
161 |     
162 | }
163 | 
164 | #' compare spike vs cancer, both to normal
165 | 
166 | compare_gene_expr_means_spike_vs_cancer_to_normal <- function(infercnv_obj, tumor_type, chr, xlim=NULL, ylim=NULL) {
167 | 
168 |     df_normal_vs_spike = compare_gene_expr_means_by_group_pair(infercnv_obj, 'normal', 'SPIKE', chr)
169 |     df_tumor_vs_spike = compare_gene_expr_means_by_group_pair(infercnv_obj, 'normal', tumor_type, chr)
170 | 
171 |     plot(df_tumor_vs_spike[,1], df_tumor_vs_spike[,2], xlab='normal', ylab=tumor_type, xlim=xlim, ylim=ylim)
172 |     points(df_normal_vs_spike[,1], df_normal_vs_spike[,2], col='red')
173 |     abline(a=0,b=1, col='blue')
174 |     
175 | }
176 |                                                               
177 | 
178 | #' model the mean-to-variance relationship
179 | 
180 | get_mean_var <- function(infercnv_obj) {
181 | 
182 |     group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
183 | 
184 |     mean_var_table = NULL
185 |     
186 |     for (group_name in names(group_indices)) {
187 |         flog.info(sprintf("processing group: %s", group_name))
188 |         expr.data = infercnv_obj@expr.data[, group_indices[[ group_name ]] ]
189 |         m = rowMeans(expr.data)
190 |         v = apply(expr.data, 1, var)
191 |         if (is.null(mean_var_table)) {
192 |             mean_var_table = data.frame(g=group, m=m, v=v)
193 |         } else {
194 |             mean_var_table = rbind(mean_var_table, data.frame(g=group, m=m, v=v))
195 |         }
196 |     }
197 | 
198 |     
199 |     
200 |     return(mean_var_table)
201 | }
202 | 
203 | plot_mean_var_table <- function(mvtable) {
204 |     s = smooth.spline(log2(mvtable$m+1), log2(mvtable$v+1))
205 |     p = predict(s, log2(mvtable$m+1))
206 |     smoothScatter(log2(mvtable$m+1), log2(mvtable$v+1))
207 |     points(p, col='green', pch='.')
208 | }
209 | 


--------------------------------------------------------------------------------
/scripts/infercnv_obj_to_input_files.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | 
11 | infercnv_obj_file = args$infercnv_obj
12 | 
13 | infercnv_obj = readRDS(infercnv_obj_file)
14 | 
15 | ## write counts matrix
16 | write.table(infercnv_obj@count.data, file='sc.counts.matrix', quote=F, sep="\t")
17 | 
18 | cellnames = colnames(infercnv_obj@count.data)
19 | 
20 | groupings = c(infercnv_obj@reference_grouped_cell_indices, infercnv_obj@observation_grouped_cell_indices)
21 | 
22 | ## write cell annotation file
23 | cell.annots = do.call(rbind, lapply(names(groupings), function(groupname) {
24 |     cell_idx = groupings[[ groupname ]]
25 |     group.cellnames = cellnames[cell_idx]
26 | 
27 |     return(data.frame(cells=group.cellnames, type=groupname))
28 | }))
29 | 
30 | cell.annots = cell.annots[ cell.annots$cells %in% colnames(infercnv_obj@count.data), ]
31 | 
32 | write.table(cell.annots, file="cell_annots.txt", quote=F, row.names=F, col.names=F, sep="\t")
33 | 
34 | ## write infercnv runner:
35 | 
36 | cat(file='run.infercnv.R', sprintf("#!/usr/bin/env Rscript
37 | 
38 | options(error = function() { traceback(2); q(status = 1) } )
39 | 
40 | library(\"infercnv\")
41 | 
42 | # create the infercnv object
43 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=\"sc.counts.matrix\",
44 |                                     annotations_file=\"cell_annots.txt\",
45 |                                     delim=\"\t\",
46 |                                     gene_order_file=\"gencode_v19_gene_pos.txt\",
47 |                                     ref_group_names=c(\'%s\'))
48 | 
49 | out_dir=\"output_dir\"
50 | # perform infercnv operations to reveal cnv signal
51 | infercnv_obj = infercnv::run(infercnv_obj,
52 |                              cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
53 |                              out_dir=out_dir, 
54 |                              cluster_by_groups=T, 
55 |                              plot_steps=T,
56 |                              HMM=T,
57 |                              #HMM_mode='subclusters',
58 |                              HMM_mode='samples',
59 |                              sim_method='meanvar'
60 |                              )
61 | ", paste(names(infercnv_obj@reference_grouped_cell_indices),collapse="','") ) )
62 | 
63 | Sys.chmod('run.infercnv.R', mode = "0775")
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/scripts/infercnv_validate.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript 
 2 | # Script to validate inferCNV docker container instances.
 3 | 
 4 | #####
 5 | # Set up logging
 6 | #####
 7 | 
 8 | library(logging)
 9 | # Logging level choices
10 | C_LEVEL_CHOICES <- names(loglevels)
11 | logging::basicConfig(level='INFO') #initialize to info setting.
12 | 
13 | #####
14 | # Data sources
15 | #####
16 | 
17 | ## input data for validation (provided in docker image)
18 | infercnv_root <- '/inferCNV/'
19 | validation_input_dir <- paste0(infercnv_root,'example/')
20 | raw_counts_matrix <- paste0(validation_input_dir, 
21 |   'oligodendroglioma_expression_downsampled.counts.matrix')
22 | annotations_file <- paste0(validation_input_dir, 
23 |   'oligodendroglioma_annotations_downsampled.txt')
24 | gene_order_file <- paste0(validation_input_dir, 
25 |   'gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt')
26 | out_dir <- 'output_cli'
27 | 
28 | ## reference output for validation
29 | validation_reference <- paste0(validation_input_dir,'validation/',
30 |   'reference-infercnv.observations.txt')
31 | 
32 | # Make sure the reference input data exists 
33 | logging::loginfo(paste("Checking for inferCNV validation input files.", sep=""))
34 | if (!file.exists(raw_counts_matrix) || 
35 |     !file.exists(annotations_file)  || 
36 |     !file.exists(gene_order_file)){
37 |     logging::logerror(paste("Missing input file(s)", sep=""))
38 |     stop(paste0('Error: expected input files cannot be found.'))
39 | }
40 | 
41 | 
42 | #####
43 | # Run inferCNV for validation
44 | #####
45 | logging::loginfo(paste("Running inferCNV on validation input files.", sep=""))
46 | inferCNV_exe <- paste0(infercnv_root, 'scripts/inferCNV.R')
47 | validate_cmd <- paste0(inferCNV_exe,
48 |                 ' --raw_counts_matrix=', raw_counts_matrix,  
49 |                 ' --annotations_file=', annotations_file,
50 |                 ' --gene_order_file=', gene_order_file,
51 |                 ' --ref_group_names=',
52 |                   '\"Microglia/Macrophage,Oligodendrocytes (non-malignant)\"',
53 |                 ' --cutoff=1',
54 |                 ' --out_dir=', out_dir,
55 |                 ' --cluster_by_groups',
56 |                 ' --denoise')
57 | logging::loginfo(validate_cmd)
58 | system(validate_cmd)
59 | 
60 | validation_input <- paste0(out_dir, '/infercnv.observations.txt')
61 | 
62 | if (!file.exists(validation_input)){
63 |     logging::logerror(paste("Error: expected output file, infercnv.observations.txt, not found.", sep=""))
64 |     stop('Validation aborted - inferCNV analysis on test data failed.\n')
65 | }
66 | 
67 | 
68 | #####
69 | # Read in data for validation
70 | #####
71 | 
72 | ref <- as.matrix(read.csv(validation_reference, header=T, sep = ' '))
73 | obs <- as.matrix(read.csv(validation_input, header=T, sep = ' '))
74 | 
75 | 
76 | #####
77 | # Perform validation
78 | #####
79 | logging::loginfo(paste("Performing validation.", sep=""))
80 | if (max ( abs(obs - ref)/abs(ref)) < 1.0e-8){
81 |   unlink(out_dir, recursive=TRUE)
82 |   logging::loginfo(paste("Successful validation - output passes similarity check.", sep=""))
83 | } else { 
84 |   logging::logerror(paste("Error: generated output fails similarity check", sep=""))
85 |   logging::loginfo(paste("Saving validation files in current working directory", sep=""))
86 |   file.copy(validation_reference, "./reference-infercnv.observations.txt")
87 |   file.copy(validation_input, "./infercnv.observations.txt")
88 |   unlink(out_dir, recursive=TRUE)
89 |   stop('Validation failed - max relative error exceeds threshold.\n')
90 | }
91 | 


--------------------------------------------------------------------------------
/scripts/meanvar_sim_counts.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 | options(error = function() {traceback(2);quit(save = "no", status = 0, runLast = FALSE)})
 5 | 
 6 | parser = ArgumentParser()
 7 | 
 8 | parser$add_argument("--infercnv_obj", help="total sum normalized infercnv obj", required=TRUE, default=NULL, nargs=1)
 9 | parser$add_argument("--ncells", help="number of cells to simulate", required=FALSE, type='integer', nargs=1, default=-1)
10 | parser$add_argument("--ngenes", help="number of genes to simulate", required=FALSE, type='integer', nargs=1, default=-1)
11 | parser$add_argument("--output_prefix", help='prefix for output matrix file', required=TRUE, nargs=1)
12 | 
13 | args = parser$parse_args()
14 | 
15 | library(infercnv)
16 | library(SingleCellExperiment)
17 | library("methods")
18 | library(tidyverse)
19 | 
20 | 
21 | infercnv_obj_file = args$infercnv_obj
22 | 
23 | ncells = args$ncells
24 | ngenes = args$ngenes
25 | output_prefix = args$output_prefix
26 | 
27 | infercnv_obj = readRDS(infercnv_obj_file)
28 | 
29 | expr.data = infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)]
30 | 
31 | if (ncells < 0) {
32 |     ncells = ncol(expr.data)
33 | }
34 | if (ngenes < 0) {
35 |     ngenes = nrow(expr.data)
36 | }
37 | 
38 | ## sim using specified gene means
39 | gene_means = rowMeans(expr.data)
40 | gene_means = gene_means[gene_means>0]
41 | 
42 | gene_means = sample(x=gene_means, size=ngenes, replace=T)
43 | 
44 | newnames = paste0('gene', 1:ngenes)
45 | 
46 | names(gene_means) = newnames
47 | 
48 | 
49 | sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend(infercnv_obj, gene_means, ncells, TRUE)
50 | 
51 | 
52 | output_filename = paste0(output_prefix, ".counts.matrix")
53 | write.table(sim_matrix, file=output_filename, quote=F, sep='\t')
54 | 
55 | pdf(paste0(output_prefix, ".KS.pdf"))
56 | infercnv:::KS_plot("meanVarSim", as.numeric(log(expr.data+1)), as.numeric(log(sim_matrix+1)))
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/scripts/plot_hspike.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 | 
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | 
12 | infercnv_obj_file = args$infercnv_obj
13 | 
14 | infercnv_obj = readRDS(infercnv_obj_file)
15 | 
16 | if (! is.null(infercnv_obj@.hspike)) {
17 |     out_prefix = paste0(infercnv_obj_file, '.hspike')
18 |     plot_cnv(infercnv_obj@.hspike,
19 |              out_dir=dirname(infercnv_obj_file),
20 |              output_filename=basename(out_prefix) )
21 | 
22 | 
23 |     hspike_obj = infercnv_obj@.hspike
24 |     hspike_gene_expr_by_cnv <- infercnv:::.get_gene_expr_by_cnv(hspike_obj)
25 |     hspike_cnv_mean_sd <- infercnv:::.get_gene_expr_mean_sd_by_cnv(hspike_gene_expr_by_cnv)
26 |     p = infercnv:::.plot_gene_expr_by_cnv(gene_expr_by_cnv=hspike_gene_expr_by_cnv, cnv_mean_sd=hspike_cnv_mean_sd)
27 |     pdf(paste0(infercnv_obj_file, '.hspike.dist.pdf'))
28 |     plot(p)
29 |     dev.off()
30 | 
31 | } else {
32 |     message("no hspike to plot")
33 | }
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/scripts/plot_hspike.by_num_cells.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(dplyr)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | if (! is.null(infercnv_obj@.hspike)) {
18 |     hspike_obj = infercnv_obj@.hspike
19 | 
20 | 
21 |     pdf(paste0(infercnv_obj_file, '.hspike.dist_by_numcells.pdf'))
22 | 
23 | 
24 | 
25 |     
26 |     gene_expr_by_cnv <- infercnv:::.get_gene_expr_by_cnv(hspike_obj)
27 |     cnv_level_to_mean_sd = list()
28 | 
29 |     for (ncells in c(1,2,3,4,5,10,20,50,100)) {
30 |         
31 |         cnv_to_means = list()
32 |         cnv_mean_sd = list()
33 |         
34 |         for (cnv_level in names(gene_expr_by_cnv) ) {
35 |             expr_vals = gene_expr_by_cnv[[ cnv_level ]]
36 |             nrounds = 100
37 |             
38 |             means = c()
39 |                         
40 |             for(i in 1:nrounds) {
41 |                 vals = sample(expr_vals, size=ncells, replace=T)
42 |                 m_val = mean(vals)
43 |                 means = c(means,  m_val)
44 |             }
45 |             cnv_to_means[[ cnv_level ]] = means
46 |             cnv_mean_sd[[ cnv_level ]] = list(sd=sd(means), mean=mean(means))
47 |         }
48 |         
49 |         ## plot
50 | 
51 |         df  = do.call(rbind, lapply(names(cnv_to_means), function(x) { data.frame(cnv=x, expr=cnv_to_means[[x]]) }))
52 |         
53 |         p = df %>% ggplot(aes(expr,  fill=cnv, colour=cnv))  +  geom_density(alpha=0.1)
54 |         
55 |         p = p +
56 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) +
57 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) +
58 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) +
59 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) +
60 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) +
61 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd)) 
62 |         
63 |         p = p + ggtitle(sprintf("num cells: %g", ncells))
64 |         
65 |         plot(p)
66 | 
67 |         
68 |     }
69 |     
70 |         
71 |     dev.off()
72 |     
73 | } else {
74 |     message("no hspike to plot")
75 | }
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/scripts/plot_hspike.diff_normal_tumor.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | 
12 | infercnv_obj_file = args$infercnv_obj
13 | 
14 | infercnv_obj = readRDS(infercnv_obj_file)
15 | 
16 | if (! is.null(infercnv_obj@.hspike)) {
17 |     pdfname = paste0(infercnv_obj_file, '.hspike.diff_normal_tumor.pdf')
18 | 
19 |     pdf(pdfname)
20 |     hspike = infercnv_obj@.hspike
21 | 
22 |     normal_matrix = hspike@expr.data[,unlist(hspike@reference_grouped_cell_indices)]
23 |     tumor_matrix = hspike@expr.data[,unlist(hspike@observation_grouped_cell_indices)]
24 | 
25 |     normal.means = rowMeans(normal_matrix)
26 |     tumor.means = rowMeans(tumor_matrix)
27 | 
28 |     plot(normal.means, ylim=range(normal.means, tumor.means))
29 |     points(tumor.means, col='green')
30 | 
31 |     plot(tumor.means - normal.means)
32 |     abline(h=0, col='red')
33 |     
34 |     sm = caTools::runmean(tumor.means - normal.means, k=31)
35 |     points(sm, col='magenta')
36 |     
37 | } else {
38 |     message("no hspike to plot")
39 | }
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/scripts/plot_hspike_vs_sample_chrs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 | 
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | 
 8 | args = parser$parse_args()
 9 | 
10 | library(infercnv)
11 | library(futile.logger)
12 | library(tidyverse)
13 | 
14 | 
15 | infercnv_obj_file = args$infercnv_obj
16 | 
17 | infercnv_obj = readRDS(infercnv_obj_file)
18 | 
19 | gene_order = infercnv_obj@gene_order
20 | gene_order = cbind(gene_order, gene=rownames(gene_order))
21 | 
22 | cnv_to_expr_vals = list()
23 | 
24 | expr.data <- infercnv_obj@expr.data
25 | 
26 | cnv_mean_sd = infercnv:::get_spike_dists(infercnv_obj@.hspike)
27 | 
28 | chrs = unique(infercnv_obj@gene_order$chr)
29 | 
30 | groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices)
31 | 
32 | samples = names(groups)
33 | 
34 | 
35 | for (sample in samples) {
36 |     pdf_name = sprintf("%s-%s.cnv_expr_densities_each_chr.pdf", infercnv_obj_file, sub("[^A-Za-z0-9]", "_", sample, perl=TRUE))
37 |     pdf(pdf_name)
38 |     
39 |     message(sprintf("plotting sample: %s", sample))
40 | 
41 |     sample_cells = groups[[ sample ]]
42 |     
43 |     sample_expr = expr.data[, sample_cells]
44 | 
45 |     for (chr in chrs) {
46 |         chr_gene_idx = which(infercnv_obj@gene_order$chr == chr)
47 | 
48 |         sample_gene_expr = sample_expr[chr_gene_idx,]
49 | 
50 |         normal_gene_expr = expr.data[chr_gene_idx, unlist(infercnv_obj@reference_grouped_cell_indices)]
51 | 
52 |         df = rbind(data.frame(class='allnormal', vals=as.numeric(normal_gene_expr) ),
53 |                    data.frame(class='sample', vals=as.numeric(sample_gene_expr)) )
54 |         
55 |         message(sprintf("plotting sample: %s, %s", sample, chr))
56 | 
57 |         p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(sprintf("%s, %s", sample, chr))
58 | 
59 |         p = p +
60 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) +
61 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) +
62 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) +
63 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) +
64 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) +
65 |             stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd))
66 | 
67 | 
68 | 
69 |         plot(p)
70 | 
71 |     }
72 |     dev.off()
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/scripts/plot_infercnv_obj.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | 
11 | infercnv_obj_file = args$infercnv_obj
12 | 
13 | infercnv_obj = readRDS(infercnv_obj_file)
14 | 
15 | plot_cnv(infercnv_obj,
16 |          output_filename=basename(infercnv_obj_file))
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/plot_tumor_vs_normal_chr_densities.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | library(dplyr)
13 | 
14 | infercnv_obj_file = args$infercnv_obj
15 | 
16 | infercnv_obj = readRDS(infercnv_obj_file)
17 | 
18 | ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj)
19 | pdf_filename = paste0(infercnv_obj_file, ".chr_expr_densities.pdf")
20 | 
21 | cnv_mean_sd = infercnv:::get_spike_dists(infercnv_obj@.hspike)
22 | 
23 | pdf(pdf_filename)
24 | 
25 | chrs = unique(infercnv_obj@gene_order$chr)
26 | 
27 | 
28 | for (chr in chrs) {
29 |         
30 |     gene_idx = which(infercnv_obj@gene_order$chr == chr)
31 |     
32 |     ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices])
33 |     
34 |     df = data.frame(class='normal', vals=ref_data_pts)
35 |     
36 |     for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) {
37 |         
38 |         tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]]
39 |         tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx])
40 |         
41 |         df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts))
42 |     }
43 | 
44 |     flog.info(sprintf("Plotting data for chr: %s", chr))
45 |     
46 |     p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr) # + scale_y_continuous(trans='log10', limits=c(1,NA))
47 |     
48 |     
49 |     p = p +
50 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) +
51 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) +
52 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) +
53 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) +
54 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) +
55 |         stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd)) 
56 |     
57 | 
58 | 
59 |     plot(p)
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/scripts/plot_tumor_vs_normal_chr_densities.i3.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | library(dplyr)
13 | 
14 | infercnv_obj_file = args$infercnv_obj
15 | 
16 | infercnv_obj = readRDS(infercnv_obj_file)
17 | 
18 | ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj)
19 | pdf_filename = paste0(infercnv_obj_file, ".i3.chr_expr_densities.pdf")
20 | 
21 | normal_sd_trend = infercnv:::.i3HMM_get_sd_trend_by_num_cells_fit(infercnv_obj)
22 | 
23 | mu = normal_sd_trend$mu
24 | sigma = normal_sd_trend$sigma
25 | 
26 | 
27 | 
28 | pdf(pdf_filename)
29 | 
30 | chrs = unique(infercnv_obj@gene_order$chr)
31 | 
32 | delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sigma, alpha=0.05, k_cells=7)
33 | 
34 | for (chr in chrs) {
35 |         
36 |     gene_idx = which(infercnv_obj@gene_order$chr == chr)
37 |     
38 |     ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices])
39 |     
40 |     df = data.frame(class='normal', vals=ref_data_pts)
41 |     
42 |     for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) {
43 |         
44 |         tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]]
45 |         tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx])
46 |         
47 |         df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts))
48 |     }
49 | 
50 |     flog.info(sprintf("Plotting data for chr: %s", chr))
51 |     
52 |     p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr) # + scale_y_continuous(trans='log10', limits=c(1,NA))
53 |     
54 |     
55 |     p = p +
56 |         stat_function(fun=dnorm, color='black', args=list('mean'=mu,'sd'=sigma)) +
57 |         stat_function(fun=dnorm, color='blue', args=list('mean'=mu-delta,'sd'=sigma)) +
58 |         stat_function(fun=dnorm, color='blue', args=list('mean'=mu+delta,'sd'=sigma)) 
59 |     
60 | 
61 |     plot(p)
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/scripts/prepare_sparsematrix.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | library(optparse)
  5 | library(Matrix)
  6 | library(data.table)
  7 | library(logging)
  8 | 
  9 | 
 10 | logging::basicConfig(level='INFO')
 11 | 
 12 | pargs <- optparse::OptionParser(usage=paste("%prog [options]",
 13 |                                             "--input data_matrix ",
 14 |                                             "--output sparse_matrix ",
 15 |                                             "--delim matrix_delimiter"
 16 |                                             ))
 17 | 
 18 | pargs <- optparse::add_option(pargs, c("--input"),
 19 |                               type="character",
 20 |                               default=NULL,
 21 |                               action="store",
 22 |                               dest="input",
 23 |                               metavar="input",
 24 |                               help=paste("Input raw counts matrix ",
 25 |                               			 "to prepare for infercnv run."))
 26 | 
 27 | pargs <- optparse::add_option(pargs, c("--output"),
 28 |                               type="character",
 29 |                               default=NULL,
 30 |                               action="store",
 31 |                               dest="output",
 32 |                               metavar="output",
 33 |                               help=paste("Output raw counts matrix ",
 34 |                               			 "as a sparseMatrix for infercnv run."))
 35 | 
 36 | pargs <- optparse::add_option(pargs, c("--delim"),
 37 |                               type="character",
 38 |                               action="store",
 39 |                               default="\t",
 40 |                               dest="delim",
 41 |                               metavar="delim",
 42 |                               help=paste("Delimiter for reading expression matrix",
 43 |                                          "[Default %default]"))
 44 | 
 45 | args <- optparse::parse_args(pargs)
 46 | 
 47 | if (is.null(args$input) || is.null(args$output)) {
 48 | 	logging::logerror("Please provide input and output arguments")
 49 | }
 50 | 
 51 | logging::loginfo("Reading header.")
 52 | 
 53 | data_head = fread(input=args$input,
 54 | 	  sep=args$delim,
 55 | 	  header=FALSE,
 56 | 	  nrows=1,
 57 | 	  stringsAsFactors=FALSE,
 58 | 	  check.names=FALSE,
 59 | 	  nThread=1,
 60 | 	  logical01=FALSE,
 61 | 	  data.table=FALSE)
 62 | 
 63 | logging::loginfo("Done reading header.")
 64 | logging::loginfo("Reading matrix data.")
 65 | 
 66 | ddata = fread(input=args$input,
 67 | 	  sep=args$delim,
 68 | 	  header=FALSE,
 69 | 	  skip=1,
 70 | 	  stringsAsFactors=FALSE,
 71 | 	  check.names=FALSE,
 72 | 	  nThread=1,
 73 | 	  logical01=FALSE,
 74 | 	  data.table=FALSE)
 75 | 
 76 | logging::loginfo("Done reading matrix data.")
 77 | 
 78 | logging::loginfo("Backing up rownames.")
 79 | # store column names before dropping the column from the matrix
 80 | saved_names = as.vector(unlist(ddata[, 1]))
 81 | ddata = ddata[, -1, drop=FALSE]
 82 | 
 83 | in_size = object.size(ddata)
 84 | 
 85 | colnames(ddata) = as.vector(unlist(data_head))
 86 | 
 87 | logging::loginfo("Converting data.frame to Matrix.")
 88 | basic_matrix = as.matrix(ddata)
 89 | logging::loginfo("Done converting data.frame to Matrix.")
 90 | logging::loginfo("Freeing data.frame.")
 91 | rm(ddata)  # make memory available
 92 | gc()
 93 | logging::loginfo("Converting Matrix to sparseMatrix.")
 94 | sparse_matrix = Matrix(basic_matrix, sparse=T)
 95 | logging::loginfo("Done converting Matrix to sparseMatrix.")
 96 | logging::loginfo("Freeing Matrix.")
 97 | rm(basic_matrix)  # make memory available
 98 | gc()
 99 | logging::loginfo("Setting rownames.")
100 | row.names(sparse_matrix) = saved_names
101 | 
102 | logging::loginfo("Saving sparseMatrix to RDS file.")
103 | saveRDS(sparse_matrix, file=paste(args$output, "rds", sep="."))
104 | 
105 | out_size = object.size(sparse_matrix)
106 | 
107 | fileConn<-file("prepare_smallest.txt")
108 | if (in_size < out_size) {
109 | 	writeLines(args$input, fileConn)
110 | } else {
111 | 	writeLines(paste(args$output, "rds", sep="."), fileConn)	
112 | }
113 | close(fileConn)
114 | 
115 | 


--------------------------------------------------------------------------------
/scripts/recursive_random_tree_height_cutting.random_trees.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | hclust_method='ward.D2'
 5 | 
 6 | num_rand_iters = 100
 7 | MAX_PVAL=0.05
 8 | 
 9 | suppressPackageStartupMessages(library("argparse"))
10 |     
11 | parser = ArgumentParser()
12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
13 | args = parser$parse_args()
14 | 
15 | library(infercnv)
16 | library(ggplot2)
17 | library(futile.logger)
18 | library(pheatmap)
19 | 
20 | infercnv_obj = readRDS(args$infercnv_obj)
21 | 
22 | 
23 | pdf("test.recursive_trees.pdf")
24 | 
25 | adj.obj = infercnv:::define_signif_tumor_subclusters(infercnv_obj, p_val=0.05, hclust_method='ward.D2', partition_method='random_trees')
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/scripts/recursive_random_tree_height_cutting.sigclust2.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | hclust_method='ward.D2'
 5 | 
 6 | num_rand_iters = 100
 7 | MAX_PVAL=0.05
 8 | 
 9 | suppressPackageStartupMessages(library("argparse"))
10 |     
11 | parser = ArgumentParser()
12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
13 | args = parser$parse_args()
14 | 
15 | library(infercnv)
16 | library(ggplot2)
17 | library(futile.logger)
18 | library(pheatmap)
19 | 
20 | obj = readRDS(args$infercnv_obj)
21 | 
22 | tumor.expr.data = obj@expr.data[, unlist(obj@observation_grouped_cell_indices)]
23 | 
24 | gene_order = obj@gene_order
25 | chrs = unique(gene_order$chr)
26 | 
27 | 
28 | pdf("test.recursive_trees.pdf")
29 | 
30 | 
31 | ALL_CLUSTERS = list()
32 | MIN_CLUSTER_SIZE=3
33 | 
34 | library(sigclust2)
35 | 
36 | recursive_cluster_cutting <- function(expr.matrix) {
37 | 
38 |     message("recursive_cluster_cutting()")
39 |     print(dim(expr.matrix))
40 | 
41 |     if (dim(expr.matrix)[2] < MIN_CLUSTER_SIZE) {
42 |         message("cluster size too small. Storing cluster")
43 |         ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix)
44 | 
45 |         print("Returning")
46 |         return(NULL)
47 |         print("Didn't actually return...")
48 |     }
49 | 
50 |     print("Onward")
51 |     print(dim(expr.matrix))
52 |     
53 |     t_tumor.expr.data = t(expr.matrix) # cells as rows, genes as cols
54 | 
55 |     shc_result = shc(t_tumor.expr.data, metric='euclidean', linkage='ward.D2')
56 |     plot(shc_result)
57 |     
58 |     for(chr in chrs) {
59 |         chr_genes = which(gene_order$chr == chr)
60 |         
61 |         message(sprintf("plotting %s", chr))
62 |         
63 |         shc_result = shc(t_tumor.expr.data[,chr_genes], metric='euclidean', linkage='ward.D2')
64 |         plot(shc_result)
65 |     }
66 |     
67 |     
68 |             
69 | }
70 | 
71 | recursive_cluster_cutting(tumor.expr.data)
72 | 
73 | dev.off()
74 | 
75 | print(ALL_CLUSTERS)
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/scripts/recursive_random_tree_height_cutting.using_hmms.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | hclust_method='ward.D2'
  5 | 
  6 | num_rand_iters = 100
  7 | MAX_PVAL=0.05
  8 | 
  9 | suppressPackageStartupMessages(library("argparse"))
 10 |     
 11 | parser = ArgumentParser()
 12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 13 | args = parser$parse_args()
 14 | 
 15 | library(infercnv)
 16 | library(ggplot2)
 17 | library(futile.logger)
 18 | library(pheatmap)
 19 | 
 20 | obj = readRDS(args$infercnv_obj)
 21 | 
 22 | tumor.expr.data = obj@expr.data[, unlist(obj@observation_grouped_cell_indices)]
 23 | 
 24 | gene_order = obj@gene_order
 25 | chrs = unique(gene_order$chr)
 26 | 
 27 | tumor.expr.data[tumor.expr.data>3] <- 4
 28 | tumor.expr.data[tumor.expr.data<3] <- 2
 29 | 
 30 | 
 31 | pdf("test.recursive_trees.pdf")
 32 | 
 33 | 
 34 | ALL_CLUSTERS = list()
 35 | MIN_CLUSTER_SIZE=3
 36 | 
 37 | 
 38 | recursive_cluster_cutting <- function(expr.matrix) {
 39 | 
 40 |     message("recursive_cluster_cutting()")
 41 |     print(dim(expr.matrix))
 42 | 
 43 |     if (dim(expr.matrix)[2] < MIN_CLUSTER_SIZE) {
 44 |         message("cluster size too small. Storing cluster")
 45 |         ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix)
 46 | 
 47 |         print("Returning")
 48 |         return(NULL)
 49 |         print("Didn't actually return...")
 50 |     }
 51 | 
 52 |     print("Onward")
 53 |     print(dim(expr.matrix))
 54 |     
 55 |     t_tumor.expr.data = t(expr.matrix) # cells as rows, genes as cols
 56 |     d = dist(t_tumor.expr.data)
 57 | 
 58 |     h_obs = hclust(d, method=hclust_method)
 59 | 
 60 |     # permute by chromosomes
 61 |     
 62 |     permute_chr_col_vals <- function(df) {
 63 | 
 64 |         num_cells = nrow(df)
 65 | 
 66 |         for(chr in chrs) {
 67 |             chr_genes = which(gene_order$chr == chr)
 68 | 
 69 |             df[, chr_genes] = df[sample(x=1:num_cells, size=num_cells, replace=F), chr_genes]
 70 |         }
 71 | 
 72 |         df
 73 |     }
 74 | 
 75 |     permute_col_vals <- function(df) {
 76 | 
 77 |         num_cells = nrow(df)
 78 |         for (i in 1:ncol(df)) {
 79 |             df[,i] = df[sample(x=1:num_cells, size=num_cells, replace=F), i]
 80 |         }
 81 |         
 82 |         df
 83 |     }
 84 | 
 85 |     
 86 |     example_rand_matrix <- NULL
 87 |     max_rand_heights = c()
 88 |     for (i in 1:num_rand_iters) {
 89 |         
 90 |         ##rand.tumor.expr.data = permute_chr_col_vals(t_tumor.expr.data)
 91 |         rand.tumor.expr.data = permute_col_vals(t_tumor.expr.data)
 92 |         example_rand_matrix <- rand.tumor.expr.data
 93 |         rand.dist = dist(rand.tumor.expr.data)
 94 |         h_rand <- hclust(rand.dist, method=hclust_method)
 95 | 
 96 |         max_rand_heights = c(max_rand_heights, max(h_rand$height))
 97 |     }
 98 |         
 99 |     h = h_obs$height
100 | 
101 |     max_height = max(h)
102 |     
103 |     message(sprintf("Max Rand Heights(h): %s", paste(max_rand_heights, sep=",", collapse=",")))
104 |     
105 |     max_rand_height_dens = density(max_rand_heights)
106 |     plot(max_rand_height_dens, xlim=range(max_rand_height_dens$x, max_height))
107 |     
108 |     e = ecdf(max_rand_heights)
109 |     message(sprintf("pvals(Lengths(h)): %s", paste(1-e(h), sep=",", collapse=",")))
110 |     
111 |     pval = 1- e(max_height)
112 |     message(sprintf("pval for max obs height: %g = %g", max_height, pval))
113 |     
114 |     abline(v=max_height, col='red')
115 |     
116 |     pheatmap(t(expr.matrix), cluster_cols=F)
117 |     pheatmap(example_rand_matrix, cluster_cols=F)
118 | 
119 | 
120 |     #stop("stopping")
121 |     
122 |     if (max_height > 0 & pval <= MAX_PVAL) {
123 |         ## keep on cutting.
124 |         cut_height = mean(c(h[length(h)-1], h[length(h)]))
125 |         message(sprintf("cutting at height: %g",  cut_height))
126 |         grps = cutree(h_obs, h=cut_height)
127 |         print(grps)
128 |         uniqgrps = unique(grps)
129 |         for (grp in uniqgrps) {
130 |             grp_idx = which(grps==grp)
131 |             
132 |             message(sprintf("grp: %s  contains idx: %s", grp, paste(grp_idx,sep=",", collapse=","))) 
133 |             df = expr.matrix[,grp_idx,drop=F]
134 |             recursive_cluster_cutting(df)
135 |         }
136 |     } else {
137 |         message("No cluster pruning")
138 |         ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix)
139 |     }
140 |         
141 | }
142 | 
143 | recursive_cluster_cutting(tumor.expr.data)
144 | 
145 | dev.off()
146 | 
147 | print(ALL_CLUSTERS)
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/scripts/run.stub.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | pdf('ladeda.pdf')
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/run_BayesNet.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--prelim_infercnv_obj", help="preliminary infercnv_obj file", required=TRUE, nargs=1)
 7 | parser$add_argument("--i6HMM_infercnv_obj", help="i6HMM infercnv_obj file", required=TRUE, nargs=1)
 8 | 
 9 | parser$add_argument("--BayesMaxPNormal", help="BayesMaxPNormal", required=TRUE, nargs=1, type='double')
10 | parser$add_argument("--out_dir", help="output directory", required=TRUE, nargs=1)
11 | 
12 | args = parser$parse_args()
13 | 
14 | library(infercnv)
15 | library(futile.logger)
16 | 
17 | infercnv_obj_prelim = readRDS(args$prelim_infercnv_obj)
18 | 
19 | hmm.infercnv_obj = readRDS(args$i6HMM_infercnv_obj)
20 | 
21 | 
22 | flog.info("Running Bayesian Network Model on HMM predicted CNV's\n")
23 | 
24 | hmm.infercnv_obj <- infercnv::inferCNVBayesNet(infercnv_obj    = infercnv_obj_prelim,
25 |                                                HMM_obj         = hmm.infercnv_obj,
26 |                                                BayesMaxPNormal = args$BayesMaxPNormal,
27 |                                                file_dir        = args$out_dir,
28 |                                                postMcmcMethod  = "removeCNV",
29 |                                                out_dir         = file.path(args$out_dir, "BayesNetOutput"),
30 |                                                quietly = TRUE)
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/scripts/run_HMM_each_cell_separately.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | 
18 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_on_indiv_cells(infercnv_obj)
19 | 
20 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM-icells.obj", infercnv_obj_file))
21 | 
22 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM-icells"))
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/run_HMM_on_hspike.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 | 
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | hspike = infercnv_obj@.hspike
18 | 
19 | hspike.hmm = infercnv:::predict_CNV_via_HMM_on_tumor_subclusters(infercnv_obj=hspike,
20 |                                                                  cnv_mean_sd=infercnv:::get_spike_dists(hspike),
21 |                                                                  cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(hspike)
22 |                                                                  )
23 | 
24 | plot_cnv(hspike.hmm, x.center=3, x.range=c(0,6), output_filename=paste0(basename(infercnv_obj_file), ".hspike.hmm"), out_dir=dirname(infercnv_obj_file))
25 | 
26 | saveRDS(hspike.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file))
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/run_HMM_on_subclusters.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | 
18 | if (length(infercnv_obj@tumor_subclusters) == 0) {
19 |     flog.info("Computing tumor subclusters")
20 |     infercnv_obj <- infercnv:::.subcluster_tumors_general(infercnv_obj)
21 | }
22 | 
23 | 
24 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_on_tumor_subclusters(infercnv_obj)
25 | 
26 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file))
27 | 
28 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM"))
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/run_HMM_per_chr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 |     
 5 | parser = ArgumentParser()
 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1)
 7 | args = parser$parse_args()
 8 | 
 9 | library(infercnv)
10 | library(ggplot2)
11 | library(futile.logger)
12 | 
13 | infercnv_obj_file = args$infercnv_obj
14 | 
15 | infercnv_obj = readRDS(infercnv_obj_file)
16 | 
17 | pdf('ladeda.pdf')
18 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_each_chr_separately(infercnv_obj)
19 | 
20 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file))
21 | 
22 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM"))
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/run_tests_sampling_and_group_plots.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(error = function() traceback(2))
  4 | options("warning.length" = 8000)
  5 | 
  6 | library("infercnv")
  7 | 
  8 | # create the infercnv object
  9 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"),
 10 |                                     annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"),
 11 |                                     delim="\t",
 12 |                                     gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"),
 13 |                                     ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 
 14 |                                     
 15 | out_dir="../example/output_dir_sampling_testscript"
 16 | # perform infercnv operations to reveal cnv signal
 17 | infercnv_obj = infercnv::run(infercnv_obj,
 18 |                              cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
 19 |                              out_dir=out_dir, 
 20 |                              cluster_by_groups=TRUE, 
 21 |                              plot_steps=FALSE,
 22 |                              denoise=TRUE,
 23 |                              HMM=FALSE,
 24 | 			                 no_prelim_plot=TRUE
 25 |                              )
 26 | t_out_dir = paste0(out_dir, "/subplots_1")
 27 | if(t_out_dir != "." & !file.exists(t_out_dir)){
 28 |     dir.create(t_out_dir)
 29 | }
 30 | infercnv:::plot_per_group(infercnv_obj, out_dir=t_out_dir, png_res=100, sample=TRUE, n_cells=100)
 31 | 
 32 | 
 33 | t_out_dir = paste0(out_dir, "/subsamples_1")
 34 | if(t_out_dir != "." & !file.exists(t_out_dir)){
 35 |     dir.create(t_out_dir)
 36 | }
 37 | sample_obj <- infercnv:::sample_object(infercnv_obj)
 38 | 
 39 | subsample_obj <- infercnv:::sample_object(infercnv_obj, n_cells=10)
 40 | 
 41 | upsubsample_obj <- infercnv:::sample_object(subsample_obj, n_cells=100)
 42 | 
 43 | every_2_object2 <- infercnv:::sample_object(infercnv_obj, every_n=2, above_m=2)
 44 | 
 45 | only_1_per_object <- infercnv:::sample_object(infercnv_obj, every_n=1000, above_m=2)
 46 | 
 47 | only_1_10times_per_object <- infercnv:::sample_object(only_1_per_object, n_cells=10)
 48 | 
 49 | infercnv_obj_filtered <- infercnv::apply_median_filtering(infercnv_obj, window_size=5)
 50 | 
 51 | 
 52 | infercnv::plot_cnv(sample_obj,
 53 |                    k_obs_groups=2,
 54 |                    cluster_by_groups=TRUE,
 55 |                    out_dir=t_out_dir,
 56 |                    x.center=1,
 57 |                    x.range="auto",
 58 |                    title="infercnv",
 59 |                    output_filename="infercnv_sampled",
 60 |                    png_res=300,
 61 |                    output_format="png",
 62 |                    write_expr_matrix=TRUE)
 63 | 
 64 | infercnv::plot_cnv(subsample_obj,
 65 |                    k_obs_groups=2,
 66 |                    cluster_by_groups=TRUE,
 67 |                    out_dir=t_out_dir,
 68 |                    x.center=1,
 69 |                    x.range="auto",
 70 |                    title="infercnv",
 71 |                    output_filename="infercnv_subsampled",
 72 |                    png_res=300,
 73 |                    output_format="png",
 74 |                    write_expr_matrix=TRUE)
 75 | 
 76 | infercnv::plot_cnv(upsubsample_obj,
 77 |                    k_obs_groups=2,
 78 |                    cluster_by_groups=TRUE,
 79 |                    out_dir=t_out_dir,
 80 |                    x.center=1,
 81 |                    x.range="auto",
 82 |                    title="infercnv",
 83 |                    output_filename="infercnv_subsampled_then_upsampled",
 84 |                    png_res=300,
 85 |                    output_format="png",
 86 |                    write_expr_matrix=TRUE)
 87 | 
 88 | 
 89 | infercnv::plot_cnv(every_2_object2,
 90 |                    k_obs_groups=2,
 91 |                    cluster_by_groups=TRUE,
 92 |                    out_dir=t_out_dir,
 93 |                    x.center=1,
 94 |                    x.range="auto",
 95 |                    title="infercnv",
 96 |                    output_filename="infercnv_sample_every_2",
 97 |                    png_res=300,
 98 |                    output_format="png",
 99 |                    write_expr_matrix=TRUE)
100 | 
101 | 
102 | 
103 | infercnv::plot_cnv(only_1_per_object,
104 |                    k_obs_groups=2,
105 |                    cluster_by_groups=TRUE,
106 |                    out_dir=t_out_dir,
107 |                    x.center=1,
108 |                    x.range="auto",
109 |                    title="infercnv",
110 |                    output_filename="infercnv_sample_only_1",
111 |                    png_res=300,
112 |                    output_format="png",
113 |                    write_expr_matrix=TRUE)
114 | 
115 | 
116 | infercnv::plot_cnv(only_1_10times_per_object,
117 |                    k_obs_groups=2,
118 |                    cluster_by_groups=TRUE,
119 |                    out_dir=t_out_dir,
120 |                    x.center=1,
121 |                    x.range="auto",
122 |                    title="infercnv",
123 |                    output_filename="infercnv_sample_only_1_10_times",
124 |                    png_res=300,
125 |                    output_format="png",
126 |                    write_expr_matrix=TRUE)
127 | 
128 | infercnv::plot_cnv(infercnv_obj_filtered,
129 |                    k_obs_groups=2,
130 |                    cluster_by_groups=TRUE,
131 |                    out_dir=out_dir,
132 |                    x.center=1,
133 |                    x.range="auto",
134 |                    title="infercnv",
135 |                    output_filename="infercnv_sampled_median_filtered",
136 |                    png_res=300,
137 |                    output_format = NA,
138 |                    write_expr_matrix=TRUE)
139 | 


--------------------------------------------------------------------------------
/scripts/sim_vs_orig_counts.QQplot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library("argparse"))
  4 | library(infercnv)
  5 | library(tidyverse)
  6 | 
  7 | 
  8 | parser = ArgumentParser()
  9 | parser$add_argument("--counts_matrix", help="raw counts matrix file", required=TRUE, nargs=1)
 10 | parser$add_argument("--sim_method", help="simulation method: splatter, simple, meanvar", required=TRUE)
 11 | parser$add_argument("--include_dropout", default=FALSE, action='store_true', help='include dropout modeling')
 12 | args = parser$parse_args()
 13 | 
 14 | 
 15 | include.dropout = args$include_dropout
 16 | 
 17 | 
 18 | data = read.table(args$counts_matrix)
 19 | data = as.matrix(data)
 20 | 
 21 | orig.counts = data
 22 | 
 23 | if (! any(args$sim_method %in% c('splatter', 'simple', 'meanvar'))) {
 24 |     stop(sprintf("Error, not recognizing sim method: %s", args$sim_method))
 25 | }
 26 | 
 27 | 
 28 | #' normalize first:
 29 | cs = colSums(data)
 30 | median_cs = median(cs)
 31 | data <- sweep(data, STATS=cs, MARGIN=2, FUN="/")
 32 | data <- data * median_cs
 33 | 
 34 | gene_means <- rowMeans(data)
 35 | 
 36 | num_cells = ncol(data)
 37 | 
 38 | ## sim the tumor matrix
 39 | sim_method = args$sim_method
 40 | if (sim_method == 'simple') {
 41 |     message('-using simple sim')
 42 | 
 43 |     mean_p0_table <- NULL
 44 |     if (include.dropout) {
 45 |         mean_p0_table <- infercnv:::.get_mean_vs_p0_from_matrix(data)
 46 |     }
 47 | 
 48 |     sim_matrix <- infercnv:::.get_simulated_cell_matrix(gene_means,
 49 |                                                         mean_p0_table=mean_p0_table,
 50 |                                                         num_cells=num_cells,
 51 |                                                         common_dispersion=0.1)
 52 | } else if (sim_method == 'splatter') {
 53 |     message('-using splatter sim')
 54 | 
 55 |     params <- infercnv:::.estimateSingleCellParamsSplatterScrape(orig.counts)
 56 | 
 57 |     params[['nCells']] <- num_cells
 58 |     params[['include.dropout']] <- include.dropout
 59 | 
 60 |     gene_means[gene_means == 0] <- 1e-3
 61 |     sim_matrix <- infercnv:::.simulateSingleCellCountsMatrixSplatterScrape(params, gene_means)
 62 |     sim_matrix <- counts(sim_matrix)
 63 | 
 64 | } else if (sim_method == 'meanvar') {
 65 |     message('-using meanvar sim')
 66 |     ##tumor_sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix(gene_means, data, args$num_tumor_cells)
 67 |     sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix(gene_means, data, num_cells, include.dropout=include.dropout)
 68 | 
 69 | } else {
 70 |     stop(sprintf("not recognizing --sim_method: %s", args$sim_method))
 71 | }
 72 | 
 73 | 
 74 | ## Plotting
 75 | if (include.dropout) {
 76 |     sim_method <- sprintf("%s-With_Dropout", sim_method)
 77 | } else {
 78 |     sim_method <- sprintf("%s-NO_Dropout", sim_method)
 79 | }
 80 | 
 81 | rownames(sim_matrix) <- names(gene_means)
 82 | colnames(sim_matrix) <- colnames(data)
 83 | sim_matrix_filename <- sprintf("sim.%s.counts.matrix", sim_method)
 84 | message("-writing matrix")
 85 | write.table(sim_matrix, sim_matrix_filename, quote=F, sep="\t")
 86 | 
 87 | ## total sum normalize sim matrix before plotting
 88 | sim_matrix <- infercnv:::.normalize_data_matrix_by_seq_depth(sim_matrix, median_cs)
 89 | 
 90 | message("-plotting QQ plot")
 91 | png(sprintf("sim_vs_orig_counts.%s.qqplots.png", sim_method))
 92 | qqplot(log(as.numeric(data)+1), log(as.numeric(sim_matrix)+1), main='orig vs. full sim')
 93 | abline(a=0,b=1,col='red')
 94 | 
 95 | message("-plotting KS plot")
 96 | png(sprintf("sim_vs_orig_counts.%s.KS.png", sim_method))
 97 | infercnv:::KS_plot(sprintf("KS, %s", sim_method), log(as.numeric(data)+1), log(as.numeric(sim_matrix)+1), names=c('orig',  sim_method))
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/scripts/splatterScrape_sim_counts.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("argparse"))
 4 | options(error = function() {traceback(2);quit(save = "no", status = 0, runLast = FALSE)})
 5 | 
 6 | parser = ArgumentParser()
 7 | 
 8 | parser$add_argument("--counts_matrix", help="raw counts matrix file", required=TRUE, default=NULL, nargs=1)
 9 | parser$add_argument("--ncells", help="number of cells to simulate", required=TRUE, type='integer', nargs=1)
10 | parser$add_argument("--ngenes", help="number of genes to simulate", required=TRUE, type='integer', nargs=1)
11 | parser$add_argument("--output", help='name of output matrix file', required=TRUE, nargs=1)
12 | 
13 | args = parser$parse_args()
14 | 
15 | library(infercnv)
16 | library(SingleCellExperiment)
17 | library("methods") 
18 | library(splatter)
19 | 
20 | 
21 | counts_matrix = read.table(args$counts_matrix)
22 | params_file = sprintf("%s.params_obj", args$counts_matrix)
23 | if (file.exists(params_file)) {
24 |     message("-note, reusing stored params")
25 |     params = readRDS(params_file)
26 | } else {
27 |     params <- infercnv:::.estimateSingleCellParamsSplatterScrape(counts_matrix)
28 |     saveRDS(params, file=sprintf("%s.params_obj", args$counts_matrix))
29 | }
30 | 
31 | ncells = args$ncells
32 | ngenes = args$ngenes
33 | output_filename = args$output
34 | 
35 | data = as.matrix(counts_matrix)
36 | 
37 | #' normalize first:
38 | cs = colSums(counts_matrix)
39 | median_cs = median(cs)
40 | data <- sweep(counts_matrix, STATS=cs, MARGIN=2, FUN="/")
41 | data <- data * median_cs
42 | 
43 | ## sim using specified gene means
44 | gene_means = rowMeans(data)
45 | gene_means = gene_means[gene_means>0]
46 | 
47 | gene_means = sample(x=gene_means, size=ngenes, replace=T)
48 | 
49 | newnames = paste0('gene', 1:ngenes)
50 | 
51 | names(gene_means) = newnames
52 | 
53 | 
54 | params[['nGenes']] = ngenes
55 | params[['nCells']] = ncells
56 | 
57 | 
58 | sim_matrix <- infercnv:::.simulateSingleCellCountsMatrixSplatterScrape(params, gene_means)
59 | sim_matrix <- counts(sim_matrix)
60 | 
61 | write.table(sim_matrix, file=output_filename, quote=F, sep='\t')
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | 
3 | library(testthat)
4 | library(infercnv)
5 | 
6 | test_check("infercnv")
7 | 


--------------------------------------------------------------------------------
/vignettes/inferCNV.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Visualizing Large-scale Copy Number Variation in Single-Cell RNA-Seq Expression Data"
  3 | author: 
  4 | - name: Timothy Tickle
  5 |   affiliation: &kco Klarman Cell Observatory, Broad Institute of MIT and Harvard, Cambridge, MA, USA
  6 | - name: Itay Tirosh
  7 |   affiliation: 
  8 |     - *kco
  9 |     - Weizmann Institute of Science, Rehovot, Israel
 10 | - name: Christophe Georgescu
 11 |   affiliation: *kco
 12 | - name: Maxwell Brown
 13 |   affiliation: *kco
 14 | - name: Brian Haas
 15 |   affiliation: *kco
 16 | date: "`r Sys.Date()`"
 17 | output:
 18 |   BiocStyle::html_document: default
 19 | package: infercnv
 20 | abstract: >
 21 |   InferCNV is used to explore tumor single cell RNA-Seq data to identify evidence for large-scale chromosomal copy number variations, such as gains or deletions of entire chromosomes or large segments of chromosomes. This is done by exploring expression intensity of genes across positions of the genome in comparison to the average or a set of reference 'normal' cells. A heatmap is generated illustrating the relative expression intensities across each chromosome, and it becomes readily apparent as to which regions of the genome are over-abundant or less-abundant as compared to normal cells (or the average, if reference normal cells are not provided).
 22 | vignette: >
 23 |   %\VignetteIndexEntry{Visualizing Large-scale Copy Number Variation in Single-Cell RNA-Seq Expression Data}
 24 |   %\VignetteEncoding{UTF-8}
 25 |   %\VignetteEngine{knitr::rmarkdown}
 26 | ---
 27 | 
 28 | 
 29 | # Installation
 30 | ## Required dependencies
 31 | 
 32 | _inferCNV_ uses the _R_ packages `r CRANpkg("ape")`, `r Biocpkg("BiocGenerics")`, `r CRANpkg("binhf")`, `r CRANpkg("caTools")`, `r CRANpkg("coda")`, `r CRANpkg("coin")`, `r CRANpkg("dplyr")`, `r CRANpkg("doparallel")`, `r Biocpkg("edgeR")`, `r CRANpkg("fastcluster")`, `r CRANpkg("fitdistrplus")`, `r CRANpkg("foreach")`, `r CRANpkg("futile.logger")`, `r CRANpkg("future")`, `r CRANpkg("gplots")`, `r CRANpkg("ggplot2")`, `r CRANpkg("HiddenMarkov")`, `r CRANpkg("leiden")`, `r CRANpkg("phyclust")`, `r CRANpkg("RANN")`, `r CRANpkg("reshape")`, `r CRANpkg("rjags")`, `r CRANpkg("RColorBrewer")`, `r Biocpkg("SingleCellExperiment")`, `r Biocpkg("SummarizedExperiment")`, `r CRANpkg("tidyr")` and imports functions from the archived `r CRANpkg("GMD")`.
 33 | 
 34 | <!--
 35 | [https://cran.r-project.org/web/packages/ape/index.html]
 36 | [https://bioconductor.org/packages/release/bioc/html/BiocGenerics.html]
 37 | [https://cran.r-project.org/web/packages/binhf/index.html]
 38 | [https://cran.r-project.org/web/packages/caTools/index.html]
 39 | [https://cran.r-project.org/web/packages/coda/index.html]
 40 | [https://cran.r-project.org/web/packages/coin/index.html]
 41 | [https://cran.r-project.org/web/packages/doParallel/index.html]
 42 | [https://cran.r-project.org/web/packages/dplyr/index.html]
 43 | [https://bioconductor.org/packages/release/bioc/html/edgeR.html]
 44 | [https://cran.r-project.org/web/packages/fastcluster/index.html]
 45 | [https://cran.r-project.org/web/packages/fitdistrplus/index.html]
 46 | [https://cran.r-project.org/web/packages/foreach/index.html]
 47 | [https://cran.r-project.org/web/packages/futile.logger/index.html]
 48 | [https://cran.r-project.org/web/packages/future/index.html]
 49 | [https://cran.r-project.org/web/packages/ggplot2/index.html]
 50 | [https://cran.r-project.org/web/packages/gplots/index.html]
 51 | [https://cran.r-project.org/web/packages/HiddenMarkov/index.html]
 52 | [https://cran.r-project.org/web/packages/reshape/index.html]
 53 | [https://cran.r-project.org/web/packages/rjags/index.html]
 54 | [https://cran.r-project.org/web/packages/RColorBrewer/index.html]
 55 | [https://cran.r-project.org/web/packages/Seurat/index.html]
 56 | [https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html]
 57 | [https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html]
 58 | [https://cran.r-project.org/src/contrib/Archive/GMD/]
 59 | -->
 60 | 
 61 | 
 62 | ## Installing
 63 | ```{r install, eval=FALSE}
 64 | if (!requireNamespace("BiocManager", quietly = TRUE))
 65 |     install.packages("BiocManager")
 66 | BiocManager::install("infercnv")
 67 | ```
 68 | 
 69 | 
 70 | ## Optional extension
 71 | If you want to use the interactive heatmap visualization, please check the add-on packge _R_ `r Githubpkg ("broadinstitute/inferCNV_NGCHM")` after installing the packages `r CRANpkg("tibble")`, `r Githubpkg("bmbroom/tsvio")` and `r Githubpkg("bmbroom/NGCHMR")`. To install optional packages, type the following in an R command window:
 72 | 
 73 | <!--
 74 | [https://cran.r-project.org/web/packages/tibble/index.html]
 75 | [https://github.com/bmbroom/tsvio]
 76 | [https://github.com/bmbroom/NGCHMR]
 77 | -->
 78 | 
 79 | ```{r install-optionals, eval = FALSE}
 80 | install.packages("tibble")
 81 | 
 82 | install.packages("devtools")
 83 | devtools::install_github("bmbroom/tsvio")
 84 | devtools::install_github("bmbroom/NGCHMR", ref="stable")
 85 | devtools::install_github("broadinstitute/inferCNV_NGCHM")
 86 | 
 87 | ```
 88 | 
 89 | And download the NGCHM java application by typing the following in a regular shell:
 90 | ```{bash, eval = FALSE}
 91 | wget http://tcga.ngchm.net/NGCHM/ShaidyMapGen.jar
 92 | ```
 93 | 
 94 | 
 95 | 
 96 | ```{r setup, include=FALSE}
 97 | knitr::opts_chunk$set(echo = TRUE)
 98 | library(infercnv)
 99 | 
100 | ```
101 | 
102 | # Running InferCNV
103 | ## Create the InferCNV Object
104 | 
105 | Reading in the raw counts matrix and meta data, populating the infercnv object
106 | 
107 | ```{r}
108 | infercnv_obj = CreateInfercnvObject(
109 |   raw_counts_matrix="../inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz",
110 |   annotations_file="../inst/extdata/oligodendroglioma_annotations_downsampled.txt",
111 |   delim="\t",
112 |   gene_order_file="../inst/extdata/gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt",
113 |   ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))
114 | 
115 | ```
116 | 
117 | 
118 | 
119 | ## Running the full default analysis
120 | ```{r, results="hide"}
121 | out_dir = tempfile()
122 | infercnv_obj_default = infercnv::run(
123 |     infercnv_obj,
124 |     cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics
125 |     out_dir=out_dir,
126 |     cluster_by_groups=TRUE, 
127 |     plot_steps=FALSE,
128 |     denoise=TRUE,
129 |     HMM=FALSE,
130 |     no_prelim_plot=TRUE,
131 |     png_res=60
132 | )
133 | 
134 | ```
135 | 
136 | Basic ouput from running inferCNV.
137 | ```{r, echo=FALSE}
138 | knitr::include_graphics(paste(out_dir, "infercnv.png", sep="/"))
139 | ```
140 | 
141 | 
142 | 
143 | # Additional Information
144 | ## Online Documentation
145 | 
146 | For additional explanations on files, usage, and a tutorial please visit the [wiki](https://github.com/broadinstitute/inferCNV/wiki).
147 | 
148 | 
149 | ## TrinityCTAT
150 | This tool is a part of the TrinityCTAT toolkit focused on leveraging the use of RNA-Seq to better understand cancer transcriptomes. To find out more please visit [TrinityCTAT](https://github.com/NCIP/Trinity_CTAT/wiki)
151 | 
152 | 
153 | ## Applications
154 | 
155 | This methodology was used in:
156 | 
157 | [Anoop P. Patel et al. Single-cell RNA-seq highlights intratumoral heterogeneity in primary glioblastoma. Science. 2014 Jun 20: 1396-1401](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4123637/)
158 | 
159 | [Tirosh I et al.Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science. 2016 Apr 8;352(6282):189-96](http://www.ncbi.nlm.nih.gov/pubmed/27124452)
160 | 
161 | 
162 | 
163 | 
164 | # Session info
165 | 
166 | ```{r sessioninfo, echo=FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=60), out.width=60}
167 | sessionInfo()
168 | ```
169 | 
170 | 


--------------------------------------------------------------------------------