├── .Rbuildignore ├── .github └── ISSUE_TEMPLATE │ ├── config.yml │ └── infercnv-support-and-development-hiatus.md ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── .wip │ └── Seurat_integration.R ├── SplatterScrape.R ├── data.R ├── inferCNV.R ├── inferCNV_BayesNet.R ├── inferCNV_HMM.R ├── inferCNV_constants.R ├── inferCNV_heatmap.R ├── inferCNV_hidden_spike.R ├── inferCNV_i3HMM.R ├── inferCNV_mask_non_DE.R ├── inferCNV_meanVarSim.R ├── inferCNV_ops.R ├── inferCNV_simple_sim.R ├── inferCNV_tumor_subclusters.R ├── inferCNV_tumor_subclusters.random_smoothed_trees.R ├── infercnv_sampling.R ├── noise_reduction.R └── seurat_interaction.R ├── README.md ├── Rstudio_helpers └── Examine_and_Filter_Cells_and_Genes.Rmd ├── WDL └── infercnv.wdl ├── data ├── HMM_states.rda ├── infercnv_annots_example.rda ├── infercnv_data_example.rda ├── infercnv_genes_example.rda ├── infercnv_object_example.rda └── mcmc_obj.rda ├── docker └── Dockerfile ├── example ├── Makefile ├── README.txt ├── __alt_exec_modes │ ├── run.no_spike.R │ ├── run.set_num_ref_groups.R │ └── run.use_zscores.R ├── example.Rmd ├── run.R ├── run_memory_profiling_per_step.sh └── run_test.R ├── inst ├── BUGS_Mixture_Model ├── BUGS_Mixture_Model_i3 ├── CITATION ├── NEWS ├── extdata │ ├── gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt │ ├── oligodendroglioma_annotations_downsampled.txt │ └── oligodendroglioma_expression_downsampled.counts.matrix.gz └── script │ └── README.txt ├── man ├── CreateInfercnvObject.Rd ├── HMM_states.Rd ├── MCMC_inferCNV-class.Rd ├── add_to_seurat.Rd ├── apply_median_filtering.Rd ├── color.palette.Rd ├── filterHighPNormals.Rd ├── inferCNVBayesNet.Rd ├── infercnv-class.Rd ├── infercnv-package.Rd ├── infercnv_annots_example.Rd ├── infercnv_data_example.Rd ├── infercnv_genes_example.Rd ├── infercnv_object_example.Rd ├── mcmc_obj.Rd ├── plot_cnv.Rd ├── plot_per_group.Rd ├── plot_subclusters.Rd ├── run.Rd ├── sample_object.Rd └── validate_infercnv_obj.Rd ├── scripts ├── ExploratoryPlots.R ├── HB_example_to_inferCNV_obj.R ├── KS_matrix_comparison.R ├── KS_matrix_comparison.use_infercnv_obj.R ├── QQ_matrix_comparison.R ├── apply_median_filtering.R ├── boxplot_cell_exprs.R ├── check_matrix_format.py ├── cross_cell_scaling_normalization.R ├── dropout_matrix_comparison.R ├── examine_dropout_logistic.R ├── examine_infercnv_data_params.R ├── examine_infercnv_data_params.just_dispersion.R ├── examine_normal_cutoffs_vs_KS.R ├── examine_normal_sampling_distributions.R ├── examine_normal_sampling_distributions.i3.R ├── examine_simulated_vs_observed_dispersion.R ├── examine_simulated_vs_observed_dispersion.from_matrix.R ├── explore_HMM_exec.R ├── explore_HMM_exec.hspike.R ├── explore_steps_by_gene.simple.R ├── genome_smoothed_lineplots.R ├── gtf_to_position_file.py ├── inferCNV.R ├── inferCNV_to_HB.R ├── inferCNV_utils.R ├── infercnv_obj_to_input_files.R ├── infercnv_validate.R ├── meanvar_sim_counts.R ├── plot_hspike.R ├── plot_hspike.by_num_cells.R ├── plot_hspike.diff_normal_tumor.R ├── plot_hspike_vs_sample_chrs.R ├── plot_infercnv_obj.R ├── plot_tumor_vs_normal_chr_densities.R ├── plot_tumor_vs_normal_chr_densities.i3.R ├── prepare_sparsematrix.R ├── recursive_random_tree_height_cutting.random_trees.R ├── recursive_random_tree_height_cutting.sigclust2.R ├── recursive_random_tree_height_cutting.using_hmms.R ├── run.stub.R ├── run_BayesNet.R ├── run_HMM_each_cell_separately.R ├── run_HMM_on_hspike.R ├── run_HMM_on_subclusters.R ├── run_HMM_per_chr.R ├── run_tests_sampling_and_group_plots.R ├── sim_vs_orig_counts.QQplot.R └── splatterScrape_sim_counts.R ├── tests ├── testthat.R └── testthat │ └── test_infer_cnv.R └── vignettes ├── .wip └── inferCNV.Rmd └── inferCNV.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^Meta$ 2 | ^doc$ 3 | ^\.travis\.yml$ 4 | ^R/\.wip 5 | ^vignettes/\.wip 6 | ^vignettes/example_output 7 | ^docker 8 | ^run_tests 9 | ^\. 10 | ^inferCNV\.wiki 11 | ^infercnv.Rdata 12 | ^inferCNV.Rproj 13 | ^Rstudio_helpers 14 | ^__simulations 15 | ^example 16 | ^example/full_precision 17 | ^example/example.html 18 | ^example/test_subdir 19 | ^example/oligodendroglioma_expression_downsampled.txt 20 | ^example_output 21 | ^example/C125.matrix.obj 22 | ^output_dir 23 | ^.*\.Rproj$ 24 | ^\.Rproj\.user$ 25 | ^devel_debug 26 | ^run_tests 27 | ^external 28 | ^\.gitmodules 29 | ^scripts 30 | ^WDL/infercnv.wdl 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/infercnv-support-and-development-hiatus.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: InferCNV support and development hiatus 3 | about: InferCNV support and development is on pause due to lack of dedicated resources available. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Meta 2 | doc 3 | .idea 4 | .Rproj.user 5 | .Rhistory 6 | .Rprofile 7 | .example_output 8 | vignettes/example_output 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | r: bioc-release 3 | bioc_required: true 4 | bioc_use_devel: false 5 | 6 | cache: 7 | apt: true 8 | packages: true 9 | timeout: 3000 10 | 11 | before_install: 12 | - sudo apt-get update 13 | - sudo apt-get install jags 14 | 15 | # r_build_args: --no-build-vignettes --no-manual --no-resave-data 16 | r_check_args: --no-build-vignettes # --no-manual 17 | 18 | #script: 19 | #- travis_wait R CMD build . 20 | #- R CMD check --no-build-vignettes *tar.gz 21 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: infercnv 2 | Type: Package 3 | Title: Infer Copy Number Variation from Single-Cell RNA-Seq Data 4 | Version: 1.23.0 5 | Date: 2023-12-01 6 | Authors@R: c( person("Timothy", "Tickle", email = "ttickle@broadinstitute.org", role = "aut"), person("Itay", "Tirosh", email = "tirosh@broadinstitute.org", role = "aut"), person("Christophe", "Georgescu", email = "cgeorges@broadinstitute.org", role = c("aut", "cre")), person("Maxwell", "Brown", email = "mbrown@broadinstitute.org", role = "aut"), person("Brian", "Haas", email = "bhaas@broadinstitute.org", role = "aut")) 7 | BugReports: https://github.com/broadinstitute/inferCNV/issues 8 | Description: Using single-cell RNA-Seq expression to visualize CNV in cells. 9 | biocViews: Software, CopyNumberVariation, VariantDetection, StructuralVariation, GenomicVariation, Genetics, Transcriptomics, StatisticalMethod, Bayesian, HiddenMarkovModel, SingleCell 10 | Depends: R(>= 4.0) 11 | License: BSD_3_clause + file LICENSE 12 | LazyData: TRUE 13 | VignetteBuilder: knitr 14 | Suggests: BiocStyle, knitr, rmarkdown, testthat 15 | RoxygenNote: 7.2.3 16 | NeedsCompilation: no 17 | SystemRequirements: JAGS 4.x.y 18 | Imports: graphics, grDevices, RColorBrewer, gplots, futile.logger, stats, utils, methods, ape, phyclust, Matrix, fastcluster, parallelDist, dplyr, HiddenMarkov, ggplot2, edgeR, coin, caTools, digest, RANN, igraph, reshape2, rjags, fitdistrplus, future, foreach, doParallel, Seurat, BiocGenerics, SummarizedExperiment, SingleCellExperiment, tidyr, parallel, coda, gridExtra, argparse 19 | URL: https://github.com/broadinstitute/inferCNV/wiki 20 | Collate: 21 | 'SplatterScrape.R' 22 | 'data.R' 23 | 'inferCNV.R' 24 | 'inferCNV_BayesNet.R' 25 | 'inferCNV_HMM.R' 26 | 'inferCNV_constants.R' 27 | 'inferCNV_heatmap.R' 28 | 'inferCNV_hidden_spike.R' 29 | 'inferCNV_i3HMM.R' 30 | 'inferCNV_mask_non_DE.R' 31 | 'inferCNV_meanVarSim.R' 32 | 'inferCNV_ops.R' 33 | 'inferCNV_simple_sim.R' 34 | 'inferCNV_tumor_subclusters.R' 35 | 'inferCNV_tumor_subclusters.random_smoothed_trees.R' 36 | 'infercnv_sampling.R' 37 | 'noise_reduction.R' 38 | 'seurat_interaction.R' 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Timothy Tickle, Christophe Georgescu, Itay Tirosh 3 | ORGANIZATION: Broad Institute 4 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(CreateInfercnvObject) 4 | export(add_to_seurat) 5 | export(apply_median_filtering) 6 | export(color.palette) 7 | export(filterHighPNormals) 8 | export(inferCNVBayesNet) 9 | export(plot_cnv) 10 | export(plot_per_group) 11 | export(plot_subclusters) 12 | export(run) 13 | export(sample_object) 14 | exportClasses(MCMC_inferCNV) 15 | exportClasses(infercnv) 16 | import(RColorBrewer) 17 | import(argparse) 18 | import(coda) 19 | import(doParallel) 20 | import(fitdistrplus) 21 | import(foreach) 22 | import(futile.logger) 23 | import(future) 24 | import(ggplot2) 25 | importFrom(BiocGenerics,counts) 26 | importFrom(BiocGenerics,mean) 27 | importFrom(BiocGenerics,t) 28 | importFrom(Matrix,Matrix) 29 | importFrom(Matrix,colSums) 30 | importFrom(Matrix,rowMeans) 31 | importFrom(Matrix,sparseMatrix) 32 | importFrom(RANN,nn2) 33 | importFrom(Seurat,CreateSeuratObject) 34 | importFrom(Seurat,FindNeighbors) 35 | importFrom(Seurat,FindVariableFeatures) 36 | importFrom(Seurat,RunPCA) 37 | importFrom(Seurat,ScaleData) 38 | importFrom(SingleCellExperiment,SingleCellExperiment) 39 | importFrom(SummarizedExperiment,"assays<-") 40 | importFrom(SummarizedExperiment,"colData<-") 41 | importFrom(SummarizedExperiment,"rowData<-") 42 | importFrom(SummarizedExperiment,assays) 43 | importFrom(SummarizedExperiment,colData) 44 | importFrom(SummarizedExperiment,rowData) 45 | importFrom(SummarizedExperiment,start) 46 | importFrom(ape,as.phylo) 47 | importFrom(ape,drop.tip) 48 | importFrom(ape,read.tree) 49 | importFrom(ape,write.tree) 50 | importFrom(caTools,runmean) 51 | importFrom(coin,oneway_test) 52 | importFrom(coin,pvalue) 53 | importFrom(digest,digest) 54 | importFrom(dplyr,"%>%") 55 | importFrom(dplyr,count) 56 | importFrom(edgeR,estimateDisp) 57 | importFrom(fastcluster,hclust) 58 | importFrom(gplots,bluered) 59 | importFrom(grDevices,col2rgb) 60 | importFrom(grDevices,colorRampPalette) 61 | importFrom(grDevices,dev.off) 62 | importFrom(grDevices,pdf) 63 | importFrom(grDevices,png) 64 | importFrom(grDevices,rgb) 65 | importFrom(graphics,abline) 66 | importFrom(graphics,axis) 67 | importFrom(graphics,boxplot) 68 | importFrom(graphics,hist) 69 | importFrom(graphics,image) 70 | importFrom(graphics,layout) 71 | importFrom(graphics,legend) 72 | importFrom(graphics,lines) 73 | importFrom(graphics,mtext) 74 | importFrom(graphics,par) 75 | importFrom(graphics,plot) 76 | importFrom(graphics,points) 77 | importFrom(graphics,rect) 78 | importFrom(graphics,text) 79 | importFrom(graphics,title) 80 | importFrom(gridExtra,gtable_combine) 81 | importFrom(gridExtra,marrangeGrob) 82 | importFrom(gridExtra,tableGrob) 83 | importFrom(gridExtra,ttheme_default) 84 | importFrom(igraph,cluster_leiden) 85 | importFrom(igraph,graph_from_adjacency_matrix) 86 | importFrom(methods,is) 87 | importFrom(methods,new) 88 | importFrom(methods,setClass) 89 | importFrom(parallel,detectCores) 90 | importFrom(parallelDist,parallelDist) 91 | importFrom(phyclust,get.rooted.tree.height) 92 | importFrom(reshape2,melt) 93 | importFrom(rjags,coda.samples) 94 | importFrom(rjags,jags.model) 95 | importFrom(stats,as.dendrogram) 96 | importFrom(stats,as.dist) 97 | importFrom(stats,as.hclust) 98 | importFrom(stats,complete.cases) 99 | importFrom(stats,cor) 100 | importFrom(stats,cutree) 101 | importFrom(stats,density) 102 | importFrom(stats,dist) 103 | importFrom(stats,dnorm) 104 | importFrom(stats,ecdf) 105 | importFrom(stats,filter) 106 | importFrom(stats,ks.test) 107 | importFrom(stats,lm) 108 | importFrom(stats,median) 109 | importFrom(stats,nls) 110 | importFrom(stats,order.dendrogram) 111 | importFrom(stats,p.adjust) 112 | importFrom(stats,pnorm) 113 | importFrom(stats,predict) 114 | importFrom(stats,qgamma) 115 | importFrom(stats,qnorm) 116 | importFrom(stats,quantile) 117 | importFrom(stats,rbinom) 118 | importFrom(stats,rchisq) 119 | importFrom(stats,reorder) 120 | importFrom(stats,rgamma) 121 | importFrom(stats,rlnorm) 122 | importFrom(stats,rnbinom) 123 | importFrom(stats,rnorm) 124 | importFrom(stats,rpois) 125 | importFrom(stats,runif) 126 | importFrom(stats,sd) 127 | importFrom(stats,shapiro.test) 128 | importFrom(stats,smooth.spline) 129 | importFrom(stats,t.test) 130 | importFrom(stats,update) 131 | importFrom(stats,var) 132 | importFrom(stats,wilcox.test) 133 | importFrom(tidyr,gather) 134 | importFrom(utils,capture.output) 135 | importFrom(utils,flush.console) 136 | importFrom(utils,head) 137 | importFrom(utils,read.csv) 138 | importFrom(utils,read.table) 139 | importFrom(utils,tail) 140 | importFrom(utils,write.table) 141 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells. 2 | #' This is only to demonstrate how to use methods, not actual data to be used in an analysis. 3 | #' 4 | #' @format A data frame with 8252 rows (genes) and 20 columns (cells) 5 | #' 6 | #' 7 | "infercnv_data_example" 8 | 9 | #' Generated classification for 10 normal cells and 10 tumor cells. 10 | #' 11 | #' @format A data frame with 20 rows (cells) and 1 columns (classification) 12 | #' 13 | #' 14 | "infercnv_annots_example" 15 | 16 | #' Downsampled gene coordinates file from GrCh37 17 | #' 18 | #' @format A data frame with 10338 rows (genes) and 3 columns (chr, start, end) 19 | #' 20 | #' 21 | "infercnv_genes_example" 22 | 23 | #' infercnv object result of the processing of run() in the example, to be used for other examples. 24 | #' 25 | #' @format An infercnv object 26 | #' 27 | #' 28 | "infercnv_object_example" 29 | 30 | #' infercnv object result of the processing of run() in the HMM example, to be used for other examples. 31 | #' 32 | #' @format An infercnv object containing HMM predictions 33 | #' 34 | #' 35 | "HMM_states" 36 | 37 | #' infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples. 38 | #' 39 | #' @format An infercnv object containing posterior probability of CNV states 40 | #' 41 | #' 42 | "mcmc_obj" 43 | -------------------------------------------------------------------------------- /R/inferCNV_constants.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | 5 | C_CHR <- "chr" 6 | C_START <- "start" 7 | C_STOP <- "stop" 8 | C_HCLUST_METHODS <- c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid") 9 | C_OUTPUT_FORMAT <- c("pdf", "png") 10 | 11 | 12 | ## also including some globals: 13 | infercnv.env <- new.env() 14 | infercnv.env$GLOBAL_NUM_THREADS <- 1 # default is single-threaded. 15 | 16 | 17 | #' @importFrom grDevices col2rgb colorRampPalette dev.off pdf png rgb 18 | #' @importFrom graphics abline axis boxplot hist image layout lines mtext par plot points rect text title legend 19 | #' @importFrom stats as.dendrogram as.dist as.hclust cutree density dist filter median order.dendrogram quantile reorder sd complete.cases cor t.test p.adjust predict rnorm runif smooth.spline var wilcox.test dnorm ecdf ks.test lm nls pnorm qgamma qnorm rbinom rchisq rgamma rlnorm rnbinom rpois shapiro.test update 20 | #' @importFrom utils flush.console read.table write.table tail read.csv head capture.output 21 | #' @import futile.logger 22 | #' @importFrom methods setClass new is 23 | #' @importFrom gplots bluered 24 | #' @importFrom ape write.tree as.phylo read.tree drop.tip 25 | #' @importFrom phyclust get.rooted.tree.height 26 | #' @importFrom fastcluster hclust 27 | #' @importFrom parallelDist parallelDist 28 | #' @import RColorBrewer 29 | #' @importFrom Matrix Matrix rowMeans colSums sparseMatrix 30 | #' @importFrom dplyr %>% count 31 | #' @import fitdistrplus 32 | #' @import foreach 33 | #' @import doParallel 34 | #' @import future 35 | #' @import coda 36 | #' @import ggplot2 37 | #' @import argparse 38 | #' @importFrom edgeR estimateDisp 39 | #' @importFrom caTools runmean 40 | #' @importFrom coin oneway_test pvalue 41 | #' @importFrom digest digest 42 | #' @importFrom RANN nn2 43 | #' @importFrom igraph graph_from_adjacency_matrix cluster_leiden 44 | #' @importFrom reshape2 melt 45 | #' @importFrom rjags jags.model coda.samples 46 | #' @importFrom BiocGenerics counts t mean 47 | #' @importFrom SummarizedExperiment start colData rowData assays assays<- rowData<- colData<- 48 | #' @importFrom SingleCellExperiment SingleCellExperiment 49 | #' @importFrom tidyr gather 50 | #' @importFrom parallel detectCores 51 | #' @importFrom gridExtra ttheme_default tableGrob gtable_combine marrangeGrob 52 | #' @importFrom Seurat CreateSeuratObject FindVariableFeatures ScaleData RunPCA FindNeighbors 53 | 54 | 55 | NULL 56 | 57 | -------------------------------------------------------------------------------- /R/inferCNV_meanVarSim.R: -------------------------------------------------------------------------------- 1 | .get_simulated_cell_matrix_using_meanvar_trend <- function(infercnv_obj, gene_means, num_cells, include.dropout=FALSE) { 2 | 3 | # should be working on the total sum count normalized data. 4 | # model the mean variance relationship 5 | 6 | 7 | mean_var_table = .get_mean_var_table(infercnv_obj) 8 | 9 | dropout_logistic_params <- NULL 10 | 11 | if (include.dropout) { 12 | 13 | mean_p0_table <- .get_mean_vs_p0_table(infercnv_obj) 14 | 15 | dropout_logistic_params <- .get_logistic_params(mean_p0_table) 16 | } 17 | 18 | return(.get_simulated_cell_matrix_using_meanvar_trend_helper(gene_means, mean_var_table, num_cells, dropout_logistic_params)) 19 | } 20 | 21 | 22 | 23 | .get_simulated_cell_matrix_using_meanvar_trend_helper <- function(gene_means, mean_var_table, num_cells, dropout_logistic_params=NULL) { 24 | 25 | ngenes = length(gene_means) 26 | 27 | logm = log(mean_var_table$m + 1) 28 | logv = log(mean_var_table$v + 1) 29 | 30 | mean_var_spline = smooth.spline(logv ~ logm) 31 | 32 | 33 | spike_cell_names = paste0('sim_cell_', seq_len(num_cells)) 34 | 35 | sim_cell_matrix = matrix(rep(0,ngenes*num_cells), nrow=ngenes) 36 | rownames(sim_cell_matrix) = names(gene_means) 37 | colnames(sim_cell_matrix) = spike_cell_names 38 | 39 | sim_expr_vals <- function(gene_idx) { 40 | m = gene_means[gene_idx] 41 | return(.sim_expr_val_mean_var_no_dropout(m, mean_var_spline)) 42 | } 43 | 44 | for (i in seq_len(num_cells)) { 45 | newvals = sapply(seq_len(ngenes), FUN=sim_expr_vals) 46 | sim_cell_matrix[,i] = newvals 47 | } 48 | 49 | ## apply dropout 50 | if (!is.null(dropout_logistic_params)) { 51 | sim_cell_matrix <- .apply_dropout(sim_cell_matrix, dropout_logistic_params) 52 | } 53 | 54 | return(sim_cell_matrix) 55 | } 56 | 57 | .get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix <- function(gene_means, normal_counts_matrix, num_cells, include.dropout=TRUE, cell_groupings=NULL) { 58 | 59 | mean_var_table <- .get_mean_var_given_matrix(normal_counts_matrix, cell_groupings) 60 | 61 | dropout_logistic_params <- NULL 62 | if (include.dropout) { 63 | mean_vs_p0_table <- .get_mean_vs_p0_table_from_matrix(normal_counts_matrix, cell_groupings) 64 | dropout_logistic_params <- .get_logistic_params(mean_vs_p0_table) 65 | } 66 | 67 | sim_matrix <- .get_simulated_cell_matrix_using_meanvar_trend_helper(gene_means, mean_var_table, num_cells, dropout_logistic_params) 68 | 69 | return(sim_matrix) 70 | } 71 | 72 | 73 | ##' @keywords internal 74 | ##' @noRd 75 | ##' 76 | 77 | .sim_expr_val_mean_var <- function(m, mean_var_spline, dropout_logistic_params) { 78 | 79 | # include drop-out prediction 80 | 81 | val = 0 82 | if (m > 0) { 83 | logm = log(m+1) 84 | pred_log_var = predict(mean_var_spline, logm)$y 85 | 86 | var = max(exp(pred_log_var)-1, 0) 87 | 88 | val = round(max(rnorm(n=1, mean=m, sd=sqrt(var)), 0)) 89 | 90 | if ( (! is.null(dropout_logistic_params)) & val > 0) { 91 | 92 | dropout_prob <- predict(dropout_logistic_params$spline, log(val))$y[1] 93 | 94 | if (runif(1) <= dropout_prob) { 95 | ## a drop-out 96 | val = 0 97 | } 98 | } 99 | } 100 | 101 | return(val) 102 | } 103 | 104 | 105 | .sim_expr_val_mean_var_no_dropout <- function(m, mean_var_spline) { 106 | 107 | val = 0 108 | if (m > 0) { 109 | logm = log(m+1) 110 | pred_log_var = predict(mean_var_spline, logm)$y 111 | 112 | var = max(exp(pred_log_var)-1, 0) 113 | 114 | val = round(max(rnorm(n=1, mean=m, sd=sqrt(var)), 0)) 115 | 116 | } 117 | 118 | return(val) 119 | } 120 | 121 | 122 | .apply_dropout <- function(counts.matrix, dropout_logistic_params) { 123 | 124 | 125 | 126 | counts.matrix <- apply(counts.matrix, 1, function(x) { 127 | 128 | mean.val = mean(x) 129 | dropout_prob <- predict(dropout_logistic_params$spline, log(mean.val))$y[1] 130 | 131 | nzeros = sum(x==0) 132 | ntotal = length(x) 133 | nremaining = ntotal - nzeros 134 | 135 | # padj = ( (pzero*total) - (current_nzero) ) / remaining 136 | 137 | padj = ( (dropout_prob * ntotal) - (nzeros) ) / nremaining 138 | padj = max(padj, 0) 139 | 140 | flog.debug(sprintf("mean.val: %g, dropout_prob: %g, adj_dropout_prob: %g", 141 | mean.val, 142 | dropout_prob, 143 | padj)) 144 | 145 | x.adj = sapply(x, function(y) { 146 | if(runif(1) <= padj) { 147 | return(0) 148 | } else { 149 | return(y) 150 | } 151 | 152 | } ) 153 | 154 | x.adj 155 | 156 | }) 157 | 158 | return(t(counts.matrix)) 159 | 160 | 161 | } 162 | 163 | 164 | 165 | ##' .get_mean_var_table() 166 | ##' 167 | ##' Computes the gene mean/variance table based on all defined cell groupings (reference and observations) 168 | ##' 169 | ##' @param infercnv_obj An infercnv object populated with raw count data 170 | ##' 171 | ##' @return data.frame with 3 columns: group_name, mean, variance 172 | ##' 173 | ##' 174 | ##' @keywords internal 175 | ##' @noRd 176 | ##' 177 | 178 | .get_mean_var_table <- function(infercnv_obj) { 179 | 180 | group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 181 | 182 | mean_variance_table <- .get_mean_var_given_matrix(infercnv_obj@expr.data, group_indices) 183 | 184 | return(mean_variance_table) 185 | 186 | } 187 | 188 | 189 | .get_mean_var_given_matrix <- function(expr.matrix, cell_cluster_groupings=NULL) { 190 | 191 | if (is.null(cell_cluster_groupings)) { 192 | ## use all cells 193 | cell_cluster_groupings = list(allcells=seq(ncol(expr.matrix))) 194 | } 195 | 196 | mean_var_table <- NULL 197 | 198 | for (group_name in names(cell_cluster_groupings)) { 199 | 200 | expr.data = expr.matrix[, cell_cluster_groupings[[ group_name ]] ] 201 | m = rowMeans(expr.data) 202 | v = apply(expr.data, 1, var) 203 | if (is.null(mean_var_table)) { 204 | mean_var_table = data.frame(g=group_name, m=m, v=v) 205 | } else { 206 | mean_var_table = rbind(mean_var_table, data.frame(g=group_name, m=m, v=v)) 207 | } 208 | } 209 | 210 | return(mean_var_table) 211 | } 212 | 213 | ##' .get_spike_in_average_bounds() 214 | ##' 215 | ##' return mean bounds for expression of all cells in the spike-in 216 | ##' 217 | ##' @param infercnv_obj An infercnv object populated with raw count data 218 | ##' 219 | ##' @return c(left_bound, right_bound) 220 | ##' 221 | ##' @keywords internal 222 | ##' @noRd 223 | ##' 224 | 225 | -------------------------------------------------------------------------------- /R/noise_reduction.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title apply_median_filtering 3 | #' 4 | #' @description Apply a median filtering to the expression matrix within each tumor bounds 5 | #' 6 | #' @param infercnv_obj infercnv_object 7 | #' 8 | #' @param window_size Size of the window side centered on the data point to filter (default = 7). 9 | #' 10 | #' @param on_observations boolean (default=TRUE), run on observations data (tumor cells). 11 | #' 12 | #' @param on_references boolean (default=TRUE), run on references (normal cells). 13 | #' 14 | #' @return infercnv_obj with median filtering applied to observations 15 | #' 16 | #' @export 17 | #' 18 | #' @examples 19 | #' # data(infercnv_data_example) 20 | #' # data(infercnv_annots_example) 21 | #' # data(infercnv_genes_example) 22 | #' 23 | #' # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 24 | #' # gene_order_file=infercnv_genes_example, 25 | #' # annotations_file=infercnv_annots_example, 26 | #' # ref_group_names=c("normal")) 27 | #' 28 | #' # infercnv_object_example <- infercnv::run(infercnv_object_example, 29 | #' # cutoff=1, 30 | #' # out_dir=tempfile(), 31 | #' # cluster_by_groups=TRUE, 32 | #' # denoise=TRUE, 33 | #' # HMM=FALSE, 34 | #' # num_threads=2, 35 | #' # no_plot=TRUE) 36 | #' 37 | #' data(infercnv_object_example) 38 | #' 39 | #' infercnv_object_example <- infercnv::apply_median_filtering(infercnv_object_example) 40 | #' # plot result object 41 | #' 42 | 43 | apply_median_filtering <- function(infercnv_obj, 44 | window_size=7, 45 | on_observations=TRUE, 46 | on_references=TRUE) { 47 | 48 | if (window_size%%2 != 1 | window_size < 2) { 49 | flog.error("::apply_median_filtering: Error, window_size is an even or < 2. Please specify an odd number >= 3.") 50 | } 51 | 52 | half_window = (window_size - 1) / 2 53 | 54 | gene_chr_listing = infercnv_obj@gene_order[[C_CHR]] 55 | chrs = unlist(unique(gene_chr_listing)) 56 | 57 | if (on_observations) { 58 | for (tumor_type in names(infercnv_obj@observation_grouped_cell_indices)) { 59 | 60 | tumor_indices_list = infercnv_obj@tumor_subclusters[["subclusters"]][[ tumor_type ]] 61 | 62 | for (tumor_indices in tumor_indices_list) { 63 | for (chr in chrs) { 64 | chr_genes_indices = which(gene_chr_listing == chr) 65 | working_data = infercnv_obj@expr.data[chr_genes_indices, tumor_indices, drop=FALSE] 66 | 67 | infercnv_obj@expr.data[chr_genes_indices, tumor_indices] = .median_filter(data=working_data, 68 | window_size=window_size, 69 | half_window=half_window) 70 | } 71 | } 72 | } 73 | } 74 | 75 | if (on_references) { 76 | for (ref_indices in infercnv_obj@reference_grouped_cell_indices) { 77 | for (chr in chrs) { 78 | chr_genes_indices = which(gene_chr_listing == chr) 79 | working_data = infercnv_obj@expr.data[chr_genes_indices, ref_indices, drop=FALSE] 80 | 81 | infercnv_obj@expr.data[chr_genes_indices, ref_indices] = .median_filter(data=working_data, 82 | window_size=window_size, 83 | half_window=half_window) 84 | } 85 | } 86 | } 87 | 88 | return(infercnv_obj) 89 | } 90 | 91 | 92 | .median_filter <- function(data, 93 | window_size, 94 | half_window) { 95 | 96 | xdim = dim(data)[1] 97 | ydim = dim(data)[2] 98 | results = data 99 | 100 | # if (xdim >= window_size & ydim >= window_size) { 101 | for (posx in seq_len(xdim)) { 102 | posxa <- ifelse(posx <= (half_window + 1), 1, (posx - (half_window + 1))) 103 | posxb <- ifelse(posx >= (xdim - (half_window + 1)), xdim, (posx + (half_window + 1))) 104 | for (posy in seq_len(ydim)) { 105 | posya <- ifelse(posy <= (half_window + 1), 1, (posy - (half_window + 1))) 106 | posyb <- ifelse(posy >= (ydim - (half_window + 1)), ydim, (posy + (half_window + 1))) 107 | results[posx, posy] = median(data[posxa:posxb, posya:posyb]) 108 | } 109 | } 110 | #} 111 | 112 | return(results) 113 | } 114 | 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Subclustering 2 | 3 | Subclustering resolution is one of the primary settings that will need to be adjusted in most runs to avoid oversplitting. The tutorial below explains how it works and details about it can also be found on the [wiki](https://github.com/broadinstitute/infercnv/wiki/infercnv-tumor-subclusters#tumor-subclustering-by-leiden-clustering-preferred). 4 | 5 | # Documentation 6 | ### Full documentation 7 | 8 | Visit project [wiki](https://github.com/broadinstitute/inferCNV/wiki) for InferCNV documentation. 9 | 10 | 11 | ### Infercnv video tutorial 12 | 13 | A **video** tutorial giving on overview of infercnv features and how to run an analysis can be found below **(click on the image)**: 14 | 15 | [![Tutorial: Running infercnv](http://img.youtube.com/vi/-qOcHAavZT8/0.jpg)](http://www.youtube.com/watch?v=-qOcHAavZT8 "Tutorial: Running infercnv") 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /Rstudio_helpers/Examine_and_Filter_Cells_and_Genes.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Examine and Filter Cells and Genes" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | # Read in data matrix 11 | ```{r} 12 | data = read.table("Glioblastoma_expressed_genes.txt", header=T, row.names=1) ## CHANGE TO YOUR INPUT MATRIX 13 | 14 | ``` 15 | 16 | # Examine distributions of counts of genes and cells 17 | ```{r} 18 | reads_per_cell = colSums(data) 19 | reads_per_gene = rowSums(data) 20 | genes_per_cell = colSums(data>0) 21 | cells_per_gene = rowSums(data>0) 22 | 23 | hist(log10(reads_per_cell+1),main='reads per cell',col='wheat') 24 | hist(log10(genes_per_cell+1), main='genes per cell', col='wheat') 25 | plot(reads_per_cell, genes_per_cell, log='xy', col='wheat') 26 | hist(log10(reads_per_gene+1),main='reads per gene',col='wheat') 27 | ``` 28 | 29 | ```{r} 30 | plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)') 31 | ``` 32 | #Cell filtering criteria: define min and max genes per cell 33 | 34 | ```{r} 35 | ################################################## 36 | ## ********* USER DEFINED SECTION *************** 37 | ################################################## 38 | 39 | # set upper and lower thresholds for genes per cell: 40 | MIN_GENES_PER_CELL = 350 ## user-defined setting 41 | MAX_GENES_PER_CELL = 1800 ## user-defined setting 42 | 43 | # now replot with the thresholds being shown: 44 | plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)') 45 | abline(h=MIN_GENES_PER_CELL, col='green') # lower threshold 46 | abline(h=MAX_GENES_PER_CELL, col='green') # upper threshold 47 | ``` 48 | 49 | 50 | # Examine percent mitochondrial read content 51 | ```{r} 52 | # define the mitochondrial genes 53 | mito_genes = grep("^mt-", rownames(data) , ignore.case=T, value=T) 54 | print(mito_genes) 55 | ``` 56 | 57 | ```{r} 58 | # compute pct mito 59 | mito_gene_read_counts = colSums(data[mito_genes,]) 60 | pct_mito = mito_gene_read_counts / reads_per_cell * 100 61 | plot(sort(pct_mito)) 62 | ``` 63 | 64 | # Decide on maximum allowed percent mitochondrial reads: 65 | ```{r} 66 | ################################################## 67 | ## ********* USER DEFINED SECTION *************** 68 | ################################################## 69 | 70 | MAX_PCT_MITO = 10 ## user-defined setting 71 | 72 | plot(sort(pct_mito)) 73 | abline(h=MAX_PCT_MITO, col='red') 74 | ``` 75 | 76 | 77 | # cell selection as per Peter Karchenko - the Pagoda way 78 | 79 | ```{r} 80 | df = data.frame(reads_per_cell=reads_per_cell, genes_per_cell=genes_per_cell) 81 | head(df) 82 | ``` 83 | 84 | 85 | # Plot gene_per_cell vs. reads_per_cell, define outliers 86 | 87 | ```{r} 88 | library(MASS) 89 | df = df[order(df$reads_per_cell),] # order by reads_per_cell 90 | plot(df, log='xy') 91 | m <- rlm(genes_per_cell~reads_per_cell,data=df) # robust linear model, not sens to outliers 92 | p.level = 1e-3 93 | # predict genes_per_cell based on observed reads_per_cell 94 | suppressWarnings(pb <- data.frame(predict(m, interval='prediction', 95 | level = 1-p.level, # define conf interval 96 | type="response"))) 97 | polygon(c(df$reads_per_cell, rev(df$reads_per_cell)), 98 | c(pb$lwr, rev(pb$upr)), col=adjustcolor(2,alpha=0.1), border = NA) 99 | 100 | # identifier outliers as having observed genes_per_cell outside the prediction confidence interval 101 | outliers <- rownames(df)[df$genes_per_cell > pb$upr | df$genes_per_cell < pb$lwr]; 102 | points(df[outliers,],col=2,cex=0.6) 103 | ``` 104 | 105 | # Before pruning cells, let's make a backup copy of the original matrix: 106 | ```{r} 107 | data.prefiltered = data 108 | ``` 109 | 110 | # Now, let's do some pruning to remove 'bad' cells 111 | ```{r} 112 | filtered_data = data.prefiltered # just in case we re-run this block using different thresholds. 113 | 114 | ############################################################### 115 | # prune genes, require a gene to be expressed in at least 3 cells 116 | 117 | filtered_data.prefiltered = filtered_data 118 | filtered_data = filtered_data[cells_per_gene >= 3,] ## user can change this if needed. 119 | 120 | ############################################################### 121 | # prune cells 122 | valid_cells = colnames(filtered_data) # all cells 123 | message('starting with: ', length(valid_cells), ' cells') # number starting with 124 | 125 | ## remove cells based on gene count criteria: 126 | valid_cells = valid_cells[genes_per_cell >= MIN_GENES_PER_CELL & genes_per_cell <= MAX_GENES_PER_CELL] # set values based on your evaluation above 127 | message('after filtering low and high gene count outliers: ', length(valid_cells), ' cells') # number after filtering based gene count thresholds 128 | 129 | ## remove cells having excessive mito read content 130 | valid_cells = valid_cells[valid_cells %in% names(pct_mito)[pct_mito <= MAX_PCT_MITO]] 131 | message('after removing high-mito cells: ', length(valid_cells), ' cells') # number remaining after high-mito cells removed 132 | 133 | ## remove cells identified as outliers via the Karchenko method 134 | valid_cells = valid_cells[ ! valid_cells %in% outliers] 135 | message('after removing final outliers: ', length(valid_cells), ' cells') # number surviving outlier detection 136 | 137 | ## update the count matrix to contain only the valid cells 138 | filtered_data = filtered_data[,valid_cells] 139 | 140 | write.table(filtered_data, file="filtered_data.counts.matrix", quote=F, sep="\t") 141 | ``` 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /WDL/infercnv.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow infercnv { 4 | input { 5 | File raw_counts_matrix # the matrix of genes (rows) vs. cells (columns) 6 | File gene_order_file # data file containing the positions of each gene along each chromosome in the genome 7 | File annotations_file # a description of the cells, indicating the cell type classifications. 8 | String additional_args = "" 9 | Int cpu = 1 10 | String memory = "12G" 11 | String docker = "trinityctat/infercnv:1.11.1" 12 | Int preemptible = 2 13 | Int extra_disk_space = 10 14 | } 15 | 16 | call run_infercnv { 17 | input: 18 | raw_counts_matrix = raw_counts_matrix, 19 | gene_order_file = gene_order_file, 20 | annotations_file = annotations_file, 21 | additional_args = additional_args, 22 | cpu = cpu, 23 | memory = memory, 24 | extra_disk_space = extra_disk_space, 25 | docker = docker, 26 | preemptible = preemptible 27 | } 28 | 29 | output { 30 | Array[File] infercnv_figures = run_infercnv.infercnv_outputs 31 | Array[File] infercnv_outputs = run_infercnv.infercnv_outputs 32 | File infercnv_full_outputs = run_infercnv.infercnv_full_outputs 33 | } 34 | } 35 | 36 | task run_infercnv { 37 | input { 38 | File raw_counts_matrix 39 | File gene_order_file 40 | File annotations_file 41 | String memory 42 | Int cpu 43 | String docker 44 | Int preemptible 45 | String additional_args 46 | Int extra_disk_space 47 | } 48 | 49 | command { 50 | set -e 51 | 52 | mkdir infercnv 53 | 54 | inferCNV.R \ 55 | --raw_counts_matrix ${raw_counts_matrix} \ 56 | --annotations_file ${annotations_file} \ 57 | --gene_order_file ${gene_order_file} \ 58 | --num_threads ${cpu} \ 59 | --out_dir infercnv \ 60 | ${additional_args} 61 | 62 | tar -cvzf infercnv_full_outputs.tar.gz infercnv 63 | } 64 | 65 | output { 66 | File infercnv_full_outputs = "infercnv_full_outputs.tar.gz" 67 | Array[File] infercnv_figures = glob("infercnv/*.png") 68 | Array[File] infercnv_outputs = glob("infercnv/infercnv.*.txt infercnv/top_*.txt infercnv/*pred_cnv_*.dat") 69 | } 70 | 71 | runtime { 72 | docker: docker 73 | memory: memory 74 | bootDiskSizeGb: 12 75 | disks: "local-disk " + ceil(size(raw_counts_matrix, "GB")*2 + extra_disk_space) + " HDD" 76 | cpu: cpu 77 | preemptible: preemptible 78 | } 79 | } 80 | 81 | -------------------------------------------------------------------------------- /data/HMM_states.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/HMM_states.rda -------------------------------------------------------------------------------- /data/infercnv_annots_example.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_annots_example.rda -------------------------------------------------------------------------------- /data/infercnv_data_example.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_data_example.rda -------------------------------------------------------------------------------- /data/infercnv_genes_example.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_genes_example.rda -------------------------------------------------------------------------------- /data/infercnv_object_example.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/infercnv_object_example.rda -------------------------------------------------------------------------------- /data/mcmc_obj.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/data/mcmc_obj.rda -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Docker file for inferCNV 2 | FROM bioconductor/bioconductor_docker:devel 3 | 4 | LABEL org.label-schema.license="BSD-3-Clause" \ 5 | org.label-schema.vendor="Broad Institute" \ 6 | maintainer="Christophe Georgescu " 7 | 8 | RUN apt-get update && apt-get -y install curl libssl-dev libcurl4-openssl-dev \ 9 | libxml2-dev git python3 jags \ 10 | r-cran-rjags time && \ 11 | apt-get clean && rm -rf /var/tmp/* \ 12 | /tmp/* /var/lib/apt/lists/* 13 | 14 | # Install R and Bioconductor packages 15 | RUN echo "options(repos = c(CRAN = 'https://cran.rstudio.com'))" >.Rprofile 16 | RUN R -e "BiocManager::install('infercnv')" 17 | #RUN R -e "install.packages(c('cluster', 'Seurat', 'parallelDist', 'optparse'), repos = 'http://cran.us.r-project.org')" 18 | RUN R -e "install.packages(c('cluster', 'Seurat', 'optparse', 'igraph', 'reshape2'), repos = 'http://cran.us.r-project.org')" 19 | #RUN R -e "install.packages('phyclust', repos = 'http://cran.us.r-project.org')" 20 | 21 | # RUN pip3 install numpy igraph pandas leidenalg 22 | # ENV RETICULATE_PYTHON=/usr/bin/python3 23 | 24 | # Checkout and install infercnv 25 | # update to version bump commit 26 | RUN git clone https://github.com/broadinstitute/infercnv && cd infercnv && \ 27 | git checkout master && git checkout 1b46b48303bac4a882bcb758e78fcf7f832fdefb && \ 28 | R CMD INSTALL . 29 | 30 | ENV PATH=${PATH}:/infercnv/scripts 31 | 32 | CMD inferCNV.R --help 33 | 34 | -------------------------------------------------------------------------------- /example/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | all: 4 | ./run.R 5 | 6 | clean: 7 | rm -rf ./output_dir 8 | 9 | 10 | 11 | debug: 12 | ../scripts/plot_hspike.by_num_cells.R --infercnv_obj output_dir/run.final.infercnv_obj 13 | ../scripts/plot_hspike.R --infercnv_obj output_dir/run.final.infercnv_obj 14 | ../scripts/run_HMM_on_hspike.R --infercnv_obj output_dir/run.final.infercnv_obj 15 | ../scripts/plot_hspike_vs_sample_chrs.R --infercnv_obj output_dir/preliminary.infercnv_obj 16 | 17 | 18 | 19 | 20 | i3: 21 | cat run.R | sed s/HMM=TRUE/HMM=TRUE,HMM_type=\'i3\'/ > run.i3.R 22 | Rscript ./run.i3.R 23 | 24 | Bayes: 25 | cat run.R | sed s/HMM=TRUE/HMM=TRUE,BayesMaxPNormal=0.35/ > run.Bayes.R 26 | Rscript ./run.Bayes.R 27 | 28 | -------------------------------------------------------------------------------- /example/README.txt: -------------------------------------------------------------------------------- 1 | This example uses an abridged version of the gencode annotations. You do not want to use that file with your own data. It's abridged here only to reduce space in R packaging. 2 | 3 | The complete gencode annotation file can be found here: 4 | https://github.com/broadinstitute/inferCNV_examples/tree/master/__gene_position_data 5 | 6 | -------------------------------------------------------------------------------- /example/__alt_exec_modes/run.no_spike.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library("infercnv") 4 | 5 | # create the infercnv object 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix", 7 | annotations_file="../oligodendroglioma_annotations_downsampled.txt", 8 | delim="\t", 9 | gene_order_file="../gencode_downsampled.txt", 10 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 11 | 12 | out_dir="output_dir.no_spike" 13 | # perform infercnv operations to reveal cnv signal 14 | infercnv_obj = infercnv::run(infercnv_obj, 15 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 16 | out_dir=out_dir, 17 | cluster_by_groups=T, 18 | plot_steps=F, 19 | include.spike=F # used for final scaling to fit range (0,2) centered at 1. 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /example/__alt_exec_modes/run.set_num_ref_groups.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library("infercnv") 4 | 5 | # create the infercnv object 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix", 7 | annotations_file="../oligodendroglioma_annotations_downsampled.txt", 8 | delim="\t", 9 | gene_order_file="../gencode_downsampled.txt", 10 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 11 | 12 | 13 | out_dir="output_dir_ref_grps_4" 14 | 15 | # perform infercnv operations to reveal cnv signal 16 | infercnv_obj = infercnv::run(infercnv_obj, 17 | cutoff=1, 18 | out_dir=out_dir, 19 | cluster_by_groups=T, 20 | plot_steps=T, 21 | num_ref_groups=4, 22 | ) 23 | 24 | # generate final plot 25 | plot_cnv(infercnv_obj, 26 | out_dir=out_dir, 27 | cluster_by_groups=T, 28 | color_safe_pal=FALSE, 29 | x.center=1, 30 | x.range=c(0.6,1.4), 31 | title="inferCNV", 32 | obs_title="Observations (Cells)", 33 | ref_title="References (Cells)", 34 | output_filename="infercnv") 35 | 36 | 37 | -------------------------------------------------------------------------------- /example/__alt_exec_modes/run.use_zscores.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library("infercnv") 4 | 5 | # create the infercnv object 6 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix="../oligodendroglioma_expression_downsampled.counts.matrix", 7 | annotations_file="../oligodendroglioma_annotations_downsampled.txt", 8 | delim="\t", 9 | gene_order_file="../gencode_downsampled.txt", 10 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 11 | 12 | 13 | out_dir="output_dir_use_zscores" 14 | 15 | # perform infercnv operations to reveal cnv signal 16 | infercnv_obj = infercnv::run(infercnv_obj, 17 | cutoff=1, 18 | out_dir=out_dir, 19 | cluster_by_groups=T, 20 | plot_steps=T, 21 | use_zscores=T, 22 | include.spike=T 23 | ) 24 | 25 | -------------------------------------------------------------------------------- /example/run.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(error = function() traceback(2)) 4 | 5 | library("infercnv") 6 | 7 | # create the infercnv object 8 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"), 9 | annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"), 10 | delim="\t", 11 | gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"), 12 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 13 | 14 | out_dir="output_dir" 15 | # perform infercnv operations to reveal cnv signal 16 | infercnv_obj = infercnv::run(infercnv_obj, 17 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 18 | out_dir=out_dir, 19 | cluster_by_groups=TRUE, 20 | analysis_mode="subclusters", 21 | plot_steps=FALSE, 22 | denoise=TRUE, 23 | sd_amplifier=2, 24 | HMM=TRUE 25 | ) 26 | 27 | -------------------------------------------------------------------------------- /example/run_memory_profiling_per_step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in `seq 1 21`; do 4 | gtime -v Rscript run_test.R $i > profiling/up_to_step_${i}_1.log 2> profiling/up_to_step_${i}_1.times 5 | gtime -v Rscript run_test.R $i > profiling/up_to_step_${i}_2.log 2> profiling/up_to_step_${i}_2.times 6 | done 7 | -------------------------------------------------------------------------------- /example/run_test.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args = commandArgs(trailingOnly=TRUE) 4 | if (length(args) != 1) { 5 | stop() 6 | } 7 | 8 | as.numeric(args[1]) 9 | 10 | 11 | options(error = function() traceback(2)) 12 | 13 | library("infercnv") 14 | 15 | # create the infercnv object 16 | # infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"), 17 | # annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"), 18 | # delim="\t", 19 | # gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"), 20 | # ref_group_names=NULL) 21 | 22 | infercnv_obj <- readRDS("default_input_infercnv_object.rds") 23 | 24 | out_dir="output_dir_memory_test" 25 | # perform infercnv operations to reveal cnv signal 26 | infercnv_obj = infercnv::run(infercnv_obj, 27 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 28 | out_dir=out_dir, 29 | cluster_by_groups=TRUE, 30 | plot_steps=FALSE, 31 | no_plot=TRUE, 32 | denoise=TRUE, 33 | debug=TRUE, 34 | HMM=TRUE, 35 | up_to_step=as.numeric(args[1])) 36 | 37 | -------------------------------------------------------------------------------- /inst/BUGS_Mixture_Model: -------------------------------------------------------------------------------- 1 | model { 2 | ## Likelihood 3 | ## Single cell 4 | for( j in 1:C ) { # for each cell line K 5 | for ( i in 1:G ) { # for each gene j 6 | ## Likelihood 7 | ## generative distribution of the observed data 8 | gexp[i, j] ~ dnorm(mu.1[j], tau.1[j]) 9 | } 10 | 11 | ## mu and tau are Fixed Effects dependent on the cluster/state assignment 12 | ## result in cluster specific means and precision 13 | ## gamma is group specific Random Effect 14 | 15 | mu.1[j] <- mu[1] * (equals(epsilon[j], 1 )) + 16 | mu[2] * (equals(epsilon[j], 2 )) + 17 | mu[3] * (equals(epsilon[j], 3 )) + 18 | mu[4] * (equals(epsilon[j], 4 )) + 19 | mu[5] * (equals(epsilon[j], 5 )) + 20 | mu[6] * (equals(epsilon[j], 6 )) 21 | 22 | tau.1[j] <- sig[1] * (equals( epsilon[j], 1 )) + 23 | sig[2] * (equals( epsilon[j], 2 )) + 24 | sig[3] * (equals( epsilon[j], 3 )) + 25 | sig[4] * (equals( epsilon[j], 4 )) + 26 | sig[5] * (equals( epsilon[j], 5 )) + 27 | sig[6] * (equals( epsilon[j], 6 )) 28 | 29 | # PRIOR 30 | ## Epsilons hold our cluster/state assignment 31 | ## theta are the mixture probabilities for states 32 | ## cell specific 33 | 34 | epsilon[j] ~ dcat(theta[]) 35 | } 36 | 37 | 38 | # HYPERPARAMETERS 39 | ## hyperparameter for for gamma, a flat gamma distribution 40 | sigma ~ dgamma(1,1) 41 | 42 | # dirchlet with equal probabilities for each state, equals to a uniform 43 | # provides the probabilities distribution of states 44 | # alpha can be 1 or (1/nubmer of states) 45 | 46 | 47 | ## Hyperparameter for epsilon, 48 | ## This is the mixing property! 49 | 50 | theta[1:6] ~ ddirich(alpha[]) 51 | 52 | # HYPERHYPERPARAMETER 53 | 54 | for(i in 1:6){ 55 | alpha[i] <- 1 56 | } 57 | } -------------------------------------------------------------------------------- /inst/BUGS_Mixture_Model_i3: -------------------------------------------------------------------------------- 1 | model { 2 | ## Likelihood 3 | ## Single cell 4 | for( j in 1:C ) { # for each cell line K 5 | for ( i in 1:G ) { # for each gene j 6 | ## Likelihood 7 | ## generative distribution of the observed data 8 | gexp[i, j] ~ dnorm(mu.1[j], tau.1[j]) 9 | } 10 | 11 | ## mu and tau are Fixed Effects dependent on the cluster/state assignment 12 | ## result in cluster specific means and precision 13 | ## gamma is group specific Random Effect 14 | 15 | mu.1[j] <- mu[1] * (equals(epsilon[j], 1 )) + 16 | mu[2] * (equals(epsilon[j], 2 )) + 17 | mu[3] * (equals(epsilon[j], 3 )) 18 | 19 | tau.1[j] <- sig[1] * (equals( epsilon[j], 1 )) + 20 | sig[2] * (equals( epsilon[j], 2 )) + 21 | sig[3] * (equals( epsilon[j], 3 )) 22 | 23 | # PRIOR 24 | ## Epsilons hold our cluster/state assignment 25 | ## theta are the mixture probabilities for states 26 | ## cell specific 27 | 28 | epsilon[j] ~ dcat(theta[]) 29 | } 30 | 31 | 32 | # HYPERPARAMETERS 33 | ## hyperparameter for for gamma, a flat gamma distribution 34 | sigma ~ dgamma(1,1) 35 | 36 | # dirchlet with equal probabilities for each state, equals to a uniform 37 | # provides the probabilities distribution of states 38 | # alpha can be 1 or (1/nubmer of states) 39 | 40 | 41 | ## Hyperparameter for epsilon, 42 | ## This is the mixing property! 43 | 44 | theta[1:3] ~ ddirich(alpha[]) 45 | 46 | # HYPERHYPERPARAMETER 47 | 48 | for(i in 1:3){ 49 | alpha[i] <- 1 50 | } 51 | } -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite the inferCNV package in publications, please use:") 2 | 3 | citEntry(entry = "manual", 4 | title = "inferCNV of the Trinity CTAT Project.", 5 | author = personList( 6 | person("Timothy", "Tickle", email = "ttickle@broadinstitute.org", role = c("aut", "cre")), 7 | person("Itay", "Tirosh", email = "tirosh@broadinstitute.org", role = "aut"), 8 | person("Christophe", "Georgescu", email = "cgeorges@broadinstitute.org", role = "aut"), 9 | person("Maxwell", "Brown", email = "mbrown@broadinstitute.org", role = "aut"), 10 | person("Brian", "Haas", email = "bhaas@broadinstitute.org", role = "aut") 11 | ), 12 | organization = "Klarman Cell Observatory, Broad Institute of MIT and Harvard", 13 | address = "Cambridge, MA, USA", 14 | year = 2019, 15 | url = "https://github.com/broadinstitute/inferCNV", 16 | textVersion = "inferCNV of the Trinity CTAT Project. https://github.com/broadinstitute/inferCNV" 17 | ) 18 | 19 | #citEntry(entry = "article", 20 | # title = "", 21 | # author = personList( 22 | # person(), 23 | # person() 24 | # ), 25 | # journal = "", 26 | # year = "2018", 27 | # volume = "", 28 | # pages = "", 29 | # textVersion = paste("authors", "title", "journal") 30 | #) 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /inst/extdata/oligodendroglioma_annotations_downsampled.txt: -------------------------------------------------------------------------------- 1 | MGH54_P2_C12 Microglia/Macrophage 2 | MGH36_P6_F03 Microglia/Macrophage 3 | MGH53_P4_H08 Microglia/Macrophage 4 | MGH53_P2_E09 Microglia/Macrophage 5 | MGH36_P5_E12 Microglia/Macrophage 6 | MGH54_P2_H07 Microglia/Macrophage 7 | MGH36_P4_H06 Microglia/Macrophage 8 | MGH53_P1_C01 Microglia/Macrophage 9 | MGH36_P4_A10 Microglia/Macrophage 10 | MGH36_P3_D10 Microglia/Macrophage 11 | MGH54_P2_F09 Microglia/Macrophage 12 | MGH36_P7_H06 Microglia/Macrophage 13 | MGH54_P2_H03 Microglia/Macrophage 14 | MGH36_P8_A02 Microglia/Macrophage 15 | MGH53_P2_C08 Microglia/Macrophage 16 | MGH53_P2_A07 Microglia/Macrophage 17 | MGH53_P1_F10 Microglia/Macrophage 18 | MGH36_P3_H06 Microglia/Macrophage 19 | MGH54_P2_F03 Microglia/Macrophage 20 | MGH54_P16_F12 Oligodendrocytes (non-malignant) 21 | MGH54_P12_C10 Oligodendrocytes (non-malignant) 22 | MGH54_P11_C11 Oligodendrocytes (non-malignant) 23 | MGH54_P15_D06 Oligodendrocytes (non-malignant) 24 | MGH54_P16_A03 Oligodendrocytes (non-malignant) 25 | MGH53_P7_B09 Oligodendrocytes (non-malignant) 26 | MGH54_P10_G04 Oligodendrocytes (non-malignant) 27 | MGH53_P2_A02 Oligodendrocytes (non-malignant) 28 | MGH53_P7_F07 Oligodendrocytes (non-malignant) 29 | MGH53_P5_G02 Oligodendrocytes (non-malignant) 30 | MGH53_P11_H03 Oligodendrocytes (non-malignant) 31 | MGH53_P1_A10 Oligodendrocytes (non-malignant) 32 | MGH53_P5_H09 Oligodendrocytes (non-malignant) 33 | MGH53_P11_E03 Oligodendrocytes (non-malignant) 34 | MGH53_P10_F11 Oligodendrocytes (non-malignant) 35 | MGH53_P1_D07 Oligodendrocytes (non-malignant) 36 | MGH53_P2_G04 Oligodendrocytes (non-malignant) 37 | MGH53_P2_G09 Oligodendrocytes (non-malignant) 38 | MGH53_P5_F04 Oligodendrocytes (non-malignant) 39 | MGH53_P11_F08 Oligodendrocytes (non-malignant) 40 | MGH53_P8_F03 Oligodendrocytes (non-malignant) 41 | MGH53_P6_B11 Oligodendrocytes (non-malignant) 42 | MGH53_P6_H06 Oligodendrocytes (non-malignant) 43 | MGH36_P1_B02 malignant_MGH36 44 | MGH36_P1_H10 malignant_MGH36 45 | MGH36_P3_A09 malignant_MGH36 46 | MGH36_P3_B02 malignant_MGH36 47 | MGH36_P3_C04 malignant_MGH36 48 | MGH36_P3_E06 malignant_MGH36 49 | MGH36_P4_B09 malignant_MGH36 50 | MGH36_P4_D11 malignant_MGH36 51 | MGH36_P4_G03 malignant_MGH36 52 | MGH36_P6_C04 malignant_MGH36 53 | MGH36_P6_G08 malignant_MGH36 54 | MGH36_P7_B04 malignant_MGH36 55 | MGH36_P7_D03 malignant_MGH36 56 | MGH36_P7_F04 malignant_MGH36 57 | MGH36_P7_G04 malignant_MGH36 58 | MGH36_P5_B08 malignant_MGH36 59 | MGH36_P5_F05 malignant_MGH36 60 | MGH36_P5_F11 malignant_MGH36 61 | MGH36_P5_H05 malignant_MGH36 62 | MGH36_P10_B08 malignant_MGH36 63 | MGH36_P10_C10 malignant_MGH36 64 | MGH36_P10_E07 malignant_MGH36 65 | MGH36_P10_F09 malignant_MGH36 66 | MGH36_P8_E05 malignant_MGH36 67 | MGH36_P8_H09 malignant_MGH36 68 | MGH36_P9_B01 malignant_MGH36 69 | MGH36_P9_B11 malignant_MGH36 70 | MGH36_P9_H03 malignant_MGH36 71 | MGH36_P2_A08 malignant_MGH36 72 | MGH36_P2_C02 malignant_MGH36 73 | MGH36_P2_G01 malignant_MGH36 74 | MGH36_P2_G02 malignant_MGH36 75 | MGH36_P2_H06 malignant_MGH36 76 | MGH53_P5_A08 malignant_MGH53 77 | MGH53_P5_D02 malignant_MGH53 78 | MGH53_P6_F03 malignant_MGH53 79 | MGH53_P6_H04 malignant_MGH53 80 | MGH53_P7_B10 malignant_MGH53 81 | MGH53_P7_C03 malignant_MGH53 82 | MGH53_P7_E02 malignant_MGH53 83 | MGH53_P7_G11 malignant_MGH53 84 | MGH53_P7_H03 malignant_MGH53 85 | MGH53_P8_A07 malignant_MGH53 86 | MGH53_P8_C11 malignant_MGH53 87 | MGH53_P8_E05 malignant_MGH53 88 | MGH53_P8_E10 malignant_MGH53 89 | MGH53_P8_H04 malignant_MGH53 90 | MGH53_P1_B04 malignant_MGH53 91 | MGH53_P12_A01 malignant_MGH53 92 | MGH53_P12_B09 malignant_MGH53 93 | MGH53_P12_C02 malignant_MGH53 94 | MGH53_P12_C09 malignant_MGH53 95 | MGH53_P12_D12 malignant_MGH53 96 | MGH53_P12_E03 malignant_MGH53 97 | MGH53_P10_B02 malignant_MGH53 98 | MGH53_P10_C09 malignant_MGH53 99 | MGH53_P10_E09 malignant_MGH53 100 | MGH53_P10_H08 malignant_MGH53 101 | MGH53_P11_A03 malignant_MGH53 102 | MGH53_P11_B02 malignant_MGH53 103 | MGH53_P11_B11 malignant_MGH53 104 | MGH53_P11_F12 malignant_MGH53 105 | MGH53_P11_H12 malignant_MGH53 106 | MGH53_P9_A09 malignant_MGH53 107 | MGH53_P9_C12 malignant_MGH53 108 | MGH53_P4_C03 malignant_MGH53 109 | MGH53_P4_F01 malignant_MGH53 110 | 97_P3_G07 malignant_97 111 | 97_P3_E04 malignant_97 112 | 97_P3_D10 malignant_97 113 | 97_P3_E01 malignant_97 114 | 97_P3_E03 malignant_97 115 | 97_P3_B10 malignant_97 116 | 97_P3_B04 malignant_97 117 | 97_P3_B01 malignant_97 118 | 97_P3_B03 malignant_97 119 | 97_P3_D01 malignant_97 120 | 97_P3_D04 malignant_97 121 | 97_P3_D12 malignant_97 122 | 97_P3_F12 malignant_97 123 | 97_P3_E12 malignant_97 124 | 97_P5_D09 malignant_97 125 | 97_P6_H01 malignant_97 126 | 97_P5_C10 malignant_97 127 | 97_P6_E07 malignant_97 128 | 97_P5_D02 malignant_97 129 | 97_P6_G10 malignant_97 130 | 97_P5_G05 malignant_97 131 | 97_P6_B09 malignant_97 132 | 97_P5_H08 malignant_97 133 | 97_P5_F04 malignant_97 134 | 97_P5_D01 malignant_97 135 | 97_P6_F05 malignant_97 136 | 97_P6_A06 malignant_97 137 | 97_P5_A07 malignant_97 138 | 97_P6_E01 malignant_97 139 | 97_P6_D09 malignant_97 140 | 97_P5_G06 malignant_97 141 | 97_P5_E12 malignant_97 142 | 97_P6_A07 malignant_97 143 | 97_P6_G12 malignant_97 144 | 97_P6_H06 malignant_97 145 | 93_P3_B02 malignant_93 146 | 93_P3_G05 malignant_93 147 | 93_P3_H04 malignant_93 148 | 93_P3_A10 malignant_93 149 | 93_P3_C04 malignant_93 150 | 93_P3_D07 malignant_93 151 | 93_P3_G07 malignant_93 152 | 93_P3_E09 malignant_93 153 | 93_P3_G11 malignant_93 154 | 93_P3_A11 malignant_93 155 | 93_P6_H11 malignant_93 156 | 93_P5_H06 malignant_93 157 | 93_P5_C12 malignant_93 158 | 93_P6_A02 malignant_93 159 | 93_P5_D07 malignant_93 160 | 93_P6_C07 malignant_93 161 | 93_P9_C04 malignant_93 162 | 93_P9_E04 malignant_93 163 | 93_P9_H01 malignant_93 164 | 93_P8_B06 malignant_93 165 | 93_P10_E05 malignant_93 166 | 93_P9_B10 malignant_93 167 | 93_P8_G11 malignant_93 168 | 93_P9_F02 malignant_93 169 | 93_P10_F03 malignant_93 170 | 93_P9_G11 malignant_93 171 | 93_P8_E09 malignant_93 172 | 93_P8_C11 malignant_93 173 | 93_P9_A03 malignant_93 174 | 93_P10_G11 malignant_93 175 | 93_P9_B11 malignant_93 176 | 93_P9_D06 malignant_93 177 | 93_P8_B02 malignant_93 178 | 93_P8_C09 malignant_93 179 | 93_P9_H03 malignant_93 180 | 93_P10_D04 malignant_93 181 | 93_P8_G09 malignant_93 182 | 93_P10_B10 malignant_93 183 | 93_P9_C07 malignant_93 184 | 93_P8_A12 malignant_93 185 | -------------------------------------------------------------------------------- /inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/infercnv/624feae9727dff74926aecd0d8945a21d61b572b/inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz -------------------------------------------------------------------------------- /inst/script/README.txt: -------------------------------------------------------------------------------- 1 | This example uses an abridged version of the gencode annotations. You do not want to use that file with your own data. It's abridged here only to reduce space in R packaging. 2 | 3 | The complete gencode annotation file can be found here: 4 | https://github.com/broadinstitute/inferCNV_examples/tree/master/__gene_position_data 5 | 6 | -------------------------------------------------------------------------------- /man/CreateInfercnvObject.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV.R 3 | \name{CreateInfercnvObject} 4 | \alias{CreateInfercnvObject} 5 | \title{CreateInfercnvObject} 6 | \usage{ 7 | CreateInfercnvObject( 8 | raw_counts_matrix, 9 | gene_order_file, 10 | annotations_file, 11 | ref_group_names, 12 | delim = "\\t", 13 | max_cells_per_group = NULL, 14 | min_max_counts_per_cell = c(100, +Inf), 15 | chr_exclude = c("chrX", "chrY", "chrM") 16 | ) 17 | } 18 | \arguments{ 19 | \item{raw_counts_matrix}{the matrix of genes (rows) vs. cells (columns) containing the raw counts 20 | If a filename is given, it'll be read via read.table() 21 | otherwise, if matrix or Matrix, will use the data directly.} 22 | 23 | \item{gene_order_file}{data file containing the positions of each gene along each chromosome in the genome.} 24 | 25 | \item{annotations_file}{a description of the cells, indicating the cell type classifications} 26 | 27 | \item{ref_group_names}{a vector containing the classifications of the reference (normal) cells to use for infering cnv} 28 | 29 | \item{delim}{delimiter used in the input files} 30 | 31 | \item{max_cells_per_group}{maximun number of cells to use per group. Default=NULL, using all cells defined in the annotations_file. This option is useful for randomly subsetting the existing data for a quicker preview run, such as using 50 cells per group instead of hundreds.} 32 | 33 | \item{min_max_counts_per_cell}{minimum and maximum counts allowed per cell. Any cells outside this range will be removed from the counts matrix. default=(100, +Inf) and uses all cells. If used, should be set as c(min_counts, max_counts)} 34 | 35 | \item{chr_exclude}{list of chromosomes in the reference genome annotations that should be excluded from analysis. Default = c('chrX', 'chrY', 'chrM')} 36 | } 37 | \value{ 38 | infercnv 39 | } 40 | \description{ 41 | Creation of an infercnv object. This requires the following inputs: 42 | A more detailed description of each input is provided below: 43 | 44 | The raw_counts_matrix: 45 | 46 | MGH54_P16_F12 MGH53_P5_C12 MGH54_P12_C10 MGH54_P16_F02 MGH54_P11_C11 ... 47 | DDX11L1 0.0000000 0.000000 0.000000 0.000000 0.0000000 48 | WASH7P 0.0000000 2.231939 7.186235 5.284944 0.9650009 49 | FAM138A 0.1709991 0.000000 0.000000 0.000000 0.0000000 50 | OR4F5 0.0000000 0.000000 0.000000 0.000000 0.0000000 51 | OR4F29 0.0000000 0.000000 0.000000 0.000000 0.0000000 52 | ... 53 | 54 | The gene_order_file, contains chromosome, start, and stop position for each gene, tab-delimited: 55 | 56 | chr start stop 57 | DDX11L1 chr1 11869 14412 58 | WASH7P chr1 14363 29806 59 | FAM138A chr1 34554 36081 60 | OR4F5 chr1 69091 70008 61 | OR4F29 chr1 367640 368634 62 | OR4F16 chr1 621059 622053 63 | ... 64 | 65 | The annotations_file, containing the cell name and the cell type classification, tab-delimited. 66 | 67 | V1 V2 68 | 1 MGH54_P2_C12 Microglia/Macrophage 69 | 2 MGH36_P6_F03 Microglia/Macrophage 70 | 3 MGH53_P4_H08 Microglia/Macrophage 71 | 4 MGH53_P2_E09 Microglia/Macrophage 72 | 5 MGH36_P5_E12 Oligodendrocytes (non-malignant) 73 | 6 MGH54_P2_H07 Oligodendrocytes (non-malignant) 74 | ... 75 | 179 93_P9_H03 malignant 76 | 180 93_P10_D04 malignant 77 | 181 93_P8_G09 malignant 78 | 182 93_P10_B10 malignant 79 | 183 93_P9_C07 malignant 80 | 184 93_P8_A12 malignant 81 | ... 82 | 83 | 84 | and the ref_group_names vector might look like so: c("Microglia/Macrophage","Oligodendrocytes (non-malignant)") 85 | } 86 | \examples{ 87 | data(infercnv_data_example) 88 | data(infercnv_annots_example) 89 | data(infercnv_genes_example) 90 | 91 | infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 92 | gene_order_file=infercnv_genes_example, 93 | annotations_file=infercnv_annots_example, 94 | ref_group_names=c("normal")) 95 | 96 | } 97 | -------------------------------------------------------------------------------- /man/HMM_states.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{HMM_states} 5 | \alias{HMM_states} 6 | \title{infercnv object result of the processing of run() in the HMM example, to be used for other examples.} 7 | \format{ 8 | An infercnv object containing HMM predictions 9 | } 10 | \usage{ 11 | HMM_states 12 | } 13 | \description{ 14 | infercnv object result of the processing of run() in the HMM example, to be used for other examples. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/MCMC_inferCNV-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_BayesNet.R 3 | \docType{class} 4 | \name{MCMC_inferCNV-class} 5 | \alias{MCMC_inferCNV-class} 6 | \alias{MCMC_inferCNV} 7 | \title{MCMC_inferCNV class} 8 | \description{ 9 | Uses Markov Chain Monte Carlo (MCMC) and Gibbs sampling to estimate the posterior 10 | probability of being in one of six Copy Number Variation states (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by 11 | inferCNV's HMM. Posterior probabilities are found for the entire CNV cluster and each individual 12 | cell line in the CNV. 13 | } 14 | \section{Slots}{ 15 | 16 | \describe{ 17 | \item{\code{bugs_model}}{BUGS model.} 18 | 19 | \item{\code{sig}}{fitted values for cell lines, 1/standard deviation to be used for determining the distribution of each cell line} 20 | 21 | \item{\code{mu}}{Mean values to be used for determining the distribution of each cell line} 22 | 23 | \item{\code{group_id}}{ID's given to the cell clusters.} 24 | 25 | \item{\code{cell_gene}}{List containing the Cells and Genes that make up each CNV.} 26 | 27 | \item{\code{cnv_probabilities}}{Probabilities of each CNV belonging to a particular state from 0 (least likely)to 1 (most likely).} 28 | 29 | \item{\code{cell_probabilities}}{Probabilities of each cell being in a particular state, from 0 (least likely)to 1 (most likely).} 30 | 31 | \item{\code{args}}{Input arguments given by the user} 32 | 33 | \item{\code{cnv_regions}}{ID for each CNV found by the HMM} 34 | }} 35 | 36 | \keyword{classes} 37 | -------------------------------------------------------------------------------- /man/add_to_seurat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/seurat_interaction.R 3 | \name{add_to_seurat} 4 | \alias{add_to_seurat} 5 | \title{add_to_seurat()} 6 | \usage{ 7 | add_to_seurat( 8 | seurat_obj = NULL, 9 | assay_name = "RNA", 10 | infercnv_output_path, 11 | top_n = 10, 12 | bp_tolerance = 2e+06, 13 | column_prefix = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{seurat_obj}{Seurat object to add meta.data to (default: NULL)} 18 | 19 | \item{assay_name}{Name of the assay in the Seurat object if provided. (default: "RNA")} 20 | 21 | \item{infercnv_output_path}{Path to the output folder of the infercnv run to use} 22 | 23 | \item{top_n}{How many of the largest CNA (in number of genes) to get.} 24 | 25 | \item{bp_tolerance}{How many bp of tolerance to have around feature start/end positions for top_n largest CNVs.} 26 | 27 | \item{column_prefix}{String to add as a prefix to the Seurat metadata columns. Only applied to the seurat_obj, if supplied. Default is NULL} 28 | } 29 | \value{ 30 | seurat_obj 31 | } 32 | \description{ 33 | Add meta.data about CNAs to a Seurat object from an infercnv_obj 34 | } 35 | -------------------------------------------------------------------------------- /man/apply_median_filtering.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/noise_reduction.R 3 | \name{apply_median_filtering} 4 | \alias{apply_median_filtering} 5 | \title{apply_median_filtering} 6 | \usage{ 7 | apply_median_filtering( 8 | infercnv_obj, 9 | window_size = 7, 10 | on_observations = TRUE, 11 | on_references = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{infercnv_obj}{infercnv_object} 16 | 17 | \item{window_size}{Size of the window side centered on the data point to filter (default = 7).} 18 | 19 | \item{on_observations}{boolean (default=TRUE), run on observations data (tumor cells).} 20 | 21 | \item{on_references}{boolean (default=TRUE), run on references (normal cells).} 22 | } 23 | \value{ 24 | infercnv_obj with median filtering applied to observations 25 | } 26 | \description{ 27 | Apply a median filtering to the expression matrix within each tumor bounds 28 | } 29 | \examples{ 30 | # data(infercnv_data_example) 31 | # data(infercnv_annots_example) 32 | # data(infercnv_genes_example) 33 | 34 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 35 | # gene_order_file=infercnv_genes_example, 36 | # annotations_file=infercnv_annots_example, 37 | # ref_group_names=c("normal")) 38 | 39 | # infercnv_object_example <- infercnv::run(infercnv_object_example, 40 | # cutoff=1, 41 | # out_dir=tempfile(), 42 | # cluster_by_groups=TRUE, 43 | # denoise=TRUE, 44 | # HMM=FALSE, 45 | # num_threads=2, 46 | # no_plot=TRUE) 47 | 48 | data(infercnv_object_example) 49 | 50 | infercnv_object_example <- infercnv::apply_median_filtering(infercnv_object_example) 51 | # plot result object 52 | 53 | } 54 | -------------------------------------------------------------------------------- /man/color.palette.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_ops.R 3 | \name{color.palette} 4 | \alias{color.palette} 5 | \title{Helper function allowing greater control over the steps in a color palette.} 6 | \usage{ 7 | color.palette(steps, between = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{steps}{Vector of colors to change use in the palette} 11 | 12 | \item{between}{Steps where gradients change} 13 | 14 | \item{...}{Additional arguments of colorRampPalette} 15 | } 16 | \value{ 17 | Color palette 18 | } 19 | \description{ 20 | Helper function allowing greater control over the steps in a color palette. 21 | Source: http://menugget.blogspot.com/2011/11/define-color-steps-for- 22 | colorramppalette.html#more 23 | } 24 | \examples{ 25 | color.palette(c("darkblue", "white", "darkred"), 26 | c(2, 2)) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/filterHighPNormals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_BayesNet.R 3 | \name{filterHighPNormals} 4 | \alias{filterHighPNormals} 5 | \title{filterHighPNormals: Filter the HMM identified CNV's by the CNV's posterior probability 6 | of belonging to a normal state.} 7 | \usage{ 8 | filterHighPNormals(MCMC_inferCNV_obj, HMM_states, BayesMaxPNormal, useRaster) 9 | } 10 | \arguments{ 11 | \item{MCMC_inferCNV_obj}{MCMC infernCNV object.} 12 | 13 | \item{HMM_states}{InferCNV object with HMM states in expression data.} 14 | 15 | \item{BayesMaxPNormal}{Option to filter CNV or cell lines by some probability threshold.} 16 | 17 | \item{useRaster}{Option to use rasterization when plotting} 18 | } 19 | \value{ 20 | Returns a list of (MCMC_inferCNV_obj, HMM_states) With removed CNV's. 21 | } 22 | \description{ 23 | The following function will filter the HMM identified CNV's by the CNV's posterior 24 | probability of belonging to a normal state identified by the function inferCNVBayesNet(). Will filter 25 | CNV's based on a user desired threshold probability. Any CNV with a probability of being normal above 26 | the threshold will be removed. 27 | } 28 | \examples{ 29 | data(mcmc_obj) 30 | 31 | mcmc_obj_hmm_states_list <- infercnv::filterHighPNormals( MCMC_inferCNV_obj = mcmc_obj, 32 | HMM_states = HMM_states, 33 | BayesMaxPNormal = 0.5) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /man/inferCNVBayesNet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_BayesNet.R 3 | \name{inferCNVBayesNet} 4 | \alias{inferCNVBayesNet} 5 | \title{inferCNVBayesNet: Run Bayesian Network Mixture Model To Obtain Posterior Probabilities For HMM Predicted States} 6 | \usage{ 7 | inferCNVBayesNet( 8 | file_dir, 9 | infercnv_obj, 10 | HMM_states, 11 | out_dir, 12 | resume_file_token, 13 | model_file = NULL, 14 | CORES = 1, 15 | postMcmcMethod = NULL, 16 | plotingProbs = TRUE, 17 | quietly = TRUE, 18 | diagnostics = FALSE, 19 | HMM_type = HMM_type, 20 | k_obs_groups = k_obs_groups, 21 | cluster_by_groups = cluster_by_groups, 22 | reassignCNVs = TRUE, 23 | no_plot = no_plot, 24 | useRaster 25 | ) 26 | } 27 | \arguments{ 28 | \item{file_dir}{Location of the directory of the inferCNV outputs.} 29 | 30 | \item{infercnv_obj}{InferCNV object.} 31 | 32 | \item{HMM_states}{InferCNV object with HMM states in expression data.} 33 | 34 | \item{out_dir}{(string) Path to where the output file should be saved to.} 35 | 36 | \item{resume_file_token}{(string) String token that contains some info on settings used to name files.} 37 | 38 | \item{model_file}{Path to the BUGS Model file.} 39 | 40 | \item{CORES}{Option to run parallel by specifying the number of cores to be used. (Default: 1)} 41 | 42 | \item{postMcmcMethod}{What actions to take after finishing the MCMC.} 43 | 44 | \item{plotingProbs}{Option for adding plots of Cell and CNV probabilities. (Default: TRUE)} 45 | 46 | \item{quietly}{Option to print descriptions along each step. (Default: TRUE)} 47 | 48 | \item{diagnostics}{Option to plot Diagnostic plots and tables. (Default: FALSE)} 49 | 50 | \item{HMM_type}{The type of HMM that was ra, either 'i3' or 'i6'. Determines how many state were predicted by the HMM.} 51 | 52 | \item{k_obs_groups}{Number of groups in which to break the observations. (default: 1)} 53 | 54 | \item{cluster_by_groups}{If observations are defined according to groups (ie. patients), each group 55 | of cells will be clustered separately. (default=FALSE, instead will use k_obs_groups setting)} 56 | 57 | \item{reassignCNVs}{(boolean) Given the CNV associated probability of belonging to each possible state, 58 | reassign the state assignments made by the HMM to the state that has the highest probability. (default: TRUE)} 59 | 60 | \item{no_plot}{(boolean) Option set by infercnv::run() for producing visualizations.} 61 | 62 | \item{useRaster}{Option to use rasterization when plotting} 63 | } 64 | \value{ 65 | Returns a MCMC_inferCNV_obj and posterior probability of being in one of six Copy Number Variation states 66 | (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by inferCNV's HMM. 67 | } 68 | \description{ 69 | Uses Markov Chain Monte Carlo (MCMC) and Gibbs sampling to estimate the posterior 70 | probability of being in one of six Copy Number Variation states (states: 0, 0.5, 1, 1.5, 2, 3) for CNV's identified by 71 | inferCNV's HMM. Posterior probabilities are found for the entire CNV cluster and each individual 72 | cell line in the CNV. 73 | } 74 | \examples{ 75 | data(infercnv_data_example) 76 | data(infercnv_annots_example) 77 | data(infercnv_genes_example) 78 | data(HMM_states) 79 | 80 | infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 81 | gene_order_file=infercnv_genes_example, 82 | annotations_file=infercnv_annots_example, 83 | ref_group_names=c("normal")) 84 | 85 | out_dir = tempfile() 86 | infercnv_object_example <- infercnv::run(infercnv_object_example, 87 | cutoff=1, 88 | out_dir=out_dir, 89 | cluster_by_groups=TRUE, 90 | analysis_mode="samples", 91 | denoise=TRUE, 92 | HMM=TRUE, 93 | num_threads=2, 94 | no_plot=TRUE) 95 | mcmc_obj <- infercnv::inferCNVBayesNet(infercnv_obj = infercnv_object_example, 96 | HMM_states = HMM_states, 97 | file_dir = out_dir, 98 | postMcmcMethod = "removeCNV", 99 | out_dir = out_dir, 100 | resume_file_token = "HMMi6.hmm_mode-samples", 101 | quietly = TRUE, 102 | CORES = 2, 103 | plotingProbs = FALSE, 104 | diagnostics = FALSE, 105 | HMM_type = 'i6', 106 | k_obs_groups = 1, 107 | cluster_by_groups = FALSE, 108 | reassignCNVs = FALSE, 109 | no_plot = TRUE) 110 | 111 | } 112 | -------------------------------------------------------------------------------- /man/infercnv-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV.R 3 | \docType{class} 4 | \name{infercnv-class} 5 | \alias{infercnv-class} 6 | \alias{infercnv} 7 | \title{The infercnv Class} 8 | \description{ 9 | An infercnv object encapsulates the expression data and gene chromosome ordering information 10 | that is leveraged by infercnv for data exploration. The infercnv object is passed among the 11 | infercnv data processing and plotting routines. 12 | } 13 | \details{ 14 | Slots in the infercnv object include: 15 | } 16 | \section{Slots}{ 17 | 18 | \describe{ 19 | \item{\code{expr.data}}{ the count or expression data matrix, manipulated throughout infercnv ops} 20 | 21 | \item{\code{count.data}}{ retains the original count data, but shrinks along with expr.data when genes are removed.} 22 | 23 | \item{\code{gene_order}}{ chromosomal gene order} 24 | 25 | \item{\code{reference_grouped_cell_indices}}{ mapping [['group_name']] to c(cell column indices) for reference (normal) cells} 26 | 27 | \item{\code{observation_grouped_cell_indices}}{ mapping [['group_name']] to c(cell column indices) for observation (tumor) cells} 28 | 29 | \item{\code{tumor_subclusters}}{ stores subclustering of tumors if requested} 30 | 31 | \item{\code{options}}{ stores the options relevant to the analysis in itself (in contrast with options relevant to plotting or paths)} 32 | 33 | \item{\code{.hspike}}{a hidden infercnv object populated with simulated spiked-in data} 34 | }} 35 | 36 | -------------------------------------------------------------------------------- /man/infercnv-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV.R 3 | \docType{package} 4 | \name{infercnv-package} 5 | \alias{infercnv-package} 6 | \alias{_PACKAGE} 7 | \title{infercnv: Infer Copy Number Variation from Single-Cell RNA-Seq Data} 8 | \description{ 9 | Using single-cell RNA-Seq expression to visualize CNV in cells. 10 | } 11 | \details{ 12 | The main functions you will need to use are CreateInfercnvObject() and run(infercnv_object). 13 | For additional details on running the analysis step by step, please refer to the example vignette. 14 | } 15 | \seealso{ 16 | Useful links: 17 | \itemize{ 18 | \item \url{https://github.com/broadinstitute/inferCNV/wiki} 19 | \item Report bugs at \url{https://github.com/broadinstitute/inferCNV/issues} 20 | } 21 | 22 | } 23 | \author{ 24 | \strong{Maintainer}: Christophe Georgescu \email{cgeorges@broadinstitute.org} 25 | 26 | Authors: 27 | \itemize{ 28 | \item Timothy Tickle \email{ttickle@broadinstitute.org} 29 | \item Itay Tirosh \email{tirosh@broadinstitute.org} 30 | \item Maxwell Brown \email{mbrown@broadinstitute.org} 31 | \item Brian Haas \email{bhaas@broadinstitute.org} 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /man/infercnv_annots_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{infercnv_annots_example} 5 | \alias{infercnv_annots_example} 6 | \title{Generated classification for 10 normal cells and 10 tumor cells.} 7 | \format{ 8 | A data frame with 20 rows (cells) and 1 columns (classification) 9 | } 10 | \usage{ 11 | infercnv_annots_example 12 | } 13 | \description{ 14 | Generated classification for 10 normal cells and 10 tumor cells. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/infercnv_data_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{infercnv_data_example} 5 | \alias{infercnv_data_example} 6 | \title{Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells. 7 | This is only to demonstrate how to use methods, not actual data to be used in an analysis.} 8 | \format{ 9 | A data frame with 8252 rows (genes) and 20 columns (cells) 10 | } 11 | \usage{ 12 | infercnv_data_example 13 | } 14 | \description{ 15 | Generated SmartSeq2 expression data with 10 normal cells and 10 tumor cells. 16 | This is only to demonstrate how to use methods, not actual data to be used in an analysis. 17 | } 18 | \keyword{datasets} 19 | -------------------------------------------------------------------------------- /man/infercnv_genes_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{infercnv_genes_example} 5 | \alias{infercnv_genes_example} 6 | \title{Downsampled gene coordinates file from GrCh37} 7 | \format{ 8 | A data frame with 10338 rows (genes) and 3 columns (chr, start, end) 9 | } 10 | \usage{ 11 | infercnv_genes_example 12 | } 13 | \description{ 14 | Downsampled gene coordinates file from GrCh37 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/infercnv_object_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{infercnv_object_example} 5 | \alias{infercnv_object_example} 6 | \title{infercnv object result of the processing of run() in the example, to be used for other examples.} 7 | \format{ 8 | An infercnv object 9 | } 10 | \usage{ 11 | infercnv_object_example 12 | } 13 | \description{ 14 | infercnv object result of the processing of run() in the example, to be used for other examples. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/mcmc_obj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{mcmc_obj} 5 | \alias{mcmc_obj} 6 | \title{infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples.} 7 | \format{ 8 | An infercnv object containing posterior probability of CNV states 9 | } 10 | \usage{ 11 | mcmc_obj 12 | } 13 | \description{ 14 | infercnv object result of the processing of inferCNVBayesNet in the example, to be used for other examples. 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/plot_cnv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_heatmap.R 3 | \name{plot_cnv} 4 | \alias{plot_cnv} 5 | \title{Plot the matrix as a heatmap, with cells as rows and genes as columns, ordered according to chromosome} 6 | \usage{ 7 | plot_cnv( 8 | infercnv_obj, 9 | out_dir = ".", 10 | title = "inferCNV", 11 | obs_title = "Observations (Cells)", 12 | ref_title = "References (Cells)", 13 | cluster_by_groups = TRUE, 14 | cluster_references = TRUE, 15 | plot_chr_scale = FALSE, 16 | chr_lengths = NULL, 17 | k_obs_groups = 1, 18 | contig_cex = 1, 19 | x.center = mean(infercnv_obj@expr.data), 20 | x.range = "auto", 21 | hclust_method = "ward.D", 22 | custom_color_pal = NULL, 23 | color_safe_pal = FALSE, 24 | output_filename = "infercnv", 25 | output_format = "png", 26 | png_res = 300, 27 | dynamic_resize = 0, 28 | ref_contig = NULL, 29 | write_expr_matrix = FALSE, 30 | write_phylo = FALSE, 31 | useRaster = TRUE 32 | ) 33 | } 34 | \arguments{ 35 | \item{infercnv_obj}{infercnv object} 36 | 37 | \item{out_dir}{Directory in which to save pdf and other output.} 38 | 39 | \item{title}{Plot title.} 40 | 41 | \item{obs_title}{Title for the observations matrix.} 42 | 43 | \item{ref_title}{Title for the reference matrix.} 44 | 45 | \item{cluster_by_groups}{Whether to cluster observations by their annotations or not. Using this ignores k_obs_groups.} 46 | 47 | \item{cluster_references}{Whether to cluster references within their annotations or not. (dendrogram not displayed)} 48 | 49 | \item{plot_chr_scale}{Whether to scale the chromosme width on the heatmap based on their actual size rather than just the number of expressed genes.} 50 | 51 | \item{chr_lengths}{A named list of chromsomes lengths to use when plot_chr_scale=TRUE, or else chromosome size is assumed to be the last chromosome's stop position + 10k bp} 52 | 53 | \item{k_obs_groups}{Number of groups to break observation into.} 54 | 55 | \item{contig_cex}{Contig text size.} 56 | 57 | \item{x.center}{Value on which to center expression.} 58 | 59 | \item{x.range}{vector containing the extreme values in the heatmap (ie. c(-3,4) )} 60 | 61 | \item{hclust_method}{Clustering method to use for hclust.} 62 | 63 | \item{custom_color_pal}{Specify a custom set of colors for the heatmap. 64 | Has to be in the shape color.palette(c("darkblue", "white", "darkred"), 65 | c(2, 2))} 66 | 67 | \item{color_safe_pal}{Logical indication of using a color blindness safe palette.} 68 | 69 | \item{output_filename}{Filename to save the figure to.} 70 | 71 | \item{output_format}{format for heatmap image file (default: 'png'), options('png', 'pdf', NA) 72 | If set to NA, will print graphics natively} 73 | 74 | \item{png_res}{Resolution for png output.} 75 | 76 | \item{dynamic_resize}{Factor (>= 0) by which to scale the dynamic resize of the observation 77 | heatmap and the overall plot based on how many cells there are. 78 | Default is 0, which disables the scaling. Try 1 first if you want to enable.} 79 | 80 | \item{ref_contig}{If given, will focus cluster on only genes in this contig.} 81 | 82 | \item{write_expr_matrix}{Includes writing a matrix file containing the expression data that is plotted in the heatmap.} 83 | 84 | \item{write_phylo}{Write newick strings of the dendrograms displayed on the left side of the heatmap to file.} 85 | 86 | \item{useRaster}{Whether to use rasterization for drawing heatmap. Only disable if it produces an error as it is much faster than not using it.} 87 | } 88 | \value{ 89 | A list of all relevent settings used for the plotting to be able to reuse them in another plot call while keeping consistant plotting settings, most importantly x.range. 90 | } 91 | \description{ 92 | Formats the data and sends it for plotting. 93 | } 94 | \examples{ 95 | # data(infercnv_data_example) 96 | # data(infercnv_annots_example) 97 | # data(infercnv_genes_example) 98 | 99 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 100 | # gene_order_file=infercnv_genes_example, 101 | # annotations_file=infercnv_annots_example, 102 | # ref_group_names=c("normal")) 103 | 104 | # infercnv_object_example <- infercnv::run(infercnv_object_example, 105 | # cutoff=1, 106 | # out_dir=tempfile(), 107 | # cluster_by_groups=TRUE, 108 | # denoise=TRUE, 109 | # HMM=FALSE, 110 | # num_threads=2, 111 | # no_plot=TRUE) 112 | 113 | data(infercnv_object_example) 114 | 115 | plot_cnv(infercnv_object_example, 116 | out_dir=tempfile(), 117 | obs_title="Observations (Cells)", 118 | ref_title="References (Cells)", 119 | cluster_by_groups=TRUE, 120 | x.center=1, 121 | x.range="auto", 122 | hclust_method='ward.D', 123 | color_safe_pal=FALSE, 124 | output_filename="infercnv", 125 | output_format="png", 126 | png_res=300, 127 | dynamic_resize=0 128 | ) 129 | 130 | } 131 | -------------------------------------------------------------------------------- /man/plot_per_group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/infercnv_sampling.R 3 | \name{plot_per_group} 4 | \alias{plot_per_group} 5 | \title{plot_per_group} 6 | \usage{ 7 | plot_per_group( 8 | infercnv_obj, 9 | on_references = TRUE, 10 | on_observations = TRUE, 11 | sample = FALSE, 12 | n_cells = 1000, 13 | every_n = NULL, 14 | above_m = 1000, 15 | k_obs_groups = 1, 16 | base_filename = "infercnv_per_group", 17 | output_format = "png", 18 | write_expr_matrix = TRUE, 19 | save_objects = FALSE, 20 | png_res = 300, 21 | dynamic_resize = 0, 22 | useRaster = TRUE, 23 | out_dir 24 | ) 25 | } 26 | \arguments{ 27 | \item{infercnv_obj}{infercnv_object} 28 | 29 | \item{on_references}{boolean (default=TRUE), plot references (normal cells).} 30 | 31 | \item{on_observations}{boolean (default=TRUE), plot observations data (tumor cells).} 32 | 33 | \item{sample}{Whether unique groups of cells should be sampled from or not. (see other parameters for how sampling is done) (Default: FALSE)} 34 | 35 | \item{n_cells}{Number of cells that should be sampled per group if sampling is enabled (default = 1000) .} 36 | 37 | \item{every_n}{Sample 1 cell every_n cells for each group that has above_m cells, if sampling is enabled. 38 | If subclusters are defined, this will make sure that at least one cell per subcluster is sampled. 39 | Requires above_m to be set to work, overriding n_cells parameter. (Default: NULL)} 40 | 41 | \item{above_m}{Sample only groups that have at least above_m cells if sampling is enabled. (default: 1000) 42 | Does not require every_n to be set.} 43 | 44 | \item{k_obs_groups}{Number of groups to break each group in with cutree (in the color bars on the left side of the plot only). (Default: 1)} 45 | 46 | \item{base_filename}{Base prefix for the output files names. 47 | Will be followed by OBS/REF to indidate the type of the group, and the group name. (Default: "infercnv_per_group")} 48 | 49 | \item{output_format}{Output format for the figure. Choose between "png", "pdf" and NA. NA means to only write the text outputs without generating the figure itself. (default: "png")} 50 | 51 | \item{write_expr_matrix}{Includes writing a matrix file containing the expression data that is plotted in the heatmap. (default: FALSE)} 52 | 53 | \item{save_objects}{Whether to save the infercnv objects generated for each group as RDS. (default: FALSE)} 54 | 55 | \item{png_res}{Resolution for png output. (Default: 300)} 56 | 57 | \item{dynamic_resize}{Factor (>= 0) by which to scale the dynamic resize of the observation 58 | heatmap and the overall plot based on how many cells there are. 59 | Default is 0, which disables the scaling. Try 1 first if you want to enable. (Default: 0)} 60 | 61 | \item{useRaster}{Whether to use rasterization for drawing heatmap. Only disable if it produces an error as it is much faster than not using it.} 62 | 63 | \item{out_dir}{Directory in which to save plots and other outputs.} 64 | } 65 | \value{ 66 | void 67 | } 68 | \description{ 69 | Takes an infercnv object and subdivides it into one object per group of cells 70 | to allow plotting of each group on a seperate plot. If references are selected, they will appear 71 | on the observation heatmap area as it is larger. 72 | } 73 | \examples{ 74 | # data(infercnv_data_example) 75 | # data(infercnv_annots_example) 76 | # data(infercnv_genes_example) 77 | 78 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 79 | # gene_order_file=infercnv_genes_example, 80 | # annotations_file=infercnv_annots_example, 81 | # ref_group_names=c("normal")) 82 | 83 | # infercnv_object_example <- infercnv::run(infercnv_object_example, 84 | # cutoff=1, 85 | # out_dir=tempfile(), 86 | # cluster_by_groups=TRUE, 87 | # denoise=TRUE, 88 | # HMM=FALSE, 89 | # num_threads=2, 90 | # no_plot=TRUE) 91 | 92 | data(infercnv_object_example) 93 | 94 | infercnv::plot_per_group(infercnv_object_example, out_dir=tempfile()) 95 | 96 | } 97 | -------------------------------------------------------------------------------- /man/plot_subclusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV_tumor_subclusters.R 3 | \name{plot_subclusters} 4 | \alias{plot_subclusters} 5 | \title{Plot a heatmap of the data in the infercnv object with the subclusters being displayed as annotations.} 6 | \usage{ 7 | plot_subclusters( 8 | infercnv_obj, 9 | out_dir, 10 | output_filename = "subcluster_as_annotations" 11 | ) 12 | } 13 | \arguments{ 14 | \item{infercnv_obj}{infercnv object} 15 | 16 | \item{out_dir}{Directory in which to output.} 17 | 18 | \item{output_filename}{Filename to save the figure to.} 19 | } 20 | \value{ 21 | infercnv_obj the modified infercnv object that was plotted where subclusters are assigned as annotation groups 22 | } 23 | \description{ 24 | Formats the data and sends it for plotting. 25 | } 26 | \examples{ 27 | # data(infercnv_data_example) 28 | # data(infercnv_annots_example) 29 | # data(infercnv_genes_example) 30 | 31 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 32 | # gene_order_file=infercnv_genes_example, 33 | # annotations_file=infercnv_annots_example, 34 | # ref_group_names=c("normal")) 35 | 36 | # infercnv_object_example <- infercnv::run(infercnv_object_example, 37 | # cutoff=1, 38 | # out_dir=tempfile(), 39 | # cluster_by_groups=TRUE, 40 | # denoise=TRUE, 41 | # HMM=FALSE, 42 | # num_threads=2, 43 | # no_plot=TRUE) 44 | 45 | data(infercnv_object_example) 46 | 47 | plot_subclusters(infercnv_object_example, 48 | out_dir=tempfile(), 49 | output_filename="subclusters_as_annotations" 50 | ) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /man/sample_object.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/infercnv_sampling.R 3 | \name{sample_object} 4 | \alias{sample_object} 5 | \title{sample_object} 6 | \usage{ 7 | sample_object( 8 | infercnv_obj, 9 | n_cells = 100, 10 | every_n = NULL, 11 | above_m = NULL, 12 | on_references = TRUE, 13 | on_observations = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{infercnv_obj}{infercnv_object} 18 | 19 | \item{n_cells}{Number of cells that should be sampled per group (default = 100).} 20 | 21 | \item{every_n}{Sample 1 cell every_n cells for each group. If subclusters are defined, 22 | this will make sure that at least one cell per subcluster is sampled. 23 | Requires above_m to be set to work, overriding n_cells parameter.} 24 | 25 | \item{above_m}{Sample groups that have at least above_m cells. 26 | Requires every_n to be set to work, overriding n_cells parameter} 27 | 28 | \item{on_references}{boolean (default=TRUE), sample references (normal cells).} 29 | 30 | \item{on_observations}{boolean (default=TRUE), sample observations data (tumor cells).} 31 | } 32 | \value{ 33 | sampled infercnv_obj 34 | } 35 | \description{ 36 | Apply sampling on an infercnv object to reduce the number of cells in it 37 | and allow faster plotting or have all groups take up the same height on the heatmap 38 | } 39 | \examples{ 40 | # data(infercnv_data_example) 41 | # data(infercnv_annots_example) 42 | # data(infercnv_genes_example) 43 | 44 | # infercnv_object_example <- infercnv::CreateInfercnvObject(raw_counts_matrix=infercnv_data_example, 45 | # gene_order_file=infercnv_genes_example, 46 | # annotations_file=infercnv_annots_example, 47 | # ref_group_names=c("normal")) 48 | 49 | # infercnv_object_example <- infercnv::run(infercnv_object_example, 50 | # cutoff=1, 51 | # out_dir=tempfile(), 52 | # cluster_by_groups=TRUE, 53 | # denoise=TRUE, 54 | # HMM=FALSE, 55 | # num_threads=2, 56 | # no_plot=TRUE) 57 | 58 | data(infercnv_object_example) 59 | 60 | infercnv_object_example <- infercnv::sample_object(infercnv_object_example, n_cells=5) 61 | # plot result object 62 | 63 | } 64 | -------------------------------------------------------------------------------- /man/validate_infercnv_obj.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inferCNV.R 3 | \name{validate_infercnv_obj} 4 | \alias{validate_infercnv_obj} 5 | \title{validate_infercnv_obj()} 6 | \usage{ 7 | validate_infercnv_obj(infercnv_obj) 8 | } 9 | \arguments{ 10 | \item{infercnv_obj}{infercnv_object} 11 | } 12 | \value{ 13 | none 14 | } 15 | \description{ 16 | validate an infercnv_obj 17 | ensures that order of genes in the @gene_order slot match up perfectly with the gene rows in the @expr.data matrix. 18 | Otherwise, throws an error and stops execution. 19 | } 20 | -------------------------------------------------------------------------------- /scripts/HB_example_to_inferCNV_obj.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(HoneyBADGER) 4 | library(infercnv) 5 | 6 | data(gexp) ## tumor cells, dim: [6082,75] 7 | data(ref) ## reference, length: 6082 8 | 9 | 10 | 11 | raw.data = cbind(gexp, data.frame('GTEX'=ref)) 12 | 13 | cell.annots = data.frame(cell=colnames(gexp), type='tumor') 14 | cell.annots = rbind(cell.annots, data.frame(cell='GTEX', type='normal')) 15 | 16 | write.table(raw.data, file="hb.example.matrix", quote=F, sep="\t") 17 | write.table(cell.annots, file='hb.example.cell_annots', quote=F, sep="\t", col.names=F, row.names=F) 18 | 19 | -------------------------------------------------------------------------------- /scripts/KS_matrix_comparison.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1234) 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | library(tidyverse) 8 | 9 | parser = ArgumentParser() 10 | parser$add_argument("--matrix1", required=T, nargs=1) 11 | parser$add_argument("--matrix2", required=T, nargs=1) 12 | parser$add_argument("--log", required=F, default=FALSE, action="store_true") 13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf") 14 | 15 | args = parser$parse_args() 16 | 17 | 18 | #' learn distribution parameters: 19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1)) 20 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1)) 21 | 22 | 23 | png(args$output) 24 | if (args$log) { 25 | data1 = log(data1+1) 26 | data2 = log(data2+1) 27 | } 28 | 29 | 30 | ## plotting ideas borrowed from 31 | ## https://stackoverflow.com/questions/39162178/kolmogorov-smirnov-plot-in-r-ggplot 32 | 33 | 34 | m1_ecdf = ecdf(data1) 35 | m2_ecdf = ecdf(data2) 36 | val_range = range(data1, data2) 37 | step = (val_range[2] - val_range[1])/100 38 | vals = seq(val_range[1], val_range[2], step) 39 | 40 | 41 | m1_cdf = m1_ecdf(vals) 42 | m2_cdf = m2_ecdf(vals) 43 | 44 | cdfs = data.frame(vals, 45 | m1_cdf, 46 | m2_cdf) 47 | 48 | ks_point = which.max(abs(cdfs$m1_cdf - cdfs$m2_cdf)) 49 | ks_point_info = cdfs[ks_point,] 50 | ##message("KS point info: ", paste(ks_point_info, collapse=', ')) 51 | 52 | cdfs = cdfs %>% gather('m1_cdf', 'm2_cdf', key='type', value='cdf') 53 | 54 | 55 | ggplot(cdfs, aes(x=vals, y=cdf)) + 56 | geom_line(aes(color=type, linetype=type)) + 57 | geom_segment(aes(x=ks_point_info$vals, 58 | y=ks_point_info$m1_cdf, 59 | xend=ks_point_info$vals, 60 | yend=ks_point_info$m2_cdf), color='magenta', size=2) + 61 | ggtitle(sprintf("%s vs. %s KS", args$matrix1, args$matrix2)) + xlab("number") + ylab("cdf") 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /scripts/KS_matrix_comparison.use_infercnv_obj.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1234) 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | library(tidyverse) 8 | 9 | parser = ArgumentParser() 10 | parser$add_argument("--matrix1", required=T, nargs=1) 11 | parser$add_argument("--infercnv_obj", required=T, nargs=1) 12 | parser$add_argument("--log", required=F, default=FALSE, action="store_true") 13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf") 14 | 15 | args = parser$parse_args() 16 | 17 | 18 | #' learn distribution parameters: 19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1)) 20 | 21 | 22 | 23 | infercnv_obj_file = args$infercnv_obj 24 | infercnv_obj = readRDS(infercnv_obj_file) 25 | data2 = as.matrix(infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)]) 26 | 27 | 28 | png(args$output) 29 | if (args$log) { 30 | data1 = log(data1+1) 31 | data2 = log(data2+1) 32 | } 33 | 34 | 35 | ## plotting ideas borrowed from 36 | ## https://stackoverflow.com/questions/39162178/kolmogorov-smirnov-plot-in-r-ggplot 37 | 38 | 39 | m1_ecdf = ecdf(data1) 40 | m2_ecdf = ecdf(data2) 41 | val_range = range(data1, data2) 42 | step = (val_range[2] - val_range[1])/100 43 | vals = seq(val_range[1], val_range[2], step) 44 | 45 | 46 | m1_cdf = m1_ecdf(vals) 47 | m2_cdf = m2_ecdf(vals) 48 | 49 | cdfs = data.frame(vals, 50 | m1_cdf, 51 | m2_cdf) 52 | 53 | ks_point = which.max(abs(cdfs$m1_cdf - cdfs$m2_cdf)) 54 | ks_point_info = cdfs[ks_point,] 55 | ##message("KS point info: ", paste(ks_point_info, collapse=', ')) 56 | 57 | cdfs = cdfs %>% gather('m1_cdf', 'm2_cdf', key='type', value='cdf') 58 | 59 | 60 | ggplot(cdfs, aes(x=vals, y=cdf)) + 61 | geom_line(aes(color=type, linetype=type)) + 62 | geom_segment(aes(x=ks_point_info$vals, 63 | y=ks_point_info$m1_cdf, 64 | xend=ks_point_info$vals, 65 | yend=ks_point_info$m2_cdf), color='magenta', size=2) + 66 | ggtitle(sprintf("%s vs. %s KS", args$matrix1, args$matrix2)) + xlab("number") + ylab("cdf") 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /scripts/QQ_matrix_comparison.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1234) 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | parser = ArgumentParser() 8 | parser$add_argument("--matrix1", required=T, nargs=1) 9 | parser$add_argument("--matrix2", required=T, nargs=1) 10 | parser$add_argument("--log", required=F, default=FALSE, action="store_true") 11 | parser$add_argument("--output", required=T, nargs=1, help="output filename png") 12 | 13 | args = parser$parse_args() 14 | 15 | 16 | #' learn distribution parameters: 17 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1)) 18 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1)) 19 | 20 | 21 | png(args$output) 22 | if (args$log) { 23 | data1 = log(data1+1) 24 | data2 = log(data2+1) 25 | } 26 | qqplot(data1, data2) 27 | abline(a=0,b=1, col='red') 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /scripts/apply_median_filtering.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--window_size", help="window size", required=FALSE, type='integer', default=11) 8 | args = parser$parse_args() 9 | 10 | library(infercnv) 11 | library(ggplot2) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | infercnv_obj = infercnv:::.subcluster_tumors_general(infercnv_obj) 18 | 19 | mf_infercnv_obj = infercnv:::.apply_heatmap_median_filtering(infercnv_obj, window_size=args$window_size) 20 | 21 | saveRDS(mf_infercnv_obj, file=sprintf("%s-median_filtered.W%d.obj", infercnv_obj_file, args$window_size) ) 22 | 23 | plot_cnv(mf_infercnv_obj, output_filename=paste0(infercnv_obj_file, sprintf(".mf.W%d", args$window_size))) 24 | 25 | 26 | -------------------------------------------------------------------------------- /scripts/boxplot_cell_exprs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--log", help="log(x+1) transform expr", action='store_true', default=FALSE) 8 | 9 | args = parser$parse_args() 10 | 11 | library(infercnv) 12 | library(ggplot2) 13 | library(tidyverse) 14 | 15 | infercnv_obj_file = args$infercnv_obj 16 | 17 | infercnv_obj = readRDS(infercnv_obj_file) 18 | 19 | expr.data = infercnv_obj@expr.data 20 | 21 | if (args$log) { 22 | expr.data = log(expr.data+1) 23 | } 24 | 25 | ## build df of expr values. 26 | cell_groups = c(infercnv_obj@reference_grouped_cell_indices, infercnv_obj@observation_grouped_cell_indices) 27 | 28 | cell_group_names = names(cell_groups) 29 | 30 | 31 | pngname = sprintf("%s-boxplot.png", infercnv_obj_file) 32 | png(pngname) 33 | 34 | expr_tibble = do.call(rbind, lapply(cell_group_names, function(cell_group_name) { 35 | cell_group_expr = expr.data[, cell_groups[[ cell_group_name ]] ] 36 | 37 | cell_group_expr = as.tibble(cell_group_expr) 38 | 39 | cell_group_expr = cell_group_expr %>% gather(key='cellname', value='expr') 40 | 41 | cell_group_expr = cell_group_expr %>% mutate(group_name=cell_group_name) 42 | })) 43 | 44 | 45 | 46 | p = expr_tibble %>% ggplot(aes(y=expr, x=cellname, color=group_name)) + geom_boxplot(outlier.shape=NA) + facet_wrap(~group_name, scales='free_x') 47 | 48 | plot(p) 49 | 50 | saveRDS(expr_tibble, 'my.tibble.obj') 51 | 52 | -------------------------------------------------------------------------------- /scripts/check_matrix_format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """Converts a square expression matrix to an R-format compatible expression matrix 5 | """ 6 | 7 | 8 | # Import statements 9 | import argparse 10 | import csv 11 | import os 12 | 13 | __author__ = 'Jon Bistline' 14 | __copyright__ = 'Copyright 2018' 15 | __credits__ = ["Jon Bistline"] 16 | __license__ = 'BSD-3' 17 | __maintainer__ = 'Jon Bistline' 18 | __email__ = 'bistline@broadinstitute.org' 19 | __status__ = 'Development' 20 | 21 | def convert_matrix_format(input_matrix, delimiter, output_name): 22 | """ Convert input expression matrix to R-compatible expression matrix (header line is 1 cell shorter than data lines) 23 | 24 | :param input_matrix: Path to input expression matrix 25 | :type input_matrix: String 26 | :param delimiter: delimiter to parse input matrix with (tab, comma, etc.) 27 | :type delimiter: String 28 | 29 | """ 30 | 31 | if not input_matrix or not os.path.exists(input_matrix): 32 | print("".join(["check_matrix_format.py:: ", 33 | "Could not find input matrix : " + input_matrix])) 34 | 35 | # read first line 36 | with open(input_matrix, "r") as exp_matrix: 37 | print("".join(["Opening input matrix and checking header format: ", input_matrix])) 38 | print("".join(["Using delimiter: ", delimiter])) 39 | rewrite_file = False 40 | matrix = csv.reader(exp_matrix, delimiter=delimiter) 41 | header_list = next(matrix) 42 | # check if first value in header_list needs to be removed 43 | headers_to_remove = ['GENE', 'gene', ''] 44 | if header_list[0] in headers_to_remove: 45 | print("Input matrix is being converted to R format.") 46 | rewrite_file = True 47 | header_list.pop(0) 48 | with open(output_name, 'w+') as new_expression_matrix: 49 | writer = csv.writer(new_expression_matrix, delimiter=delimiter) 50 | writer.writerow(header_list) 51 | for line in matrix: 52 | writer.writerow(line) 53 | 54 | if rewrite_file is True: 55 | print("".join(["Conversion complete, new output file: ", output_name])) 56 | else: 57 | os.rename(input_matrix, output_name) 58 | print("".join(["No conversion necessary, input matrix is in R format already, renamed to new output file: ", output_name])) 59 | 60 | if __name__ == "__main__": 61 | 62 | # Parse arguments 63 | prsr_arguments = argparse.ArgumentParser(prog='check_matrix_format.py', 64 | description=__doc__, # Use text from file summary up top 65 | formatter_class=argparse.RawDescriptionHelpFormatter) 66 | # Add positional argument 67 | prsr_arguments.add_argument("--input_matrix", 68 | metavar="input_matrix", 69 | help="Path to the input expression matrix") 70 | prsr_arguments.add_argument("--delimiter", 71 | metavar="delimiter", 72 | default="\t", 73 | help="delimiter to parse input matrix with (tab, comma, etc.)") 74 | prsr_arguments.add_argument("--output_name", 75 | metavar="output_name", 76 | default="expression.r_format.txt", 77 | help="path to output expression matrix") 78 | args = prsr_arguments.parse_args() 79 | 80 | # Run Script 81 | convert_matrix_format(args.input_matrix, args.delimiter, args.output_name) 82 | -------------------------------------------------------------------------------- /scripts/cross_cell_scaling_normalization.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--log", help="log transform expr", action='store_true', default=FALSE) 8 | 9 | args = parser$parse_args() 10 | 11 | library(infercnv) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | expr.data = infercnv_obj@expr.data 18 | 19 | 20 | ## do upper quartile normalization 21 | upper_quart = apply(expr.data, 2, quantile, probs=0.75) 22 | mean_upper_quart = mean(upper_quart) 23 | revised.expr.data = sweep(expr.data, 2, mean_upper_quart/upper_quart, "*") 24 | 25 | new_upper_quart = apply(revised.expr.data, 2, quantile, probs=0.75) 26 | 27 | print(new_upper_quart) 28 | 29 | infercnv_obj@expr.data = revised.expr.data 30 | 31 | saveRDS(infercnv_obj, 'rescaled.obj') 32 | 33 | -------------------------------------------------------------------------------- /scripts/dropout_matrix_comparison.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1234) 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | library(tidyverse) 8 | library(infercnv) 9 | 10 | parser = ArgumentParser() 11 | parser$add_argument("--matrix1", required=T, nargs=1) 12 | parser$add_argument("--matrix2", required=T, nargs=1) 13 | parser$add_argument("--output", required=T, nargs=1, help="output filename pdf") 14 | 15 | args = parser$parse_args() 16 | 17 | 18 | #' learn distribution parameters: 19 | data1 = as.matrix(read.table(args$matrix1, header=T, row.names=1)) 20 | data2 = as.matrix(read.table(args$matrix2, header=T, row.names=1)) 21 | 22 | ## total sum normalize each: 23 | median_cs = median(colSums(data1), colSums(data2)) 24 | data1 <- infercnv:::.normalize_data_matrix_by_seq_depth(data1, median_cs) 25 | data2 <- infercnv:::.normalize_data_matrix_by_seq_depth(data2, median_cs) 26 | 27 | 28 | pdf(args$output) 29 | 30 | 31 | data1.mean_vs_p0 <- infercnv:::.get_mean_vs_p0_from_matrix(data1) 32 | data2.mean_vs_p0 <- infercnv:::.get_mean_vs_p0_from_matrix(data2) 33 | 34 | plot_mean_vs_p0_with_data <- function(title='title', mean_vs_p0_table) { 35 | 36 | logm <- log(mean_vs_p0_table$m + 1) 37 | p0 <- mean_vs_p0_table$p0 38 | 39 | plot(logm, p0, pch='.', main=title) 40 | 41 | x_approx_mid <- median(logm[which(p0>0.2 & p0 < 0.8)]) 42 | 43 | x <- logm 44 | y <- p0 45 | df <- data.frame(x,y) 46 | 47 | fit <- nls(y ~ infercnv:::.logistic(x, x0 = x0, k = k), data = df, 48 | start = list(x0 = x_approx_mid, k = -1)) 49 | 50 | logistic_x <- x 51 | logistic_y <- predict(fit, newdata=x) 52 | points(x, logistic_y, col='green') 53 | 54 | ## also try fitting a spline 55 | spline.fit <- smooth.spline(x,y) 56 | spline.pts = predict(spline.fit, newdata=x) 57 | points(spline.pts$x, spline.pts$y, col='magenta') 58 | legend('topright', c('logistic', 'spline'), col=c('green', 'magenta'), pch=1) 59 | 60 | ret = list(logistic_x = logistic_x, 61 | logistic_y = logistic_y, 62 | spline_x <- spline.pts$x, 63 | spline_y <- spline.pts$y) 64 | 65 | 66 | return(ret) 67 | } 68 | 69 | 70 | p1 <- plot_mean_vs_p0_with_data(args$matrix1, data1.mean_vs_p0) 71 | p2 <- plot_mean_vs_p0_with_data(args$matrix2, data2.mean_vs_p0) 72 | 73 | 74 | ## plot both logistics in a single plot 75 | plot(p1$logistic_x, p1$logistic_y, col='blue') 76 | points(p2$logistic_x, p2$logistic_y, col='magenta') 77 | legend('topright', c(args$matrix1, args$matrix2), col=c('blue', 'magenta'), pch=1) 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /scripts/examine_dropout_logistic.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) == 0) { 6 | stop("Error, require params: infercnv.obj"); 7 | } 8 | 9 | infercnv_obj_file = args[1] 10 | 11 | pdf(paste0(infercnv_obj_file, '.dropout.pdf')) 12 | 13 | infercnv_obj = readRDS(infercnv_obj_file) 14 | 15 | 16 | library(edgeR) 17 | library(fitdistrplus) 18 | library(infercnv) 19 | 20 | # borrowing some code from splatter 21 | 22 | get_parameters <- function(group_name, expr.matrix) { 23 | 24 | params = list() 25 | params[['group_name']] = group_name 26 | 27 | # estimate gamma for genes 28 | lib.sizes <- colSums(expr.matrix) 29 | lib.med <- median(lib.sizes) 30 | norm.counts <- t(t(expr.matrix) / lib.sizes * lib.med) 31 | norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ] 32 | 33 | 34 | # estimate dropout params 35 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix) 36 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table) 37 | 38 | params[['dropout.logistic.midpt']] = logistic_params$midpt 39 | params[['dropout.logistic.slope']] = logistic_params$slope 40 | 41 | 42 | 43 | mean_vs_p0_table = cbind(mean_vs_p0_table, logm=log(mean_vs_p0_table$m + 1)) 44 | smoothScatter(mean_vs_p0_table$logm, mean_vs_p0_table$p0, main=group_name) 45 | points(mean_vs_p0_table$logm, 46 | infercnv:::.logistic(mean_vs_p0_table$logm, logistic_params$midpt, logistic_params$slope), col='red') 47 | 48 | 49 | midpt_use = mean(mean_vs_p0_table$logm[mean_vs_p0_table$p0>0.48 & mean_vs_p0_table$p0<0.52]) 50 | 51 | points(mean_vs_p0_table$logm, 52 | infercnv:::.logistic(mean_vs_p0_table$logm, midpt_use, logistic_params$slope), col='magenta') 53 | 54 | 55 | s = smooth.spline(mean_vs_p0_table$logm, mean_vs_p0_table$p0) 56 | r = range(mean_vs_p0_table$logm) 57 | x=seq(r[1], r[2], 0.1) 58 | points(x, predict(s, x)$y, col='orange') 59 | 60 | 61 | return(params) 62 | 63 | } 64 | 65 | 66 | 67 | 68 | # examine each group 69 | all_groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 70 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices) 71 | 72 | for (group in names(all_groups)) { 73 | 74 | group_idxs = all_groups[[ group ]] 75 | expr.data = infercnv_obj@expr.data[, group_idxs] 76 | 77 | params = get_parameters(group, expr.data) 78 | params = t(as.data.frame(params)) 79 | 80 | print(params) 81 | 82 | } 83 | 84 | -------------------------------------------------------------------------------- /scripts/examine_infercnv_data_params.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) == 0) { 6 | stop("Error, require params: infercnv.obj"); 7 | } 8 | 9 | infercnv_obj_file = args[1] 10 | 11 | infercnv_obj = readRDS(infercnv_obj_file) 12 | 13 | 14 | library(edgeR) 15 | library(fitdistrplus) 16 | library(infercnv) 17 | library(Matrix) 18 | 19 | # borrowing some code from splatter 20 | 21 | get_parameters <- function(group_name, expr.matrix) { 22 | 23 | message(sprintf("getting params for: %s", group_name)) 24 | params = list() 25 | params[['group_name']] = group_name 26 | 27 | # estimate gamma for genes 28 | lib.sizes <- colSums(expr.matrix) 29 | lib.med <- median(lib.sizes) 30 | norm.counts <- t(t(expr.matrix) / lib.sizes * lib.med) 31 | norm.counts <- norm.counts[rowSums(norm.counts > 0) > 1, ] 32 | 33 | ## note, fitting the gamma is done differently in splatter... using method = "mge", gof = "CvM", and first winsorizing the data at q=0.1 34 | means <- rowMeans(norm.counts) 35 | means.fit <- fitdistrplus::fitdist(means, "gamma", method = "mme") 36 | mean.shape = unname(means.fit$estimate["shape"]) 37 | mean.rate = unname(means.fit$estimate["rate"]) 38 | 39 | params[[ 'gamma.mean.shape' ]] = mean.shape 40 | params[[ 'gamma.mean.rate' ]] = mean.rate 41 | 42 | 43 | # estimate dropout params 44 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix) 45 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table) 46 | 47 | params[['dropout.logistic.midpt']] = logistic_params$midpt 48 | params[['dropout.logistic.slope']] = logistic_params$slope 49 | 50 | 51 | # estimate common dispersion 52 | design <- matrix(1, ncol(expr.matrix), 1) 53 | disps <- edgeR::estimateDisp(expr.matrix, design = design) 54 | 55 | params[[ 'common.dispersion' ]] = disps$common.dispersion 56 | 57 | 58 | return(params) 59 | 60 | } 61 | 62 | 63 | 64 | # examine each group 65 | all_groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 66 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices) 67 | 68 | for (group in names(all_groups)) { 69 | 70 | group_idxs = all_groups[[ group ]] 71 | expr.data = infercnv_obj@expr.data[, group_idxs] 72 | 73 | params = get_parameters(group, expr.data) 74 | params = t(as.data.frame(params)) 75 | 76 | print(params) 77 | 78 | } 79 | 80 | -------------------------------------------------------------------------------- /scripts/examine_infercnv_data_params.just_dispersion.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) == 0) { 6 | stop("Error, require params: infercnv.obj"); 7 | } 8 | 9 | infercnv_obj_file = args[1] 10 | 11 | infercnv_obj = readRDS(infercnv_obj_file) 12 | 13 | 14 | library(edgeR) 15 | library(fitdistrplus) 16 | library(infercnv) 17 | library(Matrix) 18 | 19 | # borrowing some code from splatter 20 | 21 | get_parameters <- function(group_name, expr.matrix) { 22 | 23 | message(sprintf("getting params for: %s", group_name)) 24 | params = list() 25 | params[['group_name']] = group_name 26 | 27 | 28 | # estimate common dispersion 29 | design <- matrix(1, ncol(expr.matrix), 1) 30 | disps <- edgeR::estimateDisp(expr.matrix, design = design) 31 | 32 | params[[ 'common.dispersion' ]] = disps$common.dispersion 33 | 34 | 35 | return(params) 36 | 37 | } 38 | 39 | 40 | 41 | # examine each group 42 | all_groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 43 | all_groups[['combined_normal']] <- unlist(infercnv_obj@reference_grouped_cell_indices) 44 | 45 | for (group in names(all_groups)) { 46 | 47 | group_idxs = all_groups[[ group ]] 48 | expr.data = infercnv_obj@expr.data[, group_idxs] 49 | 50 | params = get_parameters(group, expr.data) 51 | params = t(as.data.frame(params)) 52 | 53 | print(params) 54 | 55 | } 56 | 57 | -------------------------------------------------------------------------------- /scripts/examine_normal_cutoffs_vs_KS.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--scale", help="scale", action='store_true', default=FALSE) 8 | parser$add_argument("--subtract", help="subtract", action='store_true', default=FALSE) 9 | parser$add_argument("--smooth", help="smooth", action='store_true', default=TRUE) 10 | parser$add_argument("--show_tumor", help="show tumor instead of normal", action='store_true', default=FALSE) 11 | parser$add_argument("--output", help="name of output png file", required=TRUE) 12 | 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(tidyverse) 17 | library(futile.logger) 18 | 19 | infercnv_obj_file = args$infercnv_obj 20 | 21 | infercnv_obj = readRDS(infercnv_obj_file) 22 | 23 | if (! infercnv:::has_reference_cells(infercnv_obj)) { 24 | stop("Error, cannot tune parameters without reference 'normal' cells defined") 25 | } 26 | 27 | if (args$scale) { 28 | infercnv_obj <- infercnv:::scale_infercnv_expr(infercnv_obj) 29 | } 30 | 31 | if (args$subtract) { 32 | infercnv_obj <- subtract_ref_expr_from_obs(infercnv_obj, inv_log=FALSE) 33 | } 34 | 35 | 36 | if (args$smooth) { 37 | infercnv_obj <- smooth_by_chromosome(infercnv_obj, window_length=101, smooth_ends=TRUE) 38 | } 39 | 40 | if (args$show_tumor) { 41 | expr_vals <- infercnv_obj@expr.data[, unlist(infercnv_obj@observation_grouped_cell_indices)] 42 | } else { 43 | expr_vals <- infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)] 44 | } 45 | 46 | 47 | mu = mean(expr_vals) 48 | sigma = sd(expr_vals) 49 | 50 | data.want = data.frame(vals=as.numeric(expr_vals)) 51 | 52 | mean_delta = infercnv:::determine_mean_delta_via_Z(sigma, p=0.05) 53 | KS_delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sigma, alpha=0.05) 54 | 55 | 56 | png(args$output) 57 | 58 | message("plotting ncells distribution") 59 | 60 | message("mean delta: ", mean_delta) 61 | message("KS_delta: ", KS_delta) 62 | 63 | p = data.want %>% ggplot(aes(vals)) + 64 | geom_density(alpha=0.3) 65 | 66 | p = p + 67 | stat_function(fun=dnorm, color='black', args=list('mean'=mu,'sd'=sigma)) 68 | 69 | 70 | ## add Z-based 71 | 72 | p = p + 73 | stat_function(fun=dnorm, color='blue', args=list('mean'=mu-mean_delta,'sd'=sigma)) + 74 | stat_function(fun=dnorm, color='blue', args=list('mean'=mu+mean_delta,'sd'=sigma)) 75 | 76 | ## add KS-based 77 | 78 | p = p + 79 | stat_function(fun=dnorm, color='magenta', args=list('mean'=mu-KS_delta,'sd'=sigma)) + 80 | stat_function(fun=dnorm, color='magenta', args=list('mean'=mu+KS_delta,'sd'=sigma)) 81 | 82 | plot(p) 83 | -------------------------------------------------------------------------------- /scripts/examine_normal_sampling_distributions.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(tidyverse) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | if (! infercnv:::has_reference_cells(infercnv_obj)) { 18 | stop("Error, cannot tune parameters without reference 'normal' cells defined") 19 | } 20 | 21 | expr_vals <- infercnv_obj@expr.data 22 | mu = mean(expr_vals) 23 | sigma = sd(expr_vals) 24 | nrounds = 1000 25 | sds = c() 26 | ngenes = nrow(expr_vals) 27 | 28 | normal_samples = infercnv_obj@reference_grouped_cell_indices 29 | 30 | num_normal_samples = length(normal_samples) 31 | 32 | mean_vals_df = NULL; 33 | z_p_val = 0.05 34 | 35 | num_cells_to_empirical_sd = list() 36 | 37 | ncells_partitions = seq (1,100,5) 38 | for (ncells in ncells_partitions) { 39 | means = c() 40 | 41 | message(sprintf("num cells: %g", ncells)) 42 | 43 | cells_counted = 0; 44 | 45 | for(i in 1:nrounds) { 46 | ## pick a random gene 47 | rand.gene = sample(1:ngenes) 48 | 49 | ## pick a random normal cell type 50 | rand.sample = sample(num_normal_samples) 51 | #rand.sample=1 52 | 53 | vals = sample(expr_vals[rand.gene, normal_samples[[rand.sample]] ], size=ncells, replace=T) 54 | m_val = mean(vals) 55 | means = c(means, m_val) 56 | 57 | cells_counted = cells_counted + length(vals) 58 | 59 | 60 | } 61 | my.sd = sd(means) 62 | sds = c(sds, my.sd) 63 | 64 | num_cells_to_empirical_sd[[ ncells ]] = my.sd 65 | 66 | df = data.frame(num_cells=ncells, vals=means) 67 | #print(df) 68 | if(is.null(mean_vals_df)) { 69 | mean_vals_df = df 70 | } else { 71 | mean_vals_df = rbind(mean_vals_df, df) 72 | } 73 | 74 | } 75 | 76 | ## fit linear model 77 | num_cells = ncells_partitions 78 | 79 | write.table(data.frame(num_cells=num_cells, sds=sds), file='num_cells_vs_sds.table.dat', quote=F, sep="\t") 80 | 81 | 82 | fit = lm(log(sds) ~ log(num_cells)) #note, hbadger does something similar, but not for the hmm cnv state levels 83 | 84 | my.spline = smooth.spline(log(num_cells), log(sds)) 85 | 86 | message("plotting log(sd) vs. log(num_cells)") 87 | 88 | plot(log(num_cells), log(sds), main='log(sd) vs. log(num_cells)') 89 | 90 | plot(num_cells, sds, main='sd vs. num_cells') 91 | 92 | my.spline2 = smooth.spline(num_cells, sds) 93 | 94 | ## store mean_delta for the single gene for convenience sake 95 | mean_delta = qnorm(p=1-z_p_val, sd=sigma, mean=0) 96 | 97 | normal_sd_trend = list(mu=mu, 98 | sigma=sigma, 99 | fit=fit, 100 | spline=my.spline, 101 | mean_delta=mean_delta) 102 | 103 | 104 | 105 | ### do some plotting 106 | 107 | 108 | for (ncells in ncells_partitions) { 109 | 110 | message(sprintf("plotting ncells distribution: %g", ncells)) 111 | 112 | data.want = mean_vals_df %>% filter(num_cells == ncells) 113 | 114 | 115 | p = data.want %>% ggplot(aes(vals, fill=num_cells)) + 116 | geom_density(alpha=0.3) 117 | 118 | sigma <- exp(predict(normal_sd_trend$fit, 119 | newdata=data.frame(num_cells=ncells))[[1]]) 120 | 121 | message("ncells:", ncells, " sigma: ", sigma) 122 | 123 | p = p + 124 | stat_function(fun=dnorm, color='black', args=list('mean'=1,'sd'=sigma)) + 125 | ggtitle(sprintf("num_cells: %g, sd: %g", ncells, sigma)) 126 | 127 | p = p + 128 | stat_function(fun=dnorm, color='magenta', args=list('mean'=1,'sd'=num_cells_to_empirical_sd[[ ncells]] )) 129 | 130 | 131 | pval=0.01 132 | 133 | left_mean = 1 - 2 * (1-qnorm(p=pval, mean=1, sd=sigma)) 134 | message("left_mean: ", left_mean) 135 | p = p + 136 | stat_function(fun=dnorm, color='blue', args=list('mean'=left_mean,'sd'=sigma)) 137 | 138 | 139 | right_mean = 1 + 2 * (qnorm(p=1-pval, mean=1, sd=sigma)-1) 140 | message("right_mean: ", right_mean) 141 | p = p + 142 | stat_function(fun=dnorm, color='blue', args=list('mean'=right_mean,'sd'=sigma)) 143 | 144 | 145 | 146 | 147 | 148 | if (FALSE) { 149 | 150 | spline.sd = exp(predict(my.spline, x=log(ncells))$y) 151 | 152 | 153 | p = p + 154 | stat_function(fun=dnorm, color='green', args=list('mean'=1,'sd'=spline.sd)) 155 | 156 | spline2.sd = predict(my.spline2, x=ncells)$y 157 | 158 | message(spline2.sd) 159 | 160 | p = p + 161 | stat_function(fun=dnorm, color='orange', args=list('mean'=1,'sd'=spline2.sd)) 162 | } 163 | 164 | plot(p) 165 | } 166 | 167 | -------------------------------------------------------------------------------- /scripts/examine_normal_sampling_distributions.i3.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(tidyverse) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | 18 | expr_vals <- infercnv_obj@expr.data 19 | 20 | 21 | sd_trend_info = infercnv:::.i3HMM_get_sd_trend_by_num_cells_fit(infercnv_obj) 22 | 23 | 24 | mu = sd_trend_info$mu 25 | sigma = sd_trend_info$sigma 26 | 27 | sds = c() 28 | ngenes = nrow(expr_vals) 29 | 30 | tumor_samples = infercnv_obj@observation_grouped_cell_indices 31 | 32 | print(tumor_samples) 33 | 34 | num_tumor_samples = length(tumor_samples) 35 | 36 | print(num_tumor_samples) 37 | 38 | mean_vals_df = NULL; 39 | z_p_val = 0.05 40 | 41 | 42 | num_cells_to_empirical_sd = list() 43 | 44 | nrounds=100 45 | 46 | ncells_partitions = seq (1,100,5) 47 | for (ncells in ncells_partitions) { 48 | means = c() 49 | 50 | message(sprintf("num cells: %g", ncells)) 51 | 52 | cells_counted = 0; 53 | 54 | for(i in 1:nrounds) { 55 | ## pick a random gene 56 | rand.gene = sample(1:ngenes, size=1) 57 | 58 | ## pick a random normal cell type 59 | rand.sample = sample(1:num_tumor_samples, size=1) 60 | #rand.sample=1 61 | #print(rand.sample) 62 | 63 | vals = sample(expr_vals[rand.gene, tumor_samples[[rand.sample]] ], size=ncells, replace=T) 64 | m_val = mean(vals) 65 | means = c(means, m_val) 66 | 67 | cells_counted = cells_counted + length(vals) 68 | 69 | } 70 | means.sd = sd(means) 71 | means.mean = mean(means) 72 | 73 | num_cells_to_empirical_sd[[ ncells ]] = means.sd 74 | 75 | df = data.frame(num_cells=ncells, vals=means) 76 | 77 | message(sprintf("plotting ncells distribution: %g", ncells)) 78 | 79 | data.want = df 80 | 81 | 82 | p = data.want %>% ggplot(aes(vals, fill=num_cells)) + 83 | geom_density(alpha=0.3) + 84 | ggtitle(sprintf("num_cells: %g", ncells)) 85 | 86 | ## draw parameterized distribution 87 | p = p + 88 | stat_function(fun=dnorm, color='black', args=list('mean'=means.mean,'sd'=means.sd)) 89 | 90 | 91 | alpha=0.05 92 | ks_delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sd_trend_info$sigma, k_cells=ncells, alpha=alpha, plot=T) 93 | 94 | left_mean = means.mean - ks_delta 95 | message("left_mean: ", left_mean) 96 | p = p + 97 | stat_function(fun=dnorm, color='blue', args=list('mean'=left_mean,'sd'=means.sd)) 98 | 99 | 100 | right_mean = means.mean + ks_delta 101 | message("right_mean: ", right_mean) 102 | p = p + 103 | stat_function(fun=dnorm, color='blue', args=list('mean'=right_mean,'sd'=means.sd)) 104 | 105 | 106 | plot(p) 107 | } 108 | 109 | -------------------------------------------------------------------------------- /scripts/examine_simulated_vs_observed_dispersion.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) == 0) { 6 | stop("Error, require params: infercnv.obj"); 7 | } 8 | 9 | infercnv_obj_file = args[1] 10 | 11 | pdf(paste0(infercnv_obj_file, '.dispersion_estimation.pdf')) 12 | 13 | infercnv_obj = readRDS(infercnv_obj_file) 14 | 15 | 16 | library(edgeR) 17 | library(fitdistrplus) 18 | library(infercnv) 19 | 20 | # examine each group 21 | normal_grp_idx <- unlist(infercnv_obj@reference_grouped_cell_indices) 22 | expr.matrix = infercnv_obj@expr.data[, normal_grp_idx] 23 | 24 | 25 | ## estimate dropout params 26 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix) 27 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table) 28 | 29 | iterations=1 30 | dispersion_params = c(0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10) 31 | 32 | resultset=matrix(0, ncol=3, nrow=iterations*length(dispersion_params)) 33 | colnames(resultset) = c('target', 'before_Zinf', 'after_Zinf') 34 | 35 | 36 | row = 0 37 | 38 | 39 | for (common.dispersion in dispersion_params) { 40 | message(sprintf("Exploring common.dispersion set at: %g", common.dispersion)) 41 | for (iter in 1:iterations) { 42 | message(sprintf("\titer: %d", iter)) 43 | 44 | row = row + 1 45 | 46 | ## simulate w/o zero-inflation 47 | sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, NULL, 100, common_dispersion=common.dispersion) 48 | 49 | ## estimate common disp from these data: 50 | design <- matrix(1, ncol(sim_counts), 1) 51 | 52 | 53 | disps <- edgeR::estimateDisp(sim_counts, design = design) 54 | #print(sprintf("estimated disp before dropouts: %g", disps$common.dispersion)) 55 | 56 | resultset[row,1] <- common.dispersion 57 | resultset[row,2] <- disps$common.dispersion 58 | 59 | 60 | ## include zero-inflation 61 | sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, mean_vs_p0_table, 100, 62 | common_dispersion=common.dispersion) 63 | 64 | 65 | disps <- edgeR::estimateDisp(sim_counts, design = design) 66 | resultset[row,3] <- disps$common.dispersion 67 | 68 | } 69 | 70 | 71 | } 72 | 73 | 74 | resultset = as.data.frame(resultset) 75 | print(resultset) 76 | write.table(resultset, file=paste0(infercnv_obj_file, ".dispersion_estimation.dat"), quote=F, sep="\t") 77 | 78 | ## examples: 79 | ## 10x: 0.221 + 1.05 * (true_dispersion) # colon single sample 80 | ## 0.223 + 1.05 * (true_dipersion) # multiple colon samples 81 | 82 | ## smrtSeq: 0.95 + 1.56 * (true_dispersion) # oligodendro 83 | ## 1.073 + 1.628 * (true_dispersion) # melanoma 84 | 85 | 86 | res.lm = lm(resultset[,3] ~ resultset[,1]) 87 | 88 | print(res.lm) 89 | 90 | coeff = res.lm$coefficients 91 | intercept = coeff[1] 92 | slope = coeff[2] 93 | 94 | plot(resultset[,1], resultset[,3], main=sprintf("y=%g + %g * x", intercept, slope), col='green') 95 | points(resultset[,1], resultset[,2]) 96 | 97 | 98 | -------------------------------------------------------------------------------- /scripts/examine_simulated_vs_observed_dispersion.from_matrix.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) == 0) { 6 | stop("Error, require params: normal_cells.matrix"); 7 | } 8 | 9 | matrix.file = args[1] 10 | 11 | pdf(paste0(matrix.file, '.dispersion_estimation.pdf')) 12 | 13 | library(edgeR) 14 | library(fitdistrplus) 15 | library(infercnv) 16 | 17 | expr.matrix = read.table(matrix.file) 18 | 19 | 20 | ## estimate dropout params 21 | mean_vs_p0_table = infercnv:::.get_mean_vs_p0_from_matrix(expr.matrix) 22 | logistic_params = infercnv:::.get_logistic_params(mean_vs_p0_table) 23 | 24 | iterations=1 25 | dispersion_params = c(0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10) 26 | 27 | resultset=matrix(0, ncol=3, nrow=iterations*length(dispersion_params)) 28 | colnames(resultset) = c('target', 'before_Zinf', 'after_Zinf') 29 | 30 | 31 | row = 0 32 | 33 | 34 | for (common.dispersion in dispersion_params) { 35 | message(sprintf("Exploring common.dispersion set at: %g", common.dispersion)) 36 | for (iter in 1:iterations) { 37 | message(sprintf("\titer: %d", iter)) 38 | 39 | row = row + 1 40 | 41 | ## simulate w/o zero-inflation 42 | sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, NULL, 100, common_dispersion=common.dispersion) 43 | 44 | ## estimate common disp from these data: 45 | design <- matrix(1, ncol(sim_counts), 1) 46 | 47 | 48 | disps <- edgeR::estimateDisp(sim_counts, design = design) 49 | #print(sprintf("estimated disp before dropouts: %g", disps$common.dispersion)) 50 | 51 | resultset[row,1] <- common.dispersion 52 | resultset[row,2] <- disps$common.dispersion 53 | 54 | 55 | ## include zero-inflation 56 | sim_counts = infercnv:::.get_simulated_cell_matrix(mean_vs_p0_table$m, mean_vs_p0_table, 100, 57 | common_dispersion=common.dispersion) 58 | 59 | 60 | disps <- edgeR::estimateDisp(sim_counts, design = design) 61 | resultset[row,3] <- disps$common.dispersion 62 | 63 | } 64 | 65 | 66 | } 67 | 68 | 69 | resultset = as.data.frame(resultset) 70 | print(resultset) 71 | write.table(resultset, file=paste0(matrix.file, ".dispersion_estimation.dat"), quote=F, sep="\t") 72 | 73 | ## examples: 74 | ## 10x: 0.221 + 1.05 * (true_dispersion) # colon single sample 75 | ## 0.223 + 1.05 * (true_dipersion) # multiple colon samples 76 | 77 | ## smrtSeq: 0.95 + 1.56 * (true_dispersion) # oligodendro 78 | ## 1.073 + 1.628 * (true_dispersion) # melanoma 79 | 80 | 81 | res.lm = lm(resultset[,3] ~ resultset[,1]) 82 | 83 | print(res.lm) 84 | 85 | coeff = res.lm$coefficients 86 | intercept = coeff[1] 87 | slope = coeff[2] 88 | 89 | plot(resultset[,1], resultset[,3], main=sprintf("y=%g + %g * x", intercept, slope), col='green') 90 | points(resultset[,1], resultset[,2]) 91 | 92 | 93 | -------------------------------------------------------------------------------- /scripts/explore_HMM_exec.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--chr", help='restrict to chr', required=FALSE, nargs=1, default=NULL) 8 | args = parser$parse_args() 9 | 10 | library(infercnv) 11 | library(futile.logger) 12 | library(HiddenMarkov) 13 | 14 | infercnv_obj_file = args$infercnv_obj 15 | 16 | infercnv_obj = readRDS(infercnv_obj_file) 17 | 18 | cnv_mean_sd=infercnv:::get_spike_dists(infercnv_obj@.hspike) 19 | cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(infercnv_obj@.hspike) 20 | transition_out_p=1e-6 21 | p_val=0.05 22 | hclust_method='ward.D2' 23 | 24 | 25 | flog.info(sprintf("predict_CNV_via_HMM_on_tumor_subclusters(p_val=%g)", p_val)) 26 | HMM_info <- infercnv:::.get_HMM(cnv_mean_sd, transition_out_p) 27 | chrs = unique(infercnv_obj@gene_order$chr) 28 | expr.data = infercnv_obj@expr.data 29 | gene_order = infercnv_obj@gene_order 30 | hmm.data = expr.data 31 | hmm.data[,] = -1 #init to invalid state 32 | 33 | tumor_subclusters <- unlist(infercnv_obj@tumor_subclusters[["subclusters"]], recursive=F) 34 | if (is.null(tumor_subclusters)) { 35 | message("No subclusters defined, running per-sample instead") 36 | tumor_subclusters <- infercnv_obj@observation_grouped_cell_indices 37 | } 38 | 39 | if (! is.null(args$chr)) { 40 | chrs = c(args$chr) 41 | } 42 | 43 | 44 | ########################################## 45 | #chrs = c('chr1') 46 | ########################################## 47 | 48 | 49 | ############################################## 50 | ## From HiddenMarkovPackage 51 | getj <- function (x, j) { 52 | if (is.null(x)) 53 | return(NULL) 54 | n <- length(x) 55 | for (i in 1:n) x[[i]] <- x[[i]][j] 56 | return(x) 57 | } 58 | 59 | 60 | local.Viterbi.dthmm <- function (object, ...) { 61 | x <- object$x 62 | dfunc <- HiddenMarkov:::makedensity(object$distn) 63 | n <- length(x) 64 | m <- nrow(object$Pi) # transition matrix 65 | nu <- matrix(NA, nrow = n, ncol = m) # scoring matrix 66 | y <- rep(NA, n) # final trace 67 | pseudocount = 1e-20 68 | 69 | object$pm$sd = max(object$pm$sd) 70 | 71 | emissions <- matrix(NA, nrow = n, ncol = m) 72 | emissions_pre <- emissions 73 | 74 | ## init first row 75 | emission <- pnorm(abs(x[1]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F) 76 | #emissions_pre[1,] <- emission 77 | emissions_pre[1,] <- abs(x[1]-object$pm$mean)/object$pm$sd 78 | 79 | emission <- 1 / (-1 * emission) 80 | emission <- emission / sum(emission) 81 | 82 | emissions[1,] <- log(emission) 83 | 84 | nu[1, ] <- log(object$delta) + # start probabilities 85 | emissions[1,] 86 | 87 | logPi <- log(object$Pi) # convert transition matrix to log(p) 88 | 89 | for (i in 2:n) { 90 | 91 | matrixnu <- matrix(nu[i - 1, ], nrow = m, ncol = m) 92 | 93 | #nu[i, ] <- apply(matrixnu + logPi, 2, max) + 94 | # dfunc(x=x[i], object$pm, getj(object$pn, i), 95 | # log=TRUE) 96 | 97 | 98 | #emission <- dfunc(x=x[i], object$pm, getj(object$pn, i), log=T) 99 | ## normalize emission p-values 100 | ## first add pseudcounts 101 | #missions[i, ] <- emissions[i, ] + pseudocount 102 | #emissions[i, ] <- emissions[i, ] / sum(emissions[i, ]) 103 | 104 | #emissions[i, ] <- log(emissions[i, ]) 105 | 106 | 107 | emission <- pnorm(abs(x[i]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F) 108 | #emissions_pre[i,] <- emission 109 | emissions_pre[i,] <- abs(x[i]-object$pm$mean)/object$pm$sd 110 | 111 | emission <- 1 / (-1 * emission) 112 | emission <- emission / sum(emission) 113 | 114 | emissions[i, ] <- log(emission) 115 | 116 | nu[i, ] <- apply(matrixnu + logPi, 2, max) + emissions[i, ] 117 | 118 | #print(matrixnu) 119 | #print(logPi) 120 | } 121 | if (any(nu[n, ] == -Inf)) 122 | stop("Problems With Underflow") 123 | 124 | write.table(nu, file='nu.txt', quote=F, sep="\t") 125 | write.table(emissions, file='emissions.txt', quote=F, sep="\t") 126 | write.table(emissions_pre, file='emissions_pre.txt', quote=F, sep="\t") 127 | 128 | ## traceback 129 | y[n] <- which.max(nu[n, ]) 130 | 131 | for (i in seq(n - 1, 1, -1)) 132 | y[i] <- which.max(logPi[, y[i + 1]] + nu[i, ]) 133 | 134 | return(y) 135 | } 136 | 137 | 138 | ########################################## 139 | 140 | 141 | for (chr in chrs) { 142 | print(chr) 143 | chr_gene_idx = which(gene_order$chr == chr) 144 | 145 | ## run through each cell for this chromosome: 146 | for (tumor_subcluster_name in names(tumor_subclusters)) { 147 | print(tumor_subcluster_name) 148 | tumor_subcluster_cells_idx <- tumor_subclusters[[tumor_subcluster_name]] 149 | 150 | gene_expr_vals = rowMeans(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F]) 151 | ##gene_expr_vals = apply(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F], 1, median) 152 | if (length(gene_expr_vals) < 2) { next; } 153 | num_cells = length(tumor_subcluster_cells_idx) 154 | 155 | state_emission_params <- infercnv:::.get_state_emission_params(num_cells, cnv_mean_sd, cnv_level_to_mean_sd_fit) 156 | print(state_emission_params) 157 | print(gene_expr_vals) 158 | 159 | hmm <- HiddenMarkov::dthmm(gene_expr_vals, 160 | HMM_info[['state_transitions']], 161 | HMM_info[['delta']], 162 | "norm", 163 | state_emission_params) 164 | 165 | hmm_trace <- local.Viterbi.dthmm(hmm) 166 | 167 | print(hmm_trace) 168 | 169 | hmm.data[chr_gene_idx,tumor_subcluster_cells_idx] <- hmm_trace 170 | 171 | break 172 | } 173 | } 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /scripts/explore_HMM_exec.hspike.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(futile.logger) 11 | library(HiddenMarkov) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | cnv_mean_sd=infercnv:::get_spike_dists(infercnv_obj@.hspike) 18 | cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(infercnv_obj@.hspike) 19 | transition_out_p=1e-6 20 | p_val=0.05 21 | hclust_method='ward.D2' 22 | 23 | 24 | flog.info(sprintf("predict_CNV_via_HMM_on_tumor_subclusters(p_val=%g)", p_val)) 25 | HMM_info <- infercnv:::.get_HMM(cnv_mean_sd, transition_out_p) 26 | 27 | infercnv_obj = infercnv_obj@.hspike 28 | 29 | chrs = unique(infercnv_obj@gene_order$chr) 30 | expr.data = infercnv_obj@expr.data 31 | gene_order = infercnv_obj@gene_order 32 | hmm.data = expr.data 33 | hmm.data[,] = -1 #init to invalid state 34 | 35 | tumor_subclusters <- c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 36 | 37 | 38 | ########################################## 39 | #chrs = c('chr1') 40 | ########################################## 41 | 42 | 43 | ############################################## 44 | ## From HiddenMarkovPackage 45 | getj <- function (x, j) { 46 | if (is.null(x)) 47 | return(NULL) 48 | n <- length(x) 49 | for (i in 1:n) x[[i]] <- x[[i]][j] 50 | return(x) 51 | } 52 | 53 | 54 | local.Viterbi.dthmm <- function (object, ...){ 55 | x <- object$x 56 | dfunc <- HiddenMarkov:::makedensity(object$distn) 57 | n <- length(x) 58 | m <- nrow(object$Pi) # transition matrix 59 | nu <- matrix(NA, nrow = n, ncol = m) # scoring matrix 60 | y <- rep(NA, n) # final trace 61 | pseudocount = 1e-20 62 | 63 | emissions <- matrix(NA, nrow = n, ncol = m) 64 | 65 | ## init first row 66 | emission <- pnorm(abs(x[1]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F) 67 | emission <- 1 / (-1 * emission) 68 | emission <- emission / sum(emission) 69 | 70 | emissions[1,] <- log(emission) 71 | 72 | nu[1, ] <- log(object$delta) + # start probabilities 73 | emissions[1,] 74 | 75 | logPi <- log(object$Pi) # convert transition matrix to log(p) 76 | 77 | for (i in 2:n) { 78 | 79 | matrixnu <- matrix(nu[i - 1, ], nrow = m, ncol = m) 80 | 81 | #nu[i, ] <- apply(matrixnu + logPi, 2, max) + 82 | # dfunc(x=x[i], object$pm, getj(object$pn, i), 83 | # log=TRUE) 84 | 85 | 86 | #emission <- dfunc(x=x[i], object$pm, getj(object$pn, i), log=T) 87 | ## normalize emission p-values 88 | ## first add pseudcounts 89 | #missions[i, ] <- emissions[i, ] + pseudocount 90 | #emissions[i, ] <- emissions[i, ] / sum(emissions[i, ]) 91 | 92 | #emissions[i, ] <- log(emissions[i, ]) 93 | 94 | 95 | emission <- pnorm(abs(x[i]-object$pm$mean)/object$pm$sd, log=T, lower.tail=F) 96 | emission <- 1 / (-1 * emission) 97 | emission <- emission / sum(emission) 98 | 99 | emissions[i, ] <- log(emission) 100 | 101 | nu[i, ] <- apply(matrixnu + logPi, 2, max) + emissions[i, ] 102 | 103 | #print(matrixnu) 104 | #print(logPi) 105 | } 106 | if (any(nu[n, ] == -Inf)) 107 | stop("Problems With Underflow") 108 | 109 | write.table(nu, file='nu.txt', quote=F, sep="\t") 110 | write.table(emissions, file='emissions.txt', quote=F, sep="\t") 111 | 112 | ## traceback 113 | y[n] <- which.max(nu[n, ]) 114 | 115 | for (i in seq(n - 1, 1, -1)) 116 | y[i] <- which.max(logPi[, y[i + 1]] + nu[i, ]) 117 | 118 | return(y) 119 | } 120 | 121 | 122 | ########################################## 123 | 124 | #chrs = c("chr13") 125 | for (chr in chrs) { 126 | print(chr) 127 | chr_gene_idx = which(gene_order$chr == chr) 128 | 129 | ## run through each cell for this chromosome: 130 | for (tumor_subcluster_name in names(tumor_subclusters)) { 131 | print(tumor_subcluster_name) 132 | tumor_subcluster_cells_idx <- tumor_subclusters[[tumor_subcluster_name]] 133 | 134 | gene_expr_vals = rowMeans(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F]) 135 | ##gene_expr_vals = apply(expr.data[chr_gene_idx,tumor_subcluster_cells_idx,drop=F], 1, median) 136 | 137 | num_cells = length(tumor_subcluster_cells_idx) 138 | 139 | state_emission_params <- infercnv:::.get_state_emission_params(num_cells, cnv_mean_sd, cnv_level_to_mean_sd_fit) 140 | print(state_emission_params) 141 | print(gene_expr_vals) 142 | 143 | hmm <- HiddenMarkov::dthmm(gene_expr_vals, 144 | HMM_info[['state_transitions']], 145 | HMM_info[['delta']], 146 | "norm", 147 | state_emission_params) 148 | 149 | hmm_trace <- local.Viterbi.dthmm(hmm) 150 | 151 | print(hmm_trace) 152 | 153 | hmm.data[chr_gene_idx,tumor_subcluster_cells_idx] <- hmm_trace 154 | 155 | 156 | } 157 | } 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /scripts/genome_smoothed_lineplots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | pdf(sprintf("%s.chr_lineplots.pdf", infercnv_obj_file)) 18 | 19 | normal_groups = infercnv_obj@reference_grouped_cell_indices 20 | tumor_groups = infercnv_obj@observation_grouped_cell_indices 21 | 22 | expr.data = infercnv_obj@expr.data 23 | 24 | num_tumor_groups = length(tumor_groups) 25 | 26 | windowsizes = c(25,50,75,100) 27 | num_windowsizes = length(windowsizes) 28 | par(mfrow=c(num_windowsizes, 1)) 29 | 30 | library(tidyverse) 31 | 32 | 33 | plotme <- function(normal_pts, tumor_pts, windowsize) { 34 | 35 | all_pts = c(normal_pts, tumor_pts) 36 | 37 | all_pts_names = names(all_pts) 38 | 39 | my.colors = rainbow(length(all_pts)) 40 | 41 | yrange = range(unlist(all_pts)) 42 | 43 | text.adj = 0.7 44 | for (i in 1:length(all_pts)) { 45 | if (i == 1) { 46 | plot(all_pts[[i]], t='l', col=my.colors[i], main=sprintf("windowsize: %g, tumor: %s", windowsize, all_pts_names[length(all_pts_names)]), ylim=yrange, 47 | cex.lab=text.adj, cex.main=text.adj, cex.axis=text.adj) 48 | } else { 49 | points(all_pts[[i]], t='l', col=my.colors[i]) 50 | } 51 | } 52 | abline(h=0) 53 | legend('top', legend=all_pts_names, col=my.colors, pch=1, horiz=T, bty='n', cex=text.adj) 54 | 55 | } 56 | 57 | 58 | 59 | get_smoothed <- function(cell_idx, windowsize) { 60 | group_expr_data = expr.data[, cell_idx] 61 | smoothed = apply(group_expr_data, 2, caTools::runmean, k=windowsize) 62 | smoothed_mean = rowMeans(smoothed) 63 | 64 | ## center it: 65 | smoothed_mean = smoothed_mean - median(smoothed_mean) 66 | 67 | return(smoothed_mean) 68 | } 69 | 70 | plot_chr_smooths <- function(tumor_type) { 71 | 72 | 73 | tumor_pts = tumor_groups[[tumor_type]] 74 | 75 | 76 | for (windowsize in windowsizes) { 77 | message(sprintf("\t-plotting %s", tumor_type)) 78 | 79 | normal_pts = list() 80 | for (normal_type in names(normal_groups)) { 81 | normal_pts[[ normal_type ]] <- get_smoothed(normal_groups[[normal_type]], windowsize) 82 | } 83 | 84 | tumor_pts = list() 85 | tumor_pts[[ tumor_type ]] = get_smoothed(tumor_groups[[tumor_type]], windowsize) 86 | plotme(normal_pts, tumor_pts, windowsize) 87 | } 88 | } 89 | 90 | 91 | 92 | 93 | for (tumor_type in names(tumor_groups)) { 94 | message(sprintf("plotting for %s", tumor_type)) 95 | plot_chr_smooths(tumor_type) 96 | } 97 | -------------------------------------------------------------------------------- /scripts/gtf_to_position_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | Converts GTF files to proprietary formats. 6 | """ 7 | 8 | 9 | # Import statements 10 | import argparse 11 | import csv 12 | import os 13 | 14 | __author__ = 'Timothy Tickle, Itay Tirosh, Brian Haas' 15 | __copyright__ = 'Copyright 2016' 16 | __credits__ = ["Timothy Tickle"] 17 | __license__ = 'BSD-3' 18 | __maintainer__ = 'Timothy Tickle' 19 | __email__ = 'ttickle@bbroadinstitute.org' 20 | __status__ = 'Development' 21 | 22 | 23 | def convert_to_positional_file(input_gtf, output_positional, attribute_key): 24 | """ Convert input GTF file to positional file. 25 | 26 | :param input_gtf: Path to input gtf file 27 | :type input_gtf: String 28 | :param output_positional: Path to output positional file 29 | :type output_positional: String 30 | :param attribute_key: Key of the GTF attribute to use for feature/row names 31 | :type attribute_key: String 32 | 33 | :returns: Indicator of success (True) or Failure (False) 34 | :rtype: boolean 35 | """ 36 | 37 | if not input_gtf or not os.path.exists(input_gtf): 38 | print("".join(["gtf_to_position_file.py:: ", 39 | "Could not find input file : " + input_gtf])) 40 | 41 | all_genes_found = set() 42 | 43 | # Holds lines to output after parsing. 44 | output_line = [] 45 | previous_gene = None 46 | previous_chr = None 47 | gene_positions = [] 48 | 49 | # Metrics for the file 50 | i_comments = 0 51 | i_duplicate_entries = 0 52 | i_entries = 0 53 | i_accepted_entries = 0 54 | i_written_lines = 0 55 | 56 | with open(input_gtf, "r") as gtf: 57 | gtf_file = csv.reader(gtf,delimiter="\t") 58 | for gtf_line in gtf_file: 59 | if gtf_line[0][0] == "#": 60 | i_comments += 1 61 | continue 62 | i_entries += 1 63 | # Clean up the attribute keys and match the one of interest. 64 | attributes = gtf_line[8].split(";") 65 | attributes = [entry.strip(" ") for entry in attributes] 66 | attributes = [entry.split(" ") for entry in attributes if entry] 67 | attributes = [[entry[0].strip('"'),entry[1].strip('"')] for entry in attributes] 68 | attributes = dict([[entry[0].split("|")[0],entry[1]] for entry in attributes]) 69 | if attribute_key in attributes: 70 | gene_name = attributes[attribute_key] 71 | else: 72 | print("Could not find an attribute in the GTF with the name '"+attribute_key+"'. Line="+"\t".join(gtf_line)) 73 | exit(99) 74 | if not gene_name == previous_gene: 75 | if len(gene_positions) > 1 and previous_gene not in all_genes_found: 76 | i_accepted_entries += 1 77 | gene_positions.sort() 78 | output_line.append("\t".join([previous_gene, 79 | previous_chr, 80 | str(gene_positions[0]), 81 | str(gene_positions[-1])])) 82 | all_genes_found.add(previous_gene) 83 | gene_positions = [] 84 | else: 85 | i_duplicate_entries += 1 86 | gene_positions += [int(gtf_line[3]), int(gtf_line[4])] 87 | previous_gene = gene_name 88 | previous_chr = gtf_line[0] 89 | if previous_gene and previous_chr and len(gene_positions) > 1: 90 | i_accepted_entries += 1 91 | gene_positions.sort() 92 | output_line.append("\t".join([previous_gene, 93 | previous_chr, 94 | str(gene_positions[0]), 95 | str(gene_positions[-1])])) 96 | 97 | with open(output_positional, "w") as positional_file: 98 | i_written_lines += len(output_line) 99 | positional_file.write("\n".join(output_line)) 100 | 101 | # Print metrics 102 | print("Number of lines read: " + str(i_entries)) 103 | print("Number of comments: " + str(i_comments)) 104 | print("Number of entries: " + str(i_accepted_entries)) 105 | print("Number of duplicate entries: " + str(i_duplicate_entries)) 106 | print("Number of entries written: " + str(i_written_lines)) 107 | 108 | if __name__ == "__main__": 109 | 110 | # Parse arguments 111 | prsr_arguments = argparse.ArgumentParser(prog='gtf_to_position_file.py', 112 | description='Convert a GTF file to a positional file.', 113 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 114 | # Add positional argument 115 | prsr_arguments.add_argument("input_gtf", 116 | metavar="input_gtf", 117 | help="Path to the input GTF file.") 118 | prsr_arguments.add_argument("--attribute_name", 119 | metavar="attribute_name", 120 | default="gene_id", 121 | help="The name of the attribute in the GTF attributes to use instead of gene name, for example 'gene_name' or 'transcript_id'.") 122 | prsr_arguments.add_argument("output_positional", 123 | metavar="output_positional", 124 | help="Path for the output positional file.") 125 | args = prsr_arguments.parse_args() 126 | 127 | # Run Script 128 | convert_to_positional_file(args.input_gtf, args.output_positional, args.attribute_name) 129 | -------------------------------------------------------------------------------- /scripts/inferCNV_to_HB.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--no_scale_data", help="dont scale the data (ie. already scaled)", required=F, action='store_true', default=FALSE) 8 | args = parser$parse_args() 9 | 10 | library(infercnv) 11 | library(ggplot2) 12 | library(futile.logger) 13 | library(HoneyBADGER) 14 | 15 | infercnv_obj_file = args$infercnv_obj 16 | 17 | infercnv_obj = readRDS(infercnv_obj_file) 18 | 19 | require(biomaRt) ## for gene coordinates 20 | mart.obj <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", 21 | dataset = 'hsapiens_gene_ensembl', 22 | host = "jul2015.archive.ensembl.org") 23 | 24 | do_scale=TRUE 25 | if (args$no_scale_data) { 26 | do_scale=FALSE 27 | } 28 | 29 | 30 | run_hbadger <- function(tumor_group_name, normal_matrix, tumor_matrix) { 31 | 32 | hb <- new('HoneyBADGER', name=tumor_group_name) 33 | 34 | ref_normal <- rowMeans(normal_matrix) 35 | 36 | hb$setGexpMats(tumor_matrix, ref_normal, mart.obj, filter=FALSE, scale=do_scale, verbose=TRUE) 37 | 38 | pdf(sprintf("%s-hb.pdf", tumor_group_name)) 39 | 40 | hb$plotGexpProfile() ## initial visualization 41 | 42 | 43 | hb$setMvFit(verbose=TRUE) 44 | hb$setGexpDev(verbose=TRUE) 45 | hb$calcGexpCnvBoundaries(init=TRUE, verbose=FALSE) 46 | 47 | 48 | ## double check what CNVs were identified 49 | bgf <- hb$bound.genes.final 50 | genes <- hb$genes 51 | regions.genes <- range(genes[unlist(bgf)]) 52 | 53 | print(regions.genes) 54 | 55 | if (length(regions.genes) == 0) { 56 | message("No cnv regions identified") 57 | return() 58 | } 59 | 60 | ## Indeed, our initial HMM has identified a number of candidate CNVs to test. We can now retest all identified CNVs on all cells to derive the final posterior probability of each CNV in each cell. We can cluster cells on these posterior probabilities and visualize them as a heatmap. 61 | 62 | hb$retestIdentifiedCnvs(retestBoundGenes = TRUE, retestBoundSnps = FALSE, verbose=FALSE) 63 | 64 | ## look at final results 65 | results <- hb$summarizeResults(geneBased=TRUE, alleleBased=FALSE) 66 | print(head(results[,1:7])) 67 | write.table(results[,1:7], sprintf("%s-hb.cnvs.tsv", tumor_group_name), quote=F, sep="\t") 68 | 69 | 70 | ## visualize as heatmap 71 | trees <- hb$visualizeResults(geneBased=TRUE, alleleBased=FALSE, details=TRUE, margins=c(25,15)) 72 | 73 | ## order cells 74 | hc <- trees$hc 75 | order <- hc$labels[hc$order] 76 | ## plot all chromosomes 77 | hb$plotGexpProfile(cellOrder=order) 78 | 79 | 80 | ## plot just identified cnvs 81 | hb$plotGexpProfile(cellOrder=order, region=hb$cnvs[['gene-based']][['amp']]) 82 | 83 | hb$plotGexpProfile(cellOrder=order, region=hb$cnvs[['gene-based']][['del']]) 84 | 85 | 86 | } 87 | 88 | 89 | 90 | 91 | normal_matrix = infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices), drop=F] 92 | 93 | tumor_groups = infercnv_obj@observation_grouped_cell_indices 94 | 95 | tumor_group_names = names(tumor_groups) 96 | tumor_group_name = tumor_group_names[1] # for debugging 97 | for (tumor_group_name in tumor_group_names) { 98 | tumor_grp_idx = tumor_groups[[tumor_group_name]] 99 | 100 | tumor_matrix = infercnv_obj@expr.data[,tumor_grp_idx] 101 | 102 | run_hbadger(tumor_group_name, normal_matrix, tumor_matrix) 103 | } 104 | -------------------------------------------------------------------------------- /scripts/inferCNV_utils.R: -------------------------------------------------------------------------------- 1 | 2 | library(tidyverse) 3 | library(futile.logger) 4 | 5 | # plot expression density by chromosome for each observation group, reference groups are shown as single 'normal' group. 6 | plot_density_by_chr <- function(infercnv_obj, pdf_filename=NULL, exclude_range=NULL, include_range = NULL, chrs=NULL) { 7 | 8 | ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj) 9 | 10 | 11 | if (is.null(chrs)) { 12 | chrs = unique(infercnv_obj@gene_order$chr) 13 | } 14 | 15 | if (! is.null(pdf_filename)) { 16 | pdf(pdf_filename) 17 | } 18 | 19 | 20 | chr_expr_vals = list() 21 | 22 | for (chr in chrs) { 23 | 24 | 25 | gene_idx = which(infercnv_obj@gene_order$chr == chr) 26 | 27 | ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices]) 28 | 29 | df = data.frame(class='normal', vals=ref_data_pts) 30 | 31 | for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) { 32 | 33 | tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]] 34 | tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx]) 35 | 36 | df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts)) 37 | } 38 | 39 | flog.info(sprintf("Plotting data for chr: %s", chr)) 40 | 41 | if (! is.null(exclude_range)) { 42 | excl_range_left = exclude_range[1] 43 | excl_range_right = exclude_range[2] 44 | 45 | df = df %>% filter(vals < excl_range_left | vals > excl_range_right) 46 | } else if (! is.null(include_range)) { 47 | include_range_left = include_range[1] 48 | include_range_right = include_range[2] 49 | 50 | df = df %>% filter(vals >= include_range_left & vals <= include_range_right) 51 | } 52 | 53 | p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + scale_y_continuous(trans='log10', limits=c(1,NA)) + ggtitle(chr) 54 | plot(p) 55 | 56 | chr_expr_vals[[ chr ]] = df 57 | 58 | } 59 | 60 | if (! is.null(pdf_filename)) { 61 | dev.off() 62 | } 63 | 64 | return(chr_expr_vals) 65 | 66 | } 67 | 68 | 69 | 70 | # plot the spike distribution for each specified chromosome in a single density plot 71 | plot_spike_dist <- function(infercnv_obj, chrs) { 72 | 73 | 74 | spike_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ 'SPIKE' ]] 75 | 76 | spike_expr = infercnv_obj@expr.data[ , spike_cell_idx ] 77 | 78 | df = data.frame(class='rest', vals=as.numeric(spike_expr[ -1 * which(infercnv_obj@gene_order$chr %in% chrs), ])) 79 | 80 | for (chr in chrs) { 81 | 82 | df = rbind(df, data.frame(class=chr, vals=as.numeric(spike_expr[ which(infercnv_obj@gene_order$chr == chr), ]))) 83 | } 84 | 85 | p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + scale_y_continuous(trans='log10', limits=c(1,NA)) + ggtitle('spike') 86 | plot(p) 87 | 88 | } 89 | 90 | ## examine dist of counts of non-zero valued genes per cell per grouping 91 | plot_dist_counts_expr_genes_by_chr <- function(infercnv_obj, pdf_filename=NULL, chrs=NULL) { 92 | 93 | group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 94 | 95 | if (is.null(chrs)) { 96 | chrs = unique(infercnv_obj@gene_order$chr) 97 | } 98 | 99 | if (! is.null(pdf_filename)) { 100 | pdf(pdf_filename) 101 | } 102 | 103 | gene_counts_dfs = list() 104 | 105 | for (chr in chrs) { 106 | gene_idx = which(infercnv_obj@gene_order$chr == chr) 107 | 108 | df = NULL 109 | for (group in names(group_indices)) { 110 | cell_idx = group_indices[[group]] 111 | expr.data = infercnv_obj@expr.data[gene_idx, cell_idx] 112 | gene_counts = apply(expr.data, 2, function(x) { sum(x != 0) } ) 113 | if (is.null(df)) { 114 | df = data.frame(class=group, gene_counts=gene_counts) 115 | } else { 116 | df = rbind(df, data.frame(class=group, gene_counts=gene_counts)) 117 | } 118 | } 119 | p = df %>% ggplot(aes(gene_counts, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr) 120 | plot(p) 121 | 122 | gene_counts_dfs[[ chr ]] = df 123 | } 124 | 125 | if (! is.null(pdf_filename)) { 126 | dev.off() 127 | } 128 | 129 | return(gene_counts_dfs) 130 | } 131 | 132 | 133 | 134 | #' takes the mean expr per gene per group 135 | #' returns dataframe with mean_gene_grpA, mean_gene_grpB 136 | compare_gene_expr_means_by_group_pair <- function(infercnv_obj, groupA, groupB, chr=NULL) { 137 | 138 | group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 139 | 140 | group_indices[[ "normal" ]] = infercnv:::get_reference_grouped_cell_indices(infercnv_obj) 141 | 142 | expr.data = infercnv_obj@expr.data 143 | 144 | if (! is.null(chr)) { 145 | gene_idx = which(infercnv_obj@gene_order$chr == chr) 146 | expr.data = expr.data[gene_idx,] 147 | } 148 | groupA.expr.data = expr.data[, group_indices[[ groupA ]] ] 149 | groupB.expr.data = expr.data[, group_indices[[ groupB ]] ] 150 | 151 | groupA.gene_mean = rowMeans(groupA.expr.data) 152 | groupB.gene_mean = rowMeans(groupB.expr.data) 153 | 154 | #plot(groupA.gene_mean, groupB.gene_mean) 155 | smoothScatter(groupA.gene_mean, groupB.gene_mean) 156 | abline(a=0, b=1, col='magenta') 157 | 158 | df=data.frame(groupA=groupA.gene_mean, groupB=groupB.gene_mean) 159 | 160 | return(df) 161 | 162 | } 163 | 164 | #' compare spike vs cancer, both to normal 165 | 166 | compare_gene_expr_means_spike_vs_cancer_to_normal <- function(infercnv_obj, tumor_type, chr, xlim=NULL, ylim=NULL) { 167 | 168 | df_normal_vs_spike = compare_gene_expr_means_by_group_pair(infercnv_obj, 'normal', 'SPIKE', chr) 169 | df_tumor_vs_spike = compare_gene_expr_means_by_group_pair(infercnv_obj, 'normal', tumor_type, chr) 170 | 171 | plot(df_tumor_vs_spike[,1], df_tumor_vs_spike[,2], xlab='normal', ylab=tumor_type, xlim=xlim, ylim=ylim) 172 | points(df_normal_vs_spike[,1], df_normal_vs_spike[,2], col='red') 173 | abline(a=0,b=1, col='blue') 174 | 175 | } 176 | 177 | 178 | #' model the mean-to-variance relationship 179 | 180 | get_mean_var <- function(infercnv_obj) { 181 | 182 | group_indices = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 183 | 184 | mean_var_table = NULL 185 | 186 | for (group_name in names(group_indices)) { 187 | flog.info(sprintf("processing group: %s", group_name)) 188 | expr.data = infercnv_obj@expr.data[, group_indices[[ group_name ]] ] 189 | m = rowMeans(expr.data) 190 | v = apply(expr.data, 1, var) 191 | if (is.null(mean_var_table)) { 192 | mean_var_table = data.frame(g=group, m=m, v=v) 193 | } else { 194 | mean_var_table = rbind(mean_var_table, data.frame(g=group, m=m, v=v)) 195 | } 196 | } 197 | 198 | 199 | 200 | return(mean_var_table) 201 | } 202 | 203 | plot_mean_var_table <- function(mvtable) { 204 | s = smooth.spline(log2(mvtable$m+1), log2(mvtable$v+1)) 205 | p = predict(s, log2(mvtable$m+1)) 206 | smoothScatter(log2(mvtable$m+1), log2(mvtable$v+1)) 207 | points(p, col='green', pch='.') 208 | } 209 | -------------------------------------------------------------------------------- /scripts/infercnv_obj_to_input_files.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | 11 | infercnv_obj_file = args$infercnv_obj 12 | 13 | infercnv_obj = readRDS(infercnv_obj_file) 14 | 15 | ## write counts matrix 16 | write.table(infercnv_obj@count.data, file='sc.counts.matrix', quote=F, sep="\t") 17 | 18 | cellnames = colnames(infercnv_obj@count.data) 19 | 20 | groupings = c(infercnv_obj@reference_grouped_cell_indices, infercnv_obj@observation_grouped_cell_indices) 21 | 22 | ## write cell annotation file 23 | cell.annots = do.call(rbind, lapply(names(groupings), function(groupname) { 24 | cell_idx = groupings[[ groupname ]] 25 | group.cellnames = cellnames[cell_idx] 26 | 27 | return(data.frame(cells=group.cellnames, type=groupname)) 28 | })) 29 | 30 | cell.annots = cell.annots[ cell.annots$cells %in% colnames(infercnv_obj@count.data), ] 31 | 32 | write.table(cell.annots, file="cell_annots.txt", quote=F, row.names=F, col.names=F, sep="\t") 33 | 34 | ## write infercnv runner: 35 | 36 | cat(file='run.infercnv.R', sprintf("#!/usr/bin/env Rscript 37 | 38 | options(error = function() { traceback(2); q(status = 1) } ) 39 | 40 | library(\"infercnv\") 41 | 42 | # create the infercnv object 43 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=\"sc.counts.matrix\", 44 | annotations_file=\"cell_annots.txt\", 45 | delim=\"\t\", 46 | gene_order_file=\"gencode_v19_gene_pos.txt\", 47 | ref_group_names=c(\'%s\')) 48 | 49 | out_dir=\"output_dir\" 50 | # perform infercnv operations to reveal cnv signal 51 | infercnv_obj = infercnv::run(infercnv_obj, 52 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 53 | out_dir=out_dir, 54 | cluster_by_groups=T, 55 | plot_steps=T, 56 | HMM=T, 57 | #HMM_mode='subclusters', 58 | HMM_mode='samples', 59 | sim_method='meanvar' 60 | ) 61 | ", paste(names(infercnv_obj@reference_grouped_cell_indices),collapse="','") ) ) 62 | 63 | Sys.chmod('run.infercnv.R', mode = "0775") 64 | 65 | 66 | -------------------------------------------------------------------------------- /scripts/infercnv_validate.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Script to validate inferCNV docker container instances. 3 | 4 | ##### 5 | # Set up logging 6 | ##### 7 | 8 | library(logging) 9 | # Logging level choices 10 | C_LEVEL_CHOICES <- names(loglevels) 11 | logging::basicConfig(level='INFO') #initialize to info setting. 12 | 13 | ##### 14 | # Data sources 15 | ##### 16 | 17 | ## input data for validation (provided in docker image) 18 | infercnv_root <- '/inferCNV/' 19 | validation_input_dir <- paste0(infercnv_root,'example/') 20 | raw_counts_matrix <- paste0(validation_input_dir, 21 | 'oligodendroglioma_expression_downsampled.counts.matrix') 22 | annotations_file <- paste0(validation_input_dir, 23 | 'oligodendroglioma_annotations_downsampled.txt') 24 | gene_order_file <- paste0(validation_input_dir, 25 | 'gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt') 26 | out_dir <- 'output_cli' 27 | 28 | ## reference output for validation 29 | validation_reference <- paste0(validation_input_dir,'validation/', 30 | 'reference-infercnv.observations.txt') 31 | 32 | # Make sure the reference input data exists 33 | logging::loginfo(paste("Checking for inferCNV validation input files.", sep="")) 34 | if (!file.exists(raw_counts_matrix) || 35 | !file.exists(annotations_file) || 36 | !file.exists(gene_order_file)){ 37 | logging::logerror(paste("Missing input file(s)", sep="")) 38 | stop(paste0('Error: expected input files cannot be found.')) 39 | } 40 | 41 | 42 | ##### 43 | # Run inferCNV for validation 44 | ##### 45 | logging::loginfo(paste("Running inferCNV on validation input files.", sep="")) 46 | inferCNV_exe <- paste0(infercnv_root, 'scripts/inferCNV.R') 47 | validate_cmd <- paste0(inferCNV_exe, 48 | ' --raw_counts_matrix=', raw_counts_matrix, 49 | ' --annotations_file=', annotations_file, 50 | ' --gene_order_file=', gene_order_file, 51 | ' --ref_group_names=', 52 | '\"Microglia/Macrophage,Oligodendrocytes (non-malignant)\"', 53 | ' --cutoff=1', 54 | ' --out_dir=', out_dir, 55 | ' --cluster_by_groups', 56 | ' --denoise') 57 | logging::loginfo(validate_cmd) 58 | system(validate_cmd) 59 | 60 | validation_input <- paste0(out_dir, '/infercnv.observations.txt') 61 | 62 | if (!file.exists(validation_input)){ 63 | logging::logerror(paste("Error: expected output file, infercnv.observations.txt, not found.", sep="")) 64 | stop('Validation aborted - inferCNV analysis on test data failed.\n') 65 | } 66 | 67 | 68 | ##### 69 | # Read in data for validation 70 | ##### 71 | 72 | ref <- as.matrix(read.csv(validation_reference, header=T, sep = ' ')) 73 | obs <- as.matrix(read.csv(validation_input, header=T, sep = ' ')) 74 | 75 | 76 | ##### 77 | # Perform validation 78 | ##### 79 | logging::loginfo(paste("Performing validation.", sep="")) 80 | if (max ( abs(obs - ref)/abs(ref)) < 1.0e-8){ 81 | unlink(out_dir, recursive=TRUE) 82 | logging::loginfo(paste("Successful validation - output passes similarity check.", sep="")) 83 | } else { 84 | logging::logerror(paste("Error: generated output fails similarity check", sep="")) 85 | logging::loginfo(paste("Saving validation files in current working directory", sep="")) 86 | file.copy(validation_reference, "./reference-infercnv.observations.txt") 87 | file.copy(validation_input, "./infercnv.observations.txt") 88 | unlink(out_dir, recursive=TRUE) 89 | stop('Validation failed - max relative error exceeds threshold.\n') 90 | } 91 | -------------------------------------------------------------------------------- /scripts/meanvar_sim_counts.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | options(error = function() {traceback(2);quit(save = "no", status = 0, runLast = FALSE)}) 5 | 6 | parser = ArgumentParser() 7 | 8 | parser$add_argument("--infercnv_obj", help="total sum normalized infercnv obj", required=TRUE, default=NULL, nargs=1) 9 | parser$add_argument("--ncells", help="number of cells to simulate", required=FALSE, type='integer', nargs=1, default=-1) 10 | parser$add_argument("--ngenes", help="number of genes to simulate", required=FALSE, type='integer', nargs=1, default=-1) 11 | parser$add_argument("--output_prefix", help='prefix for output matrix file', required=TRUE, nargs=1) 12 | 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(SingleCellExperiment) 17 | library("methods") 18 | library(tidyverse) 19 | 20 | 21 | infercnv_obj_file = args$infercnv_obj 22 | 23 | ncells = args$ncells 24 | ngenes = args$ngenes 25 | output_prefix = args$output_prefix 26 | 27 | infercnv_obj = readRDS(infercnv_obj_file) 28 | 29 | expr.data = infercnv_obj@expr.data[, unlist(infercnv_obj@reference_grouped_cell_indices)] 30 | 31 | if (ncells < 0) { 32 | ncells = ncol(expr.data) 33 | } 34 | if (ngenes < 0) { 35 | ngenes = nrow(expr.data) 36 | } 37 | 38 | ## sim using specified gene means 39 | gene_means = rowMeans(expr.data) 40 | gene_means = gene_means[gene_means>0] 41 | 42 | gene_means = sample(x=gene_means, size=ngenes, replace=T) 43 | 44 | newnames = paste0('gene', 1:ngenes) 45 | 46 | names(gene_means) = newnames 47 | 48 | 49 | sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend(infercnv_obj, gene_means, ncells, TRUE) 50 | 51 | 52 | output_filename = paste0(output_prefix, ".counts.matrix") 53 | write.table(sim_matrix, file=output_filename, quote=F, sep='\t') 54 | 55 | pdf(paste0(output_prefix, ".KS.pdf")) 56 | infercnv:::KS_plot("meanVarSim", as.numeric(log(expr.data+1)), as.numeric(log(sim_matrix+1))) 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /scripts/plot_hspike.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | 12 | infercnv_obj_file = args$infercnv_obj 13 | 14 | infercnv_obj = readRDS(infercnv_obj_file) 15 | 16 | if (! is.null(infercnv_obj@.hspike)) { 17 | out_prefix = paste0(infercnv_obj_file, '.hspike') 18 | plot_cnv(infercnv_obj@.hspike, 19 | out_dir=dirname(infercnv_obj_file), 20 | output_filename=basename(out_prefix) ) 21 | 22 | 23 | hspike_obj = infercnv_obj@.hspike 24 | hspike_gene_expr_by_cnv <- infercnv:::.get_gene_expr_by_cnv(hspike_obj) 25 | hspike_cnv_mean_sd <- infercnv:::.get_gene_expr_mean_sd_by_cnv(hspike_gene_expr_by_cnv) 26 | p = infercnv:::.plot_gene_expr_by_cnv(gene_expr_by_cnv=hspike_gene_expr_by_cnv, cnv_mean_sd=hspike_cnv_mean_sd) 27 | pdf(paste0(infercnv_obj_file, '.hspike.dist.pdf')) 28 | plot(p) 29 | dev.off() 30 | 31 | } else { 32 | message("no hspike to plot") 33 | } 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /scripts/plot_hspike.by_num_cells.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(dplyr) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | if (! is.null(infercnv_obj@.hspike)) { 18 | hspike_obj = infercnv_obj@.hspike 19 | 20 | 21 | pdf(paste0(infercnv_obj_file, '.hspike.dist_by_numcells.pdf')) 22 | 23 | 24 | 25 | 26 | gene_expr_by_cnv <- infercnv:::.get_gene_expr_by_cnv(hspike_obj) 27 | cnv_level_to_mean_sd = list() 28 | 29 | for (ncells in c(1,2,3,4,5,10,20,50,100)) { 30 | 31 | cnv_to_means = list() 32 | cnv_mean_sd = list() 33 | 34 | for (cnv_level in names(gene_expr_by_cnv) ) { 35 | expr_vals = gene_expr_by_cnv[[ cnv_level ]] 36 | nrounds = 100 37 | 38 | means = c() 39 | 40 | for(i in 1:nrounds) { 41 | vals = sample(expr_vals, size=ncells, replace=T) 42 | m_val = mean(vals) 43 | means = c(means, m_val) 44 | } 45 | cnv_to_means[[ cnv_level ]] = means 46 | cnv_mean_sd[[ cnv_level ]] = list(sd=sd(means), mean=mean(means)) 47 | } 48 | 49 | ## plot 50 | 51 | df = do.call(rbind, lapply(names(cnv_to_means), function(x) { data.frame(cnv=x, expr=cnv_to_means[[x]]) })) 52 | 53 | p = df %>% ggplot(aes(expr, fill=cnv, colour=cnv)) + geom_density(alpha=0.1) 54 | 55 | p = p + 56 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) + 57 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) + 58 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) + 59 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) + 60 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) + 61 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd)) 62 | 63 | p = p + ggtitle(sprintf("num cells: %g", ncells)) 64 | 65 | plot(p) 66 | 67 | 68 | } 69 | 70 | 71 | dev.off() 72 | 73 | } else { 74 | message("no hspike to plot") 75 | } 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /scripts/plot_hspike.diff_normal_tumor.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | 12 | infercnv_obj_file = args$infercnv_obj 13 | 14 | infercnv_obj = readRDS(infercnv_obj_file) 15 | 16 | if (! is.null(infercnv_obj@.hspike)) { 17 | pdfname = paste0(infercnv_obj_file, '.hspike.diff_normal_tumor.pdf') 18 | 19 | pdf(pdfname) 20 | hspike = infercnv_obj@.hspike 21 | 22 | normal_matrix = hspike@expr.data[,unlist(hspike@reference_grouped_cell_indices)] 23 | tumor_matrix = hspike@expr.data[,unlist(hspike@observation_grouped_cell_indices)] 24 | 25 | normal.means = rowMeans(normal_matrix) 26 | tumor.means = rowMeans(tumor_matrix) 27 | 28 | plot(normal.means, ylim=range(normal.means, tumor.means)) 29 | points(tumor.means, col='green') 30 | 31 | plot(tumor.means - normal.means) 32 | abline(h=0, col='red') 33 | 34 | sm = caTools::runmean(tumor.means - normal.means, k=31) 35 | points(sm, col='magenta') 36 | 37 | } else { 38 | message("no hspike to plot") 39 | } 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /scripts/plot_hspike_vs_sample_chrs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | 8 | args = parser$parse_args() 9 | 10 | library(infercnv) 11 | library(futile.logger) 12 | library(tidyverse) 13 | 14 | 15 | infercnv_obj_file = args$infercnv_obj 16 | 17 | infercnv_obj = readRDS(infercnv_obj_file) 18 | 19 | gene_order = infercnv_obj@gene_order 20 | gene_order = cbind(gene_order, gene=rownames(gene_order)) 21 | 22 | cnv_to_expr_vals = list() 23 | 24 | expr.data <- infercnv_obj@expr.data 25 | 26 | cnv_mean_sd = infercnv:::get_spike_dists(infercnv_obj@.hspike) 27 | 28 | chrs = unique(infercnv_obj@gene_order$chr) 29 | 30 | groups = c(infercnv_obj@observation_grouped_cell_indices, infercnv_obj@reference_grouped_cell_indices) 31 | 32 | samples = names(groups) 33 | 34 | 35 | for (sample in samples) { 36 | pdf_name = sprintf("%s-%s.cnv_expr_densities_each_chr.pdf", infercnv_obj_file, sub("[^A-Za-z0-9]", "_", sample, perl=TRUE)) 37 | pdf(pdf_name) 38 | 39 | message(sprintf("plotting sample: %s", sample)) 40 | 41 | sample_cells = groups[[ sample ]] 42 | 43 | sample_expr = expr.data[, sample_cells] 44 | 45 | for (chr in chrs) { 46 | chr_gene_idx = which(infercnv_obj@gene_order$chr == chr) 47 | 48 | sample_gene_expr = sample_expr[chr_gene_idx,] 49 | 50 | normal_gene_expr = expr.data[chr_gene_idx, unlist(infercnv_obj@reference_grouped_cell_indices)] 51 | 52 | df = rbind(data.frame(class='allnormal', vals=as.numeric(normal_gene_expr) ), 53 | data.frame(class='sample', vals=as.numeric(sample_gene_expr)) ) 54 | 55 | message(sprintf("plotting sample: %s, %s", sample, chr)) 56 | 57 | p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(sprintf("%s, %s", sample, chr)) 58 | 59 | p = p + 60 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) + 61 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) + 62 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) + 63 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) + 64 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) + 65 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd)) 66 | 67 | 68 | 69 | plot(p) 70 | 71 | } 72 | dev.off() 73 | } 74 | 75 | -------------------------------------------------------------------------------- /scripts/plot_infercnv_obj.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | 11 | infercnv_obj_file = args$infercnv_obj 12 | 13 | infercnv_obj = readRDS(infercnv_obj_file) 14 | 15 | plot_cnv(infercnv_obj, 16 | output_filename=basename(infercnv_obj_file)) 17 | 18 | -------------------------------------------------------------------------------- /scripts/plot_tumor_vs_normal_chr_densities.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | library(dplyr) 13 | 14 | infercnv_obj_file = args$infercnv_obj 15 | 16 | infercnv_obj = readRDS(infercnv_obj_file) 17 | 18 | ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj) 19 | pdf_filename = paste0(infercnv_obj_file, ".chr_expr_densities.pdf") 20 | 21 | cnv_mean_sd = infercnv:::get_spike_dists(infercnv_obj@.hspike) 22 | 23 | pdf(pdf_filename) 24 | 25 | chrs = unique(infercnv_obj@gene_order$chr) 26 | 27 | 28 | for (chr in chrs) { 29 | 30 | gene_idx = which(infercnv_obj@gene_order$chr == chr) 31 | 32 | ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices]) 33 | 34 | df = data.frame(class='normal', vals=ref_data_pts) 35 | 36 | for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) { 37 | 38 | tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]] 39 | tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx]) 40 | 41 | df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts)) 42 | } 43 | 44 | flog.info(sprintf("Plotting data for chr: %s", chr)) 45 | 46 | p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr) # + scale_y_continuous(trans='log10', limits=c(1,NA)) 47 | 48 | 49 | p = p + 50 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.01"]]$mean,'sd'=cnv_mean_sd[["cnv:0.01"]]$sd)) + 51 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:0.5"]]$mean,'sd'=cnv_mean_sd[["cnv:0.5"]]$sd)) + 52 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1"]]$mean,'sd'=cnv_mean_sd[["cnv:1"]]$sd)) + 53 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:1.5"]]$mean,'sd'=cnv_mean_sd[["cnv:1.5"]]$sd)) + 54 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:2"]]$mean,'sd'=cnv_mean_sd[["cnv:2"]]$sd)) + 55 | stat_function(fun=dnorm, color='black', args=list('mean'=cnv_mean_sd[["cnv:3"]]$mean,'sd'=cnv_mean_sd[["cnv:3"]]$sd)) 56 | 57 | 58 | 59 | plot(p) 60 | } 61 | 62 | -------------------------------------------------------------------------------- /scripts/plot_tumor_vs_normal_chr_densities.i3.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | library(dplyr) 13 | 14 | infercnv_obj_file = args$infercnv_obj 15 | 16 | infercnv_obj = readRDS(infercnv_obj_file) 17 | 18 | ref_group_cell_indices = infercnv:::get_reference_grouped_cell_indices(infercnv_obj) 19 | pdf_filename = paste0(infercnv_obj_file, ".i3.chr_expr_densities.pdf") 20 | 21 | normal_sd_trend = infercnv:::.i3HMM_get_sd_trend_by_num_cells_fit(infercnv_obj) 22 | 23 | mu = normal_sd_trend$mu 24 | sigma = normal_sd_trend$sigma 25 | 26 | 27 | 28 | pdf(pdf_filename) 29 | 30 | chrs = unique(infercnv_obj@gene_order$chr) 31 | 32 | delta = infercnv:::get_HoneyBADGER_setGexpDev(gexp.sd=sigma, alpha=0.05, k_cells=7) 33 | 34 | for (chr in chrs) { 35 | 36 | gene_idx = which(infercnv_obj@gene_order$chr == chr) 37 | 38 | ref_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx,ref_group_cell_indices]) 39 | 40 | df = data.frame(class='normal', vals=ref_data_pts) 41 | 42 | for (tumor in names(infercnv_obj@observation_grouped_cell_indices) ) { 43 | 44 | tumor_cell_idx = infercnv_obj@observation_grouped_cell_indices[[ tumor ]] 45 | tumor_data_pts = as.numeric(infercnv_obj@expr.data[gene_idx, tumor_cell_idx]) 46 | 47 | df = rbind(df, data.frame(class=tumor, vals=tumor_data_pts)) 48 | } 49 | 50 | flog.info(sprintf("Plotting data for chr: %s", chr)) 51 | 52 | p = df %>% ggplot(aes(vals, fill=class)) + geom_density(alpha=0.3) + ggtitle(chr) # + scale_y_continuous(trans='log10', limits=c(1,NA)) 53 | 54 | 55 | p = p + 56 | stat_function(fun=dnorm, color='black', args=list('mean'=mu,'sd'=sigma)) + 57 | stat_function(fun=dnorm, color='blue', args=list('mean'=mu-delta,'sd'=sigma)) + 58 | stat_function(fun=dnorm, color='blue', args=list('mean'=mu+delta,'sd'=sigma)) 59 | 60 | 61 | plot(p) 62 | } 63 | 64 | -------------------------------------------------------------------------------- /scripts/prepare_sparsematrix.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | library(optparse) 5 | library(Matrix) 6 | library(data.table) 7 | library(logging) 8 | 9 | 10 | logging::basicConfig(level='INFO') 11 | 12 | pargs <- optparse::OptionParser(usage=paste("%prog [options]", 13 | "--input data_matrix ", 14 | "--output sparse_matrix ", 15 | "--delim matrix_delimiter" 16 | )) 17 | 18 | pargs <- optparse::add_option(pargs, c("--input"), 19 | type="character", 20 | default=NULL, 21 | action="store", 22 | dest="input", 23 | metavar="input", 24 | help=paste("Input raw counts matrix ", 25 | "to prepare for infercnv run.")) 26 | 27 | pargs <- optparse::add_option(pargs, c("--output"), 28 | type="character", 29 | default=NULL, 30 | action="store", 31 | dest="output", 32 | metavar="output", 33 | help=paste("Output raw counts matrix ", 34 | "as a sparseMatrix for infercnv run.")) 35 | 36 | pargs <- optparse::add_option(pargs, c("--delim"), 37 | type="character", 38 | action="store", 39 | default="\t", 40 | dest="delim", 41 | metavar="delim", 42 | help=paste("Delimiter for reading expression matrix", 43 | "[Default %default]")) 44 | 45 | args <- optparse::parse_args(pargs) 46 | 47 | if (is.null(args$input) || is.null(args$output)) { 48 | logging::logerror("Please provide input and output arguments") 49 | } 50 | 51 | logging::loginfo("Reading header.") 52 | 53 | data_head = fread(input=args$input, 54 | sep=args$delim, 55 | header=FALSE, 56 | nrows=1, 57 | stringsAsFactors=FALSE, 58 | check.names=FALSE, 59 | nThread=1, 60 | logical01=FALSE, 61 | data.table=FALSE) 62 | 63 | logging::loginfo("Done reading header.") 64 | logging::loginfo("Reading matrix data.") 65 | 66 | ddata = fread(input=args$input, 67 | sep=args$delim, 68 | header=FALSE, 69 | skip=1, 70 | stringsAsFactors=FALSE, 71 | check.names=FALSE, 72 | nThread=1, 73 | logical01=FALSE, 74 | data.table=FALSE) 75 | 76 | logging::loginfo("Done reading matrix data.") 77 | 78 | logging::loginfo("Backing up rownames.") 79 | # store column names before dropping the column from the matrix 80 | saved_names = as.vector(unlist(ddata[, 1])) 81 | ddata = ddata[, -1, drop=FALSE] 82 | 83 | in_size = object.size(ddata) 84 | 85 | colnames(ddata) = as.vector(unlist(data_head)) 86 | 87 | logging::loginfo("Converting data.frame to Matrix.") 88 | basic_matrix = as.matrix(ddata) 89 | logging::loginfo("Done converting data.frame to Matrix.") 90 | logging::loginfo("Freeing data.frame.") 91 | rm(ddata) # make memory available 92 | gc() 93 | logging::loginfo("Converting Matrix to sparseMatrix.") 94 | sparse_matrix = Matrix(basic_matrix, sparse=T) 95 | logging::loginfo("Done converting Matrix to sparseMatrix.") 96 | logging::loginfo("Freeing Matrix.") 97 | rm(basic_matrix) # make memory available 98 | gc() 99 | logging::loginfo("Setting rownames.") 100 | row.names(sparse_matrix) = saved_names 101 | 102 | logging::loginfo("Saving sparseMatrix to RDS file.") 103 | saveRDS(sparse_matrix, file=paste(args$output, "rds", sep=".")) 104 | 105 | out_size = object.size(sparse_matrix) 106 | 107 | fileConn<-file("prepare_smallest.txt") 108 | if (in_size < out_size) { 109 | writeLines(args$input, fileConn) 110 | } else { 111 | writeLines(paste(args$output, "rds", sep="."), fileConn) 112 | } 113 | close(fileConn) 114 | 115 | -------------------------------------------------------------------------------- /scripts/recursive_random_tree_height_cutting.random_trees.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | hclust_method='ward.D2' 5 | 6 | num_rand_iters = 100 7 | MAX_PVAL=0.05 8 | 9 | suppressPackageStartupMessages(library("argparse")) 10 | 11 | parser = ArgumentParser() 12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(ggplot2) 17 | library(futile.logger) 18 | library(pheatmap) 19 | 20 | infercnv_obj = readRDS(args$infercnv_obj) 21 | 22 | 23 | pdf("test.recursive_trees.pdf") 24 | 25 | adj.obj = infercnv:::define_signif_tumor_subclusters(infercnv_obj, p_val=0.05, hclust_method='ward.D2', partition_method='random_trees') 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /scripts/recursive_random_tree_height_cutting.sigclust2.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | hclust_method='ward.D2' 5 | 6 | num_rand_iters = 100 7 | MAX_PVAL=0.05 8 | 9 | suppressPackageStartupMessages(library("argparse")) 10 | 11 | parser = ArgumentParser() 12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(ggplot2) 17 | library(futile.logger) 18 | library(pheatmap) 19 | 20 | obj = readRDS(args$infercnv_obj) 21 | 22 | tumor.expr.data = obj@expr.data[, unlist(obj@observation_grouped_cell_indices)] 23 | 24 | gene_order = obj@gene_order 25 | chrs = unique(gene_order$chr) 26 | 27 | 28 | pdf("test.recursive_trees.pdf") 29 | 30 | 31 | ALL_CLUSTERS = list() 32 | MIN_CLUSTER_SIZE=3 33 | 34 | library(sigclust2) 35 | 36 | recursive_cluster_cutting <- function(expr.matrix) { 37 | 38 | message("recursive_cluster_cutting()") 39 | print(dim(expr.matrix)) 40 | 41 | if (dim(expr.matrix)[2] < MIN_CLUSTER_SIZE) { 42 | message("cluster size too small. Storing cluster") 43 | ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix) 44 | 45 | print("Returning") 46 | return(NULL) 47 | print("Didn't actually return...") 48 | } 49 | 50 | print("Onward") 51 | print(dim(expr.matrix)) 52 | 53 | t_tumor.expr.data = t(expr.matrix) # cells as rows, genes as cols 54 | 55 | shc_result = shc(t_tumor.expr.data, metric='euclidean', linkage='ward.D2') 56 | plot(shc_result) 57 | 58 | for(chr in chrs) { 59 | chr_genes = which(gene_order$chr == chr) 60 | 61 | message(sprintf("plotting %s", chr)) 62 | 63 | shc_result = shc(t_tumor.expr.data[,chr_genes], metric='euclidean', linkage='ward.D2') 64 | plot(shc_result) 65 | } 66 | 67 | 68 | 69 | } 70 | 71 | recursive_cluster_cutting(tumor.expr.data) 72 | 73 | dev.off() 74 | 75 | print(ALL_CLUSTERS) 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /scripts/recursive_random_tree_height_cutting.using_hmms.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | hclust_method='ward.D2' 5 | 6 | num_rand_iters = 100 7 | MAX_PVAL=0.05 8 | 9 | suppressPackageStartupMessages(library("argparse")) 10 | 11 | parser = ArgumentParser() 12 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(ggplot2) 17 | library(futile.logger) 18 | library(pheatmap) 19 | 20 | obj = readRDS(args$infercnv_obj) 21 | 22 | tumor.expr.data = obj@expr.data[, unlist(obj@observation_grouped_cell_indices)] 23 | 24 | gene_order = obj@gene_order 25 | chrs = unique(gene_order$chr) 26 | 27 | tumor.expr.data[tumor.expr.data>3] <- 4 28 | tumor.expr.data[tumor.expr.data<3] <- 2 29 | 30 | 31 | pdf("test.recursive_trees.pdf") 32 | 33 | 34 | ALL_CLUSTERS = list() 35 | MIN_CLUSTER_SIZE=3 36 | 37 | 38 | recursive_cluster_cutting <- function(expr.matrix) { 39 | 40 | message("recursive_cluster_cutting()") 41 | print(dim(expr.matrix)) 42 | 43 | if (dim(expr.matrix)[2] < MIN_CLUSTER_SIZE) { 44 | message("cluster size too small. Storing cluster") 45 | ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix) 46 | 47 | print("Returning") 48 | return(NULL) 49 | print("Didn't actually return...") 50 | } 51 | 52 | print("Onward") 53 | print(dim(expr.matrix)) 54 | 55 | t_tumor.expr.data = t(expr.matrix) # cells as rows, genes as cols 56 | d = dist(t_tumor.expr.data) 57 | 58 | h_obs = hclust(d, method=hclust_method) 59 | 60 | # permute by chromosomes 61 | 62 | permute_chr_col_vals <- function(df) { 63 | 64 | num_cells = nrow(df) 65 | 66 | for(chr in chrs) { 67 | chr_genes = which(gene_order$chr == chr) 68 | 69 | df[, chr_genes] = df[sample(x=1:num_cells, size=num_cells, replace=F), chr_genes] 70 | } 71 | 72 | df 73 | } 74 | 75 | permute_col_vals <- function(df) { 76 | 77 | num_cells = nrow(df) 78 | for (i in 1:ncol(df)) { 79 | df[,i] = df[sample(x=1:num_cells, size=num_cells, replace=F), i] 80 | } 81 | 82 | df 83 | } 84 | 85 | 86 | example_rand_matrix <- NULL 87 | max_rand_heights = c() 88 | for (i in 1:num_rand_iters) { 89 | 90 | ##rand.tumor.expr.data = permute_chr_col_vals(t_tumor.expr.data) 91 | rand.tumor.expr.data = permute_col_vals(t_tumor.expr.data) 92 | example_rand_matrix <- rand.tumor.expr.data 93 | rand.dist = dist(rand.tumor.expr.data) 94 | h_rand <- hclust(rand.dist, method=hclust_method) 95 | 96 | max_rand_heights = c(max_rand_heights, max(h_rand$height)) 97 | } 98 | 99 | h = h_obs$height 100 | 101 | max_height = max(h) 102 | 103 | message(sprintf("Max Rand Heights(h): %s", paste(max_rand_heights, sep=",", collapse=","))) 104 | 105 | max_rand_height_dens = density(max_rand_heights) 106 | plot(max_rand_height_dens, xlim=range(max_rand_height_dens$x, max_height)) 107 | 108 | e = ecdf(max_rand_heights) 109 | message(sprintf("pvals(Lengths(h)): %s", paste(1-e(h), sep=",", collapse=","))) 110 | 111 | pval = 1- e(max_height) 112 | message(sprintf("pval for max obs height: %g = %g", max_height, pval)) 113 | 114 | abline(v=max_height, col='red') 115 | 116 | pheatmap(t(expr.matrix), cluster_cols=F) 117 | pheatmap(example_rand_matrix, cluster_cols=F) 118 | 119 | 120 | #stop("stopping") 121 | 122 | if (max_height > 0 & pval <= MAX_PVAL) { 123 | ## keep on cutting. 124 | cut_height = mean(c(h[length(h)-1], h[length(h)])) 125 | message(sprintf("cutting at height: %g", cut_height)) 126 | grps = cutree(h_obs, h=cut_height) 127 | print(grps) 128 | uniqgrps = unique(grps) 129 | for (grp in uniqgrps) { 130 | grp_idx = which(grps==grp) 131 | 132 | message(sprintf("grp: %s contains idx: %s", grp, paste(grp_idx,sep=",", collapse=","))) 133 | df = expr.matrix[,grp_idx,drop=F] 134 | recursive_cluster_cutting(df) 135 | } 136 | } else { 137 | message("No cluster pruning") 138 | ALL_CLUSTERS[[length(ALL_CLUSTERS)+1]] <<- colnames(expr.matrix) 139 | } 140 | 141 | } 142 | 143 | recursive_cluster_cutting(tumor.expr.data) 144 | 145 | dev.off() 146 | 147 | print(ALL_CLUSTERS) 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /scripts/run.stub.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | pdf('ladeda.pdf') 18 | 19 | -------------------------------------------------------------------------------- /scripts/run_BayesNet.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--prelim_infercnv_obj", help="preliminary infercnv_obj file", required=TRUE, nargs=1) 7 | parser$add_argument("--i6HMM_infercnv_obj", help="i6HMM infercnv_obj file", required=TRUE, nargs=1) 8 | 9 | parser$add_argument("--BayesMaxPNormal", help="BayesMaxPNormal", required=TRUE, nargs=1, type='double') 10 | parser$add_argument("--out_dir", help="output directory", required=TRUE, nargs=1) 11 | 12 | args = parser$parse_args() 13 | 14 | library(infercnv) 15 | library(futile.logger) 16 | 17 | infercnv_obj_prelim = readRDS(args$prelim_infercnv_obj) 18 | 19 | hmm.infercnv_obj = readRDS(args$i6HMM_infercnv_obj) 20 | 21 | 22 | flog.info("Running Bayesian Network Model on HMM predicted CNV's\n") 23 | 24 | hmm.infercnv_obj <- infercnv::inferCNVBayesNet(infercnv_obj = infercnv_obj_prelim, 25 | HMM_obj = hmm.infercnv_obj, 26 | BayesMaxPNormal = args$BayesMaxPNormal, 27 | file_dir = args$out_dir, 28 | postMcmcMethod = "removeCNV", 29 | out_dir = file.path(args$out_dir, "BayesNetOutput"), 30 | quietly = TRUE) 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /scripts/run_HMM_each_cell_separately.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | 18 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_on_indiv_cells(infercnv_obj) 19 | 20 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM-icells.obj", infercnv_obj_file)) 21 | 22 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM-icells")) 23 | 24 | -------------------------------------------------------------------------------- /scripts/run_HMM_on_hspike.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | hspike = infercnv_obj@.hspike 18 | 19 | hspike.hmm = infercnv:::predict_CNV_via_HMM_on_tumor_subclusters(infercnv_obj=hspike, 20 | cnv_mean_sd=infercnv:::get_spike_dists(hspike), 21 | cnv_level_to_mean_sd_fit=infercnv:::get_hspike_cnv_mean_sd_trend_by_num_cells_fit(hspike) 22 | ) 23 | 24 | plot_cnv(hspike.hmm, x.center=3, x.range=c(0,6), output_filename=paste0(basename(infercnv_obj_file), ".hspike.hmm"), out_dir=dirname(infercnv_obj_file)) 25 | 26 | saveRDS(hspike.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file)) 27 | 28 | -------------------------------------------------------------------------------- /scripts/run_HMM_on_subclusters.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | 18 | if (length(infercnv_obj@tumor_subclusters) == 0) { 19 | flog.info("Computing tumor subclusters") 20 | infercnv_obj <- infercnv:::.subcluster_tumors_general(infercnv_obj) 21 | } 22 | 23 | 24 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_on_tumor_subclusters(infercnv_obj) 25 | 26 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file)) 27 | 28 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM")) 29 | 30 | -------------------------------------------------------------------------------- /scripts/run_HMM_per_chr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | 5 | parser = ArgumentParser() 6 | parser$add_argument("--infercnv_obj", help="infercnv_obj file", required=TRUE, nargs=1) 7 | args = parser$parse_args() 8 | 9 | library(infercnv) 10 | library(ggplot2) 11 | library(futile.logger) 12 | 13 | infercnv_obj_file = args$infercnv_obj 14 | 15 | infercnv_obj = readRDS(infercnv_obj_file) 16 | 17 | pdf('ladeda.pdf') 18 | infercnv_obj.hmm = infercnv:::predict_CNV_via_HMM_each_chr_separately(infercnv_obj) 19 | 20 | saveRDS(infercnv_obj.hmm, file=sprintf("%s-HMM.obj", infercnv_obj_file)) 21 | 22 | plot_cnv(infercnv_obj.hmm, output_filename=paste0(infercnv_obj_file, "-HMM")) 23 | 24 | -------------------------------------------------------------------------------- /scripts/run_tests_sampling_and_group_plots.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(error = function() traceback(2)) 4 | options("warning.length" = 8000) 5 | 6 | library("infercnv") 7 | 8 | # create the infercnv object 9 | infercnv_obj = CreateInfercnvObject(raw_counts_matrix=system.file("extdata", "oligodendroglioma_expression_downsampled.counts.matrix.gz", package = "infercnv"), 10 | annotations_file=system.file("extdata", "oligodendroglioma_annotations_downsampled.txt", package = "infercnv"), 11 | delim="\t", 12 | gene_order_file=system.file("extdata", "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", package = "infercnv"), 13 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 14 | 15 | out_dir="../example/output_dir_sampling_testscript" 16 | # perform infercnv operations to reveal cnv signal 17 | infercnv_obj = infercnv::run(infercnv_obj, 18 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 19 | out_dir=out_dir, 20 | cluster_by_groups=TRUE, 21 | plot_steps=FALSE, 22 | denoise=TRUE, 23 | HMM=FALSE, 24 | no_prelim_plot=TRUE 25 | ) 26 | t_out_dir = paste0(out_dir, "/subplots_1") 27 | if(t_out_dir != "." & !file.exists(t_out_dir)){ 28 | dir.create(t_out_dir) 29 | } 30 | infercnv:::plot_per_group(infercnv_obj, out_dir=t_out_dir, png_res=100, sample=TRUE, n_cells=100) 31 | 32 | 33 | t_out_dir = paste0(out_dir, "/subsamples_1") 34 | if(t_out_dir != "." & !file.exists(t_out_dir)){ 35 | dir.create(t_out_dir) 36 | } 37 | sample_obj <- infercnv:::sample_object(infercnv_obj) 38 | 39 | subsample_obj <- infercnv:::sample_object(infercnv_obj, n_cells=10) 40 | 41 | upsubsample_obj <- infercnv:::sample_object(subsample_obj, n_cells=100) 42 | 43 | every_2_object2 <- infercnv:::sample_object(infercnv_obj, every_n=2, above_m=2) 44 | 45 | only_1_per_object <- infercnv:::sample_object(infercnv_obj, every_n=1000, above_m=2) 46 | 47 | only_1_10times_per_object <- infercnv:::sample_object(only_1_per_object, n_cells=10) 48 | 49 | infercnv_obj_filtered <- infercnv::apply_median_filtering(infercnv_obj, window_size=5) 50 | 51 | 52 | infercnv::plot_cnv(sample_obj, 53 | k_obs_groups=2, 54 | cluster_by_groups=TRUE, 55 | out_dir=t_out_dir, 56 | x.center=1, 57 | x.range="auto", 58 | title="infercnv", 59 | output_filename="infercnv_sampled", 60 | png_res=300, 61 | output_format="png", 62 | write_expr_matrix=TRUE) 63 | 64 | infercnv::plot_cnv(subsample_obj, 65 | k_obs_groups=2, 66 | cluster_by_groups=TRUE, 67 | out_dir=t_out_dir, 68 | x.center=1, 69 | x.range="auto", 70 | title="infercnv", 71 | output_filename="infercnv_subsampled", 72 | png_res=300, 73 | output_format="png", 74 | write_expr_matrix=TRUE) 75 | 76 | infercnv::plot_cnv(upsubsample_obj, 77 | k_obs_groups=2, 78 | cluster_by_groups=TRUE, 79 | out_dir=t_out_dir, 80 | x.center=1, 81 | x.range="auto", 82 | title="infercnv", 83 | output_filename="infercnv_subsampled_then_upsampled", 84 | png_res=300, 85 | output_format="png", 86 | write_expr_matrix=TRUE) 87 | 88 | 89 | infercnv::plot_cnv(every_2_object2, 90 | k_obs_groups=2, 91 | cluster_by_groups=TRUE, 92 | out_dir=t_out_dir, 93 | x.center=1, 94 | x.range="auto", 95 | title="infercnv", 96 | output_filename="infercnv_sample_every_2", 97 | png_res=300, 98 | output_format="png", 99 | write_expr_matrix=TRUE) 100 | 101 | 102 | 103 | infercnv::plot_cnv(only_1_per_object, 104 | k_obs_groups=2, 105 | cluster_by_groups=TRUE, 106 | out_dir=t_out_dir, 107 | x.center=1, 108 | x.range="auto", 109 | title="infercnv", 110 | output_filename="infercnv_sample_only_1", 111 | png_res=300, 112 | output_format="png", 113 | write_expr_matrix=TRUE) 114 | 115 | 116 | infercnv::plot_cnv(only_1_10times_per_object, 117 | k_obs_groups=2, 118 | cluster_by_groups=TRUE, 119 | out_dir=t_out_dir, 120 | x.center=1, 121 | x.range="auto", 122 | title="infercnv", 123 | output_filename="infercnv_sample_only_1_10_times", 124 | png_res=300, 125 | output_format="png", 126 | write_expr_matrix=TRUE) 127 | 128 | infercnv::plot_cnv(infercnv_obj_filtered, 129 | k_obs_groups=2, 130 | cluster_by_groups=TRUE, 131 | out_dir=out_dir, 132 | x.center=1, 133 | x.range="auto", 134 | title="infercnv", 135 | output_filename="infercnv_sampled_median_filtered", 136 | png_res=300, 137 | output_format = NA, 138 | write_expr_matrix=TRUE) 139 | -------------------------------------------------------------------------------- /scripts/sim_vs_orig_counts.QQplot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | library(infercnv) 5 | library(tidyverse) 6 | 7 | 8 | parser = ArgumentParser() 9 | parser$add_argument("--counts_matrix", help="raw counts matrix file", required=TRUE, nargs=1) 10 | parser$add_argument("--sim_method", help="simulation method: splatter, simple, meanvar", required=TRUE) 11 | parser$add_argument("--include_dropout", default=FALSE, action='store_true', help='include dropout modeling') 12 | args = parser$parse_args() 13 | 14 | 15 | include.dropout = args$include_dropout 16 | 17 | 18 | data = read.table(args$counts_matrix) 19 | data = as.matrix(data) 20 | 21 | orig.counts = data 22 | 23 | if (! any(args$sim_method %in% c('splatter', 'simple', 'meanvar'))) { 24 | stop(sprintf("Error, not recognizing sim method: %s", args$sim_method)) 25 | } 26 | 27 | 28 | #' normalize first: 29 | cs = colSums(data) 30 | median_cs = median(cs) 31 | data <- sweep(data, STATS=cs, MARGIN=2, FUN="/") 32 | data <- data * median_cs 33 | 34 | gene_means <- rowMeans(data) 35 | 36 | num_cells = ncol(data) 37 | 38 | ## sim the tumor matrix 39 | sim_method = args$sim_method 40 | if (sim_method == 'simple') { 41 | message('-using simple sim') 42 | 43 | mean_p0_table <- NULL 44 | if (include.dropout) { 45 | mean_p0_table <- infercnv:::.get_mean_vs_p0_from_matrix(data) 46 | } 47 | 48 | sim_matrix <- infercnv:::.get_simulated_cell_matrix(gene_means, 49 | mean_p0_table=mean_p0_table, 50 | num_cells=num_cells, 51 | common_dispersion=0.1) 52 | } else if (sim_method == 'splatter') { 53 | message('-using splatter sim') 54 | 55 | params <- infercnv:::.estimateSingleCellParamsSplatterScrape(orig.counts) 56 | 57 | params[['nCells']] <- num_cells 58 | params[['include.dropout']] <- include.dropout 59 | 60 | gene_means[gene_means == 0] <- 1e-3 61 | sim_matrix <- infercnv:::.simulateSingleCellCountsMatrixSplatterScrape(params, gene_means) 62 | sim_matrix <- counts(sim_matrix) 63 | 64 | } else if (sim_method == 'meanvar') { 65 | message('-using meanvar sim') 66 | ##tumor_sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix(gene_means, data, args$num_tumor_cells) 67 | sim_matrix <- infercnv:::.get_simulated_cell_matrix_using_meanvar_trend_given_normal_matrix(gene_means, data, num_cells, include.dropout=include.dropout) 68 | 69 | } else { 70 | stop(sprintf("not recognizing --sim_method: %s", args$sim_method)) 71 | } 72 | 73 | 74 | ## Plotting 75 | if (include.dropout) { 76 | sim_method <- sprintf("%s-With_Dropout", sim_method) 77 | } else { 78 | sim_method <- sprintf("%s-NO_Dropout", sim_method) 79 | } 80 | 81 | rownames(sim_matrix) <- names(gene_means) 82 | colnames(sim_matrix) <- colnames(data) 83 | sim_matrix_filename <- sprintf("sim.%s.counts.matrix", sim_method) 84 | message("-writing matrix") 85 | write.table(sim_matrix, sim_matrix_filename, quote=F, sep="\t") 86 | 87 | ## total sum normalize sim matrix before plotting 88 | sim_matrix <- infercnv:::.normalize_data_matrix_by_seq_depth(sim_matrix, median_cs) 89 | 90 | message("-plotting QQ plot") 91 | png(sprintf("sim_vs_orig_counts.%s.qqplots.png", sim_method)) 92 | qqplot(log(as.numeric(data)+1), log(as.numeric(sim_matrix)+1), main='orig vs. full sim') 93 | abline(a=0,b=1,col='red') 94 | 95 | message("-plotting KS plot") 96 | png(sprintf("sim_vs_orig_counts.%s.KS.png", sim_method)) 97 | infercnv:::KS_plot(sprintf("KS, %s", sim_method), log(as.numeric(data)+1), log(as.numeric(sim_matrix)+1), names=c('orig', sim_method)) 98 | 99 | 100 | -------------------------------------------------------------------------------- /scripts/splatterScrape_sim_counts.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("argparse")) 4 | options(error = function() {traceback(2);quit(save = "no", status = 0, runLast = FALSE)}) 5 | 6 | parser = ArgumentParser() 7 | 8 | parser$add_argument("--counts_matrix", help="raw counts matrix file", required=TRUE, default=NULL, nargs=1) 9 | parser$add_argument("--ncells", help="number of cells to simulate", required=TRUE, type='integer', nargs=1) 10 | parser$add_argument("--ngenes", help="number of genes to simulate", required=TRUE, type='integer', nargs=1) 11 | parser$add_argument("--output", help='name of output matrix file', required=TRUE, nargs=1) 12 | 13 | args = parser$parse_args() 14 | 15 | library(infercnv) 16 | library(SingleCellExperiment) 17 | library("methods") 18 | library(splatter) 19 | 20 | 21 | counts_matrix = read.table(args$counts_matrix) 22 | params_file = sprintf("%s.params_obj", args$counts_matrix) 23 | if (file.exists(params_file)) { 24 | message("-note, reusing stored params") 25 | params = readRDS(params_file) 26 | } else { 27 | params <- infercnv:::.estimateSingleCellParamsSplatterScrape(counts_matrix) 28 | saveRDS(params, file=sprintf("%s.params_obj", args$counts_matrix)) 29 | } 30 | 31 | ncells = args$ncells 32 | ngenes = args$ngenes 33 | output_filename = args$output 34 | 35 | data = as.matrix(counts_matrix) 36 | 37 | #' normalize first: 38 | cs = colSums(counts_matrix) 39 | median_cs = median(cs) 40 | data <- sweep(counts_matrix, STATS=cs, MARGIN=2, FUN="/") 41 | data <- data * median_cs 42 | 43 | ## sim using specified gene means 44 | gene_means = rowMeans(data) 45 | gene_means = gene_means[gene_means>0] 46 | 47 | gene_means = sample(x=gene_means, size=ngenes, replace=T) 48 | 49 | newnames = paste0('gene', 1:ngenes) 50 | 51 | names(gene_means) = newnames 52 | 53 | 54 | params[['nGenes']] = ngenes 55 | params[['nCells']] = ncells 56 | 57 | 58 | sim_matrix <- infercnv:::.simulateSingleCellCountsMatrixSplatterScrape(params, gene_means) 59 | sim_matrix <- counts(sim_matrix) 60 | 61 | write.table(sim_matrix, file=output_filename, quote=F, sep='\t') 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(testthat) 4 | library(infercnv) 5 | 6 | test_check("infercnv") 7 | -------------------------------------------------------------------------------- /vignettes/inferCNV.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Visualizing Large-scale Copy Number Variation in Single-Cell RNA-Seq Expression Data" 3 | author: 4 | - name: Timothy Tickle 5 | affiliation: &kco Klarman Cell Observatory, Broad Institute of MIT and Harvard, Cambridge, MA, USA 6 | - name: Itay Tirosh 7 | affiliation: 8 | - *kco 9 | - Weizmann Institute of Science, Rehovot, Israel 10 | - name: Christophe Georgescu 11 | affiliation: *kco 12 | - name: Maxwell Brown 13 | affiliation: *kco 14 | - name: Brian Haas 15 | affiliation: *kco 16 | date: "`r Sys.Date()`" 17 | output: 18 | BiocStyle::html_document: default 19 | package: infercnv 20 | abstract: > 21 | InferCNV is used to explore tumor single cell RNA-Seq data to identify evidence for large-scale chromosomal copy number variations, such as gains or deletions of entire chromosomes or large segments of chromosomes. This is done by exploring expression intensity of genes across positions of the genome in comparison to the average or a set of reference 'normal' cells. A heatmap is generated illustrating the relative expression intensities across each chromosome, and it becomes readily apparent as to which regions of the genome are over-abundant or less-abundant as compared to normal cells (or the average, if reference normal cells are not provided). 22 | vignette: > 23 | %\VignetteIndexEntry{Visualizing Large-scale Copy Number Variation in Single-Cell RNA-Seq Expression Data} 24 | %\VignetteEncoding{UTF-8} 25 | %\VignetteEngine{knitr::rmarkdown} 26 | --- 27 | 28 | 29 | # Installation 30 | ## Required dependencies 31 | 32 | _inferCNV_ uses the _R_ packages `r CRANpkg("ape")`, `r Biocpkg("BiocGenerics")`, `r CRANpkg("binhf")`, `r CRANpkg("caTools")`, `r CRANpkg("coda")`, `r CRANpkg("coin")`, `r CRANpkg("dplyr")`, `r CRANpkg("doparallel")`, `r Biocpkg("edgeR")`, `r CRANpkg("fastcluster")`, `r CRANpkg("fitdistrplus")`, `r CRANpkg("foreach")`, `r CRANpkg("futile.logger")`, `r CRANpkg("future")`, `r CRANpkg("gplots")`, `r CRANpkg("ggplot2")`, `r CRANpkg("HiddenMarkov")`, `r CRANpkg("leiden")`, `r CRANpkg("phyclust")`, `r CRANpkg("RANN")`, `r CRANpkg("reshape")`, `r CRANpkg("rjags")`, `r CRANpkg("RColorBrewer")`, `r Biocpkg("SingleCellExperiment")`, `r Biocpkg("SummarizedExperiment")`, `r CRANpkg("tidyr")` and imports functions from the archived `r CRANpkg("GMD")`. 33 | 34 | 60 | 61 | 62 | ## Installing 63 | ```{r install, eval=FALSE} 64 | if (!requireNamespace("BiocManager", quietly = TRUE)) 65 | install.packages("BiocManager") 66 | BiocManager::install("infercnv") 67 | ``` 68 | 69 | 70 | ## Optional extension 71 | If you want to use the interactive heatmap visualization, please check the add-on packge _R_ `r Githubpkg ("broadinstitute/inferCNV_NGCHM")` after installing the packages `r CRANpkg("tibble")`, `r Githubpkg("bmbroom/tsvio")` and `r Githubpkg("bmbroom/NGCHMR")`. To install optional packages, type the following in an R command window: 72 | 73 | 78 | 79 | ```{r install-optionals, eval = FALSE} 80 | install.packages("tibble") 81 | 82 | install.packages("devtools") 83 | devtools::install_github("bmbroom/tsvio") 84 | devtools::install_github("bmbroom/NGCHMR", ref="stable") 85 | devtools::install_github("broadinstitute/inferCNV_NGCHM") 86 | 87 | ``` 88 | 89 | And download the NGCHM java application by typing the following in a regular shell: 90 | ```{bash, eval = FALSE} 91 | wget http://tcga.ngchm.net/NGCHM/ShaidyMapGen.jar 92 | ``` 93 | 94 | 95 | 96 | ```{r setup, include=FALSE} 97 | knitr::opts_chunk$set(echo = TRUE) 98 | library(infercnv) 99 | 100 | ``` 101 | 102 | # Running InferCNV 103 | ## Create the InferCNV Object 104 | 105 | Reading in the raw counts matrix and meta data, populating the infercnv object 106 | 107 | ```{r} 108 | infercnv_obj = CreateInfercnvObject( 109 | raw_counts_matrix="../inst/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz", 110 | annotations_file="../inst/extdata/oligodendroglioma_annotations_downsampled.txt", 111 | delim="\t", 112 | gene_order_file="../inst/extdata/gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt", 113 | ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)")) 114 | 115 | ``` 116 | 117 | 118 | 119 | ## Running the full default analysis 120 | ```{r, results="hide"} 121 | out_dir = tempfile() 122 | infercnv_obj_default = infercnv::run( 123 | infercnv_obj, 124 | cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics 125 | out_dir=out_dir, 126 | cluster_by_groups=TRUE, 127 | plot_steps=FALSE, 128 | denoise=TRUE, 129 | HMM=FALSE, 130 | no_prelim_plot=TRUE, 131 | png_res=60 132 | ) 133 | 134 | ``` 135 | 136 | Basic ouput from running inferCNV. 137 | ```{r, echo=FALSE} 138 | knitr::include_graphics(paste(out_dir, "infercnv.png", sep="/")) 139 | ``` 140 | 141 | 142 | 143 | # Additional Information 144 | ## Online Documentation 145 | 146 | For additional explanations on files, usage, and a tutorial please visit the [wiki](https://github.com/broadinstitute/inferCNV/wiki). 147 | 148 | 149 | ## TrinityCTAT 150 | This tool is a part of the TrinityCTAT toolkit focused on leveraging the use of RNA-Seq to better understand cancer transcriptomes. To find out more please visit [TrinityCTAT](https://github.com/NCIP/Trinity_CTAT/wiki) 151 | 152 | 153 | ## Applications 154 | 155 | This methodology was used in: 156 | 157 | [Anoop P. Patel et al. Single-cell RNA-seq highlights intratumoral heterogeneity in primary glioblastoma. Science. 2014 Jun 20: 1396-1401](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4123637/) 158 | 159 | [Tirosh I et al.Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science. 2016 Apr 8;352(6282):189-96](http://www.ncbi.nlm.nih.gov/pubmed/27124452) 160 | 161 | 162 | 163 | 164 | # Session info 165 | 166 | ```{r sessioninfo, echo=FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=60), out.width=60} 167 | sessionInfo() 168 | ``` 169 | 170 | --------------------------------------------------------------------------------