├── .gitattributes
├── .gitignore
├── CALDER2.Rproj
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── CALDER_hierarchy_v2.R
    ├── CALDER_main.R
    ├── HRG_MLE_each_compartment_fun.R
    ├── HighResolution2Low.R
    ├── LikelihoodRatioTest_fun.R
    ├── RcppExports.R
    ├── bisecting_kmeans.R
    ├── build_comp_table_opt.R
    ├── call_domains.R
    ├── compartment_PCA.R
    ├── compartment_data_generation_fun.R
    ├── general_functions.R
    ├── generate_compartments_bed_fun.R
    ├── post_processing_nested_domains.R
    ├── prunning.R
    └── zigzag_nested_domain_v2.R
├── README.md
├── docker
    ├── Dockerfile
    └── docker_build.sh
├── img
    ├── CALDER_features.png
    ├── CALDER_methods.png
    ├── Hela_chr11.png
    ├── Hela_chr11_and_RWPE1_chr9_pq.png
    ├── IGV_results.png
    └── RWPE1_chr9_pq.png
├── inst
    └── extdata
    │   ├── hg19_all_sub_compartments.bed
    │   ├── hg38_all_sub_compartments.bed
    │   ├── mat_chr21_10kb_ob.txt.gz
    │   ├── mat_chr22_10kb_ob.txt.gz
    │   ├── mm10_all_sub_compartments.bed
    │   └── mm9_all_sub_compartments.bed
├── man
    └── CALDER-package.Rd
├── scripts
    ├── calder
    ├── clean_test.sh
    └── create_release.sh
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── matrix_multiplication_fun.cpp
    └── zigzag_loglik_ancestors_v4.5.cpp
└── tests
    ├── create_test_data.sh
    ├── test_cmd_cool.sh
    ├── testthat.R
    └── testthat
        ├── data
            ├── test.chrom.sizes
            ├── test.cool
            ├── test.mcool
            └── test_gene_coverage.bed
        └── test-main.R


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### CUSTOM ###
  2 | /data
  3 | total_execution.time
  4 | 
  5 | ### TESTING ###
  6 | _snaps/
  7 | 
  8 | ### R ###
  9 | 
 10 | # History files
 11 | .Rhistory
 12 | .Rapp.history
 13 | 
 14 | # Session Data files
 15 | .RData
 16 | .RDataTmp
 17 | 
 18 | # User-specific files
 19 | .Ruserdata
 20 | 
 21 | # Example code in package build process
 22 | *-Ex.R
 23 | 
 24 | # Output files from R CMD build
 25 | /*.tar.gz
 26 | 
 27 | # Output files from R CMD check
 28 | /*.Rcheck/
 29 | 
 30 | # RStudio files
 31 | .Rproj.user/
 32 | 
 33 | # produced vignettes
 34 | vignettes/*.html
 35 | vignettes/*.pdf
 36 | 
 37 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 38 | .httr-oauth
 39 | 
 40 | # knitr and R markdown default cache directories
 41 | *_cache/
 42 | /cache/
 43 | 
 44 | # Temporary files created by R markdown
 45 | *.utf8.md
 46 | *.knit.md
 47 | 
 48 | # R Environment Variables
 49 | .Renviron
 50 | 
 51 | # pkgdown site
 52 | docs/
 53 | 
 54 | # translation temp files
 55 | po/*~
 56 | 
 57 | # RStudio Connect folder
 58 | rsconnect/
 59 | 
 60 | ### R.Bookdown Stack ###
 61 | # R package: bookdown caching files
 62 | /*_files/
 63 | 
 64 | ### C++ ###
 65 | # Prerequisites
 66 | *.d
 67 | 
 68 | # Compiled Object files
 69 | *.slo
 70 | *.lo
 71 | *.o
 72 | *.obj
 73 | 
 74 | # Precompiled Headers
 75 | *.gch
 76 | *.pch
 77 | 
 78 | # Compiled Dynamic libraries
 79 | *.so
 80 | *.dylib
 81 | *.dll
 82 | 
 83 | # Fortran module files
 84 | *.mod
 85 | *.smod
 86 | 
 87 | # Compiled Static libraries
 88 | *.lai
 89 | *.la
 90 | *.a
 91 | *.lib
 92 | 
 93 | # Executables
 94 | *.exe
 95 | *.out
 96 | *.app
 97 | 
 98 | 
 99 | ### macOS ###
100 | # General
101 | .DS_Store
102 | .AppleDouble
103 | .LSOverride
104 | 
105 | # Icon must end with two \r
106 | Icon
107 | 
108 | 
109 | # Thumbnails
110 | ._*
111 | 
112 | # Files that might appear in the root of a volume
113 | .DocumentRevisions-V100
114 | .fseventsd
115 | .Spotlight-V100
116 | .TemporaryItems
117 | .Trashes
118 | .VolumeIcon.icns
119 | .com.apple.timemachine.donotpresent
120 | 
121 | # Directories potentially created on remote AFP share
122 | .AppleDB
123 | .AppleDesktop
124 | Network Trash Folder
125 | Temporary Items
126 | .apdisk
127 | 
128 | ### macOS Patch ###
129 | # iCloud generated files
130 | *.icloud
131 | 
132 | 
133 | ### SublimeText ###
134 | # Cache files for Sublime Text
135 | *.tmlanguage.cache
136 | *.tmPreferences.cache
137 | *.stTheme.cache
138 | 
139 | # Workspace files are user-specific
140 | *.sublime-workspace
141 | *.sublime-project
142 | 
143 | # Project files should be checked into the repository, unless a significant
144 | # proportion of contributors will probably not be using Sublime Text
145 | # *.sublime-project
146 | 
147 | # SFTP configuration file
148 | sftp-config.json
149 | sftp-config-alt*.json
150 | 
151 | # Package control specific files
152 | Package Control.last-run
153 | Package Control.ca-list
154 | Package Control.ca-bundle
155 | Package Control.system-ca-bundle
156 | Package Control.cache/
157 | Package Control.ca-certs/
158 | Package Control.merged-ca-bundle
159 | Package Control.user-ca-bundle
160 | oscrypto-ca-bundle.crt
161 | bh_unicode_properties.cache
162 | 
163 | # Sublime-github package stores a github token in this file
164 | # https://packagecontrol.io/packages/sublime-github
165 | GitHub.sublime-settings
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/CALDER2.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CALDER
 2 | Type: Package
 3 | Title: CALDER: a Hi-C analysis tool for inferring non-linear hierarchical organization of the 3D chromosome
 4 | Authors@R: c(
 5 |     person(given = "Yuanlong",
 6 |            family = "LIU",
 7 |            role = c("aut", "cre"),
 8 |            email = "yliueagle@googlemail.com"),
 9 |     person(given = "Luca",
10 |            family = "NANNI",
11 |            role = c("aut"),
12 |            email = "luca.nanni@unil.ch"))
13 | Version: 2.0
14 | Date: 2023-04-04
15 | Maintainer: Who to complain to <yliueagle@googlemail.com>
16 | Description: CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts
17 | URL: https://github.com/CSOgroup/CALDER2
18 | BugReports: https://github.com/CSOgroup/CALDER2/issues
19 | License: MIT
20 | Encoding: UTF-8
21 | LinkingTo: Rcpp, RcppArmadillo
22 | Depends:
23 |     R (>= 3.5.2)
24 | biocViews:
25 | Imports: 
26 | 	R.utils (>= 2.9.0),
27 | 	doParallel (>= 1.0.15),
28 | 	ape (>= 5.3),
29 | 	dendextend (>= 1.12.0),
30 | 	fitdistrplus (>= 1.0.14),
31 | 	igraph (>= 1.2.4),
32 | 	Matrix (>= 1.2.17),
33 | 	rARPACK (>= 0.11.0),
34 | 	factoextra (>= 1.0.5),
35 | 	data.table (>= 1.12.2),
36 | 	fields (>= 9.8.3),
37 | 	GenomicRanges (>= 1.36.0),
38 | 	strawr (>= 0.0.9),
39 | 	rhdf5 (>= 2.28.0),
40 | 	ggplot2 (>= 3.3.5),
41 | 	optparse
42 | RoxygenNote: 7.1.2
43 | Suggests: 
44 |     testthat (>= 3.0.0)
45 | Config/testthat/edition: 3
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Yuanlong LIU
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | useDynLib(CALDER, .registration=TRUE)
2 | importFrom(Rcpp, evalCpp)
3 | importFrom(foreach, foreach)
4 | exportPattern("^[[:alpha:]]+")
5 | 
6 | 


--------------------------------------------------------------------------------
/R/CALDER_hierarchy_v2.R:
--------------------------------------------------------------------------------
  1 | 		## Compute sub-compartments, V2. This version uses already computed compartments as reference to derive the A/B compartment direction
  2 | 		## Yuanlong LIU, 16-07-2021
  3 | 
  4 | 		## This code converts the compartment into table format
  5 | 
  6 | 		make_comp_tab = function(comp_raw, bin_size)
  7 | 	    {
  8 | 	        # comp_raw[V1=="chrX"]$V1 = "chr23"
  9 | 	        # comp_raw[[1]] = gsub('chr', '', comp_raw[[1]])
 10 | 	        bin_indices_li = apply(comp_raw[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
 11 | 	        expand_len = sapply(bin_indices_li, length)
 12 | 
 13 | 	        n_domains = length(unique(comp_raw[[4]]))
 14 | 
 15 | 	        continous_rank = (n_domains + 1 - rank(unique(comp_raw[[4]]))) / n_domains
 16 | 	        names(continous_rank) = unique(comp_raw[[4]])
 17 | 
 18 | 	        comp_tab = data.table::data.table(chr=rep(comp_raw[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw[[4]], expand_len), comp_rank=rep(comp_raw[[5]], expand_len))
 19 | 	        comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])
 20 | 
 21 | 	        comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
 22 | 	        comp_tab$pos_end = (comp_tab$bin_index)*bin_size
 23 | 
 24 | 	        return(comp_tab)
 25 | 	    }
 26 | 
 27 | 		make_comp_tab_calder = function(comp_raw, bin_size)
 28 | 		{
 29 | 			`%dopar%` <- foreach::`%dopar%`
 30 | 			`%do%` <- foreach::`%do%`
 31 | 
 32 | 			colnames(comp_raw)[1] = 'chr'
 33 | 		    # comp_raw[V1=="chrX"]$V1 = "chr23"
 34 | 		    # comp_raw[[1]] = as.numeric(gsub('chr', '', comp_raw[[1]]))
 35 | 
 36 | 		    make_comp_tab_calder_chr = function(comp_raw_chr, bin_size)
 37 | 		    {
 38 | 			    bin_indices_li = apply(comp_raw_chr[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]})
 39 | 			    expand_len = sapply(bin_indices_li, length)
 40 | 
 41 | 			    n_domains = length(unique(comp_raw_chr[[4]]))
 42 | 
 43 | 			    continous_rank = (n_domains + 1 - rank(unique(comp_raw_chr[[4]]))) / n_domains
 44 | 			    names(continous_rank) = unique(comp_raw_chr[[4]]) 
 45 | 
 46 | 			    comp_tab = data.table::data.table(chr=rep(comp_raw_chr[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw_chr[[4]], expand_len), comp_rank=rep(comp_raw_chr[[5]], expand_len))
 47 | 			    comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name])    
 48 | 
 49 | 			    comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1
 50 | 			    comp_tab$pos_end = (comp_tab$bin_index)*bin_size 
 51 | 			    return(comp_tab)  
 52 | 		    }
 53 | 
 54 | 		    comp_tab_ALL = foreach::foreach(comp_raw_chr=split(comp_raw, by="chr"), .combine=rbind) %do%
 55 | 		    {
 56 | 		    	make_comp_tab_calder_chr(comp_raw_chr, bin_size)
 57 | 		    }
 58 | 
 59 | 		    return(comp_tab_ALL)
 60 | 		}
 61 | 
 62 | 	    generate_domain_2D_bedpe = function(sub_compartment_file, bin_size, bedpe_file, n_sub=8)
 63 | 		{
 64 | 			generate_domain_2D_bedpe_chr = function(comp_raw)
 65 | 			{
 66 | 				comp_tab = c(comp_raw, bin_size=10E3)
 67 | 
 68 | 				comp_tab$comp_name = substr(comp_tab$comp_name, 1, log2(n_sub)*2 - 1)
 69 | 				n_row = nrow(comp_tab)
 70 | 				comp_tab$is_boudary = c(1*(comp_tab$comp_name[2:n_row] != comp_tab$comp_name[1:(n_row-1)]), 0)
 71 | 
 72 | 				pos_end = union( comp_tab[is_boudary==1]$pos_end, comp_tab[n_row, ]$pos_end )
 73 | 				pos_start = union( comp_tab[1, ]$pos_start, pos_end[1:(length(pos_end) -1)] + 1 )
 74 | 				pos_start = as.character(pos_start)
 75 | 				pos_end = as.character(pos_end)
 76 | 
 77 | 				chr_name = comp_tab$chr[1]
 78 | 				bedpe = data.table::data.table(chr_name, pos_start, pos_end, chr_name, pos_start, pos_end)
 79 | 				return(bedpe)
 80 | 			}
 81 | 			my_skip = as.numeric(strsplit(system(intern=TRUE, sprintf('zgrep -n "chr" %s', sub_compartment_file)), ':')[[1]][1])
 82 | 			comp_raw_li = split(data.table::fread(sub_compartment_file, skip=my_skip), by="chr")
 83 | 			bedpe = do.call(rbind, lapply(comp_raw_li, generate_domain_2D_bedpe_chr))
 84 | 
 85 | 			data.table::fwrite(bedpe, file=bedpe_file, col.names=FALSE, sep="\t")
 86 | 			return(bedpe)
 87 | 		}
 88 | 
 89 | 
 90 | 	
 91 | 
 92 | 
 93 | 		## cool format is not accepted for the current version
 94 | 
 95 | 		# cool2mat <- function(cool_file, chr_num, bin_size) ## convert cool format to long format / pos_1, pos_2, val / modified from https://rdrr.io/bioc/HiCcompare/src/R/hicpro2bedpe.R
 96 | 		# {
 97 | 		# 	dump <- rhdf5::h5dump(cool_file)
 98 | 		# 	if(names(dump)=='resolutions')
 99 | 		# 	{
100 | 		# 		cat('\nYour input contact matrix is in mcool format and contains resolutions of:', names(dump$resolutions), '\n')
101 | 				
102 | 		# 		which2keep = which(bin_size==as.numeric(names(dump$resolutions)))
103 | 		# 		dump = dump$resolutions[[which2keep]]
104 | 		# 	}
105 | 
106 | 		# 	ids <- data.table::data.table(chr = dump$bins$chrom, start = dump$bins$start, id = seq(1, length(dump$bins$chrom), by = 1))
107 | 		# 	ids = ids[chr==chr_num][, c('id', 'start')]
108 | 			
109 | 		# 	# make sparse matrix
110 | 
111 | 		# 	mat <- data.table::data.table(bin1 = dump$pixels$bin1_id, bin2 = dump$pixels$bin2_id, val = dump$pixels$count)
112 | 		# 	mat_chr = mat[(bin1 %in% ids$id) & (bin2 %in% ids$id)] ## keep only cis contacts
113 | 		# 	colnames(ids)[2] = 'pos_1'
114 | 		# 	contact_mat = left_join(mat_chr, ids, by = c('bin1' = 'id'))
115 | 		# 	colnames(ids)[2] = 'pos_2'
116 | 		# 	contact_mat <- left_join(contact_mat, ids, by = c('bin2' = 'id'))
117 | 		# 	contact_mat = contact_mat[, c('pos_1', 'pos_2', 'val')]
118 | 		# 	return(contact_mat)
119 | 		# }
120 | 
121 | 
122 | 		CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, feature_track=NULL)
123 | 		{
124 | 			chr_num = gsub('chr', '', chr, ignore.case=TRUE)
125 | 			chr_name = paste0('chr', chr_num)
126 | 
127 | 		    get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, feature_track=NULL) ## correlation of PC1 with comp. domain rank of genome
128 | 		    {
129 | 		        ext_chr_bin_pc = function(chr_bin_pc) ## spand chr_bin_pc using 5kb bin
130 | 		        {
131 | 		            bin_indices_li = unlist(apply(chr_bin_pc[, 4:5], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=5E3)/5E3; list(bins[1:(length(bins) - 1)])}), recursive=FALSE)
132 | 		            expand_len = sapply(bin_indices_li, length)
133 | 		            chr_bin_pc_ext = data.table::data.table(chr=rep(chr_bin_pc[[1]], expand_len), bin_index=unlist(bin_indices_li),PC1_val=rep(chr_bin_pc$PC1_val, expand_len))
134 | 
135 | 		            chr_bin_pc_ext$pos_start = (chr_bin_pc_ext$bin_index - 1)*5E3 + 1
136 | 		            chr_bin_pc_ext$pos_end = (chr_bin_pc_ext$bin_index)*5E3        
137 | 		            return(chr_bin_pc_ext)
138 | 		        }
139 | 
140 | 		        ## function to generate binning scores // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
141 | 			 	# feature_track = data.table::data.table(chr=as.vector(GenomicRanges::seqnames(bw_val)), start=start(bw_val), end=end(bw_val), score=bw_val$score)
142 | 
143 | 
144 | 			 	get_binned_vals = function(feature_track_chr, bin_size=5E3)
145 | 			 	{
146 | 			 		## helper to compute binned average // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/
147 | 			  		binnedMean <- function(bins, numvar, mcolname)
148 | 					{
149 | 					  stopifnot(is(bins, "GRanges"))
150 | 					  stopifnot(is(numvar, "RleList"))
151 | 					  stopifnot(identical(GenomeInfoDb::seqlevels(bins), names(numvar)))
152 | 					  bins_per_chrom <- split(GenomicRanges::ranges(bins), GenomicRanges::seqnames(bins))
153 | 					  sums_list <- lapply(names(numvar),
154 | 					      function(seqname) {
155 | 					          views <- IRanges::Views(numvar[[seqname]],
156 | 					                         bins_per_chrom[[seqname]])
157 | 					          IRanges::viewMeans(views)
158 | 					      })
159 | 					  new_mcol <- unsplit(sums_list, as.factor(GenomicRanges::seqnames(bins)))
160 | 					  GenomicRanges::mcols(bins)[[mcolname]] <- new_mcol
161 | 					  return(bins)
162 | 					}
163 | 
164 | 
165 | 			 		GR = GenomicRanges::makeGRangesFromDataFrame(feature_track_chr, keep.extra.columns=TRUE)
166 | 			 		GR_chrs = split(GR, GenomicRanges::seqnames(GR))
167 | 			 		seq_lens = sapply(GR_chrs, function(v) max(GenomicRanges::end(v)))
168 | 
169 | 					GR_RleList = GenomicRanges::coverage(GR, weight="score")
170 | 					seq_info = GenomicRanges::seqinfo(GR_RleList)
171 | 					GenomeInfoDb::seqlengths(seq_info) = seq_lens
172 | 
173 | 					bins = GenomicRanges::tileGenome(seq_info, tilewidth=bin_size, cut.last.tile.in.chrom=TRUE)
174 | 					bins = bins[GenomicRanges::width(bins)==bin_size]
175 | 					bin_val_tmp = binnedMean(bins, GR_RleList, "bin_val")
176 | 					bin_val_tmp = data.table::as.data.table(bin_val_tmp)
177 | 					bin_val = data.table::data.table(chr=bin_val_tmp$seqnames, bin_index=bin_val_tmp$end / bin_size, continous_rank=log2(1 + bin_val_tmp$bin_val - min(bin_val_tmp$bin_val)))
178 | 					return(bin_val)
179 | 			 	}
180 | 
181 | 
182 | 		        chr2query = chr_bin_pc$chr[1]
183 | 
184 | 		        if(!is.null(ref_compartment_file)) 
185 | 		        {
186 | 		        	domain_rank = data.table::fread(ref_compartment_file)
187 | 			        colnames(domain_rank)[1] = 'chr'
188 | 			        domain_rank_chr = domain_rank[domain_rank[[1]]==chr2query, ]
189 | 
190 | 			        ref_tab = make_comp_tab_calder(domain_rank_chr, bin_size=5E3) ## 5kb is convenient for most of the bin sizes
191 | 			        ref_tab = ref_tab[, c(1,2,5)]
192 | 		        }
193 | 
194 | 		        if(!is.null(feature_track)) 
195 | 		        {
196 | 		        	colnames(feature_track) = c('chr', 'start', 'end', 'score')
197 | 		        	feature_track$chr = gsub('chr', '', feature_track$chr)
198 | 		        	feature_track_chr = feature_track[feature_track$chr==chr_num, ]
199 | 		        	ref_tab = get_binned_vals(feature_track_chr)
200 | 		        }
201 | 
202 | 
203 | 		        chr_name = gsub("chrchr", "chr", paste0("chr", ref_tab$chr))
204 | 		        # chr_name = ifelse(chr_name=="chr23", "chrX", chr_name)
205 | 		        ref_tab$chr = chr_name
206 | 		        colnames(ref_tab)[2] = "bin_index"
207 | 
208 | 		        ################################# convert chr_bin_pc into 5kb
209 | 
210 | 		        chr_bin_pc_ext = ext_chr_bin_pc(chr_bin_pc)
211 | 		        ref_and_pc = merge(ref_tab, chr_bin_pc_ext, by=c("chr", "bin_index"))
212 | 		        cor_with_ref = cor(method='spearman', ref_and_pc[, c("continous_rank", "PC1_val")])[1,2]
213 | 
214 | 		        return(cor_with_ref)
215 | 		    }
216 | 	
217 | 			### The main function starts here
218 | 	    
219 | 
220 | 	        time0 = Sys.time()
221 | 	        log_file = paste0(save_dir, '/chr', chr_num, '_log.txt')
222 | 	        warning_file = paste0(save_dir, '/WARNING_chr', chr_num, '.txt')
223 | 	        cor_log_file = paste0(save_dir, '/cor_with_ref.txt')
224 | 
225 | 	        cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
226 | 	        
227 | 	        cat('[', as.character(chr),'] Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
228 | 	        processed_data = contact_mat_processing_v2(contact_tab_dump, contact_file_dump=contact_file_dump, contact_file_hic=contact_file_hic, chr=chr_num, bin_size_input=bin_size_input, bin_size2look=bin_size2look, black_list_bins=black_list_bins)
229 | 	     
230 | 
231 | 	        mat_dense = processed_data$mat_dense
232 | 	        ccmat_dense_compressed_log_atanh = processed_data$atanh_score
233 | 
234 | 	        cat('[', as.character(chr),'] Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
235 | 	        cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
236 | 
237 | 	        p_thresh = ifelse(bin_size2look < 40000, 0.05, 1)
238 | 	        window.sizes = 3
239 | 	        compartments = vector("list", 2)
240 | 
241 | 	        cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
242 | 	        cat('[', as.character(chr),'] Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n')
243 | 
244 | 	        compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size2look, window.sizes = window.sizes, ccmat_dense_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL)
245 | 	        topDom_output = compartments[[2]]
246 | 	        bin_names = rownames(mat_dense)
247 | 	        mat_dense = as.matrix(mat_dense)
248 | 	        initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2])
249 | 
250 | 	        if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) {
251 | 	            stop(CELL_LINE, " initial_clusters error in topDom")
252 | 	        }
253 | 
254 | 	        n_clusters = length(initial_clusters)
255 |        		mat_dense_cluster_mean = HighResolution2Low_k_rectangle(mat_dense, initial_clusters, initial_clusters, sum_or_mean = "mean")
256 |         
257 | 			trend_mean_list = lapply( 1:4, function(v) 1*(mat_dense_cluster_mean[, -(1:v)] > mat_dense_cluster_mean[, - n_clusters - 1 + (v:1)]) )
258 | 			trend_mean = do.call(cbind, trend_mean_list)
259 | 			c_trend_mean = cor(t(trend_mean))
260 | 			atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7))
261 | 
262 | 
263 | 			# if(to_scale)
264 | 			{
265 | 				trend_mean = scale(trend_mean)
266 | 				c_trend_mean = scale(c_trend_mean)
267 | 				atanh_c_trend_mean= scale(atanh_c_trend_mean)
268 | 			}
269 | 
270 | 
271 | 			PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10)
272 | 			PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2
273 | 			rownames(PC_12_atanh) = 1:nrow(PC_12_atanh)
274 | 
275 | 			############################################################
276 | 
277 | 			PC_direction = 1
278 | 
279 | 			## switch PC direction based on correlation with "ground_truth"	
280 | 			{
281 | 				initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v]))
282 | 				chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length)))
283 | 				chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size2look + 1
284 | 				chr_bin_pc$end = chr_bin_pc$bin*bin_size2look
285 | 
286 | 				# chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE)
287 | 				# gene_info_chr = subset(gene_info, seqnames==chr_name)
288 | 
289 | 				# refGR = chr_bin_pc_range
290 | 				# testGR = gene_info_chr
291 | 				# hits <- findOverlaps(refGR, testGR)
292 | 			 	# overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)])
293 | 			 	# overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin))
294 | 			 	# bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE)
295 | 				# bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0
296 | 				
297 | 				# gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2]
298 | 				# if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak')
299 | 				
300 | 
301 | 				if(!is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, ref_compartment_file=ref_compartment_file)) ## get correlation with supplied "ground truth or reference"
302 | 				if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, feature_track=feature_track))
303 | 
304 | 
305 | 
306 | 				if(class(cor_with_ref)=='try-error') cor_with_ref = 1 ## psudo cor
307 | 	        	if(!is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and reference compartment is :', format(abs(cor_with_ref), digits=5), '\n')
308 | 	        	if(is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and feature_track is :', format(abs(cor_with_ref), digits=5), '\n')
309 | 
310 | 				PC_direction = PC_direction*sign(cor_with_ref)
311 | 			    if(swap_AB==1) PC_direction = -PC_direction ## force swap PC direction if in some case the A/B direction is reverted
312 | 			    PC_12_atanh = PC_12_atanh*PC_direction
313 | 			}
314 | 
315 | 
316 | 			project_info = project_to_major_axis(PC_12_atanh)
317 | 			x_pro = project_info$x_pro
318 | 			
319 | 			############################################################
320 | 
321 | 			hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE])
322 | 
323 | 			hc_hybrid_PC12 = hc_disect_kmeans_PC12
324 | 
325 | 			{
326 | 				reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean)
327 | 				hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names)
328 | 				hc_hybrid_x_pro = hc_disect_kmeans_PC12
329 | 				reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro)
330 | 				CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro)	
331 | 			}
332 | 
333 | 			############################################################
334 |         	parameters = list(bin_size = bin_size2look, p_thresh = p_thresh)
335 | 			res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters)
336 | 			intermediate_data_file = paste0(save_dir, '/chr', chr, '_intermediate_data.Rds')
337 | 			
338 | 			hc = res$CALDER_hc
339 | 			hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
340 | 			bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
341 | 
342 | 			rownames(bin_comp) = NULL
343 | 			res$comp = bin_comp
344 | 			res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v])
345 | 			res$mat = mat_dense
346 | 			res$chr = chr
347 | 			generate_hierachy_bed(chr=chr, res=res, save_dir=save_dir, bin_size=bin_size2look)
348 | 
349 | 
350 | 	        cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
351 | 	        cat('[', as.character(chr),'] Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n')
352 | 
353 | 	       	if(!is.null(ref_compartment_file)) cat('Correlation between PC1 and reference compartment domain rank on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)
354 | 	       	if(is.null(ref_compartment_file)) cat('Correlation between PC1 and feature_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE)	       	
355 | 
356 | 	       	# cat(sprintf("chr%s: %s\n", chr, format(abs(cor_with_ref), 1, 5)), file=cor_log_file, append=TRUE)
357 | 	       	# if(abs(cor_with_ref) < 0.3) cat('WARNING: correlation between PC1 and reference compartment domain rank on this chr is: ', format(cor_with_ref, digits=5), ', which is a bit low. Possible reason could be that this chromosome has some big structural variations (translocation, inversion for example). We suggest to overlay the compartment track with the hic map together with histone modification or gene expression track to double check the reliability of compartment calling on this chr.', '\n', file=warning_file)
358 | 
359 | 	        time1 = Sys.time()
360 | 	        # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
361 | 
362 | 	        delta_time <- time1 - time0
363 | 			timediff <- format(round(delta_time, 2), nsmall = 2)
364 | 
365 | 	        cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE)
366 | 	       	# if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE)
367 | 
368 | 			############################################################
369 | 			intermediate_data = res
370 | 			if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file)
371 | 			# cat(intermediate_data_file)
372 | 			return(intermediate_data_file)
373 | 		}
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 		project_to_major_axis <- function(PC_12_atanh)
380 | 		{
381 | 			Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2])
382 | 			Data = Data[order(Data$x),]
383 | 			loess_fit <- loess(y ~ x, Data)
384 | 
385 | 			more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1]))
386 | 			major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x))
387 | 			new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis
388 | 
389 | 			dis = fields::rdist(PC_12_atanh[, 1:2], major_axis)
390 | 			projected_x = new_x_axis[apply(dis, 1, which.min)]
391 | 			names(projected_x) = rownames(PC_12_atanh)
392 | 			# projected_x = major_axis[apply(dis, 1, which.min)]
393 | 			project_info = list(x_pro=projected_x, major_axis=major_axis)
394 | 			return(project_info)
395 | 		}
396 | 
397 | 
398 | 		get_best_reorder <- function(hc_hybrid_x_pro, x_pro)
399 | 		{
400 | 			n = length(x_pro)
401 | 			reordered_names_x_pro_list = list()
402 | 
403 | 			reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2
404 | 
405 | 			best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v]))))
406 | 			return(reordered_names_x_pro_list[[1]])
407 | 		}
408 | 
409 | 
410 | 
411 | 		generate_hierachy_bed = function(chr, res, save_dir, bin_size)
412 | 		{
413 | 			chr_name = paste0('chr', chr)
414 | 			# res = reses[[chr_name]][[CELL_LINE]]
415 | 			hc = res$CALDER_hc
416 | 
417 | 			hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
418 | 			bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
419 | 			chr_bin_domain = bin_comp
420 | 			chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr)
421 | 
422 | 			# chr_bin_domain = chr_bin_domain[order(bin_index)]
423 | 
424 | 			compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size)
425 | 
426 | 			boundaries = unname(sapply(res$initial_clusters, max))
427 | 			boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size
428 | 
429 | 			compartment_info_tab$is_boundary = 'gap'
430 | 			compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary'
431 | 			
432 | 			colnames(compartment_info_tab)[4] = 'compartment_label'
433 | 			compartments_tsv_file = paste0(save_dir, '/chr', chr, '_domain_hierachy.tsv')
434 | 			compartments_bed_file = paste0(save_dir, '/chr', chr, '_sub_compartments.bed')
435 | 			boundary_bed_file = paste0(save_dir, '/chr', chr, '_domain_boundaries.bed')
436 | 
437 | 			options(scipen=999)
438 | 			write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE )
439 | 
440 | 
441 | 			comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 	
442 | 			names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2')
443 | 			comp_val = (8:1)/8
444 | 			names(comp_val) = names(comp_cols)
445 | 
446 | 			comp_8 = substr(compartment_info_tab$compartment_label, 1, 5)
447 | 
448 | 			compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8])
449 | 			write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
450 | 
451 | 			bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary')
452 | 			bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000')
453 | 			write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
454 | 		}
455 | 
456 | 
457 | 		create_compartment_bed_v4 = function(chr_bin_domain, bin_size)
458 | 		{
459 | 			# for( chr in chrs )
460 | 			{
461 | 				v = chr_bin_domain
462 | 				# v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling
463 | 				# v$intra_domain = names(cols)[(as.numeric(v$intra_domain))]
464 | 				v = v[order(v$bin_index), ]
465 | 
466 | 
467 | 				borders_non_consecutive = which(diff(v$bin_index)!=1)
468 | 				borders_domain = cumsum(rle(v$comp)$lengths)
469 | 				borders = sort(union(borders_non_consecutive, borders_domain))
470 | 				bins = v$bin_index
471 | 				to_id = as.numeric(bins[borders])
472 | 				from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)])
473 | 
474 | 				pos_start = (from_id-1)*bin_size + 1
475 | 				pos_end = to_id*bin_size
476 | 				# chr = as.numeric( gsub('chr', '', v$chr) )
477 | 				chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX
478 | 
479 | 				compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders])
480 | 			}
481 | 			return(compartment_info_tab)
482 | 		}
483 | 
484 | 
485 | 		CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, save_dir, bin_size)
486 | 		{	
487 | 		    time0 = Sys.time()
488 | 		    log_file = paste0(save_dir, '/chr', chr, '_sub_domains_log.txt')
489 | 
490 | 		   	cat('[', as.character(chr),']Begin compute sub-domains at:', as.character(Sys.time()))
491 | 		   	cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
492 | 
493 | 			if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file)
494 | 			{
495 | 
496 | 			    if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 
497 | 			    if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n')     
498 | 			    compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters )
499 | 
500 | 				cat('[', as.character(chr),'] Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')   			
501 | 				cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
502 | 
503 | 				sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2)   
504 | 
505 | 			    no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=save_dir, bin_size=bin_size)
506 | 
507 | 			    cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
508 | 			    cat('[', as.character(chr),'] Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n')
509 | 
510 | 			   	time1 = Sys.time()
511 | 		        # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
512 | 		        delta_time <- time1 - time0
513 | 				timediff <- format(round(delta_time, 2), nsmall = 2)
514 | 
515 | 		        cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE)
516 | 			}
517 | 			# return(NULL)
518 | 		}
519 | 
520 | 


--------------------------------------------------------------------------------
/R/CALDER_main.R:
--------------------------------------------------------------------------------
  1 | 
  2 |     CALDER = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chrs, bin_size, save_dir, save_intermediate_data=FALSE, swap_AB=0, genome='others', feature_track=NULL, black_list_bins=NULL, n_cores=1, sub_domains=FALSE, single_binsize_only=FALSE)
  3 |     {
  4 |     		########################### https://stackoverflow.com/questions/30216613/how-to-use-dopar-when-only-import-foreach-in-description-of-a-package
  5 | 
  6 |     		`%dopar%` <- foreach::`%dopar%`
  7 | 			`%do%` <- foreach::`%do%`
  8 | 
  9 |     		###########################
 10 | 
 11 |             chrs = as.character(chrs)
 12 |             n_chrs = length(chrs)
 13 | 
 14 |             if(n_chrs > 1) ## when computing for multiple chrs, contact_tab_dump or contact_file_dump should be a list with elements matching to that of chrs
 15 |             {
 16 |             	if(!is.null(contact_tab_dump))
 17 |             	{
 18 |             		if(class(contact_tab_dump)!='list' | length(contact_tab_dump)!=n_chrs) stop('contact_tab_dump should be a list of contact table with its length equal to the length of chrs you specified\n')
 19 |                     names(contact_tab_dump) = chrs
 20 |             	}
 21 | 
 22 |             	if(!is.null(contact_file_dump))
 23 |             	{
 24 |             		if(class(contact_file_dump)!='list' | length(contact_file_dump)!=n_chrs) stop('contact_file_dump should be a list of contact table with its length equal to the length of chrs you specified\n')
 25 |                     names(contact_file_dump) = chrs
 26 |             	}
 27 |             }
 28 | 
 29 |             ########################### try multiple bin_size parameters and choose the one that generates reliable compartments
 30 | 
 31 |             if(bin_size==5E3) bin_sizes = c(5E3, 10E3, 50E3, 100E3)
 32 |             if(bin_size==10E3) bin_sizes = c(10E3, 50E3, 100E3)
 33 |             if(bin_size==20E3) bin_sizes = c(20E3, 40E3, 100E3)
 34 |             if(bin_size==25E3) bin_sizes = c(25E3, 50E3, 100E3)
 35 |             if(bin_size==40E3) bin_sizes = c(40E3, 80E3)
 36 |             if(bin_size==50E3) bin_sizes = c(50E3, 100E3)
 37 |             if(!(bin_size %in% c(5E3, 10E3, 20E3, 25E3, 40E3, 50E3))) bin_sizes = bin_size
 38 | 
 39 |             if(genome=='others') single_binsize_only = TRUE
 40 |             if(single_binsize_only) bin_sizes = bin_size ## do not try multiple bin_sizes
 41 | 
 42 |             ## choose the already computed good compartment as reference to decide correctly the A/B compartment direction
 43 |             if(genome!='others') ref_compartment_file = system.file("extdata", sprintf('%s_all_sub_compartments.bed', genome), package = 'CALDER')
 44 |             if(genome=='others') ref_compartment_file = NULL
 45 | 
 46 |             ###########################
 47 | 
 48 |             if(is.null(ref_compartment_file) & is.null(feature_track)) stop('Should either specify genome in one of hg19, hg38, mm9, mm10, or provide a feature_track\n')
 49 | 
 50 |             ###########################
 51 | 
 52 |             doParallel::registerDoParallel(cores=n_cores)
 53 | 
 54 |             for(bin_size2look in bin_sizes)
 55 |             {
 56 |             	bin_size_kb = sprintf("%skb", bin_size2look/1E3)
 57 |             	save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
 58 |             	dir.create(save_dir_binsize, recursive=TRUE, showWarnings=FALSE)
 59 |             }
 60 | 
 61 |             para_tab = data.table::data.table(expand.grid(bin_sizes, chrs))
 62 |             for(i in 1:ncol(para_tab)) para_tab[[i]] = as.vector(para_tab[[i]])
 63 |             colnames(para_tab) = c('bin_size', 'chr')
 64 | 
 65 |             ########################### compute compartment for each bin_size and chr
 66 | 
 67 |             silent_out =  foreach::foreach(i=1:nrow(para_tab)) %dopar%
 68 | 	        {	
 69 | 	        	bin_size2look = para_tab$bin_size[i]
 70 | 	           	chr = para_tab$chr[i]
 71 |            		bin_size_kb = sprintf("%skb", bin_size2look/1E3)
 72 |             	save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
 73 | 
 74 |             	save_intermediate_data_tmp = save_intermediate_data*(bin_size2look==bin_size)
 75 | 
 76 | 	            CALDER_CD_hierarchy_v2(contact_tab_dump=contact_tab_dump[[chr]], contact_file_dump=contact_file_dump[[chr]], contact_file_hic=contact_file_hic, chr=chr, bin_size_input=bin_size, bin_size2look=bin_size2look, save_dir=save_dir_binsize, save_intermediate_data=save_intermediate_data_tmp, swap_AB=swap_AB, ref_compartment_file=ref_compartment_file, feature_track=feature_track, black_list_bins=black_list_bins)
 77 | 	        }
 78 | 
 79 |             ########################### build optimal compartment from multiple resolutions
 80 | 
 81 |             try(build_comp_table_opt(save_dir=save_dir, chrs=chrs, bin_sizes=bin_sizes, with_ref=!is.null(genome)))
 82 | 
 83 |             ########################### if computing sub-domains
 84 | 
 85 |             if(sub_domains==TRUE) 
 86 |             {
 87 |             	silence_out = foreach::foreach(chr=chrs) %do% ## use one core instead of multi-cores
 88 |             	{
 89 |             		chr_num = gsub('chr', '', chr, ignore.case=TRUE)
 90 |             		intermediate_data_file = sprintf('%s/%skb/chr%s_intermediate_data.Rds', file.path(save_dir, 'intermediate_data/sub_compartments') , bin_size/1E3, chr_num) 
 91 |             		save_dir_sub_domains = file.path(save_dir, 'intermediate_data/sub_domains')
 92 | 					dir.create(save_dir_sub_domains, showWarnings = FALSE)
 93 |             		try(CALDER_sub_domains(intermediate_data_file=intermediate_data_file, chr=chr, save_dir=save_dir_sub_domains, bin_size=bin_size))
 94 |             	}
 95 | 
 96 | 				save_dir_opt = file.path(save_dir, "sub_domains")
 97 | 				dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE)
 98 | 				save_opt_file = file.path(save_dir_opt, sprintf("all_nested_boundaries.bed"))
 99 | 
100 | 				opt_sub_domain_beds = sprintf('%s/chr%s_nested_boundaries.bed', file.path(save_dir, 'intermediate_data/sub_domains') , gsub('chr', '', chrs, ignore.case=TRUE)) 
101 | 				cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_domain_beds, collapse=" "), save_opt_file)
102 | 				system(cmd_opt)
103 |             }
104 |     }
105 | 
106 |     
107 | 


--------------------------------------------------------------------------------
/R/HRG_MLE_each_compartment_fun.R:
--------------------------------------------------------------------------------
  1 | 	## Yuanlong LIU
  2 | 	## 12/04/2018
  3 | 	## 23/05/2018
  4 | 	## 29/05/2018
  5 | 	
  6 | 	## A should be already named
  7 | 	## A should be symmetric
  8 | 	## Does not remove 0-rows/columns of A
  9 | 	HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE)
 10 | 	{
 11 | 		min_n_bins_outer=2
 12 | 		min_n_bins_inner=2
 13 | 		distr = 'lnorm'
 14 | 		A = pA_sym[seg, seg]
 15 | 		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
 16 | 		arg_list = as.list(environment())
 17 | 		res_folder = file.path(res_dir)
 18 | 		dir.create(res_folder, recursive=TRUE, showWarnings = TRUE)
 19 | 		
 20 | 		total_execution_time_file = file.path(res_dir, 'total_execution.time')
 21 | 		cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE)
 22 | 		time_begin = Sys.time()
 23 | 		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
 24 | 	
 25 | 		## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken
 26 | 		n_A = nrow(A)
 27 | 		
 28 | 		## IF THE SEGMENT IS ALREADY SMALL ENOUGH
 29 | 		if( n_A <= max(allowed_max_nbins_seq) ) 
 30 | 		{
 31 | 			## This will be modified
 32 | 			{
 33 | 				dists = (2*min_n_bins_outer - 1):(nrow(A)-1)
 34 | 				counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer ))
 35 | 				split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
 36 | 				chunks = c(dists[1]-1, dists[split_info$break_points])
 37 | 				res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
 38 | 			}
 39 | 			
 40 | 			## this part gets the hi_tree
 41 | 			{
 42 | 				hi_tree = get_tree_v0( res_outer )
 43 | 				igraph::V(hi_tree)$left_rescaled = igraph::V(hi_tree)$left
 44 | 				igraph::V(hi_tree)$right_rescaled = igraph::V(hi_tree)$right
 45 | 				igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right - igraph::V(hi_tree)$left + 1
 46 | 				igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
 47 | 			}
 48 | 			
 49 | 			full_tree = hi_tree
 50 | 			res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) )
 51 | 			res_info$full_tree = full_tree
 52 | 			res_folder_final = file.path(res_dir, 'final')
 53 | 			dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
 54 | 			save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
 55 | 	
 56 | 			time_finish = Sys.time()
 57 | 			cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
 58 | 			cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
 59 | 			return(res_info)
 60 | 		}
 61 | 		
 62 | 		## which one leads to the least residue
 63 | 		max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ]
 64 | 		residue = n_A %% max_nbins
 65 | 	
 66 | 		residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0)
 67 | 		A_final = residue_mat_info$A_final
 68 | 		n2one = residue_mat_info$n2one
 69 | 
 70 | 		## A_low has no row/col name
 71 | 		A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low
 72 | 		
 73 | 		## Starting from here, the original row/col names are no longer used
 74 | 		
 75 | 		## this part run HRG_core on the outer part
 76 | 		{
 77 | 			dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1)
 78 | 			counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer ))
 79 | 			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
 80 | 			chunks = c(dists[1]-1, dists[split_info$break_points])
 81 | 			res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
 82 | 		}
 83 | 
 84 | 		## this part gets the hi_tree
 85 | 		{
 86 | 			hi_tree = get_tree_v0( res_outer )
 87 | 			igraph::V(hi_tree)$left_rescaled = n2one*(igraph::V(hi_tree)$left - 1) + 1
 88 | 			igraph::V(hi_tree)$right_rescaled = n2one*igraph::V(hi_tree)$right
 89 | 			node2adjust = which(igraph::V(hi_tree)$right_rescaled==max(igraph::V(hi_tree)$right_rescaled)) ## append the residue bins here
 90 | 			igraph::V(hi_tree)[node2adjust]$right_rescaled = n_A
 91 | 			
 92 | 			igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
 93 | 			igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
 94 | 		}
 95 | 
 96 | 		# segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp)	
 97 | 
 98 | 		segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine)
 99 | 		
100 | 		## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL
101 | 		## 14-05-2018
102 | 		## Also adjust the tree
103 | 		if(adjust_segmentss_topdom==TRUE)
104 | 		{
105 | 			shift = seg[1]-1
106 | 			segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift
107 | 			for( i in 1:nrow(segmentss) )
108 | 			{
109 | 				index_left2adj = which(igraph::V(hi_tree)$left_rescaled == segmentss_nadj[i,1])
110 | 				index_right2adj = which(igraph::V(hi_tree)$right_rescaled == segmentss_nadj[i,2])
111 | 				igraph::V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1]
112 | 				igraph::V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2]
113 | 				igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
114 | 				igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
115 | 			}
116 | 		}
117 | 		
118 | 		trunk = hi_tree
119 | 		# save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata'))
120 | 		
121 | 		res_inner = rep(list(list()), nrow(segmentss))
122 | 		for( i in 1:nrow(segmentss) )
123 | 		{
124 | 			cat(i,'\n')
125 | 			# if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {}
126 | 			index = segmentss[i,1]:segmentss[i,2]
127 | 			A_part = A[index, index]
128 | 			dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1)
129 | 			counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner ))
130 | 			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
131 | 			chunks = c(dists[1]-1, dists[split_info$break_points])
132 | 			
133 | 			## A_part is named
134 | 			res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast )
135 | 			res_inner[[i]] = res_info_inner      #minimum size filtering
136 | 			cat('finished', '\n')
137 | 		}
138 | 		
139 | 		res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner )
140 | 		res_folder_final = file.path(res_dir, 'final')
141 | 		dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
142 | 		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
143 | 	
144 | 		time_finish = Sys.time()
145 | 		cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
146 | 		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
147 | 
148 | 
149 | 		## xenocfraf:
150 | 		# res_inner = res_inner[sapply(res_inner, length) > 0]
151 | 		branches = lapply( res_inner, get_tree_v0 )
152 | 		for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1])
153 | 		
154 | 		full_tree = xenocraft( trunk, branches )
155 | 		names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ','))
156 | 		igraph::V(full_tree)$left = substring(names_tmp[,1], 2)
157 | 		igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
158 | 		if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
159 | 		
160 | 		res_info$full_tree = full_tree
161 | 		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
162 | 	
163 | 		return( res_info )
164 | 	}
165 | 	
166 | 


--------------------------------------------------------------------------------
/R/HighResolution2Low.R:
--------------------------------------------------------------------------------
  1 | 	## These functions try to aggregate bin-wise contact matrix into a domain-wise contact matrix
  2 | 
  3 | 	HighResolution2Low_k_complete <-function( A, max_nbins )
  4 | 	{
  5 | 		nbins = nrow( A )
  6 | 		
  7 | 		max_nbins_bp = max_nbins
  8 | 		max_nbinss = max_nbins + (-5:20)
  9 | 		expand_fold_info = t(sapply(max_nbinss, fold_expand_rate, K=nbins))
 10 | 		index = max(which(expand_fold_info[,'rescale_rate']==min(expand_fold_info[,'rescale_rate'])))
 11 | 		max_nbins = max_nbinss[ index ]
 12 | 		n2one_head = expand_fold_info[index, 'n2one_head']
 13 | 		n2one_last = expand_fold_info[index, 'n2one_last']
 14 | 		
 15 | 		if(n2one_head==1) return(list())
 16 | 		rescale_rate = n2one_last / n2one_head
 17 | 
 18 | 		if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in rescale_rate as: ', rescale_rate, '\n')
 19 | 		
 20 | 		if(rescale_rate!=1) ## only do when rescale is not 1
 21 | 		{
 22 | 			aa_end = n2one_head*(max_nbins-1)
 23 | 			bb_end = nrow(A)
 24 | 			A_aa = A[1:aa_end, 1:aa_end]
 25 | 			A_bb = A[(aa_end+1):bb_end, (aa_end+1):bb_end]
 26 | 			A_ab = A[1:aa_end, (aa_end+1):bb_end, drop=FALSE]
 27 | 			A_aa_low = HighResolution2Low_k(mat=A_aa, k=max_nbins-1)
 28 | 			A_bb_low = sum( A_bb )
 29 | 			
 30 | 			## https://stackoverflow.com/questions/15265512/summing-every-n-points-in-r
 31 | 			v = apply(A_ab, 1, sum)
 32 | 			A_ab_low = unname(tapply(v, (seq_along(v)-1) %/% n2one_head, sum))
 33 | 			
 34 | 			A_ab_low_new = A_ab_low / n2one_last*n2one_head
 35 | 			A_bb_low_new = A_bb_low / n2one_last^2*n2one_head^2
 36 | 			A_low = rbind(cbind(A_aa_low, A_ab_low_new), c( A_ab_low_new, A_bb_low_new ))
 37 | 			colnames(A_low) = NULL
 38 | 		}
 39 | 		
 40 | 		if(rescale_rate==1) A_low = HighResolution2Low_k(mat=A, k=max_nbins) ## k is the dimension of A_final_low
 41 | 		## A_low has no row/col name
 42 | 		res = list(A_low=A_low, n2one_head=n2one_head, n2one_last=n2one_last)
 43 | 		return(res)
 44 | 	}	
 45 | 	
 46 | 	split_info <-function( K, max_nbins )
 47 | 	{
 48 | 		x = floor( K/max_nbins ) + 1
 49 | 		n2 = max_nbins*x - K
 50 | 		n1 = max_nbins - n2
 51 | 		vec = c(rep(x, n1), rep(x-1, n2))
 52 | 		split_vec = split(1:K, rep(1:max_nbins, vec))
 53 | 
 54 | 		expand_fold_ratio = min(n1/n2, n2/n1)
 55 | 		res = list(vec=vec, expand_fold_ratio=expand_fold_ratio, split_vec=split_vec)
 56 | 		return(res)
 57 | 	}
 58 | 	
 59 | 	
 60 | 	## diagonal values are summed twice
 61 | 	HighResolution2Low <-function( A, rescale, which=2 )
 62 | 	{
 63 | 		nbins = nrow(A)
 64 | 		nbins_low = floor(nbins/rescale)
 65 | 		A_low = matrix( , nbins_low, nbins_low)
 66 | 		keep_rows = nbins - nbins%%nbins_low
 67 | 		A_truncated = A[ 1:keep_rows, 1:keep_rows ]
 68 | 		
 69 | 		if(which==1) A_low = matsplitter_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
 70 | 		if(which==2) A_low = mat_split_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
 71 | 
 72 | 		return( A_low )
 73 | 	}
 74 | 	
 75 | 	## https://stackoverflow.com/questions/24299171/function-to-split-a-matrix-into-sub-matrices-in-r
 76 | 	## Function to split a matrix into sub-matrices in R
 77 | 	matsplitter_sum <- function(M, r, c) 
 78 | 	{
 79 | 		rg <- (row(M)-1)%/%r + 1
 80 | 		cg <- (col(M)-1)%/%c + 1
 81 | 		rci <- (rg-1)*max(cg) + cg
 82 | 		N <- prod(dim(M))/r/c
 83 | 		cv <- unlist(lapply(1:N, function(x) M[rci==x]))
 84 | 		dim(cv)<-c(r, c, N)
 85 | 		res = matrix(apply(cv, 3, sum), nrow(M)/r, byrow=TRUE)
 86 | 		return(res)
 87 | 	}
 88 | 
 89 | 
 90 | 	mat_split_sum <- function(M, r, c)
 91 | 	{
 92 | 	  nr <- ceiling(nrow(M)/r)
 93 | 	  nc <- ceiling(ncol(M)/c)
 94 | 	  newM <- matrix(NA, nr*r, nc*c)
 95 | 	  newM[1:nrow(M), 1:ncol(M)] <- M
 96 | 
 97 | 	  div_k <- kronecker(matrix(seq_len(nr*nc), nr, byrow = TRUE), matrix(1, r, c))
 98 | 	  matlist <- split(newM, div_k)
 99 | 	  res = matrix(sapply(matlist, sum), nrow(M)/r, byrow=TRUE)
100 | 	  return(res)
101 | 	}
102 | 	
103 | 	## this is much faster
104 | 	HighResolution2Low_k <- function(mat, k)
105 | 	{
106 | 		chunk2 <- function(x, n) split(x, cut(seq_along(x), n, labels = FALSE)) 
107 | 		n = nrow(mat)
108 | 		A_low = matrix( , k, k )
109 | 		indices = chunk2( 1:n, k )
110 | 		for(row_index in 1:k)
111 | 		{
112 | 			A_sub = mat[indices[[row_index]], ]
113 | 			for(col_index in 1:k) A_low[row_index, col_index] = sum( A_sub[ , indices[[col_index]]] )
114 | 		}
115 | 		return( A_low )
116 | 	}
117 | 	
118 | 	HighResolution2Low_k_rectangle <- function(mat, row_split, col_split, sum_or_mean=c('sum', 'mean', 'median', 'p_above_one'))
119 | 	{
120 | 		# if(remove_zero==TRUE) mat[mat==0] = NA
121 | 		k_row = length(row_split)
122 | 		k_col = length(col_split)
123 | 		
124 | 		A_low = matrix( , k_row, k_col )
125 | 		
126 | 		if(sum_or_mean=='sum')
127 | 		{
128 | 			for(row_index in 1:k_row)
129 | 			{
130 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
131 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = sum( A_sub[ , col_split[[col_index]]] )
132 | 			}
133 | 		}
134 | 
135 | 		if(sum_or_mean=='mean')
136 | 		{
137 | 			for(row_index in 1:k_row)
138 | 			{
139 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
140 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
141 | 			}
142 | 		}
143 | 
144 | 		if(sum_or_mean=='median')
145 | 		{
146 | 			for(row_index in 1:k_row)
147 | 			{
148 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
149 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = median( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
150 | 			}
151 | 		}
152 | 
153 | 		if(sum_or_mean=='p_above_one')
154 | 		{
155 | 			for(row_index in 1:k_row)
156 | 			{
157 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
158 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]] > 1 )
159 | 			}
160 | 		}
161 | 		# if(remove_zero==TRUE) A_low[is.na(A_low)] = 0
162 | 		return( A_low )
163 | 	}	
164 | 	
165 | 	## for a n x n matrix, this function generate a nxm matrix, with m*compress_size = n
166 | 	## this function is used for generating the nxm matrix, where the i,jth value is the contact value 
167 | 	## 08-08-2018  
168 | 	compress_mat_fast = function(input_mat, compress_size)
169 | 	{
170 | 		mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
171 | 		n = ncol(input_mat)
172 | 		mat2prod = mat_block(n, compress_size)
173 | 		# return(t(input_mat%*%mat2prod / compress_size))
174 | 		return(t(matrix_multiplication_cpp(input_mat, mat2prod)) / compress_size)
175 | 	}
176 | 	
177 | 	


--------------------------------------------------------------------------------
/R/LikelihoodRatioTest_fun.R:
--------------------------------------------------------------------------------
  1 | 	## The following code tries to evaluate the performance of different likelihood ratio-based tests
  2 | 	## Yuanlong LIU
  3 | 	## 01-03-2018
  4 | 	
  5 | 	############## model 1: keep the same structure ##############
  6 | 	## In this case, a tree structure should be provided
  7 | 	## Constraint: the top three levels have the same contact intensity parameter
  8 | 
  9 | 	## 	01-03-2018
 10 | 	p_likelihood_ratio <- function( A, head, mid, tail, num )
 11 | 	{
 12 | 		A_whole = A[head:tail, head:tail]
 13 | 		A_a = A[head:mid, head:mid]
 14 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 15 | 		A_ab = A[head:mid, (mid+1):tail]
 16 | 		
 17 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=FALSE)] 
 18 | 		tri_Aa = A_a[upper.tri(A_a, diag=FALSE)] 
 19 | 		tri_Ab = A_b[upper.tri(A_b, diag=FALSE)] 
 20 | 		# gamma_fit(tri_Awhole, num)
 21 | 		
 22 | 		inner_a = get_prob( tri_Aa )
 23 | 		inner_b = get_prob( tri_Ab )
 24 | 		inter_ab = get_prob( A_ab )
 25 | 		
 26 | 		## negative binomial
 27 | 		# inner_a = get_prob_ng( tri_Aa )
 28 | 		# inner_b = get_prob_ng( tri_Ab )
 29 | 		# inter_ab = get_prob_ng( A_ab )
 30 | 
 31 | 		
 32 | 		## log likelihood of H1
 33 | 		LH1 = inner_a + inner_b + inter_ab
 34 | 		LH0 = get_prob( tri_Awhole )
 35 | 		
 36 | 		Lambda = -2*( LH0 - LH1 )
 37 | 		# cat(Lambda, '\n')
 38 | 		df_h0 = 1*1
 39 | 		df_h1 = 3*1
 40 | 		df = df_h1 - df_h0
 41 | 		
 42 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
 43 | 		info = list( Lambda=Lambda, p=p )
 44 | 		return(info)
 45 | 	}
 46 | 	
 47 | 	p_likelihood_ratio_nb <- function( A, head, mid, tail )
 48 | 	{
 49 | 		A_whole = A[head:tail, head:tail]
 50 | 		A_a = A[head:mid, head:mid]
 51 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 52 | 		A_ab = A[head:mid, (mid+1):tail]
 53 | 		
 54 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
 55 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
 56 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
 57 | 		
 58 | 		## negative binomial
 59 | 		inner_a = get_prob_nb( tri_Aa )
 60 | 		inner_b = get_prob_nb( tri_Ab )
 61 | 		inter_ab = get_prob_nb( A_ab )
 62 | 
 63 | 		
 64 | 		## log likelihood of H1
 65 | 		LH1 = inner_a + inner_b + inter_ab
 66 | 		LH0 = get_prob_nb( tri_Awhole )
 67 | 		
 68 | 		Lambda = -2*( LH0 - LH1 )
 69 | 		# cat(Lambda, '\n')
 70 | 		n_parameters = 2
 71 | 		df_h0 = 1*n_parameters
 72 | 		df_h1 = 3*n_parameters
 73 | 		df = df_h1 - df_h0
 74 | 		
 75 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
 76 | 		info = list( Lambda=Lambda, p=p )
 77 | 		return(info)
 78 | 	}
 79 | 
 80 | 	p_likelihood_ratio_norm <- function( A, head, mid, tail )
 81 | 	{
 82 | 		A_whole = A[head:tail, head:tail]
 83 | 		A_a = A[head:mid, head:mid]
 84 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 85 | 		A_ab = A[head:mid, (mid+1):tail]
 86 | 		
 87 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
 88 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
 89 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
 90 | 		
 91 | 		## norm
 92 | 		inner_a = likelihood_norm( tri_Aa )
 93 | 		inner_b = likelihood_norm( tri_Ab )
 94 | 		inter_ab = likelihood_norm( A_ab )
 95 | 
 96 | 		
 97 | 		## log likelihood of H1
 98 | 		LH1 = inner_a + inner_b + inter_ab
 99 | 		LH0 = likelihood_norm( tri_Awhole )
100 | 		
101 | 		Lambda = -2*( LH0 - LH1 )
102 | 		# cat(Lambda, '\n')
103 | 		n_parameters = 2
104 | 		df_h0 = 1*n_parameters
105 | 		df_h1 = 3*n_parameters
106 | 		df = df_h1 - df_h0
107 | 		
108 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
109 | 		info = list( Lambda=Lambda, p=p )
110 | 		return(info)
111 | 	}
112 | 
113 | 	
114 | 	
115 | 	p_likelihood_ratio_gamma <- function( A, head, mid, tail, n_parameters, imputation )
116 | 	{
117 | 		A_whole = A[head:tail, head:tail]
118 | 		A_a = A[head:mid, head:mid]
119 | 		A_b = A[(mid+1):tail, (mid+1):tail]
120 | 		A_ab = A[head:mid, (mid+1):tail]
121 | 		
122 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
123 | 		## added 25-03-2018. If no zero values, no imputation
124 | 		if( length(tri_Awhole==0) == 0 ) imputation = FALSE
125 | 		
126 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
127 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
128 | 		
129 | 		## norm
130 | 		inner_a = likelihood_gamma_mme( tri_Aa )
131 | 		inner_b = likelihood_gamma_mme( tri_Ab )
132 | 		inter_ab = likelihood_gamma_mme( A_ab )
133 | 		whole = likelihood_gamma_mme( tri_Awhole )
134 | 		
135 | 		if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
136 | 		{
137 | 			inner_a = likelihood_gamma_mme( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
138 | 			inner_b = likelihood_gamma_mme( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
139 | 			inter_ab = likelihood_gamma_mme( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
140 | 			whole = likelihood_gamma_mme( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
141 | 			n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
142 | 		}
143 | 
144 | 		
145 | 		## log likelihood of H1
146 | 		LH1 = inner_a + inner_b + inter_ab
147 | 		LH0 = whole
148 | 		
149 | 		Lambda = -2*( LH0 - LH1 )
150 | 		# cat(Lambda, '\n')
151 | 		df_h0 = 1*n_parameters
152 | 		df_h1 = 3*n_parameters
153 | 		df = df_h1 - df_h0
154 | 		
155 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
156 | 		info = list( Lambda=Lambda, p=p )
157 | 		return(info)
158 | 	}
159 | 
160 | 	lognormal_mean_test <- function( cA, head, mid, tail )
161 | 	{
162 | 		A_whole = cA[head:tail, head:tail]
163 | 		A_a = cA[head:mid, head:mid]
164 | 		A_b = cA[(mid+1):tail, (mid+1):tail]
165 | 		A_ab = cA[head:mid, (mid+1):tail]
166 | 		
167 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
168 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
169 | 		
170 | 		tri_Aa_vec = as.vector( tri_Aa )
171 | 		tri_Ab_vec = as.vector( tri_Ab )
172 | 		A_ab_vec = as.vector( A_ab )
173 | 		
174 | 		tri_Aa_vec_p = tri_Aa_vec[tri_Aa_vec!=0]
175 | 		tri_Ab_vec_p = tri_Ab_vec[tri_Ab_vec!=0]
176 | 		A_ab_vec_p = A_ab_vec[A_ab_vec!=0]
177 | 		
178 | 		p_Aa = p_lognormal_mean( tri_Aa_vec_p, A_ab_vec_p )
179 | 		p_Ab = p_lognormal_mean( tri_Ab_vec_p, A_ab_vec_p )
180 | 		return( list(p_Aa=p_Aa, p_Ab=p_Ab) )
181 | 	}
182 | 	
183 | 	## https://www.jstor.org/stable/2533570
184 | 	## google: Methods for Comparing the Means of Two Independent Log-Normal Samples
185 | 	## 03-07-2018
186 | 	p_lognormal_mean <- function( vec_aa, vec_ab )
187 | 	{
188 | 		if( all(vec_aa==0) | all(vec_ab==0) ) return(0)
189 | 		
190 | 		n_aa = length(vec_aa)
191 | 		n_ab = length(vec_ab)
192 | 		fited_info_aa = MASS::fitdistr(vec_aa, 'lognormal') ## intra
193 | 		mu_aa = fited_info_aa$estimate[1]
194 | 		sd_aa = fited_info_aa$estimate[2]
195 | 		s2_aa = sd_aa^2
196 | 
197 | 		fited_info_ab = MASS::fitdistr(vec_ab, 'lognormal') ## inter
198 | 		mu_ab = fited_info_ab$estimate[1]
199 | 		sd_ab = fited_info_ab$estimate[2]
200 | 		s2_ab = sd_ab^2
201 | 		
202 | 		z_score = ( (mu_aa - mu_ab) + (1/2)*(s2_aa - s2_ab) ) / sqrt( s2_aa/n_aa + s2_ab/n_ab + (1/2)*( s2_aa^2/(n_aa-1) + s2_ab^2/(n_ab-1) ) )
203 | 		p = pnorm( z_score, lower.tail=FALSE )
204 | 		return(p)
205 | 	}
206 | 
207 | 	get_corner_xy <- function(A_whole)
208 | 	{
209 | 		n = nrow(A_whole)
210 | 		corner_size = floor(n/2)
211 | 		A_corner = A_whole[1:corner_size, (n - corner_size + 1 ):n]
212 | 		expected_high = A_corner[upper.tri(A_corner, diag=FALSE)]
213 | 		expected_low = A_corner[lower.tri(A_corner, diag=TRUE)]
214 | 		return(list(x=expected_low, y=expected_high))
215 | 	}
216 | 
217 | 	get_half_mat_values <- function(mat)
218 | 	{
219 | 		n1 = nrow(mat)
220 | 		n2 = ncol(mat)
221 | 		delta = n1/n2
222 | 		rows = lapply( 1:n2, function(x) (1+ceiling(x*delta)):n1 )
223 | 		rows = rows[ sapply(rows, function(v) max(v) <= n1) ]
224 | 		flag = which(diff(sapply(rows, length)) > 0)
225 | 		if(length(flag)>0) rows = rows[ 1:min(flag) ]
226 | 		
227 | 		row_col_indices = cbind( unlist(rows), rep(1:length(rows), sapply(rows, length)))
228 | 		x = mat[row_col_indices]
229 | 		mat[row_col_indices] = NA
230 | 		y = na.omit(as.vector(mat))
231 | 		return(list(x=x, y=y))
232 | 	}
233 | 
234 | 
235 | 	get_half_mat_values_v2 <- function(mat)
236 | 	{
237 | 		## https://stackoverflow.com/questions/52990525/get-upper-triangular-matrix-from-nonsymmetric-matrix/52991508#52991508
238 | 		y = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) > -1/2]
239 | 		x = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) < -1/2]
240 | 		return(list(x=x, y=y))
241 | 	}
242 | 
243 | 	p_wilcox_test_nested <- function( A, head, mid, tail, alternative, is_CD )
244 | 	{
245 | 		test_1 = p_wilcox_test( A, head, mid, tail, alternative, is_CD, only_corner=FALSE ) #: coner + inter
246 | 		if( (tail - head <= 4) | (test_1$p > 0.05) ) ## when > 0.05 or it is already small, do not cosider it as nested
247 | 		{
248 | 			info = list( Lambda=NULL, p=0.555555, mean_diff=0 )
249 | 			return(info)
250 | 		}
251 | 
252 | 		## try_error happens when tad to test is too small. THEREFORE, ASSIGN P=0 TO THE TAD
253 | 		test_left_tad = try(p_wilcox_test( A, head, mid=ceiling((head+mid)/2), mid, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
254 | 		if( class(test_left_tad)=="try-error" ) test_left_tad = list(p=0)
255 | 		test_right_tad = try(p_wilcox_test( A, mid+1, mid=ceiling((mid+1+tail)/2), tail, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
256 | 		if( class(test_right_tad)=="try-error" ) test_right_tad = list(p=0)
257 | 
258 | 		info = list( Lambda=NULL, p=max( test_1$p, min(test_left_tad$p, test_right_tad$p) ), mean_diff=0 )
259 | 		return(info)
260 | 	}
261 | 
262 | 
263 | 
264 | 	p_wilcox_test = function( A, head, mid, tail, alternative, is_CD=FALSE, only_corner=FALSE ) ## only_corner tests if a domain is a TAD (no nesting)
265 |         {
266 |                 A_whole = A[head:tail, head:tail]
267 |                 A_a = A[head:mid, head:mid]
268 |                 A_b = A[(mid+1):tail, (mid+1):tail]
269 |                 A_ab = A[head:mid, (mid+1):tail]
270 | 
271 |                 tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] ## need to check whether diag should be false or true
272 | 
273 |                 tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] ## need to check whether diag should be false or true
274 |                 tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] ## need to check whether diag should be false or true
275 | 
276 |                 tri_Aa_vec = as.vector( tri_Aa )
277 |                 tri_Ab_vec = as.vector( tri_Ab )
278 |                 A_ab_vec = as.vector( A_ab )
279 | 
280 |                 corner_mat_info = get_half_mat_values_v2(A_ab)
281 |                 A_ab_corner = as.vector(corner_mat_info$y)
282 |                 A_ab_ncorner = corner_mat_info$x
283 |                 p_corner = wilcox.test(x=A_ab_corner, y=A_ab_ncorner, alternative='greater', exact=F)
284 |                 p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_ncorner, alternative='greater', exact=F)
285 |                 # p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=c(A_ab_ncorner, A_ab_corner), alternative='greater', exact=F)
286 | 
287 |                 if(is_CD==FALSE) p = max(p_inter$p.value, p_corner$p.value) ## if the tested part is the CD
288 |                 if(is_CD==TRUE) p = p_inter$p.value ## if the tested part is the CD
289 |                 # if(only_corner==TRUE) p = p_corner$p.value
290 |                 mean_diff_inter = mean(A_ab_ncorner) - mean(c(tri_Ab_vec, tri_Aa_vec)) ## negative is good
291 |                 mean_diff_corner = mean(A_ab_ncorner) - mean(A_ab_corner) ## negative is good
292 | 
293 | 
294 |                 if(is_CD==FALSE) mean_diff = min(mean_diff_corner, mean_diff_inter) ## if the tested part is the CD
295 |                 if(is_CD==TRUE) mean_diff = mean_diff_inter
296 | 
297 |                 if(min(length(tri_Ab_vec), length(tri_Ab_vec)) < 10) mean_diff = 100 ## when one of the two twins is too small. 10: dim(4,4)
298 | 
299 |                 # mean_diff = mean(A_ab_corner) - mean(A_ab_ncorner)
300 | 
301 |                 # p_test_Aa =  wilcox.test(x=A_ab_vec, y=tri_Aa_vec, alternative="less", exact=F)
302 |                 # p_test_Ab =  wilcox.test(x=A_ab_vec, y=tri_Ab_vec, alternative="less", exact=F)
303 |                 # p = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_vec, alternative=alternative, exact=F)
304 | 
305 |                 # xy = get_corner_xy(A_whole)
306 |                 # p = wilcox.test(x=xy$x, y=xy$y, alternative=alternative, exact=F)
307 | 
308 |                 # p = max(p_test_Aab$p.value, p_test_Ab$p.value)
309 |                 info = list( Lambda=NULL, p=p, mean_diff=mean_diff)
310 |                 return(info)
311 |         }
312 | 
313 | 
314 | 
315 | 	
316 | 	p_likelihood_ratio_lnorm <- function( A, head, mid, tail, n_parameters, imputation, imputation_num=1E2 )
317 | 	{
318 | 		likelihood_fun = likelihood_lnorm_mle
319 | 		A_whole = A[head:tail, head:tail]
320 | 		A_a = A[head:mid, head:mid]
321 | 		A_b = A[(mid+1):tail, (mid+1):tail]
322 | 		A_ab = A[head:mid, (mid+1):tail]
323 | 		
324 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
325 | 		## added 25-03-2018. If no zero values, no imputation
326 | 		no_zero_flag = 0
327 | 		if( sum(tri_Awhole==0) == 0 ) { no_zero_flag = 1; imputation = FALSE }
328 | 		
329 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
330 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
331 | 		
332 | 		tri_Aa_vec = as.vector( tri_Aa )
333 | 		tri_Ab_vec = as.vector( tri_Ab )
334 | 		A_ab_vec = as.vector( A_ab )
335 | 
336 | 		mean_a = mean( tri_Aa_vec[tri_Aa_vec!=0] )
337 | 		mean_b = mean( tri_Ab_vec[tri_Ab_vec!=0] )
338 | 		mean_ab = mean( A_ab_vec[A_ab_vec!=0] )
339 | 		mean_diff = mean_ab - min(mean_a, mean_b)
340 | 		
341 | 		if( (all(tri_Aa_vec==0)) | (all(tri_Ab_vec==0)) | (all(A_ab_vec==0))  )
342 | 		{
343 | 			info = list( Lambda=NA, p=0, mean_diff=mean_diff )
344 | 			return(info)
345 | 		}
346 | 
347 | 		## lnorm
348 | 		if(!imputation)
349 | 		{
350 | 			inner_a = likelihood_fun( tri_Aa )
351 | 			inner_b = likelihood_fun( tri_Ab )
352 | 			inter_ab = likelihood_fun( A_ab )
353 | 			whole = likelihood_fun( tri_Awhole )
354 | 
355 | 			## log likelihood of H1
356 | 			LH1 = inner_a + inner_b + inter_ab
357 | 			LH0 = whole
358 | 			Lambda = -2*( LH0 - LH1 )
359 | 			
360 | 			if( no_zero_flag ) n_parameters = n_parameters - 1 ## no alpha parameter
361 | 			df_h0 = 1*n_parameters ##(theta_1 = theta_2 = theta_3)
362 | 			df_h1 = 3*n_parameters ##(theta_1, theta_2, theta_3)
363 | 			df = df_h1 - df_h0
364 | 		
365 | 			p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
366 | 			info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
367 | 			return(info)
368 | 		}
369 | 		
370 | 		if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
371 | 		{
372 | 			vec_list = list( tri_Aa=tri_Aa, tri_Ab=tri_Ab, A_ab=A_ab )
373 | 			imputations = imputation_list( vec_list, imputation_num )
374 | 			
375 | 			inner_as = apply(imputations$tri_Aa, 1, likelihood_fun)
376 | 			inner_bs = apply(imputations$tri_Ab, 1, likelihood_fun)
377 | 			inter_abs = apply(imputations$A_ab, 1, likelihood_fun)
378 | 			
379 | 			wholes = apply(do.call( cbind, imputations ), 1, likelihood_fun)
380 | 			LH1s = inner_as + inner_bs + inter_abs
381 | 			LH0s = wholes
382 | 			Lambdas = -2*( LH0s - LH1s )
383 | 			Lambda = mean( Lambdas )
384 | 			n_parameters = n_parameters - 1 ## the mixture parameter is not taken into account
385 | 			
386 | 			# cat(Lambda, '\n')
387 | 			df_h0 = 1*n_parameters
388 | 			df_h1 = 3*n_parameters
389 | 			df = df_h1 - df_h0
390 | 			
391 | 			p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
392 | 			info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
393 | 			return(info)
394 | 			
395 | 		# if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
396 | 		# {
397 | 			# inner_a = likelihood_lnorm( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
398 | 			# inner_b = likelihood_lnorm( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
399 | 			# inter_ab = likelihood_lnorm( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
400 | 			# whole = likelihood_lnorm( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
401 | 			# n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
402 | 		# }
403 | 		
404 | 		}
405 | 	}
406 | 
407 | 	imputation_list <- function( vec_list, imputation_num )
408 | 	{
409 | 		imputations = lapply( vec_list, function(v) 
410 | 		{
411 | 			if(sum(v!=0)==1) {final_vec = matrix( v[v!=0], imputation_num, length(v) ); return(final_vec)}
412 | 			
413 | 			fit = fit_lnorm(v)
414 | 			set.seed(1)
415 | 			
416 | 			## THERE WILL BE ERROR IF IS.NA SDLOG
417 | 			if(!is.na(fit['sdlog'])) imputation_vec = matrix(rlnorm( sum(v==0)*imputation_num, fit['meanlog'], fit['sdlog'] ), nrow=imputation_num)
418 | 			if(is.na(fit['sdlog'])) stop("In function imputation_list, sdlog=NA encountered")
419 | 
420 | 			ori_vec =  t(replicate(imputation_num, v[v!=0]))
421 | 			final_vec = cbind( ori_vec, imputation_vec )
422 | 		} )
423 | 		return( imputations )
424 | 	}
425 | 
426 | 
427 | 	fit_lnorm <- function(vec)
428 | 	{
429 | 		vec = vec[vec!=0]
430 | 		fit = MASS::fitdistr(vec, 'lognormal')$estimate
431 | 		return(fit)
432 | 	}
433 | 
434 | 	


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | matrix_multiplication_cpp <- function(A, B) {
 5 |     .Call(`_CALDER_matrix_multiplication_cpp`, A, B)
 6 | }
 7 | 
 8 | matrix_multiplication_sym_cpp <- function(A) {
 9 |     .Call(`_CALDER_matrix_multiplication_sym_cpp`, A)
10 | }
11 | 
12 | loglik_lnorm_cpp <- function(sum_ln1, sum_ln2, p, q) {
13 |     .Call(`_CALDER_loglik_lnorm_cpp`, sum_ln1, sum_ln2, p, q)
14 | }
15 | 
16 | loglik_lnorm_cpp_vec <- function(vec_values) {
17 |     .Call(`_CALDER_loglik_lnorm_cpp_vec`, vec_values)
18 | }
19 | 
20 | get_A_len <- function(A) {
21 |     .Call(`_CALDER_get_A_len`, A)
22 | }
23 | 
24 | get_A_ln1 <- function(A) {
25 |     .Call(`_CALDER_get_A_ln1`, A)
26 | }
27 | 
28 | get_A_ln2 <- function(A) {
29 |     .Call(`_CALDER_get_A_ln2`, A)
30 | }
31 | 
32 | loglik_lnorm_cpp_mat <- function(sum_ln1, sum_ln2, ps, qs) {
33 |     .Call(`_CALDER_loglik_lnorm_cpp_mat`, sum_ln1, sum_ln2, ps, qs)
34 | }
35 | 
36 | zigzag_loglik_ancestors_v4_5 <- function(A, k, min_n_bins = 2L) {
37 |     .Call(`_CALDER_zigzag_loglik_ancestors_v4_5`, A, k, min_n_bins)
38 | }
39 | 
40 | compute_L <- function(A, L, k) {
41 |     .Call(`_CALDER_compute_L`, A, L, k)
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/R/bisecting_kmeans.R:
--------------------------------------------------------------------------------
  1 | 
  2 |  	## k-means with replicatable seeds
  3 |  	my_kmeans = function(iter.max=1E3, nstart=50, ...)
  4 |  	{
  5 |  		set.seed(1)
  6 |  		res = kmeans(iter.max=iter.max, nstart=nstart, ...)
  7 |  		return(res)
  8 |  	}
  9 | 
 10 |  	## This function tries to adjust the height of each split, in order to generate a valid hclust object and with balanced compartments A.1 A.2 B.1 B.2
 11 |  	## Clusters with more nodes will get bigger height in case of same height
 12 |  	adjust_hs <- function(l_r_h)
 13 |  	{
 14 |  		hs = sapply(l_r_h, function(v) v$h)
 15 |  		all_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$l, v$r))))
 16 |   		r_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$r))))
 17 | 		
 18 | 	 	sizes = sapply(l_r_h, function(v) length(v$l) + length(v$r)) ##
 19 | 
 20 | 	 	################ This part deals with duplicated heights
 21 | 	 	hs = hs + sizes*1E-7
 22 | 	 	################ This part tries to make the top-level left and right branch to have similar height, such that to make balanced A.1, A.2, B.1, B.2 compartments
 23 | 	 	## Find the index of second branch, whose number of nodes is n_total - n_left: sizes[1] - sizes[2]
 24 | 	 	l_b = 2 ## left sub-branch
 25 | 	 	# r_b = which(sizes==(sizes[1] - sizes[2]))[1] ## right sub-branch
 26 | 	 	r_b = which(r_names[1]==all_names) ## right sub-branch
 27 | 
 28 | 	 	l_h = hs[l_b]
 29 | 	 	r_h = hs[r_b]
 30 | 	 	max_h = max(l_h, r_h) ## the maximum height of the two branches
 31 | 	 	hs_new = mean(sort(hs, decreasing=TRUE)[2:3]) ## hs_new is the 3rd largest height
 32 | 		hs[l_b] = ifelse(l_h > r_h, max_h, hs_new)
 33 | 		hs[r_b] = ifelse(r_h > l_h, max_h, hs_new)
 34 | 
 35 | 	 	if(any(duplicated(hs))) stop('ERROR: DUPLICATED HEIGHTS exist in bisecting_kmeans')
 36 | 	 	return( hs )
 37 |  	}
 38 | 
 39 |  	bisecting_kmeans <- function(data)
 40 |  	{
 41 |  		dist_mat = as.matrix(stats::dist(data))
 42 |  		indices = 1:nrow(data)
 43 |  		l_r_h <<- list()
 44 | 
 45 | 	 	get_h <- function(l_indices, r_indices)
 46 | 	 	{
 47 | 			combined_indices = c(l_indices, r_indices)
 48 | 			idx <- as.matrix(expand.grid(combined_indices, combined_indices))
 49 | 			max(dist_mat[idx]) ## diameter		
 50 | 	 	}
 51 | 
 52 | 	 	get_sub_tree <- function( indices )
 53 | 	 	{
 54 | 		 	n_nodes = length(indices)
 55 | 
 56 | 		 	if(n_nodes==1) ## if only two nodes
 57 | 		 	{
 58 | 		 		h = NULL
 59 | 		 		# tree = list(h=h, leaf=indices)
 60 | 		 		return()
 61 | 		 	}
 62 | 
 63 | 		 	############# if more than two nodes
 64 | 		 	if(n_nodes==2) cluster=c(1,2) else cluster = my_kmeans(x=data[indices, ], centers=2)$cluster
 65 | 		 	l_indices = indices[cluster==1]
 66 | 		 	r_indices = indices[cluster==2]
 67 | 		 	h = get_h(l_indices, r_indices)
 68 | 		 	l_r_h <<- c(l_r_h, list(list(l=l_indices, r=r_indices, h=h)))
 69 | 
 70 | 		 	# cat(h, '\n')
 71 | 			l_branch = get_sub_tree( l_indices )
 72 | 			r_branch = get_sub_tree( r_indices )
 73 | 		 	# tree = list(h=h, l_branch=l_branch, r_branch=r_branch, l_indices=l_indices, r_indices=r_indices)
 74 | 		 	# return(tree)
 75 | 	 	}
 76 | 
 77 | 	 	get_sub_tree(indices)
 78 | 
 79 | 	 	hs = adjust_hs(l_r_h)
 80 | 
 81 | 	 	r_hs = rank(hs)
 82 | 	 	for( i in 1:length(l_r_h) ) {name=r_hs[i]; names(name)=paste0(collapse='_', sort(c(l_r_h[[i]]$l, l_r_h[[i]]$r))); l_r_h[[i]]$name=name}
 83 | 	 	pos_names = sapply(l_r_h, function(v) v$name)
 84 | 	 	neg_names = -(1:length(indices)); names(neg_names) = 1:length(indices); all_names = c(pos_names, neg_names)
 85 | 	 	for( i in 1:length(l_r_h) ) {l_r_h[[i]]$l_name=unname(all_names[paste0(l_r_h[[i]]$l, collapse='_')]); l_r_h[[i]]$r_name=unname(all_names[paste0(l_r_h[[i]]$r, collapse='_')]) }
 86 | 		
 87 | 		merge_height = data.frame(l=sapply(l_r_h, function(v) v$l_name), r=sapply(l_r_h, function(v) v$r_name), h=hs)
 88 | 		merge_height = merge_height[order(merge_height$h), ]
 89 | 		rownames(merge_height) = NULL
 90 | 		
 91 | 		data_tmp = cbind(c(0,0,1,1), c(0,1,1,0))
 92 | 		hc = hclust(stats::dist(data_tmp), "com")
 93 | 		hc$merge = as.matrix(unname(merge_height[,1:2]))
 94 | 		hc$height = merge_height$h
 95 | 		# hc$order = unname(unlist(res, recursive=TRUE)[grepl('leaf', names(unlist(res, recursive=TRUE)))])
 96 | 		# hc$order = 1:length(indices)
 97 | 		hc$labels = 1:length(indices)
 98 | 		den <- as.dendrogram(hc)
 99 | 		hc_r <- as.hclust(reorder(den, 1:length(indices)))
100 | 		hc_r$method = "complete"
101 | 		hc_r$dist.method = "euclidean"
102 | 		l_r_h <<- list()
103 | 		rm(l_r_h)
104 | 		return(hc_r)
105 |  	}
106 |  
107 | 


--------------------------------------------------------------------------------
/R/build_comp_table_opt.R:
--------------------------------------------------------------------------------
  1 | 	
  2 | 	## function to obtain the optimal compartment calling from various resolutions (bin_sizes)
  3 | 
  4 | 	build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref) 
  5 | 	{
  6 | 
  7 |     	`%dopar%` <- foreach::`%dopar%`
  8 | 		`%do%` <- foreach::`%do%`
  9 | 
 10 | 		get_opt_index = function(consist_tab)
 11 | 		{
 12 | 			if(ncol(consist_tab)==2) ## if only one bin_size value
 13 | 			{
 14 | 				opt_index = rep(1, nrow(consist_tab))
 15 | 				return(opt_index)			
 16 | 			}
 17 | 
 18 | 			mins = apply(consist_tab[,-1], 1, max) - 0.05 ## allow at most 0.05 smaller than the maximum
 19 | 			opt_index = apply(1*((consist_tab[,-1] >= mins)==1), 1, function(v) min(which(v==1)))
 20 | 			return(opt_index)
 21 | 		}
 22 | 		
 23 | 		################################
 24 | 
 25 | 		identities = 'bulk'
 26 | 
 27 | 		{
 28 | 			consist_tab_li = foreach::foreach(identity=identities) %do%
 29 | 			{
 30 | 				# consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
 31 | 				# {
 32 | 				# 	bin_size_kb = sprintf("%skb", bin_size/1E3)
 33 | 				# 	save_dir_binsize = file.path(save_dir, bin_size_kb)
 34 | 				#     cor_log_file = paste0(save_dir_binsize, '/cor_with_ref.txt')
 35 | 				#     log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
 36 | 
 37 | 				#     as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
 38 | 
 39 | 				#     cor_tab = data.table::fread(cor_log_file)
 40 | 				#     colnames(cor_tab) = c("chr", bin_size_kb)
 41 | 				#     cor_tab
 42 | 				# }
 43 | 
 44 | 
 45 | 				consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do%
 46 | 				{
 47 | 					bin_size_kb = sprintf("%skb", bin_size/1E3)
 48 |             		save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb)
 49 | 					consist_tab_tmp = data.table::data.table(chr=paste0('chr', chrs), val=0)
 50 | 					consist_tab_tmp$val = foreach::foreach(chr=chrs, .combine=c) %do%
 51 | 					{
 52 | 					    log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt')
 53 | 					    # print(log_file)
 54 | 					    cor_val = as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2])
 55 | 					    # print(cor_val)
 56 | 					    cor_val
 57 | 					}
 58 | 					colnames(consist_tab_tmp)[2] = bin_size_kb
 59 | 					consist_tab_tmp
 60 | 				}
 61 | 
 62 | 				s = gsub('chr', '', consist_tab[['chr']])
 63 | 
 64 | 				x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
 65 | 				consist_tab = consist_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
 66 | 				# print((consist_tab))
 67 | 
 68 | 
 69 | 			}
 70 | 
 71 | 			names(consist_tab_li) = identities
 72 | 
 73 | 			min_consist = (sapply(consist_tab_li, function(v) min(v[,2])))
 74 | 
 75 | 			################################ Choose the best bin size (as small as possible, and save the chosen comp to the opt dir)
 76 | 
 77 | 			# silent_out = foreach::foreach(identity=identities, .combine=merge) %do% ## do not use dopar
 78 | 			{
 79 | 
 80 | 				save_dir_opt = file.path(save_dir, "sub_compartments")
 81 | 				dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE)
 82 | 				
 83 | 				consist_tab = consist_tab_li[[identity]]		
 84 | 				opt_index = get_opt_index(consist_tab)
 85 | 				names(opt_index) = gsub(":", "", consist_tab$chr)
 86 | 				opt_bin_tab = data.table::data.table(chr=names(opt_index),  opt_binsize=(colnames(consist_tab)[-1])[opt_index])
 87 | 
 88 | 				consist_tab_tmp = cbind( consist_tab, opt_bin_tab[, 'opt_binsize'])
 89 | 				colnames(consist_tab_tmp)[ncol(consist_tab_tmp)] = 'opt_binsize'
 90 | 
 91 | 				# consist_tab_tmp$chr_num = gsub(':', '', gsub("chr", "", consist_tab_tmp$chr))
 92 | 				# consist_tab_tmp[chr=="chrX:"]$chr_num = 23
 93 | 				# consist_tab_tmp = consist_tab_tmp[order(chr_num)]
 94 | 
 95 | 				cor_all_file = file.path(save_dir_opt, "cor_with_ref.ALL.txt")
 96 | 				data.table::fwrite(consist_tab_tmp, file=cor_all_file, col.names=TRUE, sep="\t")
 97 | 
 98 | 
 99 | 
100 | 				cor_with_ref = foreach::foreach(j=1:nrow(opt_bin_tab), .combine=c) %do%
101 | 				{
102 | 					chr_query = opt_bin_tab$chr[j]
103 | 					opt_binsize = opt_bin_tab$opt_binsize[j]
104 | 
105 | 					row_index = which(consist_tab$chr == chr_query)
106 | 					col_index = which(colnames(consist_tab)==opt_binsize)
107 | 					as.data.frame(consist_tab)[row_index, col_index] ## some wired behavior when using data.table. Convert to data.frame
108 | 				}
109 | 
110 | 				
111 | 				cor_with_ref_tab = data.table::data.table(chr=consist_tab$chr, cor=cor_with_ref, chr_num=gsub("chr", "", opt_bin_tab$chr))
112 | 
113 | 				s = cor_with_ref_tab$chr_num
114 | 				x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically
115 | 				cor_with_ref_tab = cor_with_ref_tab[order(x, 'is.na<-'(s, !is.na(x))), ]
116 | 
117 | 
118 | 				cor_opt_file = file.path(save_dir_opt, "cor_with_ref.txt")
119 | 				data.table::fwrite(cor_with_ref_tab[, 1:2], file=cor_opt_file, col.names=TRUE, sep="\t")
120 | 
121 | 
122 | 				################################ make plot
123 | 
124 | 				cor_with_ref_tab$index = 1:nrow(cor_with_ref_tab)
125 | 				cor_opt_plot_file = file.path(save_dir_opt, "cor_with_ref.pdf")
126 | 
127 | 
128 | 			    pdf(cor_opt_plot_file, width=8, height=6)
129 | 			    p = ggplot2::ggplot(data=cor_with_ref_tab, ggplot2::aes(x=index, y=cor, label = format(round(cor_with_ref_tab$cor, 2), nsmall = 2))) + ggplot2::geom_line(color="red") + ggplot2::geom_point()
130 | 			    p = p + ggplot2::geom_label() + ggplot2::scale_x_continuous(labels=cor_with_ref_tab$chr_num, breaks=1:nrow(cor_with_ref_tab))
131 | 			    p = p + ggplot2::geom_hline(yintercept=ifelse(with_ref, 0.4, 0.15), linetype=2, color='darkblue')
132 | 			    if(with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.4 indicates imprecise compartment calling')
133 | 			    if(!with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.15 indicates imprecise compartment calling')
134 | 			    print(p)    
135 | 			    dev.off()			
136 | 
137 | 			    ################################
138 | 
139 | 				opt_sub_comp_beds = foreach::foreach(i=1:nrow(opt_bin_tab)) %do%
140 | 				{
141 | 					opt_bin_size_kb = opt_bin_tab[[2]][i]
142 | 					chr_name = opt_bin_tab[[1]][i]
143 | 					opt_sub_comp_bed_chr = sprintf("%s/%s_sub_compartments.bed", file.path(save_dir, 'intermediate_data/sub_compartments', opt_bin_size_kb), chr_name)
144 | 					opt_sub_comp_bed_chr
145 | 				}
146 | 
147 | 				# save_opt_file = file.path(save_dir_opt, "all_sub_compartments.bed")
148 | 				save_opt_file = file.path(save_dir_opt, sprintf("all_sub_compartments.bed"))
149 | 
150 | 				cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_comp_beds, collapse=" "), save_opt_file)
151 | 				system(cmd_opt)
152 | 
153 | 				# cmd_replaceX = sprintf("sed -i 's/chrNA/chrX/g' %s", save_opt_file) ## the bed file contains NA because of chrX
154 | 				# system(cmd_replaceX)
155 | 
156 | 				comp_raw = data.table::fread(save_opt_file)
157 | 				comp_tab = make_comp_tab(comp_raw, bin_size=10E3)
158 | 
159 | 
160 | 				cols2keep = c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')
161 | 				comp_tab = comp_tab[, c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')]
162 | 				# comp_tab = comp_tab[, mget(cols2keep)] ## mget does not work for some reasons
163 | 
164 | 				save_opt_tab_file = file.path(save_dir_opt, sprintf("all_sub_compartments.tsv"))
165 | 				data.table::fwrite(comp_tab, file=save_opt_tab_file, sep='\t')
166 | 			}
167 | 		}
168 | 	}
169 | 
170 | 	
171 | 	


--------------------------------------------------------------------------------
/R/call_domains.R:
--------------------------------------------------------------------------------
  1 | 	## Yuanlong LIU
  2 | 	## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship
  3 | 
  4 | 	# @author : Hanjun Shin(shanjun@usc.edu)
  5 | 	# @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH)
  6 | 	# @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix. 
  7 | 	# @version 0.0.2
  8 | 
  9 | 	# @fn TopDom
 10 | 	# @param matrixFile : string, matrixFile Address,
 11 | 	# - Format = {chromosome, bin start, bin end, N numbers normalized value }
 12 | 	# - N * (N + 3), where N is the number of bins
 13 | 	# @param window.size :integer, number of bins to extend.
 14 | 	# @param out_binSignal : string, binSignal file address to write
 15 | 	# @param out_ext : string, ext file address to write
 16 | 
 17 | 
 18 | 
 19 | 	remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09
 20 | 	{
 21 | 		# binSignal = res_input_mat$binSignal
 22 | 		# domains = domains_bp
 23 | 		# domains = domains[, c('from.id', 'to.id', 'size')]
 24 | 		while(1)
 25 | 		{
 26 | 			i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one
 27 | 			if(i==Inf) break
 28 | 			ids = domains[(i-1):i, 'to.id']
 29 | 			index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])]
 30 | 			domains = domains[-index2rm, ]
 31 | 			domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1
 32 | 			domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1
 33 | 			cat(nrow(domains), '\n')
 34 | 			if(nrow(domains) < 10) break ## avoid exceptions
 35 | 		}
 36 | 		# domains[, 'chr'] = domains_bp[1, 'chr']
 37 | 		domains[, 'from.coord'] = domains[, 'from.id'] - 1
 38 | 		domains[, 'to.coord'] = domains[, 'to.id']
 39 | 		rownames(domains) = 1:nrow(domains)
 40 | 		# print(head(domains))
 41 | 		return(domains)
 42 | 	}
 43 | 
 44 | 	## modified input file format and added several potentially useful parameter options, Yuanlong LIU
 45 | 	TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL)
 46 | 	{
 47 | 	  if(!is.null(window.sizes)) window.size = window.sizes[1]
 48 | 
 49 | 	  # print("#########################################################################")
 50 | 	  # print("Step 0 : File Read ")
 51 | 	  # print("#########################################################################")
 52 | 	  window.size = as.numeric(window.size)
 53 | 	  matdf = A_extended
 54 | 	  
 55 | 	  if( ncol(matdf) - nrow(matdf) == 3) {
 56 | 		colnames(matdf) <- c("chr", "from.coord", "to.coord")
 57 | 	  } else if( ncol(matdf) - nrow(matdf) ==4 ) {
 58 | 		colnames(matdf) <- c("id", "chr", "from.coord", "to.coord")
 59 | 	  } else {
 60 | 		print("Unknwon Type of matrix file")
 61 | 		return(0)
 62 | 	  }
 63 | 	  n_bins = 1*nrow(matdf)
 64 | 	  mean.cf <- rep(0, n_bins)
 65 | 	  pvalue <- rep(1, n_bins)
 66 | 	  
 67 | 	  local.ext = rep(-0.5, n_bins)
 68 | 	  
 69 | 	  bins <- data.frame(id=1:n_bins, 
 70 | 						 chr=matdf[, "chr"], 
 71 | 						 from.coord=matdf[, "from.coord"], 
 72 | 						 to.coord=matdf[, "to.coord"] )
 73 | 	  
 74 | 	  matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] )
 75 | 	  
 76 | 
 77 | 	  ptm <- proc.time()
 78 | 	  
 79 | 	  ## Only compute for one track of signal
 80 | 	  if(is.null(window.sizes)){
 81 | 	  	  for(i in 1:n_bins)
 82 | 		  {
 83 | 			diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size)
 84 | 			mean.cf[i] = mean(diamond)
 85 | 		  }
 86 | 	  }
 87 | 
 88 | 
 89 | 	  if(!is.null(window.sizes)) ## compute multiple tracks of signal
 90 | 	  {
 91 | 	  	 mean.cfs = matrix(, n_bins, length(window.sizes))
 92 | 	  	 for(k in 1:length(window.sizes) )
 93 | 	  	 {
 94 | 	  	 	window.size.iter = window.sizes[k]
 95 | 	  	    for(i in 1:n_bins)
 96 | 			  {
 97 | 				diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter)
 98 | 				mean.cfs[i, k] = mean(diamond)
 99 | 			  }  
100 | 	  	 }
101 | 	  	 mean.cf = apply(mean.cfs, 1, mean)
102 | 	  }
103 | 
104 | 	  
105 | 	  eltm = proc.time() - ptm
106 | 	  # print(paste("Step 1 Running Time : ", eltm[3]))
107 | 	  # print("Step 1 : Done !!")
108 | 	  
109 | 	  # print("#########################################################################")
110 | 	  # print("Step 2 : Detect TD boundaries based on binSignals")
111 | 	  # print("#########################################################################")
112 | 	  
113 | 	  ptm = proc.time()
114 | 	  #gap.idx = Which.Gap.Region(matrix.data=matrix.data)
115 | 	  #gap.idx = Which.Gap.Region2(mean.cf)
116 | 	  gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size)
117 | 	  
118 | 	  proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3)
119 | 	  
120 | 	  #print(proc.regions)
121 | 	  
122 | 	  for( i in 1:nrow(proc.regions))
123 | 	  {
124 | 		start = proc.regions[i, "start"]
125 | 		end = proc.regions[i, "end"]
126 | 		
127 | 		# print(paste("Process Regions from ", start, "to", end))
128 | 		  
129 | 		local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end])
130 | 	  }
131 | 	  
132 | 	  local.ext_bp = local.ext
133 | 	  
134 | 	  eltm = proc.time() - ptm
135 | 	  # print(paste("Step 2 Running Time : ", eltm[3]))
136 | 	  # print("Step 2 : Done !!")
137 | 	  
138 | 	  if(statFilter)
139 | 	  {
140 | 		# print("#########################################################################")
141 | 		# print("Step 3 : Statistical Filtering of false positive TD boundaries")
142 | 		# print("#########################################################################")
143 | 	  	
144 | 	  	if(is.null(stat_window_size)) stat_window_size=window.size
145 | 		ptm = proc.time()  
146 | 		# print("-- Matrix Scaling....")
147 | 		scale.matrix.data = matrix.data
148 | 
149 | 		## This is to normalize each off diag values -- 2018-09-20
150 | 		# for( i in 1:(2*window.size) )
151 | 		for( i in 1:(2*stat_window_size) )
152 | 		{
153 | 		  #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) )
154 | 		  scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] )
155 | 		}
156 | 		
157 | 		# print("-- Compute p-values by Wilcox Ranksum Test")
158 | 		for( i in 1:nrow(proc.regions))
159 | 		{
160 | 		  start = proc.regions[i, "start"]
161 | 		  end = proc.regions[i, "end"]
162 | 		  
163 | 		  # print(paste("Process Regions from ", start, "to", end))
164 | 		  # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1)
165 | 		  
166 | 		  pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1)
167 | 		}
168 | 		# print("-- Done!")
169 | 		
170 | 		# print("-- Filtering False Positives")
171 | 		local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue<p_thresh))] = -2
172 | 		local.ext[which(local.ext==-1)] = 0
173 | 		local.ext[which(local.ext==-2)] = -1
174 | 		# print("-- Done!")
175 | 		
176 | 		eltm = proc.time() - ptm
177 | 		# print(paste("Step 3 Running Time : ", eltm[3]))
178 | 		# print("Step 3 : Done!")
179 | 	  } else pvalue = 0
180 | 
181 | 	  domains = Convert.Bin.To.Domain.TMP(bins=bins, 
182 | 									  signal.idx=which(local.ext==-1), 
183 | 									  gap.idx=which(local.ext==-0.5), 
184 | 									  pvalues=pvalue, 
185 | 									  pvalue.cut=p_thresh)
186 | 
187 | 	  domains = subset(domains, size!=0) ## ADDED 06-April-2019, Yuanlong
188 | 	  # print(head(domains))
189 | 	  # stop('HELLO')
190 | 	  if( sum(domains$size)!=nrow(A_extended) ) stop('In TopDom, some rows are missing')
191 | 
192 | 
193 | 	  binSignal = cbind(bins, 
194 | 				   local.ext = local.ext,
195 | 				   mean.cf = mean.cf, 
196 | 				   pvalue = pvalue)
197 | 	  bins = binSignal
198 | 
199 | 	  if(!is.null(domain_size_min))
200 | 	  {
201 | 	  	domains = remove_small_domains(domain_size_min=domain_size_min, binSignal=binSignal, domains=domains)
202 | 	  }
203 | 	  
204 | 	  
205 | 	  bedform = domains[, c("chr", "from.coord", "to.coord", "tag")]
206 | 	  colnames(bedform) = c("chrom", "chromStart", "chromEnd", "name")
207 | 	  
208 | 	  if( !is.null(outFile) ) {
209 | 		# print("#########################################################################")
210 | 		# print("Writing Files")
211 | 		# print("#########################################################################")
212 | 		
213 | 		outBinSignal =  paste(outFile, ".binSignal", sep="")
214 | 		# print(paste("binSignal File :", outBinSignal) )
215 | 		write.table(bins, file=outBinSignal, quote=F, row.names=F, col.names=T, sep="\t")  
216 | 	  
217 | 	  
218 | 		outDomain = paste(outFile, ".domain", sep="")
219 | 		# print(paste("Domain File :", outDomain) )
220 | 		write.table( domains, file=outDomain, quote=F, row.names=F, col.names=T, sep="\t")
221 | 		
222 | 		outBed = paste(outFile, ".bed", sep="")
223 | 		# print(paste("Bed File : ", outBed))
224 | 		write.table( bedform, file=outBed, quote=F, row.names=F, col.names=F, sep="\t")
225 | 	  }
226 | 	  
227 | 	  # print("Done!!")
228 | 	  
229 | 	  # print("Chromatin domains generated !")
230 | 	  if(return_peak_and_binsignal==TRUE) return( list( local_ext=local.ext_bp, mean_cf=mean.cf, domains=domains ) )
231 | 	  return(list(binSignal=bins, domain=domains, bed=bedform))
232 | 	}
233 | 
234 | 	# @fn Get.Diamond.Matrix
235 | 	# @param mat.data : N by N matrix, where each element indicate contact frequency
236 | 	# @param i :integer, bin index
237 | 	# @param size : integer, window size to expand from bin
238 | 	# @retrun : matrix.
239 | 	Get.Diamond.Matrix <- function(mat.data, i, size)
240 | 	{
241 | 	  n_bins = nrow( mat.data )
242 | 	  if(i==n_bins) return(NA)
243 | 	  
244 | 	  lowerbound = max( 1, i-size+1 )
245 | 	  upperbound = min( i+size, n_bins)
246 | 	  
247 | 	  return( mat.data[lowerbound:i, (i+1):upperbound] )
248 | 	}
249 | 
250 | 	# @fn Which.process.region
251 | 	# @param rmv.idx : vector of idx, remove index vector
252 | 	# @param n_bins : total number of bins
253 | 	# @param min.size : minimum size of bins
254 | 	# @retrun : data.frame of proc.regions
255 | 	Which.process.region <- function(rmv.idx, n_bins, min.size=3)
256 | 	{
257 | 	  gap.idx = rmv.idx
258 | 	  
259 | 	  proc.regions = data.frame(start=numeric(0), end=numeric(0))
260 | 	  proc.set = setdiff(1:n_bins, gap.idx)
261 | 	  n_proc.set = length(proc.set)
262 | 	  
263 | 	  i=1
264 | 	  while(i < n_proc.set )
265 | 	  {
266 | 		start = proc.set[i]
267 | 		j = i+1
268 | 		
269 | 		while(j <= n_proc.set)
270 | 		{
271 | 		  if( proc.set[j] - proc.set[j-1] <= 1) j = j + 1
272 | 		  else {
273 | 			proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
274 | 			i = j
275 | 			break
276 | 		  }
277 | 		}
278 | 		
279 | 		if(j >= n_proc.set ) {
280 | 		  proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
281 | 		  break
282 | 		}
283 | 	  }
284 | 	  
285 | 	  colnames(proc.regions) = c("start", "end")
286 | 	  proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ]
287 | 	  
288 | 	  return(proc.regions)
289 | 	}
290 | 
291 | 	# @fn Which.Gap.Region
292 | 	# @breif version 0.0.1 used
293 | 	# @param matrix.data : n by n matrix
294 | 	# @return gap index
295 | 	Which.Gap.Region <- function(matrix.data)
296 | 	{
297 | 	  n_bins = nrow(matrix.data)
298 | 	  gap = rep(0, n_bins)
299 | 	  
300 | 	  i=1
301 | 	  while(i < n_bins)
302 | 	  {
303 | 		j = i + 1
304 | 		while( j <= n_bins)
305 | 		{
306 | 		  if( sum( matrix.data[i:j, i:j]) == 0 ) {
307 | 			gap[i:j] = -0.5
308 | 			j = j+1  
309 | 			#if(j-i > 1) gap[i:j]=-0.5
310 | 			#j=j+1
311 | 		  } else break
312 | 		}
313 | 		
314 | 		i = j  
315 | 	  }
316 | 	  
317 | 	  idx = which(gap == -0.5)
318 | 	  return(idx)
319 | 	}
320 | 
321 | 	# @fn Which.Gap.Region3
322 | 	# @param matrix.data : n by n matrix
323 | 	# @return gap index
324 | 	Which.Gap.Region3 <- function(mean.cf)
325 | 	{
326 | 	  n_bins = length(mean.cf)
327 | 	  gapidx = which(mean.cf==0)
328 | 	  
329 | 	  return(gapidx)
330 | 	}
331 | 
332 | 	# @fn Which.Gap.Region2
333 | 	# @breif version 0.0.2 used
334 | 	# @param matrix.data : n by n matrix
335 | 	# @return gap index
336 | 	Which.Gap.Region2 <- function(matrix.data, w)
337 | 	{
338 | 	  n_bins = nrow(matrix.data)
339 | 	  gap = rep(0, n_bins)
340 | 	  
341 | 	  for(i in 1:n_bins)
342 | 	  {
343 | 		if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5
344 | 	  }
345 | 	  
346 | 	  idx = which(gap == -0.5)
347 | 	  return(idx)
348 | 	}
349 | 
350 | 	# @fn Detect.Local.Extreme
351 | 	# @param x : original signal to find local minima
352 | 	# @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise.
353 | 	Detect.Local.Extreme <- function(x)
354 | 	{
355 | 	  n_bins = length(x)
356 | 	  ret = rep(0, n_bins)
357 | 	  x[is.na(x)]=0
358 | 	  
359 | 	  if(n_bins <= 3)
360 | 	  {
361 | 		ret[which.min(x)]=-1
362 | 		ret[which.max(x)]=1
363 | 		
364 | 		return(ret)
365 | 	  }
366 | 	  # Norm##################################################3
367 | 	  new.point = Data.Norm(x=1:n_bins, y=x)
368 | 	  x=new.point$y
369 | 	  ##################################################
370 | 	  cp = Change.Point(x=1:n_bins, y=x)
371 | 	  
372 | 	  if( length(cp$cp) <= 2 ) return(ret)
373 | 	  if( length(cp$cp) == n_bins) return(ret)
374 | 	  for(i in 2:(length(cp$cp)-1))
375 | 	  {
376 | 		if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1
377 | 		else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1
378 | 		
379 | 		min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
380 | 		max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
381 | 		
382 | 		if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1
383 | 		if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1
384 | 	  }
385 | 
386 | 	  return(ret)
387 | 	}
388 | 
389 | 	# @fn Data.Norm
390 | 	# @param x : x axis vector
391 | 	# @param x : y axis vector
392 | 	# @return list of normalized x and y
393 | 	Data.Norm <- function(x, y)
394 | 	{
395 | 	  ret.x = rep(0, length(x))
396 | 	  ret.y = rep(0, length(y))
397 | 	  
398 | 	  ret.x[1] = x[1]
399 | 	  ret.y[1] = y[1]
400 | 	  
401 | 	  diff.x = diff(x)
402 | 	  diff.y = diff(y)
403 | 	  
404 | 	  scale.x = 1 / mean( abs(diff(x) ) )
405 | 	  scale.y = 1 / mean( abs( diff(y) ) )
406 | 	  
407 | 	  #print(scale.x)
408 | 	  #print(scale.y)
409 | 	  
410 | 	  for(i in 2:length(x))
411 | 	  {
412 | 		ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x)
413 | 		ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y)
414 | 	  }
415 | 	  
416 | 	  return(list(x=ret.x, y=ret.y))
417 | 	}
418 | 
419 | 	# @fn Change.Point
420 | 	# @param x : x axis vector
421 | 	# @param x : y axis vector
422 | 	# @return change point index in x vector,
423 | 	# Note that the first and the last point will be always change point
424 | 	Change.Point <- function( x, y )
425 | 	{
426 | 	  if( length(x) != length(y)) 
427 | 	  {
428 | 		print("ERROR : The length of x and y should be the same")
429 | 		return(0)
430 | 	  }
431 | 	  
432 | 	  n_bins <- length(x)
433 | 	  Fv <- rep(NA, n_bins)
434 | 	  Ev <- rep(NA, n_bins)
435 | 	  cp <- 1
436 | 	  
437 | 	  i=1
438 | 	  Fv[1]=0
439 | 	  while( i < n_bins )
440 | 	  {
441 | 		j=i+1
442 | 		Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 )
443 | 		
444 | 		while(j<n_bins)
445 | 		{
446 | 		  j=j+1
447 | 		  k=(i+1):(j-1)
448 | 		  Ev[j] = ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
449 | 		  Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i])^2 ) - ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
450 | 		  
451 | 		  #################################################
452 | 		  #Not Original Code
453 | 		  if( is.na(Fv[j]) || is.na(Fv[j-1]) ) {
454 | 			j = j-1
455 | 			cp <- c(cp, j)
456 | 			break
457 | 		  }
458 | 		  ####################################################3
459 | 		  if(Fv[j] < Fv[j-1] ) {
460 | 			j = j - 1
461 | 			cp <- c(cp, j )
462 | 			break
463 | 		  }
464 | 		}
465 | 		i=j
466 | 	  }
467 | 	  
468 | 	  cp <- c(cp, n_bins)
469 | 	  
470 | 	  return(list(cp=cp, objF=Fv, errF=Ev))
471 | 	}
472 | 
473 | 	# @fn Get.Pvalue
474 | 	# @param matrix.data : matrix
475 | 	# @param size : size to extend
476 | 	# @param scale : scale parameter if necessary. deprecated parameter
477 | 	# @return computed p-value vector
478 | 	Get.Pvalue <- function( matrix.data, size, scale=1 )
479 | 	{
480 | 	  n_bins = nrow(matrix.data)
481 | 	  pvalue <- rep(1, n_bins)
482 | 	  
483 | 	  for( i in 1:(n_bins-1) )
484 | 	  {
485 | 		dia = as.vector( Get.Diamond.Matrix2(matrix.data, i, size=size) )
486 | 		ups = as.vector( Get.Upstream.Triangle(matrix.data, i, size=size) )
487 | 		downs = as.vector( Get.Downstream.Triangle(matrix.data, i, size=size) )
488 | 		
489 | 		wil.test =  wilcox.test(x=dia*scale, y=c(ups, downs), alternative="less", exact=F)
490 | 		pvalue[i] = wil.test$p.value  
491 | 		
492 | 		#print(paste(i, "=", wil.test$p.value) )  
493 | 	  }
494 | 	  
495 | 	  pvalue[ is.na(pvalue) ] = 1
496 | 	  return(pvalue)
497 | 	}
498 | 
499 | 	# @fn Get.Upstream.Triangle
500 | 	# @param mat.data : matrix data
501 | 	# @param i : bin index
502 | 	# @param size : size of window to extend
503 | 	# @return upstream triangle matrix
504 | 	Get.Upstream.Triangle <- function(mat.data, i, size)
505 | 	{
506 | 	  n_bins = nrow(mat.data)
507 | 	  
508 | 	  lower = max(1, i-size)
509 | 	  tmp.mat = mat.data[lower:i, lower:i]
510 | 	  return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
511 | 	}
512 | 
513 | 	# @fn Get.Downstream.Triangle
514 | 	# @param mat.data : matrix data
515 | 	# @param i : bin index
516 | 	# @param size : size of window to extend
517 | 	# @return downstream triangle matrix
518 | 	Get.Downstream.Triangle <- function(mat.data, i, size)
519 | 	{
520 | 	  n_bins = nrow(mat.data)
521 | 	  if(i==n_bins) return(NA)
522 | 	  
523 | 	  upperbound = min(i+size, n_bins)
524 | 	  tmp.mat = mat.data[(i+1):upperbound, (i+1):upperbound]
525 | 	  return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
526 | 	}
527 | 
528 | 	# @fn Get.Diamond.Matrix2
529 | 	# @param mat.data : matrix data
530 | 	# @param i : bin index
531 | 	# @param size : size of window to extend
532 | 	# @return diamond matrix
533 | 	Get.Diamond.Matrix2 <- function(mat.data, i, size)
534 | 	{
535 | 	  n_bins = nrow(mat.data)
536 | 	  new.mat = matrix(rep(NA, size*size), nrow=size, ncol=size)
537 | 	  
538 | 	  for(k in 1:size)
539 | 	  {
540 | 		if(i-(k-1) >= 1 && i < n_bins)
541 | 		{
542 | 		  lower = min(i+1, n_bins)
543 | 		  upper = min(i+size, n_bins)
544 | 		  
545 | 		  new.mat[size-(k-1), 1:(upper-lower+1)] = mat.data[i-(k-1), lower:upper]
546 | 		}
547 | 	  }
548 | 	  
549 | 	  return(new.mat)
550 | 	}
551 | 	  
552 | 	# @fn Convert.Bin.To.Domain
553 | 	# @param bins : bin information
554 | 	# @param signal.idx : signal index
555 | 	# @param signal.idx : gap index
556 | 	# @param pvalues : pvalue vector
557 | 	# @param pvalue.cut : pvalue threshold
558 | 	# @return dataframe storing domain information
559 | 	Convert.Bin.To.Domain <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
560 | 	{
561 | 	  n_bins = nrow(bins)
562 | 	  ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
563 | 	  levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
564 | 	  
565 | 	  rmv.idx = setdiff(1:n_bins, gap.idx)
566 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
567 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
568 | 	  n_procs = nrow(proc.region)
569 | 	  gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
570 | 	  
571 | 	  rmv.idx = union(signal.idx, gap.idx)
572 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
573 | 	  n_procs = nrow(proc.region)
574 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
575 | 	  domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
576 | 	  
577 | 	  rmv.idx = setdiff(1:n_bins, signal.idx)
578 | 	  proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
579 | 	  n_procs = nrow(proc.region)
580 | 	  if(n_procs>0)
581 | 	  {
582 | 		from.coord = bins[proc.region[, "start"]+1, "from.coord"]  
583 | 		boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
584 | 		ret = rbind(ret, boundary)
585 | 	  }
586 | 	  
587 | 	  ret = rbind(gap, domain)
588 | 	  ret = ret[order(ret[,3]), ]
589 | 	  
590 | 	  ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
591 | 	  ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
592 | 	  ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
593 | 	  ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU
594 | 	  
595 | 	  if(!is.null(pvalues) && !is.null(pvalue.cut))
596 | 	  {
597 | 		for(i in 1:nrow(ret))
598 | 		{
599 | 		  if(ret[i, "tag"]=="domain")
600 | 		  {
601 | 			domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
602 | 			p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
603 | 			
604 | 			if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
605 | 		  }
606 | 		}
607 | 	  }
608 | 	  
609 | 	  new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
610 | 	  stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
611 | 	  
612 | 	  i=1
613 | 	  while(i <= nrow(ret))
614 | 	  {
615 | 		if( ret[i, "tag"] == "boundary" )
616 | 		{
617 | 		  stack.bdr = rbind(stack.bdr, ret[i, ])
618 | 		} else if(nrow(stack.bdr)>0) {
619 | 		  new.bdr = data.frame(chr=bins[1, "chr"], 
620 | 							   from.id = min( stack.bdr[, "from.id"]), 
621 | 							   from.coord=min(stack.bdr[, "from.coord"]),
622 | 							   to.id = max( stack.bdr[, "to.id"]),
623 | 							   to.coord=max(stack.bdr[, "to.coord"]),
624 | 							   tag="boundary",
625 | 							   size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"]))
626 | 		  new.bdr.set = rbind(new.bdr.set, new.bdr)
627 | 		  stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
628 | 		}
629 | 		
630 | 		i = i + 1
631 | 	  }
632 | 	  
633 | 	  ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set )
634 | 	  ret = ret[order(ret[, "to.coord"]), ]
635 | 	  
636 | 	  return(ret)
637 | 	}
638 | 
639 | 
640 | 	# @fn Convert.Bin.To.Domain
641 | 	# @param bins : bin information
642 | 	# @param signal.idx : signal index
643 | 	# @param signal.idx : gap index
644 | 	# @param pvalues : pvalue vector
645 | 	# @param pvalue.cut : pvalue threshold
646 | 	# @return dataframe storing domain information
647 | 	Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
648 | 	{
649 | 	  n_bins = nrow(bins)
650 | 	  ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
651 | 	  levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
652 | 	  
653 | 	  rmv.idx = setdiff(1:n_bins, gap.idx)
654 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
655 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
656 | 	  n_procs = nrow(proc.region)
657 | 	  gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
658 | 	  
659 | 	  rmv.idx = union(signal.idx, gap.idx)
660 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
661 | 	  n_procs = nrow(proc.region)
662 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
663 | 	  domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
664 | 	  
665 | 	  rmv.idx = setdiff(1:n_bins, signal.idx)
666 | 	  proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
667 | 	  n_procs = nrow(proc.region)
668 | 	  if(n_procs>0)
669 | 	  {
670 | 		from.coord = bins[proc.region[, "start"]+1, "from.coord"]  
671 | 		boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
672 | 		ret = rbind(ret, boundary)
673 | 	  }
674 | 	  
675 | 	  ret = rbind(gap, domain)
676 | 	  ret = ret[order(ret[,3]), ]
677 | 	  
678 | 	  ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
679 | 	  ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
680 | 	  ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
681 | 	  ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"]
682 | 	  
683 | 	  if(!is.null(pvalues) && !is.null(pvalue.cut))
684 | 	  {
685 | 		for(i in 1:nrow(ret))
686 | 		{
687 | 		  if(ret[i, "tag"]=="domain")
688 | 		  {
689 | 			domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
690 | 			p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
691 | 			
692 | 			if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
693 | 		  }
694 | 		}
695 | 	  }
696 | 	  
697 | 	  return(ret)
698 | 	}


--------------------------------------------------------------------------------
/R/compartment_PCA.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 	get_PCs = function( mat, which ) ## fast way to compute PCs
 3 | 	{
 4 | 		PC_mat = crossprod(mat)
 5 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=max(which), which = "LM" )
 6 | 		if(any(res_eigs_sym$values <0)) stop('Non-positive eigenvalues for A^2')
 7 |  		PCs = mat%*%(res_eigs_sym$vectors)
 8 | 		return( PCs[, which] )
 9 | 	}
10 | 
11 | 	
12 | 	PC_compartment <- function(A_oe) ## compute PC and define A/B compartment. Not used currently
13 | 	{
14 | 		cA_oe = fast_cor(A_oe)
15 | 		PC_mat = crossprod(cA_oe)
16 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" )
17 | 		PC1 = cA_oe%*%res_eigs_sym$vectors[,1]
18 | 		PC2 = cA_oe%*%res_eigs_sym$vectors[,2]
19 | 
20 | 		borders = which(diff(1*(PC1 > 0))!=0)
21 | 		to_id = as.numeric(rownames(A_oe)[borders])
22 | 		from_id = as.numeric(rownames(A_oe)[c(1, head(borders, length(borders)-1)+1)])
23 | 			
24 | 		start_poses = (from_id-1)*bin_size + 1
25 | 		end_poses = to_id*bin_size
26 | 
27 | 		compartment_AB = data.frame(chr=paste0('chr', chr), start_poses=start_poses, end_poses=end_poses, A_or_B=NA, zero=0, dot='.', start_poses_2=start_poses, end_poses_2=end_poses, col=NA)
28 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'A_or_B'] = 'A'
29 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'A_or_B'] = 'B'
30 | 			
31 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'col'] = '112,128,144'
32 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'col'] = '255,255,0'
33 | 
34 | 		compartments_bed_files = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_2', '.bed')
35 | 		write.table( compartment_AB, file=compartments_bed_files, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' )
36 | 	}
37 | 
38 | 	PC_compartment_slim <- function(A_oe, downsratio=NULL) ## compute and save the PC values only. Not used currently
39 | 	{
40 | 		cA_oe = fast_cor(A_oe)
41 | 		PC_mat = crossprod(cA_oe)
42 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" )
43 | 		PC1 = cA_oe%*%res_eigs_sym$vectors[,1]
44 | 		PC2 = cA_oe%*%res_eigs_sym$vectors[,2]
45 | 
46 | 		compartment_AB = data.frame(PC1=PC1, bin_names=rownames(A_oe))
47 | 
48 | 		if( is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB.Rdata')
49 | 		if( !is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_downsratio_', downsratio, '.Rdata')
50 | 		
51 | 		save( compartment_AB, file=compartments_AB_file )
52 | 	}
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/R/compartment_data_generation_fun.R:
--------------------------------------------------------------------------------
  1 | 	
  2 | 	compress_mat_fast_tmp = function(input_mat, compress_size)
  3 | 	{
  4 | 	    mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
  5 | 	    n = ncol(input_mat)
  6 | 	    mat2prod = mat_block(n, compress_size)
  7 | 	    # return(t(input_mat%*%mat2prod / compress_size))
  8 | 	    return(t(matrix_multiplication_cpp(input_mat, mat2prod)))
  9 | 	}
 10 | 
 11 | 	mat_10to40kb = function(mat2compress, bin_size2look, bin_size_input) ## small bin size to bigger bin_size
 12 | 	{
 13 | 		compress_size = bin_size2look / bin_size_input
 14 | 		len = nrow(mat2compress) - nrow(mat2compress)%%compress_size
 15 | 		mat2compress = mat2compress[1:len, 1:len]
 16 | 		c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size )
 17 | 		mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size )
 18 | 		rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) )
 19 | 		return(mat_compressed)
 20 | 	}
 21 | 
 22 | 	contact_mat_processing_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr_num, bin_size_input, bin_size2look, black_list_bins=NULL)
 23 | 	{	
 24 | 		compress_size = ifelse(bin_size2look < 40E3, 1, 1)
 25 | 		zero_ratio = 0.01
 26 | 
 27 | 
 28 | 		if(!is.null(contact_tab_dump)) contact_mat_raw = contact_tab_dump ## if contact matrix in data.frame or data.table of strawr format is provided
 29 | 		if(!is.null(contact_file_dump)) contact_mat_raw = data.table::fread(contact_file_dump) ## if contact matrix in strawr format is provided
 30 | 
 31 | 		if(!is.null(contact_file_hic)) ## if contact matrix in hic format is provided
 32 | 		{
 33 | 			bin_sizes_in_hic = strawr::readHicBpResolutions(contact_file_hic)
 34 | 			chrs_in_hic = as.vector(strawr::readHicChroms(contact_file_hic)[["name"]])
 35 | 			chr2query = chrs_in_hic[match(toupper(chr_num), gsub('chr', '', toupper(chrs_in_hic), ignore.case=TRUE))]
 36 | 
 37 | 			if(!(bin_size_input %in% bin_sizes_in_hic)) stop(sprintf('Your provided hic file only contains resolutions of: %s', paste0(bin_sizes_in_hic, collapse=' ')))
 38 | 			if(!(chr2query %in% chrs_in_hic)) stop(sprintf('Your provided hic file only contains chrs of: %s', paste0(chrs_in_hic, collapse=' ')))
 39 | 
 40 | 			## try different normalization to get available dataset
 41 | 
 42 | 			contact_mat_raw = try(strawr::straw("KR", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
 43 | 			if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC_SQRT", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
 44 | 			if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input))
 45 | 			if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) stop(sprintf('Your provided hic file does not contain information given the bin_size=%s and any of the normalization method KR/VC/VC_SQRT', bin_size_input))	
 46 | 			contact_mat_raw = data.table::as.data.table(contact_mat_raw)
 47 | 		}
 48 | 
 49 | 		colnames(contact_mat_raw) = c('pos_1', 'pos_2', 'val')	
 50 | 
 51 | 		contact_mat = subset(contact_mat_raw, !is.na(val))
 52 | 		contact_mat[,1] = contact_mat[,1]/bin_size_input
 53 | 		contact_mat[,2] = contact_mat[,2]/bin_size_input
 54 | 
 55 | 		if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')
 56 | 
 57 | 		n_bins = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
 58 | 		mat_sparse = Matrix::Matrix(0, nrow=n_bins, ncol=n_bins)
 59 | 		mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]
 60 | 
 61 | 		rownames(mat_sparse) = colnames(mat_sparse) = as.character( 1:nrow(mat_sparse) )
 62 | 		
 63 | 		########################################################## remove black listed regions
 64 | 
 65 | 		if(length(black_list_bins) > 0)
 66 | 		{
 67 | 			black_list_bins = intersect(as.character(black_list_bins), rownames(mat_sparse))
 68 | 			if(length(black_list_bins) > 0) mat_sparse[black_list_bins, ] = mat_sparse[, black_list_bins] = 0
 69 | 		}
 70 | 
 71 | 		########################################################## remove bins having too sparse contacts
 72 | 
 73 | 		mat_sparse = Matrix::forceSymmetric(mat_sparse, uplo='U')
 74 | 		if(bin_size2look!=bin_size_input) mat_sparse = mat_10to40kb( mat_sparse, bin_size2look, bin_size_input )
 75 | 		mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
 76 | 		if(nrow(mat_dense) < 100) mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=0) ## when all are dense
 77 | 		while(min(apply(mat_dense, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
 78 | 		{
 79 | 			mat_dense = remove_blank_cols(mat_dense, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
 80 | 			if(nrow(mat_dense) < 1) stop('Error in generating mat_dense at the data generating step')
 81 | 		}
 82 | 
 83 | 		##########################################################
 84 | 
 85 | 		nrow2keep = nrow(mat_dense) - nrow(mat_dense)%%compress_size
 86 | 		mat_dense_2_compress = mat_dense[, 1:nrow2keep]
 87 | 
 88 | 		bin_names = rownames(mat_dense)
 89 | 
 90 | 		mat_dense_compressed = compress_mat_fast( as.matrix(mat_dense_2_compress), compress_size=compress_size )
 91 | 		colnames(mat_dense_compressed) = bin_names
 92 | 		rm(mat_dense_2_compress); gc()
 93 | 		
 94 | 		# range(mat_dense_compressed)
 95 | 		# # sum(mat_dense_compressed > 1000)
 96 | 		# # mat_dense_compressed[mat_dense_compressed > 1000] = 1000
 97 | 		# mat_dense_compressed_sparse = mat_dense_compressed
 98 | 		mat_dense_compressed = as.matrix(mat_dense_compressed)
 99 | 		mat_dense_compressed_log = log2(mat_dense_compressed + 1)
100 | 		
101 | 		# #########################################################
102 | 		# cat('compute correlation matrix ... ')
103 | 
104 | 		cmat_dense_compressed_log = fast_cor(mat_dense_compressed_log)
105 | 		ccmat_dense_compressed_log = fast_cor(cmat_dense_compressed_log)
106 | 
107 | 		# cat('compute correlation matrix done ... ')
108 | 
109 | 		# #########################################################
110 | 		# # ccmat_dense_compressed_atanh = atanh(ccmat_dense_compressed - 1E-7)
111 | 		ccmat_dense_compressed_log_atanh = atanh(ccmat_dense_compressed_log / (1+1E-7))
112 | 
113 | 		# rm(mat_dense_compressed, mat_dense_compressed_sparse, cmat_dense_compressed_log)
114 | 		gc()
115 | 		# #########################################################
116 | 		# cat('ready to compute compartment domains\n')
117 | 
118 | 		out = list(mat_dense=mat_dense, atanh_score=ccmat_dense_compressed_log_atanh)
119 | 
120 | 		return(out)
121 | 	}
122 | 	
123 | 
124 | 
125 | 	# if(!file.exists(contact_mat_file)) contact_mat_file = paste0('/mnt/ndata/Yuanlong/2.Results/1.Juicer/', CELL_LINE, '/contact_mat/mat_chr', chr, '_', bin_size_initial_kb, 'kb_ob.txt.gz')
126 | 
127 | 	## bin_size_initial is the binsize of your input matrix, can be different from the bin_size of your planned analysis
128 | 	# contact_mat_processing = function(contact_mat_file, bin_size, bin_size_initial=bin_size)
129 | 	# {	
130 | 		
131 | 	# 	compress_size = ifelse(bin_size < 40E3, 1, 1)
132 | 	# 	zero_ratio = 0.01
133 | 
134 | 	# 	combined_xk_oe_raw = data.table::fread(contact_mat_file)
135 | 
136 | 	# 	## this code generates the compartment domains
137 | 
138 | 	# 	combined_xk_oe_raw = subset(combined_xk_oe_raw, !is.na(V3))
139 | 	# 	combined_xk_oe_raw[,1] = combined_xk_oe_raw[,1]/bin_size_initial
140 | 	# 	combined_xk_oe_raw[,2] = combined_xk_oe_raw[,2]/bin_size_initial
141 | 	# 	combined_xk_oe = combined_xk_oe_raw
142 | 
143 | 	# 	colnames(combined_xk_oe) = c('pos_1', 'pos_2', 'val')	
144 | 	# 	if(!all(combined_xk_oe[[2]] >= combined_xk_oe[[1]])) stop('\nYou provided matrix does not represent an upper triangular matrix!\n\n')
145 | 
146 | 	# 	oe_size = max(max(combined_xk_oe[[1]]), max(combined_xk_oe[[2]])) + 1 ## should +1 because combined_xk_oe index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
147 | 	# 	mat_oe_sparse = Matrix::Matrix(0, nrow=oe_size, ncol=oe_size)
148 | 	# 	mat_oe_sparse[cbind(combined_xk_oe[[1]]+1, combined_xk_oe[[2]]+1)] <- combined_xk_oe[[3]]
149 | 
150 | 	# 	rownames(mat_oe_sparse) = colnames(mat_oe_sparse) = as.character( 1:nrow(mat_oe_sparse) )
151 | 		
152 | 	# 	mat_oe_sparse = Matrix::forceSymmetric(mat_oe_sparse, uplo='U')
153 | 	# 	if(bin_size!=bin_size_initial) mat_oe_sparse = mat_10to40kb( mat_oe_sparse, bin_size, bin_size_initial )
154 | 	# 	A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
155 | 	# 	if(nrow(A_oe) < 100) A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=0) ## when all are dense
156 | 	# 	while(min(apply(A_oe, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
157 | 	# 	{
158 | 	# 		A_oe = remove_blank_cols(A_oe, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
159 | 	# 		if(nrow(A_oe) < 1) stop('ERROR IN GENERATING MEANINGFUL A_oe at the data generating step')
160 | 	# 	}
161 | 
162 | 	# 	##########################################################
163 | 
164 | 	# 	len = nrow(A_oe) - nrow(A_oe)%%compress_size
165 | 	# 	A_oe_2_compress = A_oe[, 1:len]
166 | 
167 | 	# 	bin_names = rownames(A_oe)
168 | 
169 | 	# 	A_oe_compressed = compress_mat_fast( as.matrix(A_oe_2_compress), compress_size=compress_size )
170 | 	# 	colnames(A_oe_compressed) = bin_names
171 | 	# 	rm(A_oe_2_compress); gc()
172 | 		
173 | 	# 	range(A_oe_compressed)
174 | 	# 	# # sum(A_oe_compressed > 1000)
175 | 	# 	# # A_oe_compressed[A_oe_compressed > 1000] = 1000
176 | 	# 	A_oe_compressed_sparse = A_oe_compressed
177 | 	# 	A_oe_compressed = as.matrix(A_oe_compressed)
178 | 	# 	A_oe_compressed_log = log2(A_oe_compressed + 1)
179 | 		
180 | 	# 	# #########################################################
181 | 	# 	# cat('compute correlation matrix ... ')
182 | 
183 | 	# 	cA_oe_compressed_log = fast_cor(A_oe_compressed_log)
184 | 	# 	ccA_oe_compressed_log = fast_cor(cA_oe_compressed_log)
185 | 
186 | 	# 	# cat('compute correlation matrix done ... ')
187 | 
188 | 	# 	# #########################################################
189 | 	# 	# # ccA_oe_compressed_atanh = atanh(ccA_oe_compressed - 1E-7)
190 | 	# 	ccA_oe_compressed_log_atanh = atanh(ccA_oe_compressed_log / (1+1E-7))
191 | 
192 | 	# 	# rm(A_oe_compressed, A_oe_compressed_sparse, cA_oe_compressed_log)
193 | 	# 	gc()
194 | 	# 	# #########################################################
195 | 	# 	# cat('ready to compute compartment domains\n')
196 | 
197 | 	# 	out = list(A_oe=A_oe, atanh_score=ccA_oe_compressed_log_atanh)
198 | 
199 | 	# 	return(out)
200 | 	# }
201 | 
202 | 


--------------------------------------------------------------------------------
/R/generate_compartments_bed_fun.R:
--------------------------------------------------------------------------------
 1 | 	## Yuanlong LIU, 09-08-2018
 2 | 
 3 | 	generate_compartments_bed <- function(input_mat, p_thresh, out_file_name, chr, window.sizes=3, stat_window_size=NULL, bin_size)
 4 | 	{	
 5 | 
 6 | 		input_mat_extended = data.frame(chr=paste0('chr', chr), pos_start=0:(nrow(input_mat)-1), pos_end=1:nrow(input_mat), mat=input_mat)
 7 | 
 8 | 		res_input_mat = TopDom_v2(input_mat_extended, window.size=NULL, NULL, T, p_thresh=p_thresh, window.sizes=window.sizes, stat_window_size=stat_window_size, domain_size_min=NULL)
 9 | 		cat('[', as.character(chr),'] Computing compartments\n')
10 | 		# return(res_input_mat)
11 | 		to_id = as.numeric(rownames(input_mat)[res_input_mat$domain$to.id])
12 | 		from_id = as.numeric(rownames(input_mat)[res_input_mat$domain$from.id])
13 | 		
14 | 		start_poses = (from_id-1)*bin_size + 1
15 | 		# end_poses = start_poses + n2one*bin_size
16 | 		end_poses = start_poses + bin_size
17 | 		
18 | 		input_mat_compartments_bed = data.frame(paste0('chr', chr), as.character(start_poses), as.character(end_poses) )
19 | 		if(!is.null(out_file_name)) write.table( input_mat_compartments_bed, file=out_file_name, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' )
20 | 		return( res_input_mat )
21 | 	}
22 | 	


--------------------------------------------------------------------------------
/R/post_processing_nested_domains.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 	get_cluster_index <- function(pos, initial_clusters, bin_names, bin_size)
  3 | 	{
  4 | 		compartment_segs = generate_compartment_segs( initial_clusters )
  5 | 		CDs = get_original_tad_indices( bin_names, compartment_segs, bin_size=bin_size )
  6 | 		cluster_index = which(apply( CDs, 1, function(v) (v[1] < pos)&(v[2]>pos)  )==1)
  7 | 		return(cluster_index)
  8 | 	}
  9 | 
 10 | 	get_original_tad_indices_extra <- function(names_A_final, TADs, bin_size) ## to get the TADs for the extra edges
 11 |     {
 12 |         # start_pos = as.numeric(names_A_final[TADs$start_pos])
 13 |         end_pos = as.numeric(names_A_final[TADs$end_pos])
 14 |         # start_pos_ori = (start_pos - 1)*bin_size + 1
 15 |         end_pos_ori = end_pos*bin_size
 16 |         TADs = data.frame( start_pos_ori=end_pos_ori+1, end_pos_ori=end_pos_ori )
 17 |         return( TADs )
 18 |     }
 19 | 
 20 | 	
 21 | 	LikelihoodRatioTest <- function(sub_domains_raw, ncores, remove_zero=FALSE, distr, n_parameters=3, imputation_num=1E2, A_already_corrected=FALSE)
 22 | 	{
 23 |     	`%dopar%` <- foreach::`%dopar%`
 24 | 		`%do%` <- foreach::`%do%`
 25 | 
 26 | 		pA_sym = sub_domains_raw$pA_sym
 27 | 		if(A_already_corrected==TRUE) cpA_sym = pA_sym
 28 | 		## CAN SPEEDUP correct_A_fast_divide_by_mean BECAUSE ONLY SOME OFF-DIAGNAL LINES NEED TO BE CORRECTED
 29 | 		if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym
 30 | 		trees = foreach::foreach(j=1:length( sub_domains_raw$res_inner )) %do% 
 31 | 		{
 32 | 			name_index = rownames(sub_domains_raw$pA_sym)[sub_domains_raw$segmentss[j,1]:sub_domains_raw$segmentss[j,2]]
 33 | 			sub_domains_raw$res_inner[[j]]$cA = as.matrix(cpA_sym[name_index, name_index]) ## the corrected A. Use this matrix is essential for reducing the distance-based false positves
 34 | 			
 35 | 			## for example, when the number of invovled bins in sub_domains_raw$res_inner[[j]] is too small
 36 | 			# tmp = try(get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ))
 37 | 			tmp = get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )
 38 | 			# if( class(tmp)=="try-error" )
 39 | 			if(class(tmp)!='igraph') ## if(tmp=='bad_tree')
 40 | 			{
 41 | 				root_tree = igraph::graph.empty() + 'root'
 42 | 				igraph::V(root_tree)$width = nrow(sub_domains_raw$res_inner[[j]]$A)
 43 | 				igraph::V(root_tree)$left = 1
 44 | 				igraph::V(root_tree)$right = igraph::V(root_tree)$width
 45 | 				tmp = root_tree ## represented by the length of the segment
 46 | 			}
 47 | 			tmp
 48 | 		}
 49 | 		return(trees)	
 50 | 	}
 51 | 
 52 | 	
 53 | 	# pipline <- function(chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
 54 | 	create_TADs <- function(sub_domains_raw, chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
 55 | 	{
 56 | 		chr_name = paste0('chr', chr)
 57 | 		bin_size_kb = bin_size / 1E3
 58 | 
 59 | 		if(just_save_no_nest)
 60 | 		{
 61 | 			TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 62 | 			TADs = cbind(chr_name, TADs)
 63 | 			save( TADs, file=Tads_R_File )
 64 | 		}
 65 | 		TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 66 | 		# if(TADs_type=='extra') TADs = get_original_tad_indices_extra( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 67 | 
 68 | 		# print(TADs)
 69 | 
 70 | 		TADs = cbind(chr_name, TADs)
 71 | 		return(TADs)
 72 | 	}
 73 | 
 74 | 	#####################################################################
 75 | 
 76 | 	## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
 77 | 	clean_TADs_all = function(TADs_all_raw, CDs, bin_size)
 78 | 	{
 79 | 		TADs_all_end = TADs_all_raw[,2]
 80 | 		CD_end = CDs[,2]
 81 | 		outer_mat = outer(TADs_all_end, CD_end, "-")
 82 | 		min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
 83 | 		# end2rm = TADs_all_end[(min_dist <= 3) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb		
 84 | 		end2rm = TADs_all_end[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb		
 85 | 		## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
 86 | 
 87 | 
 88 | 		TADs_all_head = TADs_all_raw[,1]
 89 | 		CD_head = CDs[,1]
 90 | 		outer_mat = outer(TADs_all_head, CD_head, "-")
 91 | 		min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
 92 | 		head2rm = TADs_all_head[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary	
 93 | 		TADs_all = subset(TADs_all_raw, (!(start_pos %in% head2rm)) & (!(end_pos %in% end2rm)) )	
 94 | 		return(TADs_all)
 95 | 	}
 96 | 
 97 | 	post_process_sub_domains = function(chr, sub_domains_raw, ncores, out_dir, bin_size)
 98 | 	{
 99 | 		
100 | 		distr=c('lnorm', 'wilcox')[2]
101 | 		remove_zero = FALSE
102 | 		n_parameters = 3
103 | 		imputation_num = 1E2
104 | 		A_already_corrected = FALSE
105 | 
106 | 		decorated_branches = LikelihoodRatioTest(sub_domains_raw=sub_domains_raw, ncores=ncores, remove_zero=FALSE, distr=distr, n_parameters=n_parameters, imputation_num=imputation_num, A_already_corrected=A_already_corrected)
107 | 		chr_name = paste0('chr', chr)
108 | 
109 | 
110 | 		mean_diff_thresh = -0.1
111 | 		i = 1
112 | 		# for(i in 1:length(p0s))
113 | 		{
114 | 			# cat(i, '\n')
115 | 			# p0 = p0s[i]
116 | 
117 | 			normal_decorated_branches = decorated_branches[sapply(decorated_branches, igraph::vcount) > 1]
118 | 			# ps = sort(unlist(lapply(normal_decorated_branches, function(v) igraph::V(v)$wilcox_p)), decreasing=FALSE) ## fdr correction
119 | 			# p0_adj = ps[min(which(p.adjust(ps, method = 'fdr') > p0))]
120 | 			# p0_adj = ps[min(which(p.adjust(ps, method = 'bonferroni') > p0))]
121 | 			p0_adj = 0.05
122 | 
123 | 			TADs_all_raw = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=TRUE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
124 | 
125 | 			# CDs = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE); cat(nrow(CDs), '\n')
126 | 			CDs = prunning(decorated_branches, to_correct=FALSE, p0=-1, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
127 | 			# if(nrow(CDs)!=length(res$initial_clusters)) stop('nrow(CDs)!=length(res$initial_clusters)')
128 | 
129 | 			TADs_all = clean_TADs_all(TADs_all_raw, CDs, bin_size=bin_size)
130 | 
131 | 			# Tad_edges <- sort(unique(c(TADs_tmp[,1], (TADs_tmp[,2]+1))))
132 | 			TADs_all_edges <- sort(unique(TADs_all[,2]))
133 | 			CD_edges <- sort(unique(CDs[,2]))
134 | 			
135 | 			TADs_extra_edges = unique(setdiff(TADs_all_edges, CD_edges))
136 | 			TADs_extra = data.frame(start_pos=TADs_extra_edges, end_pos= TADs_extra_edges)
137 | 
138 | 
139 | 			TADs_pos_all = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_all, TADs_type='ALL', bin_size=bin_size)
140 | 			TADs_pos_extra = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_extra, TADs_type='extra', bin_size=bin_size)		
141 | 			TADs_pos_CD = create_TADs(sub_domains_raw=sub_domains_raw, chr, CDs, TADs_type='CD', bin_size=bin_size)
142 | 
143 | 			# TADs_info = list(decorated_branches=decorated_branches, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD)
144 | 			TADs_info = list(decorated_branches=decorated_branches, TADs_all=TADs_all, CDs=CDs, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD, TADs_pos_extra=TADs_pos_extra)
145 | 		}
146 | 
147 | 
148 | 		options(stringsAsFactors=FALSE)
149 | 	    level_5 = TADs_info$TADs_pos_extra[, c('chr_name', 'end_pos_ori')]
150 | 	    colnames(level_5)[1:2] = c('chr', 'nested_boundary')
151 | 	    sub_domain_boundary_bed_file = paste0(out_dir, '/chr', chr, '_nested_boundaries.bed')
152 | 
153 | 	    level_5_bed = data.frame(level_5, level_5[,2], '', '.', level_5, level_5[,2], '#000000')
154 | 	    op <- options(scipen=999)
155 | 	    write.table( level_5_bed, file=sub_domain_boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
156 | 
157 | 	    on.exit(options(op))
158 | 
159 | 		return(NULL)
160 | 	}
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/R/prunning.R:
--------------------------------------------------------------------------------
  1 | trim_tree_adaptive_top_down_v2 = function( tree, wilcox_p_thresh, mean_diff_thresh )
  2 |         {
  3 |                 # leaves = get_leaves(tree)
  4 |                 if(igraph::vcount(tree)==1) return(tree)
  5 |                 # cat('I am in trim_tree_adaptive_top_down_v2\n')
  6 |                 nsig_nodes = union( igraph::V(tree)[which(igraph::V(tree)$wilcox_p > wilcox_p_thresh)]$name, igraph::V(tree)[which(igraph::V(tree)$mean_diff > mean_diff_thresh)]$name )
  7 | 
  8 | 
  9 |                 children_of_nsig = names(unlist(igraph::ego(tree, order=1, node=nsig_nodes, mode='out', mindist=1)))
 10 |                 if(length(children_of_nsig)!=0) trimmed_tree = tree - children_of_nsig
 11 |                 if(length(children_of_nsig)==0) trimmed_tree = tree
 12 | 
 13 |                 comps = igraph::decompose(trimmed_tree)
 14 |                 root_index = which(sapply( comps, function(comp) igraph::V(tree)[1]$name %in% igraph::V(comp)$name )==1)
 15 |                 trimmed_tree = comps[[root_index]]
 16 |                 if(!is_binary_tree( trimmed_tree )) stop("trim_tree_adaptive_top_down, the resulted tree is not a binary tree")
 17 |                 return( trimmed_tree )
 18 |         }
 19 | 
 20 | 
 21 | prunning = function(branches, p0, to_correct=FALSE, width_thresh=-Inf, width_thresh_CD=5, boundary_signal_thresh=-Inf, return_which='TADs', top_down=FALSE, all_levels=FALSE, CD_border_adj=FALSE, peak_thresh=NULL, mean_diff_thresh)
 22 | 	{
 23 | 
 24 | 		size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v))))
 25 | 		p_thresh = p0/size2correct
 26 | 		if(to_correct==FALSE) p_thresh=p0
 27 | 		# if(!is.null(p0)) p_thresh = p0
 28 | 
 29 | 		
 30 | 		if(top_down==FALSE)
 31 | 		{
 32 | 			trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh, peak_thresh=peak_thresh )
 33 | 			# size2correct = sum(sapply(trimmed_branches, igraph::vcount)) - sum(sapply(trimmed_branches, function(v) length(get_leaves(v))))
 34 | 			
 35 | 			# for( i in 1:length( trimmed_branches ) )
 36 | 			# {
 37 | 				# trimmed_branch = trimmed_branches[[i]]
 38 | 				# if(igraph::vcount(trimmed_branch) > 1) trimmed_branches[[i]] = lapply( trimmed_branches[i], trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=-1 )[[1]]
 39 | 			# }
 40 | 			
 41 | 
 42 | 		}
 43 | 		
 44 | 		# if(top_down==TRUE) trimmed_branches = lapply( branches, trim_tree_adaptive_top_down, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh )
 45 | 		if(top_down==TRUE) trimmed_branches = lapply( branches, function(branch) trim_tree_adaptive_top_down_v2(wilcox_p_thresh=p_thresh, mean_diff_thresh=mean_diff_thresh, tree=branch ))
 46 | 
 47 | 		if(CD_border_adj==TRUE)
 48 | 		{
 49 | 			all_tads = get_adjusted_nested_TADs( trimmed_branches, width_thresh_CD, all_levels )
 50 | 			return( all_tads )
 51 | 		}
 52 | 		
 53 | 		## get all nested TADs in trimmed_branches
 54 | 		if(all_levels==TRUE)
 55 | 		{
 56 | 			all_tads = data.frame(start_pos=numeric(), end_pos=numeric())
 57 | 			widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width))
 58 | 			for(i in 1:length(trimmed_branches)) 
 59 | 			{
 60 | 				all_tads_i = get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i]))
 61 | 				all_tads = rbind(all_tads, all_tads_i)
 62 | 			}
 63 | 			return( all_tads )
 64 | 		}
 65 | 					
 66 | 		if( return_which=='trimmed_branches' ) return( trimmed_branches )
 67 | 		
 68 | 		tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
 69 | 		tad_sizes = unlist(tad_sizes_ind)
 70 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
 71 | 		end_pos = cumsum(tad_sizes)
 72 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
 73 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
 74 | 		return( tads )
 75 | 	}
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 	## This function combines prunning with branches of only one node
 81 | 	prunning_hybrid <- function(branches, ...)
 82 | 	{
 83 | 		names(branches) = as.character(1:length(branches))
 84 | 		normal_branches = branches[sapply( branches, function(v) class(v)=='igraph' )]
 85 | 		unnormal_branches = branches[sapply( branches, function(v) class(v)!='igraph' )] ## that is reprsented as bin_start:bin_end
 86 | 
 87 | 		trimmed_branches = prunning(normal_branches, return_which='trimmed_branches', ...)
 88 | 
 89 | 		normal_tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
 90 | 		unormal_tad_sizes_ind = unnormal_branches
 91 | 		tad_sizes_ind = c(normal_tad_sizes_ind, unormal_tad_sizes_ind)
 92 | 		tad_sizes_ind = tad_sizes_ind[names(branches)]
 93 | 		tad_sizes = unlist(tad_sizes_ind)
 94 | 
 95 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
 96 | 		end_pos = cumsum(tad_sizes)
 97 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
 98 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
 99 | 		return( tads )
100 | 	}
101 | 
102 | 
103 | 	get_all_tads_in_a_trimmed_branch <- function(trimmed_branch, pos_shift)
104 | 	{
105 | 		res = data.frame( start_pos=igraph::V(trimmed_branch)$left + pos_shift, end_pos=igraph::V(trimmed_branch)$right + pos_shift )
106 | 		res = res[order(res[,1], res[,2]), ]
107 | 		return(res)
108 | 	}
109 | 	
110 | 	prunning_bottom_up <- function(branches, p0=NULL, width_thresh)
111 | 	{
112 | 		size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v))))
113 | 		
114 | 		p_thresh = 0.05/size2correct
115 | 		if(!is.null(p0)) p_thresh = p0
116 | 		
117 | 		trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh )
118 | 
119 | 		tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
120 | 		tad_sizes = unlist(tad_sizes_ind)
121 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
122 | 		end_pos = cumsum(tad_sizes)
123 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
124 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
125 | 		return( tads )
126 | 	}
127 | 	
128 | 	
129 | 	trim_tree_adaptive_bottom_up <- function( tree, which_p='imp_p' )
130 | 	{
131 | 		if(which_p=='imp_p') ps = sort(unique(igraph::V(tree)$imp_p), decreasing=TRUE)
132 | 		# if(which_p=='nimp_p') ps = sort(unique(igraph::V(tree)$nimp_p), decreasing=TRUE)
133 | 		# if(which_p=='both') ps = sort(unique(pmin(igraph::V(tree)$nimp_p, igraph::V(tree)$imp_p)), decreasing=TRUE)
134 | 		
135 | 		trimed_tree_current = tree
136 | 		trimmed_branch_bottom_up = vector('list', length(ps))
137 | 		for(i in 1:length(ps))
138 | 		{
139 | 			trimed_tree_current = trim_tree_adaptive( tree, L_diff_thresh=-Inf, max_imp_p=ps[i], max_nimp_p=Inf, width_thresh=-Inf )
140 | 			trimmed_branch_bottom_up[[i]] = trimed_tree_current
141 | 		}
142 | 		igraph::vcounts = sapply(trimmed_branch_bottom_up, igraph::vcount)
143 | 		ps = ps[!duplicated(igraph::vcounts)]
144 | 		trimmed_branch_bottom_up = trimmed_branch_bottom_up[!duplicated(igraph::vcounts)]
145 | 		res = list(ps=ps, trimmed_branch_bottom_up=trimmed_branch_bottom_up)
146 | 		return( res )
147 | 	}
148 | 	
149 | 	
150 | 	## get adjusted nested TADs
151 | 	get_adjusted_nested_TADs <- function( trimmed_branches, width_thresh_CD, all_levels )
152 | 	{
153 | 			widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width))
154 | 			all_tads_i_list = lapply( 1:length(trimmed_branches), function(i) get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i])))
155 | 
156 | 			for(i in 1:length(trimmed_branches)) 
157 | 			{
158 | 				all_tads_i = all_tads_i_list[[i]]
159 | 				if( nrow(all_tads_i) <= 1 ) next
160 | 				## move the left-most border a little bit right if needed
161 | 				left_borders = unique(all_tads_i[,1])
162 | 				min_diff_left = left_borders[2] - left_borders[1]
163 | 				if( min_diff_left <= width_thresh_CD  )
164 | 				{
165 | 					all_tads_i[ all_tads_i==left_borders[1] ] = left_borders[2]
166 | 					all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1],  ] ## remove "negative" TADs
167 | 					all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs
168 | 
169 | 					all_tads_i_list[[i]] = all_tads_i
170 | 
171 | 					## need to modify the right border of nested TADs in previous CD if the left border of this CD is modified
172 | 					if(i > 1)
173 | 					{
174 | 						## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i]	
175 | 						all_tads_i_list[[i-1]][ all_tads_i_list[[i-1]]==max(all_tads_i_list[[i-1]]) ] = min(all_tads_i_list[[i]]) - 1
176 | 					}
177 | 				}
178 | 
179 | 				if( nrow(all_tads_i) <= 1 ) next
180 | 				## move the right-most border a little bit left if needed
181 | 				right_borders = unique(rev(all_tads_i[,2]))
182 | 				min_diff_right = right_borders[1] - right_borders[2]
183 | 				if( min_diff_right <= width_thresh_CD  )
184 | 				{
185 | 					all_tads_i[ all_tads_i==right_borders[1] ] = right_borders[2]
186 | 					all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1],  ]
187 | 					all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs
188 | 					
189 | 					all_tads_i_list[[i]] = all_tads_i
190 | 
191 | 					if(i < length(trimmed_branches))
192 | 					{
193 | 						## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i]	
194 | 						all_tads_i_list[[i+1]][ all_tads_i_list[[i+1]]==min(all_tads_i_list[[i+1]]) ] = max(all_tads_i_list[[i]]) + 1
195 | 					}
196 | 				}
197 | 			}
198 | 
199 | 			if(!all_levels) all_tads_i_list = lapply( all_tads_i_list, function(v) data.frame(start_pos=head(v[,1],1), end_pos=tail(v[,2],1)) )
200 | 
201 | 			all_tads = do.call(rbind, all_tads_i_list)
202 | 			colnames(all_tads) = c('start_pos', 'end_pos')
203 | 			return( all_tads )
204 | 	}
205 | 	
206 | 	
207 | 	
208 | 	


--------------------------------------------------------------------------------
/R/zigzag_nested_domain_v2.R:
--------------------------------------------------------------------------------
 1 | 	## Yuanlong LIU
 2 | 	## 12/04/2018
 3 | 	## 06/06/2018
 4 | 	## 16/06/2018
 5 | 	## This code runs zigzag search on each compartment domain as a whole without dividing into 400
 6 | 
 7 | 	## 09-10-2018
 8 | 	
 9 | 	## A should be already named
10 | 	## "zigzag" resembles computational path of finding the optimum  
11 | 	# HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2)
12 | 	HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq=NULL, max_nbins_fine=NULL, min_n_bins=2)
13 | 	{
14 | 
15 | 		`%dopar%` <- foreach::`%dopar%`
16 | 			`%do%` <- foreach::`%do%`
17 | 			
18 | 		arg_list = as.list(environment())
19 | 		
20 | 		res_folder = file.path(res_dir)
21 | 		dir.create(res_folder, recursive=TRUE, showWarnings = FALSE)
22 | 		
23 | 		total_execution_time_file = file.path(res_dir, 'total_execution.time')
24 | 		time_begin = Sys.time()
25 | 		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
26 | 	
27 | 		## check whether your input matrix is symmetric
28 | 		# A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') )
29 | 		# A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11
30 | 		A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11
31 | 
32 | 		tol = 100 * .Machine$double.eps
33 | 		max_diff = max(abs(A_sym - A))
34 | 		notSym_flag = max_diff > tol
35 | 		if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first')
36 | 		
37 | 		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
38 | 		
39 | 		# pA_sym = rm_zeros(A_sym) ## pA_sym: positive A
40 | 		pA_sym = as.matrix(remove_blank_cols(A_sym, sparse=TRUE, ratio=0))	
41 | 		n_zero_rows = nrow(A_sym) - nrow(pA_sym)
42 | 		zero_rows_flag = n_zero_rows > 0
43 | 		if( zero_rows_flag )
44 | 		{
45 | 			warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis')
46 | 			original_row_names = rownames(A_sym)
47 | 			kept_row_names = rownames(pA_sym)
48 | 		}
49 | 
50 | 		# res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain
51 | 		# for( i in 1:nrow(compartment_segs) )
52 | 		# {
53 | 		# 	seg = compartment_segs[i,1]:compartment_segs[i,2]
54 | 		# 	cat('Compute seg:', i, 'of length:', length(seg), '\n')
55 | 
56 | 		# 	A_seg = as.matrix(pA_sym[seg, seg])
57 | 		# 	res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg))
58 | 		# 	res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors)
59 | 		# 	res_inner[[i]] = res_outer
60 | 		# 	cat('finished', '\n')
61 | 		# }
62 | 
63 | 		## changed to paralell, 2018-11-11
64 | 
65 | 		res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do%
66 | 		{
67 | 			seg = compartment_segs[i,1]:compartment_segs[i,2]
68 | 			cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins\n', i, nrow(compartment_segs), length(seg)))
69 | 
70 | 			A_seg = pA_sym[seg, seg]
71 | 			res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins)
72 | 			res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins)
73 | 			res_outer
74 | 			# res_inner[[i]] = res_outer
75 | 		}
76 | 
77 | 		cat('\n')
78 | 		
79 | 		segmentss = compartment_segs
80 | 		res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner )
81 | 		# res_folder_final = file.path(res_dir, 'final')
82 | 		# dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
83 | 		# save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
84 | 	
85 | 		time_finish = Sys.time()
86 | 		cat('Execution finishes:', as.character(time_finish), '\n', file=total_execution_time_file, append=TRUE)
87 | 		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n', file=total_execution_time_file, append=TRUE)
88 | 	
89 | 		return( res_info )
90 | 	}
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/r-calder2/README.html)
  2 | [![Docker Pulls](https://img.shields.io/docker/pulls/lucananni93/calder2?label=Docker%20image&logo=Docker&style=flat-square)](https://hub.docker.com/r/lucananni93/calder2)
  3 | [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
  4 | [![nf-core](https://img.shields.io/badge/nf--core-calder2-green)](https://nf-co.re/modules/calder2)
  5 | 
  6 | # CALDER user manual
  7 | 
  8 | CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts. CALDER is currently implemented in R.
  9 | 
 10 | * Overview of the CALDER method:
 11 | ![Alt text](./img/CALDER_methods.png "CALDER methods")
 12 | 
 13 | * Calder connects chromatin 3D organization to genomic function:
 14 | ![Alt text](./img/CALDER_features.png "CALDER features")
 15 | 
 16 | ## (A note on the performance of Calder vs PC-based approach)
 17 | 
 18 | * PC1 (sometimes PC2) of the correlation matrix was typically used to define A/B compartment. We found Calder demonstrates superior robustness over PC-based approach in identifying meaningful compartments, particularly when faced with complex chromosomal structural variations (figure on the left) or loose interaction between the p and q arms (figure on the right)
 19 | 
 20 | ![Alt text](./img/Hela_chr11_and_RWPE1_chr9_pq.png "Hela_chr11_and_RWPE1_chr9_pq")
 21 | 
 22 | 
 23 | ## Multiple new features were added in version 2.0
 24 | 
 25 | * Support for hg19, hg38, mm9, mm10 and other genomes
 26 | * Support input in .hic format generated by Juicer tools (https://github.com/aidenlab/juicer)
 27 | * Optimized bin_size selection for more reliable compartment identification
 28 | * Aggregated all chromosome output into a single file for easier visualization in IGV
 29 | * Added output in tabular .txt format at bin level for easier downstream analysis
 30 | 
 31 | Below we introduce two main updates:
 32 | 
 33 | ### (1) Optimized `bin_size` selection
 34 | 
 35 | 
 36 | Due to reasons such as low data quality or large scale structural variation, compartments can be unreliably called at one `bin_size` (equivalent to `resolution` in the literature) but properly called at another `bin_size`. We added an optimized `bin_size` selection strategy to call reliable compartments. This strategy is based on the observation from our large scale compartment analysis (https://www.nature.com/articles/s41467-021-22666-3), that although compartments can change between different conditions, their overall correlation `cor(compartment_rank_1, compartment_rank_2)` is high (> 0.4).
 37 | <br>
 38 | <br>
 39 | **The strategy**: given a `bin_size` specified by user, we call compartments with extended `bin_sizes` and choose the smallest `bin_size` such that no bigger `bin_size` can increase the compartment correlation with a reference compartment more than 0.05. For example, if correlation for `bin_size=10000` is 0.2 while for `bin_size=50000` is 0.6, we are more confident that the latter is more reliable; if correlation for `bin_size=10000` is 0.5 while for `bin_size=50000` is 0.52, we would choose the former as it has higher resolution.
 40 | <br>
 41 | <br>
 42 | `bin_size` is extended in the following way thus contact matrices at any larger `bin_sizes` can be aggregated from the input contact matrices directly:
 43 | ```
 44 | if(bin_size==5E3) bin_sizes = c(5E3, 10E3, 50E3, 100E3)
 45 | if(bin_size==10E3) bin_sizes = c(10E3, 50E3, 100E3)
 46 | if(bin_size==20E3) bin_sizes = c(20E3, 40E3, 100E3)
 47 | if(bin_size==25E3) bin_sizes = c(25E3, 50E3, 100E3)
 48 | if(bin_size==40E3) bin_sizes = c(40E3, 80E3)
 49 | if(bin_size==50E3) bin_sizes = c(50E3, 100E3)
 50 | ```
 51 | Note that this strategy is currently only available for `hg19`, `hg38`, `mm9` and `mm10` genome for which we generated high quality reference compartments using Hi-C data from: GSE63525 for `hg19`, https://data.4dnucleome.org/files-processed/4DNFI1UEG1HD for `hg38`, GSM3959427 for `mm9`, http://hicfiles.s3.amazonaws.com/external/bonev/CN_mapq30.hic for `mm10`.
 52 | 
 53 | ### (2) Support for other genomes
 54 | 
 55 | Although CALDER was mainly tested on human and mouse dataset, it can be applied to dataset from other genomes. One additional information is required in such case: a `feature_track` presumably positively correlated with compartment score (thus higher values in A than in B compartment). This information will be used for correctly determining the `A/B` direction. Some suggested tracks are gene density, H3K27ac, H3K4me1, H3K4me2, H3K4me3, H3K36me3 (or negative transform of H3K9me3) signals. Note that this information will not alter the hierarchical compartment/TAD structure, and can come from any external study with matched genome. An example of `feature_track` is given in the **Usage** section.
 56 | 
 57 | # Installation
 58 | 
 59 | ## Installing from conda
 60 | The easiest way to get the package is to install from [Bioconda](https://bioconda.github.io/index.html):
 61 | ```
 62 | conda install --channel bioconda r-calder2
 63 | ```
 64 | 
 65 | ## Installing from source
 66 | 
 67 | ### Make sure all dependencies have been installed:
 68 | 
 69 | * R.utils (>= 2.9.0),
 70 | * doParallel (>= 1.0.15),
 71 | * ape (>= 5.3),
 72 | * dendextend (>= 1.12.0),
 73 | * fitdistrplus (>= 1.0.14),
 74 | * igraph (>= 1.2.4.1),
 75 | * Matrix (>= 1.2.17),
 76 | * rARPACK (>= 0.11.0),
 77 | * factoextra (>= 1.0.5),
 78 | * data.table (>= 1.12.2),
 79 | * fields (>= 9.8.3),
 80 | * GenomicRanges (>= 1.36.0)
 81 | * ggplot2 (>= 3.3.5)
 82 | * strawr (>= 0.0.9)
 83 | 
 84 | ### Clone its repository and install it from source:
 85 | 
 86 | On the command line:
 87 | ```
 88 | git clone https://github.com/CSOgroup/CALDER2.git
 89 | cd CALDER2
 90 | ```
 91 | Then, once inside of the `R` interpreter:
 92 | ```
 93 | install.packages(".", repos = NULL, type="source") # install from the cloned source file
 94 | ```
 95 | 
 96 | ### Install CALDER and dependencies automaticly:
 97 | One can also install directly from Github, together with the dependencies as follows:
 98 | 
 99 | ```
100 | if (!requireNamespace("BiocManager", quietly = TRUE))
101 |     install.packages("BiocManager")
102 | 
103 | BiocManager::install("GenomicRanges")
104 | install.packages("remotes")
105 | remotes::install_github("CSOgroup/CALDER2.0")
106 | ```
107 | 
108 | Please contact yliueagle@googlemail.com for any questions about installation.
109 | 
110 | ## Use as a docker container
111 | We provide a [Docker image](https://hub.docker.com/r/lucananni93/calder2) complete with all dependencies to run CALDER workflows.
112 | 
113 | ```
114 | # Pull the docker image from Dockerhub
115 | docker pull lucananni93/calder2
116 | 
117 | # Run the image
118 | docker run -it lucananni93/calder2
119 | 
120 | # Once inside the image we can run the command line Calder tool
121 | calder [options]
122 | 
123 | # or we can just enter R 
124 | R
125 | 
126 | # and load Calder
127 | library(CALDER)	
128 | ```
129 | 
130 | 
131 | # Usage
132 | 
133 | CALDER contains three modules: (1) compute chromatin domains; (2) derive their hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each compartment domain.
134 | 
135 | ### Input data format
136 | 
137 | CALDER works on contact matrices compatible with that generated by Juicer tools (https://github.com/aidenlab/juicer), either a .hic file, or three-column `dump` table retrieved by the juicer dump (or straw) command (https://github.com/aidenlab/juicer/wiki/Data-Extraction):	
138 | 
139 | 	16050000	16050000	10106.306
140 | 	16050000	16060000	2259.247
141 | 	16060000	16060000	7748.551
142 | 	16050000	16070000	1251.3663
143 | 	16060000	16070000	4456.1245
144 | 	16070000	16070000	4211.7393
145 | 	16050000	16080000	522.0705
146 | 	16060000	16080000	983.1761
147 | 	16070000	16080000	1996.749
148 | 	...
149 | 
150 | `feature_track` should be a data.frame or data.table of 4 columns (chr, start, end, score), and can be generated directly from conventional format such as bed or wig, see the example:
151 | 
152 | ```
153 | library(rtracklayer)
154 | feature_track  = import('ENCFF934YOE.bigWig') ## from ENCODE https://www.encodeproject.org/files/ENCFF934YOE/@@download/ENCFF934YOE.bigWig
155 | feature_track = data.table::as.data.table(feature_track)[, c(1:3, 6)]
156 | ```
157 | 	> feature_track
158 | 	chr	start	end	score
159 | 	chr1	534179	534353	2.80512
160 | 	chr1	534354	572399	0
161 | 	chr1	572400	572574	2.80512
162 | 	chr1	572575	628400	0
163 | 	...	...	...	...
164 | 	chrY	59031457	59032403	0
165 | 	chrY	59032404	59032413	0.92023
166 | 	chrY	59032414	59032415	0.96625
167 | 	chrY	59032416	59032456	0.92023
168 | 	chrY	59032457	59032578	0.78875
169 | 
170 | ### Example usage (1): use contact matrix file in dump format as input
171 | ```
172 | chrs = c(21:22)
173 | 
174 | ## demo contact matrices in dump format
175 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs),
176 | 			package='CALDER'))
177 | names(contact_file_dump) = chrs
178 | 
179 | ## Run CALDER to compute compartments but not nested sub-domains
180 | CALDER(contact_file_dump=contact_file_dump, 
181 | 			chrs=chrs, 
182 | 			bin_size=10E3,
183 | 			genome='hg19',
184 | 			save_dir=save_dir,
185 | 			save_intermediate_data=FALSE,
186 | 			n_cores=2,
187 | 			sub_domains=FALSE)
188 | 
189 | ## Run CALDER to compute compartments and nested sub-domains / will take more time
190 | CALDER(contact_file_dump=contact_file_dump, 
191 | 			chrs=chrs, 
192 | 			bin_size=10E3,
193 | 			genome='hg19',
194 | 			save_dir=save_dir,
195 | 			save_intermediate_data=TRUE,
196 | 			n_cores=2,
197 | 			sub_domains=TRUE)
198 | ```
199 | 
200 | ### Example (2): use contact matrices stored in an R list 
201 | ```
202 | chrs = c(21:22)
203 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs),
204 | 			package='CALDER'))
205 | names(contact_file_dump) = chrs
206 | contact_tab_dump = lapply(contact_file_dump, data.table::fread)
207 | 
208 | 
209 | CALDER(contact_tab_dump=contact_tab_dump, 
210 | 			chrs=chrs, 
211 | 			bin_size=10E3,
212 | 			genome='hg19',
213 | 			save_dir=save_dir,
214 | 			save_intermediate_data=FALSE,
215 | 			n_cores=2,
216 | 			sub_domains=FALSE)
217 | ```
218 | 
219 | ### Example (3): use .hic file as input
220 | ```
221 | chrs = c(21:22)
222 | hic_file = 'HMEC_combined_30.hic' ## can be downloaded from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_HMEC_combined_30.hic
223 | 
224 | CALDER(contact_file_hic=hic_file, 
225 | 			chrs=chrs, 
226 | 			bin_size=10E3,
227 | 			genome='hg19',
228 | 			save_dir=save_dir,
229 | 			save_intermediate_data=FALSE,
230 | 			n_cores=2,
231 | 			sub_domains=FALSE)
232 | ```
233 | 
234 | ### Example (4): run CALDER on other genomes
235 | ```
236 | ## prepare feature_track
237 | library(rtracklayer)
238 | feature_track_raw  = import('ENCFF934YOE.bigWig') ## from ENCODE https://www.encodeproject.org/files/ENCFF934YOE/@@download/ENCFF934YOE.bigWig
239 | feature_track = data.table::as.data.table(feature_track_raw)[, c(1:3, 6)]
240 | 
241 | ## Run CALDER
242 | chrs = c(21:22)
243 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs),
244 | 			package='CALDER'))
245 | names(contact_file_dump) = chrs
246 | 
247 | CALDER(contact_file_dump=contact_file_dump, 
248 | 			chrs=chrs, 
249 | 			bin_size=10E3,
250 | 			genome='others',
251 | 			save_dir=save_dir,
252 | 			feature_track=feature_track,
253 | 			save_intermediate_data=FALSE,
254 | 			n_cores=2,
255 | 			sub_domains=FALSE)
256 | ```
257 | 
258 | 
259 | 
260 | ### Parameters:
261 | 
262 | | Name              | Description |  
263 | | --------------------- | ----------------------- |
264 | | **chrs**                | A vector of chromosome names to be analyzed, with or without 'chr'. Chromosome names should be consistent with those in `contact_file_hic` and `feature_track` if such files are provided
265 | | **contact_file_dump**                |A list of contact files in dump format, named by `chrs`. Each contact file stores the contact information of the corresponding `chr`. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided
266 | | **contact_tab_dump**                | A list of contact table in dump format, named by `chrs`, stored as an R object. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided
267 | | **contact_file_hic**                | A hic file generated by Juicer tools. It should contain all chromosomes in `chrs`. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided
268 | | **genome**                | One of 'hg19', 'hg38', 'mm9', 'mm10', 'others' (default). These compartments will be used as reference compartments for optimized bin_size selection. If `genome = 'others'`, a `feature_track` should be provided (see below) and no optimized bin_size selection will be performed 
269 | | **save_dir**             | the directory to be created for saving outputs
270 | | **bin_size**         | The bin_size (resolution) to run CALDER. `bin_size` should be consistent with the data resolution in `contact_file_dump` or `contact_tab_dump` if these files are provided as input, otherwise `bin_size` should exist in `contact_file_hic`. Recommended `bin_size` is between **10000 to 100000**
271 | | **single_binsize_only**     |  logical. If TRUE, CALDER will compute compartments only using the bin_size specified by the user and not do bin size optimization
272 | | **feature_track**                | A genomic feature track in `data.frame` or `data.table` format with 4 columns (chr, start, end, score). This track will be used for determining the A/B compartment direction when `genome='others'` and should presumably have higher values in A than in B compartment. Some suggested tracks can be gene density, H3K27ac, H3K4me1, H3K4me2, H3K4me3, H3K36me3 (or negative transform of H3K9me3 signals)
273 | | **save_intermediate_data**  | logical. If TRUE, an intermediate_data will be saved. This file can be used for computing nested sub-domains later on
274 | | **n_cores**     |  integer. Number of cores to be registered for running CALDER in parallel
275 | | **sub_domains**     |  logical, whether to compute nested sub-domains
276 | 
277 | 
278 | 
279 | ## Output Structure
280 | The output of the workflow is stored in the folder specified by `save_dir` and will look like this:
281 | ```
282 | save_dir/
283 | |-- sub_domains
284 | |   `-- all_nested_boundaries.bed
285 | |-- sub_compartments
286 | |   |-- cor_with_ref.txt
287 | |   |-- cor_with_ref.pdf
288 | |   |-- cor_with_ref.ALL.txt
289 | |   |-- all_sub_compartments.tsv
290 | |   `-- all_sub_compartments.bed
291 | `-- intermediate_data
292 |     |-- sub_domains
293 |     |   |-- chr22_sub_domains_log.txt
294 |     |   |-- chr22_nested_boundaries.bed
295 |     |   |-- chr21_sub_domains_log.txt
296 |     |   `-- chr21_nested_boundaries.bed
297 |     `-- sub_compartments
298 |         |-- 50kb
299 |         |   |-- chr22_sub_compartments.bed
300 |         |   |-- chr22_log.txt
301 |         |   |-- chr22_domain_hierachy.tsv
302 |         |   |-- chr22_domain_boundaries.bed
303 |         |   |-- chr21_sub_compartments.bed
304 |         |   |-- chr21_log.txt
305 |         |   |-- chr21_domain_hierachy.tsv
306 |         |   `-- chr21_domain_boundaries.bed
307 |         |-- 10kb
308 |         |   |-- chr22_sub_compartments.bed
309 |         |   |-- chr22_log.txt
310 |         |   |-- chr22_intermediate_data.Rds
311 |         |   |-- chr22_domain_hierachy.tsv
312 |         |   |-- chr22_domain_boundaries.bed
313 |         |   |-- chr21_sub_compartments.bed
314 |         |   |-- chr21_log.txt
315 |         |   |-- chr21_intermediate_data.Rds
316 |         |   |-- chr21_domain_hierachy.tsv
317 |         |   `-- chr21_domain_boundaries.bed
318 |         `-- 100kb
319 |             |-- chr22_sub_compartments.bed
320 |             |-- chr22_log.txt
321 |             |-- chr22_domain_hierachy.tsv
322 |             |-- chr22_domain_boundaries.bed
323 |             |-- chr21_sub_compartments.bed
324 |             |-- chr21_log.txt
325 |             |-- chr21_domain_hierachy.tsv
326 |             `-- chr21_domain_boundaries.bed
327 | ```
328 | 
329 | ### File description:
330 | 
331 | * The `sub_domains` folder stores the nested boundaries
332 | * The `sub_compartments` folder stores the optimal compartment information, including files:
333 | 
334 | | Name              | Description |  
335 | | --------------------- | ----------------------- |
336 | | **all_sub_compartments.bed**                | a .bed file containing the optimal compartments for all `chrs`, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments)
337 | | **all_sub_compartments.tsv**                | optimal compartments stored in tabular text format. Each row represents one 10kb region
338 | | **cor_with_ref.ALL.txt**                | a plot of correlation between compartment rank and the reference compartment rank for each of extended `bin_sizes`, and the optimal `bin_size` that is finally selected
339 | | **cor_with_ref.pdf**                | correlation of compartment rank with the reference compartment rank using the optimal `bin_size`
340 | 
341 | 
342 | 
343 | * The `intermediate_data` folder stores information at each of the extended `bin_sizes` for each chromosome
344 | 
345 | 
346 | | Name              | Description |  
347 | | --------------------- | ----------------------- |
348 | | **chrxx_domain_hierachy.tsv**                | information of compartment domain and their hierarchical organization. The hierarchical structure is fully represented by `compartment_label`, for example, `B.2.2.2` and `B.2.2.1` are two sub-branches of `B.2.2`. The `pos_end` column specifies all compartment domain borders, except when it is marked as `gap`, which indicates it is the border of a gap chromosome region that has too few contacts and was excluded from the analysis (e.g., due to low mappability, deletion, technique flaw) 
349 | | **chrxx_sub_compartments.bed**                | a .bed file containing the compartment information, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments)
350 | | **chrxx_domain_boundaries.bed**                | a .bed file containing the chromatin domains boundaries, that can be visualized in IGV
351 | | **chrxx_nested_boundaries.bed**                | a .bed file containing the nested sub-domain boundaries, that can be visualized in IGV
352 | | **chrxx_intermediate_data.Rds**                | an Rds file storing the intermediate_data that can be used to compute nested sub-domains
353 | | **chrxx_log.txt, chrxx_sub_domains_log.txt**                | log files storing the status and running time of each step
354 | 
355 | 
356 | All .bed files can be view directly through IGV:
357 | 
358 | ![Alt text](./img/IGV_results.png "IGV")
359 | 
360 | ## Running time:
361 | For the computational requirement, running CALDER on the GM12878 Hi-C dataset at bin size of 40kb took **36 minutes** to derive the chromatin domains and their hierarchy for all chromosomes (i.e., CALDER Step1 and Step2); **13 minutes** to derive the nested sub-domains (i.e., CALDER Step3). At the bin size of 10kb, it took **1 h 44 minutes and 55 minutes** correspondingly (server information: 40 cores, 64GB Ram, Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz). The evaluation was done using a single core although CALDER can be run in a parallel manner.
362 | 
363 | # Citation
364 | 
365 | If you use CALDER in your work, please cite: https://www.nature.com/articles/s41467-021-22666-3
366 | 
367 | 
368 | # Contact information
369 | 
370 | * Author: Yuanlong LIU
371 | * Affiliation: Computational Systems Oncology group, Department of Computational Biology, University of Lausanne, Switzerland
372 | * Email: yliueagle@googlemail.com
373 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # We need Bioconductor, so we inherit from their repositorycd .
 2 | FROM bioconductor/bioconductor_docker
 3 | 
 4 | ARG tag_name
 5 | ARG repo_name
 6 | 
 7 | ENV github_tag=$tag_name
 8 | ENV github_repo=$repo_name
 9 | 
10 | RUN echo "Building repo ${github_repo} with tag: ${github_tag}"
11 | 
12 | ################
13 | # CALDER SETUP #
14 | ################
15 | 
16 | # Get latest release tag
17 | RUN wget https://github.com/${github_repo}/archive/refs/tags/${github_tag}.tar.gz
18 | # Untar the release
19 | RUN tar xvfs ${github_tag}.tar.gz
20 | # Adding calder script to bin
21 | ENV calder_cmd=CALDER2-${github_tag}/scripts/calder
22 | RUN ln -s $(realpath ${calder_cmd}) /bin/
23 | # Removing tar file
24 | RUN rm ${github_tag}.tar.gz
25 | 
26 | # # Install the package
27 | RUN R -e "remotes::install_local('CALDER2-${github_tag}', dependencies=T); if (!library(CALDER, logical.return=T)) quit(status=10)"
28 | 
29 | # # Install other packages
30 | RUN pip install cooler 
31 | 
32 | CMD ["bash"]
33 | 


--------------------------------------------------------------------------------
/docker/docker_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DOCKERHUB_USERNAME="lucananni93"
 4 | PACKAGE_NAME="calder2"
 5 | GITHUB_REPO="CSOgroup/CALDER2"
 6 | DOCKER_FILE="docker/Dockerfile"
 7 | LATEST_TAG=$(curl --silent "https://api.github.com/repos/${GITHUB_REPO}/releases/latest" | grep '"tag_name"' | cut -d':' -f2 | tr -d '," ')
 8 | 
 9 | echo "Building Docker for ${GITHUB_REPO} at ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG}"
10 | 
11 | docker build . \
12 |             -t ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG} \
13 |             -t ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:latest \
14 |             -f ${DOCKER_FILE} \
15 |             --build-arg tag_name=${LATEST_TAG} \
16 |             --build-arg repo_name=${GITHUB_REPO}
17 | 
18 | docker push ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG}
19 | docker push ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:latest


--------------------------------------------------------------------------------
/img/CALDER_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/CALDER_features.png


--------------------------------------------------------------------------------
/img/CALDER_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/CALDER_methods.png


--------------------------------------------------------------------------------
/img/Hela_chr11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/Hela_chr11.png


--------------------------------------------------------------------------------
/img/Hela_chr11_and_RWPE1_chr9_pq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/Hela_chr11_and_RWPE1_chr9_pq.png


--------------------------------------------------------------------------------
/img/IGV_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/IGV_results.png


--------------------------------------------------------------------------------
/img/RWPE1_chr9_pq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/RWPE1_chr9_pq.png


--------------------------------------------------------------------------------
/inst/extdata/mat_chr21_10kb_ob.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/inst/extdata/mat_chr21_10kb_ob.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/mat_chr22_10kb_ob.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/inst/extdata/mat_chr22_10kb_ob.txt.gz


--------------------------------------------------------------------------------
/man/CALDER-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{CALDER-package}
 2 | \alias{CALDER-package}
 3 | \alias{CALDER}
 4 | \docType{package}
 5 | \title{
 6 | \packageTitle{CALDER}
 7 | }
 8 | \description{
 9 | \packageDescription{CALDER}
10 | }
11 | \details{
12 | 
13 | The DESCRIPTION file:
14 | \packageDESCRIPTION{CALDER}
15 | \packageIndices{CALDER}
16 | ~~ An overview of how to use the package, including the most important ~~
17 | ~~ functions ~~
18 | }
19 | \author{
20 | \packageAuthor{CALDER}
21 | 
22 | Maintainer: \packageMaintainer{CALDER}
23 | }
24 | \references{
25 | ~~ Literature or other references for background information ~~
26 | }
27 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~
28 | ~~ the R documentation directory ~~
29 | \keyword{ package }
30 | \seealso{
31 | ~~ Optional links to other man pages, e.g. ~~
32 | ~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
33 | }
34 | \examples{
35 | ~~ simple examples of the most important functions ~~
36 | }
37 | 


--------------------------------------------------------------------------------
/scripts/calder:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library(optparse))
  4 | suppressPackageStartupMessages(library(CALDER))
  5 | 
  6 | AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10")
  7 | INPUT_TYPES <- c("hic", "cool")
  8 | CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY")
  9 | 
 10 | 
 11 | parse_arguments <- function(){
 12 | 	# Creating the argument parsing options
 13 | 	option_list = list(
 14 | 	  make_option(c("-i", "--input"), action="store", default=NA, type='character',
 15 | 	              help="Input Hi-C contacts"),
 16 | 	  make_option(c("-t", "--type"), action="store", default='hic', type='character',
 17 | 	              help="The type of input: hic or cool [default %default]"),
 18 | 	  make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer',
 19 | 	              help="Bin size to use for the analysis [default %default]"),
 20 | 	  make_option(c("-g", "--genome"), action="store", default="hg19", type='character',
 21 | 	              help="Genome assembly to use [default %default]"),
 22 | 	  make_option(c("-f", "--feature_track"), action="store", default=NA, type='character',
 23 | 	              help="Genomic feature track to be used to determine A/B compartment direction
 24 | 	              when genome == 'others'. The track should presumably have higher values
 25 | 	              in A than in B compartmnets. [default %default]"),
 26 | 	  make_option(c("-c", "--chromosomes"), action='store', default='all', type='character',
 27 | 	  			  help="Chromosomes to analyze, separated by comma. [default %default]"),
 28 | 	  make_option(c("-p", "--nproc"), action="store", default=1, type='integer',
 29 | 	              help="Number of cores to use [default %default]"),
 30 | 	  make_option(c("-o", "--outpath"), action="store", default=NA, type='character',
 31 | 	              help="Path to the output folder"),
 32 | 	  make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical',
 33 | 	              help="Keep intermediate data after done [default %default]"),
 34 | 	  make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical',
 35 | 	              help="Use adaptive resolution choice [default %default]")
 36 | 	)
 37 | 	parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
 38 | 	opt <- parse_args(parser)
 39 | 
 40 | 	# Checking if input path exists
 41 | 	if(is.na(opt$input)){
 42 | 		print_help(parser)
 43 | 		stop(paste0("Input path (", opt$input,") does not exist"))
 44 | 	}
 45 | 
 46 | 	# Checking if output path is provided
 47 | 	if(is.na(opt$outpath)){
 48 | 		stop("Output path was not provided")
 49 | 	}
 50 | 
 51 | 	# Check that the input type is one of the possible ones
 52 | 	if(!(opt$type %in% INPUT_TYPES)){
 53 | 		stop(paste0("Input type ", opt$input_type, " not available"))
 54 | 	}
 55 | 
 56 | 	# Check if the provided genome is in the list of available reference genomes 
 57 | 	# or if a feature track is provided
 58 | 	if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){
 59 | 		# in this case, we just assign it the name 'others'
 60 | 		opt$genome = "others"
 61 | 	}
 62 | 
 63 | 	writeLines(c(
 64 | 		"*******************************",
 65 | 		"*            CALDER           *",
 66 | 		"*******************************",
 67 | 		paste0("[Parameters] Input: ", opt$input),
 68 | 		paste0("[Parameters] Input type: ", opt$type),
 69 | 		paste0("[Parameters] Bin size: ", opt$bin_size),
 70 | 		paste0("[Parameters] Genome: ", opt$genome),
 71 | 		paste0("[Parameters] Feature Track: ", opt$feature_track),
 72 | 		paste0("[Parameters] Chromosomes: ", opt$chromosomes),
 73 | 		paste0("[Parameters] N. cores: ", opt$nproc),
 74 | 		paste0("[Parameters] Output: ", opt$outpath),
 75 | 		paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate),
 76 | 		paste0("[Parameters] Use adaptive resolution: ", opt$adaptive)
 77 | 	))
 78 | 
 79 | 	if(file.exists(opt$feature_track)){
 80 | 		opt$feature_track <- read.table(opt$feature_track)
 81 | 	}
 82 | 
 83 | 	return(opt)
 84 | }
 85 | 
 86 | sanitize_chroms <- function(chroms){
 87 | 	res <- lapply(chroms, function(x){
 88 | 		if(startsWith(x, "chr")){
 89 | 			return(substring(x, 4))
 90 | 		} else{
 91 | 			return(x)
 92 | 		}
 93 | 	})
 94 | 	return(res)
 95 | }
 96 | 
 97 | handle_input_hic <- function(opt){
 98 | 	suppressPackageStartupMessages(library(strawr))
 99 | 	chromsizes <- readHicChroms(opt$input)
100 | 	if(opt$chromosomes == "all"){
101 | 		chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"]
102 | 	}
103 | 	else{
104 | 		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
105 | 		chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"]
106 | 	}
107 | 	chroms <- sanitize_chroms(chroms)
108 | 	CALDER(contact_file_hic = opt$input,
109 | 		   chrs = chroms,
110 | 		   bin_size = opt$bin_size,
111 | 		   genome = opt$genome,
112 | 		   save_dir=opt$outpath,
113 | 		   save_intermediate_data=TRUE,
114 | 		   feature_track=opt$feature_track,
115 | 		   single_binsize_only=!opt$adaptive,
116 | 		   n_cores = opt$nproc,
117 | 		   sub_domains=TRUE)
118 | }
119 | 
120 | handle_input_cool <- function(opt){
121 | 	intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
122 | 	dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE)
123 | 
124 | 	system(paste0("cooler dump --table chroms --out ", 
125 | 				  file.path(intermediate_data_dir, "chroms.txt"), 
126 | 				  " --header ", 
127 | 				  opt$input))
128 | 	chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE)
129 | 	if(opt$chromosomes == "all"){
130 | 		chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"]
131 | 	}
132 | 	else{
133 | 		chrom_list <- strsplit(opt$chromosomes, ",")[[1]]
134 | 		chroms <- chroms[chroms$name %in% chrom_list, "name"]
135 | 	}
136 | 	
137 | 	dump_paths <- list()
138 | 	for(chrom in chroms){
139 | 		cat(paste0("[Pre-processing] Dumping ", chrom, "\n"))
140 | 		chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt"))
141 | 		dump_paths <- c(dump_paths, chrom_dump_path)
142 | 		if(! file.exists(chrom_dump_path)){
143 | 			system(paste0("cooler dump --table pixels --range ", 
144 | 						  chrom, 
145 | 						  " --join --balanced ",
146 | 						  opt$input,
147 | 						  " | cut -f2,5,8 | awk '{if ($3) print;}' > ",
148 | 						  chrom_dump_path))
149 | 		}
150 | 	}
151 | 
152 | 	chroms <- sanitize_chroms(chroms)
153 | 	names(dump_paths) <- chroms
154 | 
155 | 	CALDER(contact_file_dump=dump_paths, 
156 | 		   chrs=chroms, 
157 | 		   bin_size=opt$bin_size,
158 | 		   genome=opt$genome,
159 | 		   save_dir=opt$outpath,
160 | 		   feature_track=opt$feature_track,
161 | 		   single_binsize_only=!opt$adaptive,
162 | 		   save_intermediate_data=TRUE,
163 | 		   n_cores=opt$nproc,
164 | 		   sub_domains=TRUE)
165 | 
166 | }
167 | 
168 | opt <- parse_arguments()
169 | 
170 | 
171 | if(opt$type == "hic"){
172 | 	handle_input_hic(opt)
173 | } else if(opt$type == "cool"){
174 | 	handle_input_cool(opt)
175 | } else {
176 | 	stop("Unknown input type")
177 | }
178 | 
179 | # Cleaning the output
180 | intermediate_data_dir = file.path(opt$outpath, "intermediate_data")
181 | if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){
182 | 	cat('[Post-processing] Removing intermediate data\n')
183 | 	unlink(intermediate_data_dir, recursive=TRUE)
184 | }
185 | exec_time_file = "./total_execution.time"
186 | if(file.exists(exec_time_file)){
187 | 	cat("[Post-processing] Removing total_execution.time\n")
188 | 	file.remove(exec_time_file)
189 | }
190 | 
191 | 


--------------------------------------------------------------------------------
/scripts/clean_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ~/miniconda3/etc/profile.d/conda.sh
 4 | 
 5 | TEST_ENV_NAME="R-Test"
 6 | 
 7 | mamba create --yes --name ${TEST_ENV_NAME} --channel bioconda --channel conda-forge \
 8 | 		r-essentials r-devtools cooler r-nloptr
 9 | conda activate ${TEST_ENV_NAME}
10 | 
11 | R -e "devtools::install('.', dependencies=TRUE, Ncpus = 20)"
12 | R -e "devtools::test()"
13 | 
14 | # conda deactivate 
15 | mamba env remove --yes --name ${TEST_ENV_NAME}
16 | 


--------------------------------------------------------------------------------
/scripts/create_release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | release_name="0.3"
 5 | release_description="Dockerization"
 6 | 
 7 | 
 8 | release_notes="CALDER2 release: ${release_description} (${release_name})"
 9 | 
10 | 
11 | latest_tag=$(git describe --tags --abbrev=0)
12 | 
13 | if [[ "${release_name}" == "${latest_tag}" ]]
14 | then
15 | 	echo "Overwriting exiting release (${release_name})"
16 | 	echo "---------------------------------------------"
17 | 
18 | 	# Detelting Github release (should prompt a confirmation)
19 | 	gh release delete ${release_name}
20 | 	# Deleting local tag
21 | 	git tag -d ${release_name}
22 | 	# Pushing the removal of the tag to Github
23 | 	git push origin :${release_name}
24 | fi
25 | 
26 | echo "Uploading new release (${release_name})"
27 | echo "---------------------------------------------"
28 | # Creating new tag
29 | git tag -a ${release_name} -m ${release_description}
30 | # Pushing new tag
31 | git push origin ${release_name}
32 | # Creating release from tag
33 | gh release create ${release_name} --title "${release_description}" --notes "${release_notes}"


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // matrix_multiplication_cpp
 15 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B);
 16 | RcppExport SEXP _CALDER_matrix_multiplication_cpp(SEXP ASEXP, SEXP BSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 21 |     Rcpp::traits::input_parameter< arma::mat >::type B(BSEXP);
 22 |     rcpp_result_gen = Rcpp::wrap(matrix_multiplication_cpp(A, B));
 23 |     return rcpp_result_gen;
 24 | END_RCPP
 25 | }
 26 | // matrix_multiplication_sym_cpp
 27 | arma::mat matrix_multiplication_sym_cpp(arma::mat A);
 28 | RcppExport SEXP _CALDER_matrix_multiplication_sym_cpp(SEXP ASEXP) {
 29 | BEGIN_RCPP
 30 |     Rcpp::RObject rcpp_result_gen;
 31 |     Rcpp::RNGScope rcpp_rngScope_gen;
 32 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 33 |     rcpp_result_gen = Rcpp::wrap(matrix_multiplication_sym_cpp(A));
 34 |     return rcpp_result_gen;
 35 | END_RCPP
 36 | }
 37 | // loglik_lnorm_cpp
 38 | double loglik_lnorm_cpp(double sum_ln1, double sum_ln2, double p, double q);
 39 | RcppExport SEXP _CALDER_loglik_lnorm_cpp(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP pSEXP, SEXP qSEXP) {
 40 | BEGIN_RCPP
 41 |     Rcpp::RObject rcpp_result_gen;
 42 |     Rcpp::RNGScope rcpp_rngScope_gen;
 43 |     Rcpp::traits::input_parameter< double >::type sum_ln1(sum_ln1SEXP);
 44 |     Rcpp::traits::input_parameter< double >::type sum_ln2(sum_ln2SEXP);
 45 |     Rcpp::traits::input_parameter< double >::type p(pSEXP);
 46 |     Rcpp::traits::input_parameter< double >::type q(qSEXP);
 47 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp(sum_ln1, sum_ln2, p, q));
 48 |     return rcpp_result_gen;
 49 | END_RCPP
 50 | }
 51 | // loglik_lnorm_cpp_vec
 52 | double loglik_lnorm_cpp_vec(arma::vec vec_values);
 53 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_vec(SEXP vec_valuesSEXP) {
 54 | BEGIN_RCPP
 55 |     Rcpp::RObject rcpp_result_gen;
 56 |     Rcpp::RNGScope rcpp_rngScope_gen;
 57 |     Rcpp::traits::input_parameter< arma::vec >::type vec_values(vec_valuesSEXP);
 58 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_vec(vec_values));
 59 |     return rcpp_result_gen;
 60 | END_RCPP
 61 | }
 62 | // get_A_len
 63 | arma::mat get_A_len(arma::mat A);
 64 | RcppExport SEXP _CALDER_get_A_len(SEXP ASEXP) {
 65 | BEGIN_RCPP
 66 |     Rcpp::RObject rcpp_result_gen;
 67 |     Rcpp::RNGScope rcpp_rngScope_gen;
 68 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 69 |     rcpp_result_gen = Rcpp::wrap(get_A_len(A));
 70 |     return rcpp_result_gen;
 71 | END_RCPP
 72 | }
 73 | // get_A_ln1
 74 | arma::mat get_A_ln1(arma::mat A);
 75 | RcppExport SEXP _CALDER_get_A_ln1(SEXP ASEXP) {
 76 | BEGIN_RCPP
 77 |     Rcpp::RObject rcpp_result_gen;
 78 |     Rcpp::RNGScope rcpp_rngScope_gen;
 79 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 80 |     rcpp_result_gen = Rcpp::wrap(get_A_ln1(A));
 81 |     return rcpp_result_gen;
 82 | END_RCPP
 83 | }
 84 | // get_A_ln2
 85 | arma::mat get_A_ln2(arma::mat A);
 86 | RcppExport SEXP _CALDER_get_A_ln2(SEXP ASEXP) {
 87 | BEGIN_RCPP
 88 |     Rcpp::RObject rcpp_result_gen;
 89 |     Rcpp::RNGScope rcpp_rngScope_gen;
 90 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 91 |     rcpp_result_gen = Rcpp::wrap(get_A_ln2(A));
 92 |     return rcpp_result_gen;
 93 | END_RCPP
 94 | }
 95 | // loglik_lnorm_cpp_mat
 96 | arma::mat loglik_lnorm_cpp_mat(arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs);
 97 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_mat(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP psSEXP, SEXP qsSEXP) {
 98 | BEGIN_RCPP
 99 |     Rcpp::RObject rcpp_result_gen;
100 |     Rcpp::RNGScope rcpp_rngScope_gen;
101 |     Rcpp::traits::input_parameter< arma::mat >::type sum_ln1(sum_ln1SEXP);
102 |     Rcpp::traits::input_parameter< arma::mat >::type sum_ln2(sum_ln2SEXP);
103 |     Rcpp::traits::input_parameter< arma::mat >::type ps(psSEXP);
104 |     Rcpp::traits::input_parameter< arma::mat >::type qs(qsSEXP);
105 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_mat(sum_ln1, sum_ln2, ps, qs));
106 |     return rcpp_result_gen;
107 | END_RCPP
108 | }
109 | // zigzag_loglik_ancestors_v4_5
110 | List zigzag_loglik_ancestors_v4_5(arma::mat A, int k, int min_n_bins);
111 | RcppExport SEXP _CALDER_zigzag_loglik_ancestors_v4_5(SEXP ASEXP, SEXP kSEXP, SEXP min_n_binsSEXP) {
112 | BEGIN_RCPP
113 |     Rcpp::RObject rcpp_result_gen;
114 |     Rcpp::RNGScope rcpp_rngScope_gen;
115 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
116 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
117 |     Rcpp::traits::input_parameter< int >::type min_n_bins(min_n_binsSEXP);
118 |     rcpp_result_gen = Rcpp::wrap(zigzag_loglik_ancestors_v4_5(A, k, min_n_bins));
119 |     return rcpp_result_gen;
120 | END_RCPP
121 | }
122 | // compute_L
123 | List compute_L(arma::mat A, arma::mat L, int k);
124 | RcppExport SEXP _CALDER_compute_L(SEXP ASEXP, SEXP LSEXP, SEXP kSEXP) {
125 | BEGIN_RCPP
126 |     Rcpp::RObject rcpp_result_gen;
127 |     Rcpp::RNGScope rcpp_rngScope_gen;
128 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
129 |     Rcpp::traits::input_parameter< arma::mat >::type L(LSEXP);
130 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
131 |     rcpp_result_gen = Rcpp::wrap(compute_L(A, L, k));
132 |     return rcpp_result_gen;
133 | END_RCPP
134 | }
135 | 
136 | static const R_CallMethodDef CallEntries[] = {
137 |     {"_CALDER_matrix_multiplication_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_cpp, 2},
138 |     {"_CALDER_matrix_multiplication_sym_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_sym_cpp, 1},
139 |     {"_CALDER_loglik_lnorm_cpp", (DL_FUNC) &_CALDER_loglik_lnorm_cpp, 4},
140 |     {"_CALDER_loglik_lnorm_cpp_vec", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_vec, 1},
141 |     {"_CALDER_get_A_len", (DL_FUNC) &_CALDER_get_A_len, 1},
142 |     {"_CALDER_get_A_ln1", (DL_FUNC) &_CALDER_get_A_ln1, 1},
143 |     {"_CALDER_get_A_ln2", (DL_FUNC) &_CALDER_get_A_ln2, 1},
144 |     {"_CALDER_loglik_lnorm_cpp_mat", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_mat, 4},
145 |     {"_CALDER_zigzag_loglik_ancestors_v4_5", (DL_FUNC) &_CALDER_zigzag_loglik_ancestors_v4_5, 3},
146 |     {"_CALDER_compute_L", (DL_FUNC) &_CALDER_compute_L, 3},
147 |     {NULL, NULL, 0}
148 | };
149 | 
150 | RcppExport void R_init_CALDER(DllInfo *dll) {
151 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
152 |     R_useDynamicSymbols(dll, FALSE);
153 | }
154 | 


--------------------------------------------------------------------------------
/src/matrix_multiplication_fun.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 	//http://arma.sourceforge.net/docs.html#stats_fns
 3 | 	#include "RcppArmadillo.h"
 4 | 	using namespace Rcpp;
 5 | 	
 6 | 	// [[Rcpp::export]]
 7 | 	arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B)
 8 | 	{
 9 | 		return A*B;
10 | 	}
11 | 
12 | 	// [[Rcpp::export]]
13 | 	arma::mat matrix_multiplication_sym_cpp(arma::mat A)
14 | 	{
15 | 		return A*A;
16 | 	}
17 | 
18 | 	
19 | 	
20 | 


--------------------------------------------------------------------------------
/src/zigzag_loglik_ancestors_v4.5.cpp:
--------------------------------------------------------------------------------
  1 |     // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*-
  2 | 
  3 |     #include "RcppArmadillo.h"
  4 |     using namespace Rcpp;
  5 |     using namespace std;
  6 |     // #define M_PI  3.141592653589793238462643383280    /* pi */                    
  7 | 
  8 |     // [[Rcpp::export]]
  9 |     double loglik_lnorm_cpp( double sum_ln1, double sum_ln2, double p, double q ) // several times faster than fitdistr
 10 |     {
 11 |         if( sum_ln2 < 0 ) cout << "sum_ln2 not valid in loglik_lnorm_cpp\n";
 12 |         if( p < 0 ) cout << "p not valid in loglik_lnorm_cpp\n";
 13 |         if( q < 0 ) cout << "q not valid in loglik_lnorm_cpp\n";
 14 |             
 15 |         if( q <=1 ) return 0; // p = sum(x==0): the number of zero-values
 16 |         double alpha = 1.0*p/(p+q);
 17 |         double mu = sum_ln1/q;
 18 | 
 19 |         double sigma2 = sum_ln2/q + pow(mu, 2) - 2*mu*sum_ln1/q; // %: element-wise product 
 20 |         if( abs(sigma2) <=1E-10 ) 
 21 |         {
 22 |             return 0;
 23 |         }
 24 |         double loglik = -sum_ln1 - 0.5*q*(log(2*M_PI) + log(sigma2)) - (sum_ln2 + q*pow(mu, 2) - 2*mu*sum_ln1)/2/sigma2; 
 25 |         if( p==0 ) {return loglik;} else {return p*log(alpha) + q*log(1-alpha) + loglik;}
 26 |     }
 27 | 
 28 |     // [[Rcpp::export]]
 29 |     double loglik_lnorm_cpp_vec( arma::vec vec_values ) //log-likelihood of vec_values
 30 |     {
 31 |         int p, q;
 32 |         int n = vec_values.n_elem;
 33 |         if( n < 2 ) return 0; // added on 19-07-2018
 34 | 
 35 |         double sum_ln1, sum_ln2;
 36 |         arma::vec positive_vec = vec_values.elem(find(vec_values > 0));
 37 |         q = positive_vec.n_elem; // the number of positive values
 38 |         p = n - q; // the number of zeros
 39 |         if( q <= 1 ) return 0;
 40 |         
 41 |         sum_ln1 = sum(log(positive_vec));
 42 |         sum_ln2 = sum(pow(log(positive_vec), 2));
 43 |         
 44 |         return loglik_lnorm_cpp( sum_ln1, sum_ln2, p, q );
 45 |     }
 46 |     
 47 |     // [[Rcpp::export]]
 48 |     arma::mat get_A_len(arma::mat A) // get matrix A_len: A_len := A*(A>0). This is used for computing the number of positive elements in a rectangle region
 49 |     {
 50 |         int n_row = A.n_rows;
 51 |         arma::mat A_len=arma::zeros<arma::mat>(n_row, n_row); // for test
 52 |         arma::uvec ids = find(A > 0);
 53 |         arma::vec new_values = arma::ones<arma::vec>(ids.n_elem);
 54 |         A_len.elem(ids) = new_values;
 55 |         return A_len;
 56 |     }
 57 |     
 58 |     // [[Rcpp::export]]
 59 |     arma::mat get_A_ln1(arma::mat A) // log(A_ij)
 60 |     {
 61 |         int n_row = A.n_rows;
 62 |         arma::mat A_ln1=arma::zeros<arma::mat>(n_row, n_row); // for test
 63 |         arma::uvec ids = find(A > 0);
 64 |         arma::vec new_values = log(A.elem(ids));
 65 |         A_ln1.elem(ids) = new_values;
 66 |         return A_ln1;
 67 |     }
 68 |     
 69 |     // [[Rcpp::export]]
 70 |     arma::mat get_A_ln2(arma::mat A) // log(A_ij)^2
 71 |     {
 72 |         int n_row = A.n_rows;
 73 |         arma::mat A_ln2=arma::zeros<arma::mat>(n_row, n_row); // for test
 74 |         arma::uvec ids = find(A > 0);
 75 |         arma::vec new_values = pow(log(A.elem(ids)), 2);
 76 |         A_ln2.elem(ids) = new_values;
 77 |         return A_ln2;
 78 |     }
 79 |  
 80 |      // compute the loglik matrix   
 81 |     // [[Rcpp::export]]
 82 |     arma::mat loglik_lnorm_cpp_mat( arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs ) // several times faster than fitdistr
 83 |     {
 84 |         int n_row = sum_ln1.n_rows;
 85 |         int n_col = sum_ln1.n_cols;
 86 |         arma::mat loglik(n_row, n_col);
 87 |         for(int i=0; i<n_row; i++)
 88 |             for(int j=0; j<n_col; j++)
 89 |                 loglik(i,j) = loglik_lnorm_cpp( sum_ln1(i,j), sum_ln2(i,j), ps(i,j), qs(i,j) );
 90 |         return loglik;
 91 |     }
 92 |     
 93 |     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 94 |     // compute the blockwise sum
 95 |     // [[Rcpp::export]] // use the same version name v4 instead of v4_5
 96 |     List zigzag_loglik_ancestors_v4_5( arma::mat A, int k, int min_n_bins=2 ){
 97 |         int n_row = A.n_rows;
 98 |         // int min_n_bins = 2;
 99 |         int p, q, max_mid_index, max_mid;
100 |         arma::mat ps, loglik_tmp, loglik;
101 |         arma::vec n_cells;
102 |         arma::mat A_len = get_A_len(A); // A_len = A*(A > 0)
103 |         arma::mat A_ln1 = get_A_ln1(A); // log(A_ij)
104 |         arma::mat A_ln2 = get_A_ln2(A); // log(A_ij)^2
105 |         StringMatrix ancestors(n_row, n_row);
106 |         arma::mat L=arma::zeros<arma::mat>(n_row, n_row); // for test
107 |         Rcpp::List res;
108 | 
109 |         for( int k= min_n_bins-1; k<=(2*min_n_bins - 2); k++ ) // other values of L are 0
110 |         {
111 |             for( int v=1; v<= (n_row - k); v++ )
112 |             {
113 |                 arma::mat tmp_mat=A.submat(arma::span(v-1, v+k-1), arma::span(v-1, v+k-1)); // span(0,1) := 1:2 in R
114 |                 arma::vec upper_tri_vec = tmp_mat.elem(find(trimatu(tmp_mat)));
115 |                 L(v-1, v-1+k) = loglik_lnorm_cpp_vec( upper_tri_vec );
116 |             }
117 |         } // Checked to be the same in R
118 |         // cout << "Finished initialize L\n";
119 |         
120 |         // initialize the rad_mat as the first off-diagonal values
121 |         arma::mat rad_mat_current_ln1(n_row-1, 1);
122 |         arma::mat rad_mat_current_ln2(n_row-1, 1);
123 |         arma::mat rad_mat_current_len(n_row-1, 1);
124 |         
125 |         for(int i=0; i<(n_row-1); i++) // The first off-diagonal values
126 |         {
127 |             rad_mat_current_ln1(i, 0) = A_ln1(i, i+1);
128 |             rad_mat_current_ln2(i, 0) = A_ln2(i, i+1);
129 |             rad_mat_current_len(i, 0) = A_len(i, i+1);
130 |         }
131 |         
132 |         // initialized to be two vertical cells (2 rows)
133 |         arma::mat vertical_columns_next_ln1(2, n_row-2);
134 |         arma::mat vertical_columns_next_ln2(2, n_row-2);
135 |         arma::mat vertical_columns_next_len(2, n_row-2);
136 | 
137 |         for(int i=0; i<(n_row-2); i++) 
138 |         {
139 |             vertical_columns_next_ln1.col(i) = A_ln1( arma::span(i, i+1), i+2 );
140 |             vertical_columns_next_ln2.col(i) = A_ln2( arma::span(i, i+1), i+2 );
141 |             vertical_columns_next_len.col(i) = A_len( arma::span(i, i+1), i+2 );
142 |         }
143 |         
144 |         for(int i=1; i<2; i++) // cumsum of the two vertical cells (i=1:1)
145 |         {
146 |             vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + vertical_columns_next_ln1.row(i-1); //cumsum
147 |             vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + vertical_columns_next_ln2.row(i-1); //cumsum
148 |             vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + vertical_columns_next_len.row(i-1); //cumsum
149 |         }
150 |         
151 |         arma::mat rad_mat_next_ln1 = rad_mat_current_ln1;
152 |         arma::mat rad_mat_next_ln2 = rad_mat_current_ln2;
153 |         arma::mat rad_mat_next_len = rad_mat_current_len;
154 | 
155 |         arma::mat vertical_columns_current_ln1 = vertical_columns_next_ln1; // this line just create the vertical_columns_current_ln
156 |         arma::mat vertical_columns_current_ln2 = vertical_columns_next_ln2; // this line just create the vertical_columns_current_ln
157 |         arma::mat vertical_columns_current_len = vertical_columns_next_len; // this line just create the vertical_columns_current_ln
158 | 
159 |         // Rcout << L << "\n";
160 |         // cout << "Begin iteration:\n";
161 |         
162 |         // time complexity of this part: n^3
163 |         // for(int shift=3; shift<=n_row; shift++)
164 |         // each row of rad_mat represent one off-diagonal point
165 |         for(int shift=3; shift<=k; shift++)
166 |         {
167 |             rad_mat_current_ln1 = rad_mat_next_ln1;
168 |             rad_mat_current_ln2 = rad_mat_next_ln2;
169 |             rad_mat_current_len = rad_mat_next_len;
170 | 
171 |             rad_mat_next_ln1 = arma::mat( n_row-shift+1, shift-1);
172 |             rad_mat_next_ln2 = arma::mat( n_row-shift+1, shift-1);
173 |             rad_mat_next_len = arma::mat( n_row-shift+1, shift-1);
174 |             
175 |             n_cells = arma::zeros<arma::vec>(shift-1); // size of each rectangle
176 |             for( int i=0; i< (shift-1); i++ ) n_cells(i) = (i+1)*(shift-i-1); 
177 |             arma::mat rad_mat_next_len_all = (arma::ones<arma::mat>(n_row-shift+1, shift-1))*(arma::diagmat(n_cells)); // In R: rep(vec, n_row times) // Schur product: element-wise multiplication of two objects
178 |             
179 |             for(int i=1; i<=(n_row-shift+1); i++)
180 |             {
181 |                 // next = current + vertical_columns_next_ln values
182 |                 rad_mat_next_ln1.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln1( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln1( arma::span(0, shift-2-1), i-1 ).t();
183 |                 rad_mat_next_ln2.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln2( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln2( arma::span(0, shift-2-1), i-1 ).t();
184 |                 rad_mat_next_len.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_len( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_len( arma::span(0, shift-2-1), i-1 ).t();
185 | 
186 |                 rad_mat_next_ln1(i-1, shift-1-1) = vertical_columns_next_ln1(shift-1-1, i-1); // the last new element
187 |                 rad_mat_next_ln2(i-1, shift-1-1) = vertical_columns_next_ln2(shift-1-1, i-1); // the last new element
188 |                 rad_mat_next_len(i-1, shift-1-1) = vertical_columns_next_len(shift-1-1, i-1); // the last new element
189 |             }
190 |             
191 | 
192 |             
193 |             ////////////////////////////////////// compute the vertical_columns_next values
194 |             if(shift < n_row) //stop when shift=n
195 |             {
196 |                 vertical_columns_current_ln1 = vertical_columns_next_ln1;
197 |                 vertical_columns_current_ln2 = vertical_columns_next_ln2;
198 |                 vertical_columns_current_len = vertical_columns_next_len;
199 | 
200 |                 arma::mat first_row_ln1(1, n_row-shift);
201 |                 arma::mat first_row_ln2(1, n_row-shift);
202 |                 arma::mat first_row_len(1, n_row-shift);
203 | 
204 |                 for(int i=0; i<(n_row-shift); i++) 
205 |                 {
206 |                     first_row_ln1(0, i) = A_ln1(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
207 |                     first_row_ln2(0, i) = A_ln2(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
208 |                     first_row_len(0, i) = A_len(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
209 |                 }
210 |                 
211 |                 vertical_columns_next_ln1 = vertical_columns_current_ln1.submat(0, 1, shift-2, n_row-shift); // drop the first column
212 |                 vertical_columns_next_ln2 = vertical_columns_current_ln2.submat(0, 1, shift-2, n_row-shift); // drop the first column
213 |                 vertical_columns_next_len = vertical_columns_current_len.submat(0, 1, shift-2, n_row-shift); // drop the first column
214 | 
215 |                 vertical_columns_next_ln1 = arma::join_cols(first_row_ln1, vertical_columns_next_ln1);
216 |                 vertical_columns_next_ln2 = arma::join_cols(first_row_ln2, vertical_columns_next_ln2);
217 |                 vertical_columns_next_len = arma::join_cols(first_row_len, vertical_columns_next_len);
218 | 
219 |                 for(int i=1; i<shift; i++) 
220 |                 {
221 |                     vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + first_row_ln1; // cumsum
222 |                     vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + first_row_ln2; // cumsum
223 |                     vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + first_row_len; // cumsum
224 |                 }   
225 |             }
226 |             
227 |             ////////////////////////////////////// compute the L and ancestors
228 |             // if(shift >= 4)
229 |             if(shift >= 2*min_n_bins) //            
230 |             {
231 |                 ps = rad_mat_next_len_all - rad_mat_next_len; // number of positive values
232 |                 loglik_tmp = loglik_lnorm_cpp_mat( rad_mat_next_ln1, rad_mat_next_ln2, ps, rad_mat_next_len );
233 |                 // loglik = loglik_tmp.submat(0, 1, n_row-shift, shift-2-1); // remove first and last col because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat
234 |                 loglik = loglik_tmp.submat(0, min_n_bins-1, n_row-shift, shift-2-(min_n_bins-1)); // 2018-11-14, remove first and last min_n_bins-1 cols because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat
235 | 
236 |                 
237 |                 arma::mat cases(1, shift-2*min_n_bins+1); // shift=5: 1:2, i.e., two cases
238 |                 // arma::mat loglik(n_row-shift, shift-min_n_bins);
239 |                 for( int row=1; row<=(n_row-shift+1); row++ )
240 |                 {
241 |                     p = row; //7
242 |                     q = row + shift; // 7 + 4 = 11
243 |                     cases = loglik( p-1, arma::span(0, shift-2*min_n_bins) ) + L(p-1, arma::span(p-1-1 + min_n_bins, q - min_n_bins-1-1) ) + (L(arma::span(p-1 + min_n_bins, q - min_n_bins-1), q-1-1)).t();
244 |                     L(p-1, q-1-1) = cases.max();
245 | 
246 |                     max_mid_index = cases.index_max() + 1; //The c++ offset 1
247 |                     max_mid = (p-1 + min_n_bins) + max_mid_index -1; // should minus one
248 |                     // ancestor = paste(i, max_mid, max_mid+1, j, sep='-')
249 |                     ancestors(p-1, q-1-1) = to_string(p) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(q-1);
250 |                     // cout << "cases:" << L(p-1, q-1-1) << "\n";
251 |                 }
252 |             }
253 |             
254 |         }
255 |         
256 |         res["ancestors"] = ancestors;
257 |         res["L"] = L;
258 |         return res;
259 |     }
260 |     
261 |     // compute the ancestors
262 |     // [[Rcpp::export]]
263 |     List compute_L( arma::mat A, arma::mat L, int k ) // A seems not needed here, and can be removed, Y.L, 2018-11-14 (Indeed this part is not used)
264 |     {
265 |         int n_row = A.n_rows;
266 |         int min_n_bins = 2;
267 |         StringMatrix ancestors(n_row, n_row);
268 |     
269 |         // should rewrite this part
270 |         // for( int i= min_n_bins-1; i<=(2*min_n_bins - 2); i++ )
271 |             // for( int j=i; j<n_row; j++ ) L(j-i, j) = 1;
272 |         
273 |         for( int shift=(2*min_n_bins - 1); shift<=(n_row-1); shift++ )  
274 |         {
275 |             int i, j, max_mid_index, max_mid;
276 |             arma::mat cases(1, shift-2*min_n_bins+1);
277 |             arma::mat prob_vec(n_row-shift, shift-min_n_bins);
278 |             for( int row=1; row<=(n_row-shift); row++ )
279 |             {
280 |                     i = row;
281 |                     j = row + shift;
282 |                     cases = prob_vec( i-1, arma::span(0, j-2*min_n_bins-i+1) ) + L(i-1, arma::span(i-1-1 + min_n_bins, j - min_n_bins-1) ) + (L(arma::span(i + min_n_bins-1, j - min_n_bins), j-1)).t();
283 |                     L(i-1, j-1) = cases.max();
284 |                     max_mid_index = cases.index_max() + 1; //The c++ offset 1
285 |                     max_mid = (i-1 + min_n_bins) + max_mid_index -1; // should minus one
286 |                     // ancestor = paste(i, max_mid, max_mid+1, j, sep='-')
287 |                     ancestors(i-1, j-1) = to_string(i) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(j); // This 'to_string' may be optimized 
288 |             }
289 |         }
290 |     return List::create(Named("L") = L, Named("ancestors") = ancestors);    
291 |     }
292 |     
293 | 


--------------------------------------------------------------------------------
/tests/create_test_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | create_test_cool(){
 5 | 	url=$1
 6 | 	outpath=$2
 7 | 	binsize=$3
 8 | 
 9 | 	if [[ -f ${outpath}/test.cool ]]; then return; fi
10 | 	if [[ ! -f ${outpath}/source.pairs.gz ]] 
11 | 	then
12 | 		echo "[Error] Please manually download the file ${url} to ${outpath}"
13 | 		echo "        and give it source.pairs.gz as name"
14 | 		exit -1
15 | 	fi
16 | 
17 | 	zcat ${outpath}/source.pairs.gz | head -26 | tail -2 | cut -d' ' --output-delimiter=$'\t' -f 2,3 > ${outpath}/test.chrom.sizes
18 | 	cooler cload pairs --chrom1 2 --pos1 3 --chrom2 4 --pos2 5 \
19 | 						${outpath}/test.chrom.sizes:${binsize} \
20 | 						${outpath}/source.pairs.gz \
21 | 						${outpath}/test.cool
22 | 	rm ${outpath}/source.pairs.gz
23 | 
24 | 	cooler balance --force --max-iters 1000 ${outpath}/test.cool
25 | }
26 | 
27 | 
28 | create_test_mcool(){
29 | 	cool_path=$1
30 | 	outpath=$2
31 | 	binsize=$3
32 | 
33 | 	if [[ ! -f ${outpath} ]]
34 | 	then
35 | 		cooler zoomify --resolutions ${binsize}N \
36 | 					   --balance \
37 | 					   --balance-args '--force --max-iters 1000' \
38 | 					   --out ${outpath} ${cool_path}
39 | 	fi
40 | }
41 | 
42 | 
43 | test_data_path="tests/testthat/data"
44 | 
45 | mkdir -p ${test_data_path}
46 | 
47 | # Test cool file
48 | source_cool_file="https://data.4dnucleome.org/files-processed/4DNFI2EK1IOQ/@@download/4DNFI2EK1IOQ.pairs.gz"
49 | test_cool_binsize=50000
50 | 
51 | create_test_cool ${source_cool_file} ${test_data_path} ${test_cool_binsize}
52 | 
53 | # Creating MCOOL from COOL
54 | create_test_mcool ${test_data_path}/test.cool ${test_data_path}/test.mcool ${test_cool_binsize}


--------------------------------------------------------------------------------
/tests/test_cmd_cool.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p "tests/output"
 4 | 
 5 | scripts/calder --input tests/data/test.cool \
 6 | 			   --type cool \
 7 | 			   --bin_size 50000 \
 8 | 			   --genome hg38 \
 9 | 			   --nproc 10 \
10 | 			   --outpath "tests/output/test_cmd_cool_out"


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(CALDER)
3 | 
4 | test_check("CALDER")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/data/test.chrom.sizes:
--------------------------------------------------------------------------------
1 | chr21	46709983
2 | chr22	50818468
3 | 


--------------------------------------------------------------------------------
/tests/testthat/data/test.cool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/tests/testthat/data/test.cool


--------------------------------------------------------------------------------
/tests/testthat/data/test.mcool:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/tests/testthat/data/test.mcool


--------------------------------------------------------------------------------
/tests/testthat/test-main.R:
--------------------------------------------------------------------------------
 1 | 
 2 | CALDER_CLI=file.path(system.file("scripts", package="CALDER"), "calder")
 3 | 
 4 | 
 5 | test_that("CALDER works with cool files", {
 6 |     input_cool_path <- file.path(testthat::test_path("data"), "test.cool")
 7 |     output_path <- testthat::test_path("test-main-cool-out")
 8 | 
 9 |     CMD = paste0(CALDER_CLI, " -i ", input_cool_path, " -t cool -g hg38 -o ", output_path)
10 |     system(CMD)
11 | 
12 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestCool_all_sub_compartments.bed")
13 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestCool_all_sub_compartments.tsv")
14 |     expect_snapshot_file(file.path(output_path, "sub_domains", "all_nested_boundaries.bed"), name = "TestCool_all_nested_boundaries.bed")
15 |     unlink(output_path, recursive=TRUE)
16 | })
17 | 
18 | 
19 | test_that("CALDER works with dumps", {
20 | 
21 |     chrs = c(21:22)
22 | 
23 |     ## demo contact matrices in dump format
24 |     contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs),
25 |                 package='CALDER'))
26 |     names(contact_file_dump) = chrs
27 |     output_path <- testthat::test_path("test-main-dump-out")
28 |     ## Run CALDER to compute compartments but not nested sub-domains
29 |     CALDER(contact_file_dump=contact_file_dump, 
30 |             chrs=chrs, 
31 |             bin_size=10E3,
32 |             genome='hg19',
33 |             save_dir=output_path,
34 |             save_intermediate_data=FALSE,
35 |             n_cores=2,
36 |             sub_domains=FALSE)
37 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestDump_all_sub_compartments.bed")
38 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestDump_all_sub_compartments.tsv")
39 |     unlink(output_path, recursive=TRUE)
40 | })
41 | 
42 | test_that("CALDER works with custom feature track", {
43 |     input_cool_path <- file.path(testthat::test_path("data"), "test.cool")
44 |     feature_track_path <- file.path(testthat::test_path("data"), "test_gene_coverage.bed")
45 |     output_path <- testthat::test_path("test-main-featuretrack-out")
46 | 
47 |     CMD = paste0(CALDER_CLI, " -i ", input_cool_path, " -t cool -g hg38 -o ", output_path, " -f ", feature_track_path)
48 |     system(CMD)
49 | 
50 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestFeatureTrack_all_sub_compartments.bed")
51 |     expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestFeatureTrack_all_sub_compartments.tsv")
52 |     expect_snapshot_file(file.path(output_path, "sub_domains", "all_nested_boundaries.bed"), name = "TestFeatureTrack_all_nested_boundaries.bed")
53 |     unlink(output_path, recursive=TRUE)
54 | })


--------------------------------------------------------------------------------