├── .DS_Store
├── .gitattributes
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── CALDER_hierarchy.R
    ├── CALDER_main.R
    ├── HRG_MLE_each_compartment_fun.R
    ├── HighResolution2Low.R
    ├── LikelihoodRatioTest_fun.R
    ├── RcppExports.R
    ├── bisecting_kmeans.R
    ├── call_domains.R
    ├── compartment_PCA.R
    ├── compartment_data_generation_fun.R
    ├── general_functions.R
    ├── generate_compartments_bed_fun.R
    ├── post_processing_nested_domains.R
    ├── prunning.R
    └── zigzag_nested_domain.R
├── README.md
├── Read-and-delete-me
├── img
    ├── CALDER_methods.png
    ├── Hela_chr11_and_RWPE1_chr9_pq.png
    └── IGV_results.png
├── inst
    ├── .DS_Store
    └── extdata
    │   ├── TxDb.Hsapiens.UCSC.hg19.knownGene.rds
    │   ├── TxDb.Mmusculus.UCSC.mm9.knownGene.rds
    │   ├── mat_chr22_10kb_ob.txt.gz
    │   └── mat_mm9_chr1_100kb_ob.txt.gz
├── man
    ├── .Rapp.history
    └── CALDER-package.Rd
└── src
    ├── .DS_Store
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── matrix_multiplication_fun.cpp
    └── zigzag_loglik_ancestors_v4.5.cpp


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | total_execution.time
 3 | 
 4 | # Created by https://www.toptal.com/developers/gitignore/api/r,sublimetext
 5 | # Edit at https://www.toptal.com/developers/gitignore?templates=r,sublimetext
 6 | 
 7 | ### R ###
 8 | # History files
 9 | .Rhistory
10 | .Rapp.history
11 | 
12 | # Session Data files
13 | .RData
14 | .RDataTmp
15 | 
16 | # User-specific files
17 | .Ruserdata
18 | 
19 | # Example code in package build process
20 | *-Ex.R
21 | 
22 | # Output files from R CMD build
23 | /*.tar.gz
24 | 
25 | # Output files from R CMD check
26 | /*.Rcheck/
27 | 
28 | # RStudio files
29 | .Rproj.user/
30 | 
31 | # produced vignettes
32 | vignettes/*.html
33 | vignettes/*.pdf
34 | 
35 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
36 | .httr-oauth
37 | 
38 | # knitr and R markdown default cache directories
39 | *_cache/
40 | /cache/
41 | 
42 | # Temporary files created by R markdown
43 | *.utf8.md
44 | *.knit.md
45 | 
46 | # R Environment Variables
47 | .Renviron
48 | 
49 | # pkgdown site
50 | docs/
51 | 
52 | # translation temp files
53 | po/*~
54 | 
55 | # RStudio Connect folder
56 | rsconnect/
57 | 
58 | ### R.Bookdown Stack ###
59 | # R package: bookdown caching files
60 | /*_files/
61 | 
62 | ### SublimeText ###
63 | # Cache files for Sublime Text
64 | *.tmlanguage.cache
65 | *.tmPreferences.cache
66 | *.stTheme.cache
67 | 
68 | # Workspace files are user-specific
69 | *.sublime-workspace
70 | 
71 | # Project files should be checked into the repository, unless a significant
72 | # proportion of contributors will probably not be using Sublime Text
73 | # *.sublime-project
74 | 
75 | # SFTP configuration file
76 | sftp-config.json
77 | sftp-config-alt*.json
78 | 
79 | # Package control specific files
80 | Package Control.last-run
81 | Package Control.ca-list
82 | Package Control.ca-bundle
83 | Package Control.system-ca-bundle
84 | Package Control.cache/
85 | Package Control.ca-certs/
86 | Package Control.merged-ca-bundle
87 | Package Control.user-ca-bundle
88 | oscrypto-ca-bundle.crt
89 | bh_unicode_properties.cache
90 | 
91 | # Sublime-github package stores a github token in this file
92 | # https://packagecontrol.io/packages/sublime-github
93 | GitHub.sublime-settings
94 | 
95 | # End of https://www.toptal.com/developers/gitignore/api/r,sublimetext
96 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: CALDER
 2 | Type: Package
 3 | Title: CALDER: a Hi-C non-linear hierarchical organization analysis tool
 4 | Version: 1.0
 5 | Date: 2020-09-01
 6 | Author: Yuanlong LIU
 7 | Maintainer: Who to complain to <yliueagle@googlemail.com>
 8 | Description: CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts
 9 | LinkingTo: Rcpp, RcppArmadillo
10 | Authors@R: c(
11 |     person(given = "Yuanlong",
12 |            family = "LIU",
13 |            role = c("aut", "cre"),
14 |            email = "yliueagle@googlemail.com"))
15 | URL: https://github.com/YuanlongLiu/CALDER/
16 | BugReports: https://github.com/YuanlongLiu/CALDER/issues
17 | License: MIT
18 | Encoding: UTF-8
19 | Depends:
20 |     R (>= 3.5.2)
21 | Imports: 
22 | 	R.utils (>= 2.9.0),
23 | 	doParallel (>= 1.0.15),
24 | 	ape (>= 5.3),
25 | 	dendextend (>= 1.12.0),
26 | 	fitdistrplus (>= 1.0.14),
27 | 	igraph (>= 1.2.4.1),
28 | 	Matrix (>= 1.2.17),
29 | 	rARPACK (>= 0.11.0),
30 | 	factoextra (>= 1.0.5),
31 | 	maptools (>= 0.9.5),
32 | 	data.table (>= 1.12.2),
33 | 	fields (>= 9.8.3),
34 | 	GenomicRanges (>= 1.36.0),
35 | 	strawr (>= 0.0.9)
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Yuanlong LIU
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | useDynLib(CALDER, .registration=TRUE)
2 | importFrom(Rcpp, evalCpp)
3 | exportPattern("^[[:alpha:]]+")
4 | 


--------------------------------------------------------------------------------
/R/CALDER_hierarchy.R:
--------------------------------------------------------------------------------
  1 | ############################################################
  2 | 
  3 | 
  4 | get_gene_info <- function(genome) {
  5 | 
  6 | 	if(genome == 'hg19') {
  7 | 		gene_info_file = system.file("extdata", "TxDb.Hsapiens.UCSC.hg19.knownGene.rds", package = 'CALDER')
  8 | 	} else if(genome == 'mm9') {
  9 | 		gene_info_file = system.file("extdata", "TxDb.Mmusculus.UCSC.mm9.knownGene.rds", package = 'CALDER')
 10 | 	} else {
 11 | 		stop(paste0("Unknown genome (", genome, ")"))
 12 | 	}
 13 | 
 14 | 	
 15 | 	gene_info = readRDS(gene_info_file)
 16 | }
 17 | 
 18 | 
 19 | CALDER_CD_hierarchy = function(contact_mat_file, 
 20 | 							   chr, 
 21 | 							   bin_size, 
 22 | 							   out_dir, 
 23 | 							   save_intermediate_data=FALSE,
 24 | 							   genome = 'hg19')
 25 | {
 26 |     time0 = Sys.time()
 27 |     log_file = paste0(out_dir, '/chr', chr, '_log.txt')
 28 | 
 29 |     cat('\n')
 30 | 
 31 |     cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
 32 |     cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n')
 33 |     processed_data = contact_mat_processing(contact_mat_file, bin_size=bin_size)
 34 |    
 35 |     A_oe = processed_data$A_oe
 36 |     ccA_oe_compressed_log_atanh = processed_data$atanh_score
 37 | 
 38 |     cat('\r', '>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()))
 39 |     cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
 40 | 
 41 |     p_thresh = ifelse(bin_size < 40000, 0.05, 1)
 42 |     window.sizes = 3
 43 |     compartments = vector("list", 2)
 44 |     chr_name = paste0("chr", chr)
 45 | 
 46 |     cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
 47 |     cat('\r', '>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()))
 48 | 
 49 |     compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size, window.sizes = window.sizes, ccA_oe_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL)
 50 |     topDom_output = compartments[[2]]
 51 |     bin_names = rownames(A_oe)
 52 |     A_oe = as.matrix(A_oe)
 53 |     initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2])
 54 | 
 55 |     if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) {
 56 |         stop(CELL_LINE, " initial_clusters error in topDom")
 57 |     }
 58 | 
 59 |     n_clusters = length(initial_clusters)
 60 | 		A_oe_cluster_mean = HighResolution2Low_k_rectangle(A_oe, initial_clusters, initial_clusters, sum_or_mean = "mean")
 61 | 
 62 | 	trend_mean_list = lapply( 1:4, function(v) 1*(A_oe_cluster_mean[, -(1:v)] > A_oe_cluster_mean[, - n_clusters - 1 + (v:1)]) )
 63 | 	trend_mean = do.call(cbind, trend_mean_list)
 64 | 	c_trend_mean = cor(t(trend_mean))
 65 | 	atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7))
 66 | 
 67 | 
 68 | 	# if(to_scale)
 69 | 	{
 70 | 		trend_mean = scale(trend_mean)
 71 | 		c_trend_mean = scale(c_trend_mean)
 72 | 		atanh_c_trend_mean= scale(atanh_c_trend_mean)
 73 | 	}
 74 | 
 75 | 
 76 | 	PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10)
 77 | 	PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2
 78 | 	rownames(PC_12_atanh) = 1:nrow(PC_12_atanh)
 79 | 
 80 | 	############################################################
 81 | 	PC_direction = 1
 82 | 
 83 | 	gene_info <- get_gene_info(genome)
 84 | 
 85 | 	## switch PC direction based on gene density
 86 | 	{
 87 | 		initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v]))
 88 | 		chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length)))
 89 | 		chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size + 1
 90 | 		chr_bin_pc$end = chr_bin_pc$bin*bin_size
 91 | 		chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE)
 92 | 		gene_info_chr = subset(gene_info, seqnames==chr_name)
 93 | 
 94 | 		refGR = chr_bin_pc_range
 95 | 		testGR = gene_info_chr
 96 | 		hits <- findOverlaps(refGR, testGR)
 97 | 	    overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)])
 98 | 	    overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin))
 99 | 	    bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE)
100 | 	    bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0
101 | 		
102 | 	    gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2]
103 | 	    if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak')
104 | 	    PC_direction = PC_direction*sign(gene_density_cor)
105 | 
106 | 	    PC_12_atanh = PC_12_atanh*PC_direction
107 | 	}
108 | 
109 | 
110 | 	project_info = project_to_major_axis(PC_12_atanh)
111 | 	x_pro = project_info$x_pro
112 | 	
113 | 	############################################################
114 | 	hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE])
115 | 
116 | 	hc_hybrid_PC12 = hc_disect_kmeans_PC12
117 | 
118 | 	{
119 | 		reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean)
120 | 		hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names)
121 | 		hc_hybrid_x_pro = hc_disect_kmeans_PC12
122 | 		reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro)
123 | 		CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro)	
124 | 	}
125 | 
126 | 	############################################################
127 | 	parameters = list(bin_size = bin_size, p_thresh = p_thresh)
128 | 	res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters)
129 | 	intermediate_data_file = paste0(out_dir, '/chr', chr, '_intermediate_data.Rds')
130 | 	
131 | 	hc = res$CALDER_hc
132 | 	hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
133 | 	bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
134 | 
135 | 	rownames(bin_comp) = NULL
136 | 	res$comp = bin_comp
137 | 	res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v])
138 | 	res$mat = A_oe
139 | 	res$chr = chr
140 | 	generate_hierachy_bed(chr=chr, res=res, out_dir=out_dir, bin_size=bin_size)
141 | 
142 | 
143 |     cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
144 |     cat('\r', '>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()))
145 | 
146 |    	if(abs(gene_density_cor) < 0.2) cat('WARNING: correlation between gene density and PC1 on this chr is: ', substr(gene_density_cor, 1, 4), ', which is a bit low', '\n', file=log_file, append=TRUE)
147 | 
148 |     time1 = Sys.time()
149 |     # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
150 | 
151 |     delta_time <- time1 - time0
152 | 	timediff <- format(round(delta_time, 2), nsmall = 2)
153 | 
154 |     cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE)
155 |    	# if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE)
156 | 
157 | 	############################################################
158 | 	intermediate_data = res
159 | 	if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file)
160 | 	# cat(intermediate_data_file)
161 | 	return(intermediate_data)
162 | }
163 | 
164 | CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, out_dir, bin_size)
165 | {	
166 |     time0 = Sys.time()
167 |     log_file = paste0(out_dir, '/chr', chr, '_sub_domains_log.txt')
168 | 
169 |    	cat('\r', '>>>> Begin compute sub-domains at:', as.character(Sys.time()))
170 |    	cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE)
171 | 
172 | 	if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file)
173 | 	{
174 | 	    if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 
175 | 	    if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n')     
176 | 	    compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters )
177 | 
178 | 			cat('\r', '>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()))   			
179 | 		cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
180 | 		sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2)   
181 | 	    no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=out_dir, bin_size=bin_size)
182 | 	    cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE)
183 | 	    cat('\r', '>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()))
184 | 
185 | 	   	time1 = Sys.time()
186 |         # delta_time  = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0))
187 |         delta_time <- time1 - time0
188 | 		timediff <- format(round(delta_time, 2), nsmall = 2)
189 | 
190 |         cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE)
191 | 	}
192 | 	# return(NULL)
193 | }
194 | 
195 | 
196 | 
197 | ############################################################
198 | create_compartment_bed_v4 = function(chr_bin_domain, bin_size)
199 | {
200 | 	# for( chr in chrs )
201 | 	{
202 | 		v = chr_bin_domain
203 | 		# v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling
204 | 		# v$intra_domain = names(cols)[(as.numeric(v$intra_domain))]
205 | 		v = v[order(v$bin_index), ]
206 | 
207 | 
208 | 		borders_non_consecutive = which(diff(v$bin_index)!=1)
209 | 		borders_domain = cumsum(rle(v$comp)$lengths)
210 | 		borders = sort(union(borders_non_consecutive, borders_domain))
211 | 		bins = v$bin_index
212 | 		to_id = as.numeric(bins[borders])
213 | 		from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)])
214 | 
215 | 		pos_start = (from_id-1)*bin_size + 1
216 | 		pos_end = to_id*bin_size
217 | 		# chr = as.numeric( gsub('chr', '', v$chr) )
218 | 		chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX
219 | 
220 | 		compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders])
221 | 	}
222 | 	return(compartment_info_tab)
223 | }
224 | 
225 | ############################################################
226 | generate_hierachy_bed = function(chr, res, out_dir, bin_size)
227 | {
228 | 	chr_name = paste0('chr', chr)
229 | 	# res = reses[[chr_name]][[CELL_LINE]]
230 | 	hc = res$CALDER_hc
231 | 
232 | 	hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels)
233 | 	bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length)))
234 | 	chr_bin_domain = bin_comp
235 | 	chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr)
236 | 
237 | 	# chr_bin_domain = chr_bin_domain[order(bin_index)]
238 | 
239 | 	compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size)
240 | 
241 | 	boundaries = unname(sapply(res$initial_clusters, max))
242 | 	boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size
243 | 
244 | 	compartment_info_tab$is_boundary = 'gap'
245 | 	compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary'
246 | 	
247 | 	colnames(compartment_info_tab)[4] = 'compartment_label'
248 | 	compartments_tsv_file = paste0(out_dir, '/chr', chr, '_domain_hierachy.tsv')
249 | 	compartments_bed_file = paste0(out_dir, '/chr', chr, '_sub_compartments.bed')
250 | 	boundary_bed_file = paste0(out_dir, '/chr', chr, '_domain_boundaries.bed')
251 | 
252 | 	options(scipen=999)
253 | 	write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE )
254 | 
255 | 
256 | 	comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 	
257 | 	names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2')
258 | 	comp_val = (8:1)/8
259 | 	names(comp_val) = names(comp_cols)
260 | 
261 | 	comp_8 = substr(compartment_info_tab$compartment_label, 1, 5)
262 | 
263 | 	compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8])
264 | 	write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
265 | 
266 | 	bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary')
267 | 	bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000')
268 | 	write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
269 | }
270 | 
271 | 
272 | 
273 | project_to_major_axis <- function(PC_12_atanh)
274 | {
275 | 	Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2])
276 | 	Data = Data[order(Data$x),]
277 | 	loess_fit <- loess(y ~ x, Data)
278 | 
279 | 	more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1]))
280 | 	major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x))
281 | 	new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis
282 | 
283 | 	dis = fields::rdist(PC_12_atanh[, 1:2], major_axis)
284 | 	projected_x = new_x_axis[apply(dis, 1, which.min)]
285 | 	names(projected_x) = rownames(PC_12_atanh)
286 | 	# projected_x = major_axis[apply(dis, 1, which.min)]
287 | 	project_info = list(x_pro=projected_x, major_axis=major_axis)
288 | 	return(project_info)
289 | }
290 | 
291 | 
292 | get_best_reorder <- function(hc_hybrid_x_pro, x_pro)
293 | {
294 | 	n = length(x_pro)
295 | 	reordered_names_x_pro_list = list()
296 | 
297 | 	reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2
298 | 
299 | 	best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v]))))
300 | 	return(reordered_names_x_pro_list[[1]])
301 | }
302 | 
303 | 


--------------------------------------------------------------------------------
/R/CALDER_main.R:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | CALDER_main = function(contact_mat_file, 
 3 | 					   chr, 
 4 | 					   bin_size, 
 5 | 					   out_dir, 
 6 | 					   sub_domains = TRUE, 
 7 | 					   save_intermediate_data = TRUE,
 8 | 					   genome='hg19') {
 9 | 	required_packages = c('doParallel', 'GenomicRanges', 'R.utils', 'factoextra', 'maptools')
10 | 	sapply(required_packages, require, character.only = TRUE, quietly = TRUE)
11 | 
12 | 	dir.create(out_dir, showWarnings = FALSE)
13 | 	intermediate_data = CALDER_CD_hierarchy(contact_mat_file, 
14 | 											chr, 
15 | 											bin_size, 
16 | 											out_dir, 
17 | 											save_intermediate_data,
18 | 											genome)
19 | 
20 | 	if(sub_domains==TRUE) {
21 | 		CALDER_sub_domains(intermediate_data=intermediate_data, 
22 | 						   chr=chr, 
23 | 						   out_dir=out_dir, 
24 | 						   bin_size=bin_size)
25 | 	}
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/R/HRG_MLE_each_compartment_fun.R:
--------------------------------------------------------------------------------
  1 | 	## Yuanlong LIU
  2 | 	## 12/04/2018
  3 | 	## 23/05/2018
  4 | 	## 29/05/2018
  5 | 	
  6 | 	## A should be already named
  7 | 	## A should be symmetric
  8 | 	## Does not remove 0-rows/columns of A
  9 | 	HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE)
 10 | 	{
 11 | 		min_n_bins_outer=2
 12 | 		min_n_bins_inner=2
 13 | 		distr = 'lnorm'
 14 | 		A = pA_sym[seg, seg]
 15 | 		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
 16 | 		arg_list = as.list(environment())
 17 | 		res_folder = file.path(res_dir)
 18 | 		dir.create(res_folder, recursive=TRUE, showWarnings = TRUE)
 19 | 		
 20 | 		total_execution_time_file = file.path(res_dir, 'total_execution.time')
 21 | 		cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE)
 22 | 		time_begin = Sys.time()
 23 | 		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
 24 | 	
 25 | 		## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken
 26 | 		n_A = nrow(A)
 27 | 		
 28 | 		## IF THE SEGMENT IS ALREADY SMALL ENOUGH
 29 | 		if( n_A <= max(allowed_max_nbins_seq) ) 
 30 | 		{
 31 | 			## This will be modified
 32 | 			{
 33 | 				dists = (2*min_n_bins_outer - 1):(nrow(A)-1)
 34 | 				counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer ))
 35 | 				split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
 36 | 				chunks = c(dists[1]-1, dists[split_info$break_points])
 37 | 				res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
 38 | 			}
 39 | 			
 40 | 			## this part gets the hi_tree
 41 | 			{
 42 | 				hi_tree = get_tree_v0( res_outer )
 43 | 				igraph::V(hi_tree)$left_rescaled = igraph::V(hi_tree)$left
 44 | 				igraph::V(hi_tree)$right_rescaled = igraph::V(hi_tree)$right
 45 | 				igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right - igraph::V(hi_tree)$left + 1
 46 | 				igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
 47 | 			}
 48 | 			
 49 | 			full_tree = hi_tree
 50 | 			res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) )
 51 | 			res_info$full_tree = full_tree
 52 | 			res_folder_final = file.path(res_dir, 'final')
 53 | 			dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
 54 | 			save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
 55 | 	
 56 | 			time_finish = Sys.time()
 57 | 			cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
 58 | 			cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
 59 | 			return(res_info)
 60 | 		}
 61 | 		
 62 | 		## which one leads to the least residue
 63 | 		max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ]
 64 | 		residue = n_A %% max_nbins
 65 | 	
 66 | 		residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0)
 67 | 		A_final = residue_mat_info$A_final
 68 | 		n2one = residue_mat_info$n2one
 69 | 
 70 | 		## A_low has no row/col name
 71 | 		A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low
 72 | 		
 73 | 		## Starting from here, the original row/col names are no longer used
 74 | 		
 75 | 		## this part run HRG_core on the outer part
 76 | 		{
 77 | 			dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1)
 78 | 			counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer ))
 79 | 			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
 80 | 			chunks = c(dists[1]-1, dists[split_info$break_points])
 81 | 			res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
 82 | 		}
 83 | 
 84 | 		## this part gets the hi_tree
 85 | 		{
 86 | 			hi_tree = get_tree_v0( res_outer )
 87 | 			igraph::V(hi_tree)$left_rescaled = n2one*(igraph::V(hi_tree)$left - 1) + 1
 88 | 			igraph::V(hi_tree)$right_rescaled = n2one*igraph::V(hi_tree)$right
 89 | 			node2adjust = which(igraph::V(hi_tree)$right_rescaled==max(igraph::V(hi_tree)$right_rescaled)) ## append the residue bins here
 90 | 			igraph::V(hi_tree)[node2adjust]$right_rescaled = n_A
 91 | 			
 92 | 			igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
 93 | 			igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
 94 | 		}
 95 | 
 96 | 		# segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp)	
 97 | 
 98 | 		segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine)
 99 | 		
100 | 		## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL
101 | 		## 14-05-2018
102 | 		## Also adjust the tree
103 | 		if(adjust_segmentss_topdom==TRUE)
104 | 		{
105 | 			shift = seg[1]-1
106 | 			segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift
107 | 			for( i in 1:nrow(segmentss) )
108 | 			{
109 | 				index_left2adj = which(igraph::V(hi_tree)$left_rescaled == segmentss_nadj[i,1])
110 | 				index_right2adj = which(igraph::V(hi_tree)$right_rescaled == segmentss_nadj[i,2])
111 | 				igraph::V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1]
112 | 				igraph::V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2]
113 | 				igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1
114 | 				igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='')
115 | 			}
116 | 		}
117 | 		
118 | 		trunk = hi_tree
119 | 		# save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata'))
120 | 		
121 | 		res_inner = rep(list(list()), nrow(segmentss))
122 | 		for( i in 1:nrow(segmentss) )
123 | 		{
124 | 			cat(i,'\n')
125 | 			# if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {}
126 | 			index = segmentss[i,1]:segmentss[i,2]
127 | 			A_part = A[index, index]
128 | 			dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1)
129 | 			counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner ))
130 | 			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
131 | 			chunks = c(dists[1]-1, dists[split_info$break_points])
132 | 			
133 | 			## A_part is named
134 | 			res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast )
135 | 			res_inner[[i]] = res_info_inner      #minimum size filtering
136 | 			cat('finished', '\n')
137 | 		}
138 | 		
139 | 		res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner )
140 | 		res_folder_final = file.path(res_dir, 'final')
141 | 		dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
142 | 		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
143 | 	
144 | 		time_finish = Sys.time()
145 | 		cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
146 | 		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
147 | 
148 | 
149 | 		## xenocfraf:
150 | 		# res_inner = res_inner[sapply(res_inner, length) > 0]
151 | 		branches = lapply( res_inner, get_tree_v0 )
152 | 		for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1])
153 | 		
154 | 		full_tree = xenocraft( trunk, branches )
155 | 		names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ','))
156 | 		igraph::V(full_tree)$left = substring(names_tmp[,1], 2)
157 | 		igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
158 | 		if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
159 | 		
160 | 		res_info$full_tree = full_tree
161 | 		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
162 | 	
163 | 		return( res_info )
164 | 	}
165 | 	
166 | 


--------------------------------------------------------------------------------
/R/HighResolution2Low.R:
--------------------------------------------------------------------------------
  1 | 	## These functions try to aggregate bin-wise contact matrix into a domain-wise contact matrix
  2 | 
  3 | 	HighResolution2Low_k_complete <-function( A, max_nbins )
  4 | 	{
  5 | 		nbins = nrow( A )
  6 | 		
  7 | 		max_nbins_bp = max_nbins
  8 | 		max_nbinss = max_nbins + (-5:20)
  9 | 		expand_fold_info = t(sapply(max_nbinss, fold_expand_rate, K=nbins))
 10 | 		index = max(which(expand_fold_info[,'rescale_rate']==min(expand_fold_info[,'rescale_rate'])))
 11 | 		max_nbins = max_nbinss[ index ]
 12 | 		n2one_head = expand_fold_info[index, 'n2one_head']
 13 | 		n2one_last = expand_fold_info[index, 'n2one_last']
 14 | 		
 15 | 		if(n2one_head==1) return(list())
 16 | 		rescale_rate = n2one_last / n2one_head
 17 | 
 18 | 		if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in rescale_rate as: ', rescale_rate, '\n')
 19 | 		
 20 | 		if(rescale_rate!=1) ## only do when rescale is not 1
 21 | 		{
 22 | 			aa_end = n2one_head*(max_nbins-1)
 23 | 			bb_end = nrow(A)
 24 | 			A_aa = A[1:aa_end, 1:aa_end]
 25 | 			A_bb = A[(aa_end+1):bb_end, (aa_end+1):bb_end]
 26 | 			A_ab = A[1:aa_end, (aa_end+1):bb_end, drop=FALSE]
 27 | 			A_aa_low = HighResolution2Low_k(mat=A_aa, k=max_nbins-1)
 28 | 			A_bb_low = sum( A_bb )
 29 | 			
 30 | 			## https://stackoverflow.com/questions/15265512/summing-every-n-points-in-r
 31 | 			v = apply(A_ab, 1, sum)
 32 | 			A_ab_low = unname(tapply(v, (seq_along(v)-1) %/% n2one_head, sum))
 33 | 			
 34 | 			A_ab_low_new = A_ab_low / n2one_last*n2one_head
 35 | 			A_bb_low_new = A_bb_low / n2one_last^2*n2one_head^2
 36 | 			A_low = rbind(cbind(A_aa_low, A_ab_low_new), c( A_ab_low_new, A_bb_low_new ))
 37 | 			colnames(A_low) = NULL
 38 | 		}
 39 | 		
 40 | 		if(rescale_rate==1) A_low = HighResolution2Low_k(mat=A, k=max_nbins) ## k is the dimension of A_final_low
 41 | 		## A_low has no row/col name
 42 | 		res = list(A_low=A_low, n2one_head=n2one_head, n2one_last=n2one_last)
 43 | 		return(res)
 44 | 	}	
 45 | 	
 46 | 	split_info <-function( K, max_nbins )
 47 | 	{
 48 | 		x = floor( K/max_nbins ) + 1
 49 | 		n2 = max_nbins*x - K
 50 | 		n1 = max_nbins - n2
 51 | 		vec = c(rep(x, n1), rep(x-1, n2))
 52 | 		split_vec = split(1:K, rep(1:max_nbins, vec))
 53 | 
 54 | 		expand_fold_ratio = min(n1/n2, n2/n1)
 55 | 		res = list(vec=vec, expand_fold_ratio=expand_fold_ratio, split_vec=split_vec)
 56 | 		return(res)
 57 | 	}
 58 | 	
 59 | 	
 60 | 	## diagonal values are summed twice
 61 | 	HighResolution2Low <-function( A, rescale, which=2 )
 62 | 	{
 63 | 		nbins = nrow(A)
 64 | 		nbins_low = floor(nbins/rescale)
 65 | 		A_low = matrix( , nbins_low, nbins_low)
 66 | 		keep_rows = nbins - nbins%%nbins_low
 67 | 		A_truncated = A[ 1:keep_rows, 1:keep_rows ]
 68 | 		
 69 | 		if(which==1) A_low = matsplitter_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
 70 | 		if(which==2) A_low = mat_split_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low)
 71 | 
 72 | 		return( A_low )
 73 | 	}
 74 | 	
 75 | 	## https://stackoverflow.com/questions/24299171/function-to-split-a-matrix-into-sub-matrices-in-r
 76 | 	## Function to split a matrix into sub-matrices in R
 77 | 	matsplitter_sum <- function(M, r, c) 
 78 | 	{
 79 | 		rg <- (row(M)-1)%/%r + 1
 80 | 		cg <- (col(M)-1)%/%c + 1
 81 | 		rci <- (rg-1)*max(cg) + cg
 82 | 		N <- prod(dim(M))/r/c
 83 | 		cv <- unlist(lapply(1:N, function(x) M[rci==x]))
 84 | 		dim(cv)<-c(r, c, N)
 85 | 		res = matrix(apply(cv, 3, sum), nrow(M)/r, byrow=TRUE)
 86 | 		return(res)
 87 | 	}
 88 | 
 89 | 
 90 | 	mat_split_sum <- function(M, r, c)
 91 | 	{
 92 | 	  nr <- ceiling(nrow(M)/r)
 93 | 	  nc <- ceiling(ncol(M)/c)
 94 | 	  newM <- matrix(NA, nr*r, nc*c)
 95 | 	  newM[1:nrow(M), 1:ncol(M)] <- M
 96 | 
 97 | 	  div_k <- kronecker(matrix(seq_len(nr*nc), nr, byrow = TRUE), matrix(1, r, c))
 98 | 	  matlist <- split(newM, div_k)
 99 | 	  res = matrix(sapply(matlist, sum), nrow(M)/r, byrow=TRUE)
100 | 	  return(res)
101 | 	}
102 | 	
103 | 	## this is much faster
104 | 	HighResolution2Low_k <- function(mat, k)
105 | 	{
106 | 		chunk2 <- function(x, n) split(x, cut(seq_along(x), n, labels = FALSE)) 
107 | 		n = nrow(mat)
108 | 		A_low = matrix( , k, k )
109 | 		indices = chunk2( 1:n, k )
110 | 		for(row_index in 1:k)
111 | 		{
112 | 			A_sub = mat[indices[[row_index]], ]
113 | 			for(col_index in 1:k) A_low[row_index, col_index] = sum( A_sub[ , indices[[col_index]]] )
114 | 		}
115 | 		return( A_low )
116 | 	}
117 | 	
118 | 	HighResolution2Low_k_rectangle <- function(mat, row_split, col_split, sum_or_mean=c('sum', 'mean', 'median', 'p_above_one'))
119 | 	{
120 | 		# if(remove_zero==TRUE) mat[mat==0] = NA
121 | 		k_row = length(row_split)
122 | 		k_col = length(col_split)
123 | 		
124 | 		A_low = matrix( , k_row, k_col )
125 | 		
126 | 		if(sum_or_mean=='sum')
127 | 		{
128 | 			for(row_index in 1:k_row)
129 | 			{
130 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
131 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = sum( A_sub[ , col_split[[col_index]]] )
132 | 			}
133 | 		}
134 | 
135 | 		if(sum_or_mean=='mean')
136 | 		{
137 | 			for(row_index in 1:k_row)
138 | 			{
139 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
140 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
141 | 			}
142 | 		}
143 | 
144 | 		if(sum_or_mean=='median')
145 | 		{
146 | 			for(row_index in 1:k_row)
147 | 			{
148 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
149 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = median( A_sub[ , col_split[[col_index]]], na.rm=TRUE )
150 | 			}
151 | 		}
152 | 
153 | 		if(sum_or_mean=='p_above_one')
154 | 		{
155 | 			for(row_index in 1:k_row)
156 | 			{
157 | 				A_sub = mat[row_split[[row_index]], , drop=FALSE ]
158 | 				for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]] > 1 )
159 | 			}
160 | 		}
161 | 		# if(remove_zero==TRUE) A_low[is.na(A_low)] = 0
162 | 		return( A_low )
163 | 	}	
164 | 	
165 | 	## for a n x n matrix, this function generate a nxm matrix, with m*compress_size = n
166 | 	## this function is used for generating the nxm matrix, where the i,jth value is the contact value 
167 | 	## 08-08-2018  
168 | 	compress_mat_fast = function(input_mat, compress_size)
169 | 	{
170 | 		mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
171 | 		n = ncol(input_mat)
172 | 		mat2prod = mat_block(n, compress_size)
173 | 		# return(t(input_mat%*%mat2prod / compress_size))
174 | 		return(t(matrix_multiplication_cpp(input_mat, mat2prod)) / compress_size)
175 | 	}
176 | 	
177 | 	


--------------------------------------------------------------------------------
/R/LikelihoodRatioTest_fun.R:
--------------------------------------------------------------------------------
  1 | 	## The following code tries to evaluate the performance of different likelihood ratio-based tests
  2 | 	## Yuanlong LIU
  3 | 	## 01-03-2018
  4 | 	
  5 | 	############## model 1: keep the same structure ##############
  6 | 	## In this case, a tree structure should be provided
  7 | 	## Constraint: the top three levels have the same contact intensity parameter
  8 | 
  9 | 	## 	01-03-2018
 10 | 	p_likelihood_ratio <- function( A, head, mid, tail, num )
 11 | 	{
 12 | 		A_whole = A[head:tail, head:tail]
 13 | 		A_a = A[head:mid, head:mid]
 14 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 15 | 		A_ab = A[head:mid, (mid+1):tail]
 16 | 		
 17 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=FALSE)] 
 18 | 		tri_Aa = A_a[upper.tri(A_a, diag=FALSE)] 
 19 | 		tri_Ab = A_b[upper.tri(A_b, diag=FALSE)] 
 20 | 		# gamma_fit(tri_Awhole, num)
 21 | 		
 22 | 		inner_a = get_prob( tri_Aa )
 23 | 		inner_b = get_prob( tri_Ab )
 24 | 		inter_ab = get_prob( A_ab )
 25 | 		
 26 | 		## negative binomial
 27 | 		# inner_a = get_prob_ng( tri_Aa )
 28 | 		# inner_b = get_prob_ng( tri_Ab )
 29 | 		# inter_ab = get_prob_ng( A_ab )
 30 | 
 31 | 		
 32 | 		## log likelihood of H1
 33 | 		LH1 = inner_a + inner_b + inter_ab
 34 | 		LH0 = get_prob( tri_Awhole )
 35 | 		
 36 | 		Lambda = -2*( LH0 - LH1 )
 37 | 		# cat(Lambda, '\n')
 38 | 		df_h0 = 1*1
 39 | 		df_h1 = 3*1
 40 | 		df = df_h1 - df_h0
 41 | 		
 42 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
 43 | 		info = list( Lambda=Lambda, p=p )
 44 | 		return(info)
 45 | 	}
 46 | 	
 47 | 	p_likelihood_ratio_nb <- function( A, head, mid, tail )
 48 | 	{
 49 | 		A_whole = A[head:tail, head:tail]
 50 | 		A_a = A[head:mid, head:mid]
 51 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 52 | 		A_ab = A[head:mid, (mid+1):tail]
 53 | 		
 54 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
 55 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
 56 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
 57 | 		
 58 | 		## negative binomial
 59 | 		inner_a = get_prob_nb( tri_Aa )
 60 | 		inner_b = get_prob_nb( tri_Ab )
 61 | 		inter_ab = get_prob_nb( A_ab )
 62 | 
 63 | 		
 64 | 		## log likelihood of H1
 65 | 		LH1 = inner_a + inner_b + inter_ab
 66 | 		LH0 = get_prob_nb( tri_Awhole )
 67 | 		
 68 | 		Lambda = -2*( LH0 - LH1 )
 69 | 		# cat(Lambda, '\n')
 70 | 		n_parameters = 2
 71 | 		df_h0 = 1*n_parameters
 72 | 		df_h1 = 3*n_parameters
 73 | 		df = df_h1 - df_h0
 74 | 		
 75 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
 76 | 		info = list( Lambda=Lambda, p=p )
 77 | 		return(info)
 78 | 	}
 79 | 
 80 | 	p_likelihood_ratio_norm <- function( A, head, mid, tail )
 81 | 	{
 82 | 		A_whole = A[head:tail, head:tail]
 83 | 		A_a = A[head:mid, head:mid]
 84 | 		A_b = A[(mid+1):tail, (mid+1):tail]
 85 | 		A_ab = A[head:mid, (mid+1):tail]
 86 | 		
 87 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
 88 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
 89 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
 90 | 		
 91 | 		## norm
 92 | 		inner_a = likelihood_norm( tri_Aa )
 93 | 		inner_b = likelihood_norm( tri_Ab )
 94 | 		inter_ab = likelihood_norm( A_ab )
 95 | 
 96 | 		
 97 | 		## log likelihood of H1
 98 | 		LH1 = inner_a + inner_b + inter_ab
 99 | 		LH0 = likelihood_norm( tri_Awhole )
100 | 		
101 | 		Lambda = -2*( LH0 - LH1 )
102 | 		# cat(Lambda, '\n')
103 | 		n_parameters = 2
104 | 		df_h0 = 1*n_parameters
105 | 		df_h1 = 3*n_parameters
106 | 		df = df_h1 - df_h0
107 | 		
108 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
109 | 		info = list( Lambda=Lambda, p=p )
110 | 		return(info)
111 | 	}
112 | 
113 | 	
114 | 	
115 | 	p_likelihood_ratio_gamma <- function( A, head, mid, tail, n_parameters, imputation )
116 | 	{
117 | 		A_whole = A[head:tail, head:tail]
118 | 		A_a = A[head:mid, head:mid]
119 | 		A_b = A[(mid+1):tail, (mid+1):tail]
120 | 		A_ab = A[head:mid, (mid+1):tail]
121 | 		
122 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
123 | 		## added 25-03-2018. If no zero values, no imputation
124 | 		if( length(tri_Awhole==0) == 0 ) imputation = FALSE
125 | 		
126 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
127 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
128 | 		
129 | 		## norm
130 | 		inner_a = likelihood_gamma_mme( tri_Aa )
131 | 		inner_b = likelihood_gamma_mme( tri_Ab )
132 | 		inter_ab = likelihood_gamma_mme( A_ab )
133 | 		whole = likelihood_gamma_mme( tri_Awhole )
134 | 		
135 | 		if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
136 | 		{
137 | 			inner_a = likelihood_gamma_mme( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
138 | 			inner_b = likelihood_gamma_mme( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
139 | 			inter_ab = likelihood_gamma_mme( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
140 | 			whole = likelihood_gamma_mme( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
141 | 			n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
142 | 		}
143 | 
144 | 		
145 | 		## log likelihood of H1
146 | 		LH1 = inner_a + inner_b + inter_ab
147 | 		LH0 = whole
148 | 		
149 | 		Lambda = -2*( LH0 - LH1 )
150 | 		# cat(Lambda, '\n')
151 | 		df_h0 = 1*n_parameters
152 | 		df_h1 = 3*n_parameters
153 | 		df = df_h1 - df_h0
154 | 		
155 | 		p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
156 | 		info = list( Lambda=Lambda, p=p )
157 | 		return(info)
158 | 	}
159 | 
160 | 	lognormal_mean_test <- function( cA, head, mid, tail )
161 | 	{
162 | 		A_whole = cA[head:tail, head:tail]
163 | 		A_a = cA[head:mid, head:mid]
164 | 		A_b = cA[(mid+1):tail, (mid+1):tail]
165 | 		A_ab = cA[head:mid, (mid+1):tail]
166 | 		
167 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
168 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
169 | 		
170 | 		tri_Aa_vec = as.vector( tri_Aa )
171 | 		tri_Ab_vec = as.vector( tri_Ab )
172 | 		A_ab_vec = as.vector( A_ab )
173 | 		
174 | 		tri_Aa_vec_p = tri_Aa_vec[tri_Aa_vec!=0]
175 | 		tri_Ab_vec_p = tri_Ab_vec[tri_Ab_vec!=0]
176 | 		A_ab_vec_p = A_ab_vec[A_ab_vec!=0]
177 | 		
178 | 		p_Aa = p_lognormal_mean( tri_Aa_vec_p, A_ab_vec_p )
179 | 		p_Ab = p_lognormal_mean( tri_Ab_vec_p, A_ab_vec_p )
180 | 		return( list(p_Aa=p_Aa, p_Ab=p_Ab) )
181 | 	}
182 | 	
183 | 	## https://www.jstor.org/stable/2533570
184 | 	## google: Methods for Comparing the Means of Two Independent Log-Normal Samples
185 | 	## 03-07-2018
186 | 	p_lognormal_mean <- function( vec_aa, vec_ab )
187 | 	{
188 | 		if( all(vec_aa==0) | all(vec_ab==0) ) return(0)
189 | 		
190 | 		n_aa = length(vec_aa)
191 | 		n_ab = length(vec_ab)
192 | 		fited_info_aa = MASS::fitdistr(vec_aa, 'lognormal') ## intra
193 | 		mu_aa = fited_info_aa$estimate[1]
194 | 		sd_aa = fited_info_aa$estimate[2]
195 | 		s2_aa = sd_aa^2
196 | 
197 | 		fited_info_ab = MASS::fitdistr(vec_ab, 'lognormal') ## inter
198 | 		mu_ab = fited_info_ab$estimate[1]
199 | 		sd_ab = fited_info_ab$estimate[2]
200 | 		s2_ab = sd_ab^2
201 | 		
202 | 		z_score = ( (mu_aa - mu_ab) + (1/2)*(s2_aa - s2_ab) ) / sqrt( s2_aa/n_aa + s2_ab/n_ab + (1/2)*( s2_aa^2/(n_aa-1) + s2_ab^2/(n_ab-1) ) )
203 | 		p = pnorm( z_score, lower.tail=FALSE )
204 | 		return(p)
205 | 	}
206 | 
207 | 	get_corner_xy <- function(A_whole)
208 | 	{
209 | 		n = nrow(A_whole)
210 | 		corner_size = floor(n/2)
211 | 		A_corner = A_whole[1:corner_size, (n - corner_size + 1 ):n]
212 | 		expected_high = A_corner[upper.tri(A_corner, diag=FALSE)]
213 | 		expected_low = A_corner[lower.tri(A_corner, diag=TRUE)]
214 | 		return(list(x=expected_low, y=expected_high))
215 | 	}
216 | 
217 | 	get_half_mat_values <- function(mat)
218 | 	{
219 | 		n1 = nrow(mat)
220 | 		n2 = ncol(mat)
221 | 		delta = n1/n2
222 | 		rows = lapply( 1:n2, function(x) (1+ceiling(x*delta)):n1 )
223 | 		rows = rows[ sapply(rows, function(v) max(v) <= n1) ]
224 | 		flag = which(diff(sapply(rows, length)) > 0)
225 | 		if(length(flag)>0) rows = rows[ 1:min(flag) ]
226 | 		
227 | 		row_col_indices = cbind( unlist(rows), rep(1:length(rows), sapply(rows, length)))
228 | 		x = mat[row_col_indices]
229 | 		mat[row_col_indices] = NA
230 | 		y = na.omit(as.vector(mat))
231 | 		return(list(x=x, y=y))
232 | 	}
233 | 
234 | 
235 | 	get_half_mat_values_v2 <- function(mat)
236 | 	{
237 | 		## https://stackoverflow.com/questions/52990525/get-upper-triangular-matrix-from-nonsymmetric-matrix/52991508#52991508
238 | 		y = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) > -1/2]
239 | 		x = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) < -1/2]
240 | 		return(list(x=x, y=y))
241 | 	}
242 | 
243 | 	p_wilcox_test_nested <- function( A, head, mid, tail, alternative, is_CD )
244 | 	{
245 | 		test_1 = p_wilcox_test( A, head, mid, tail, alternative, is_CD, only_corner=FALSE ) #: coner + inter
246 | 		if( (tail - head <= 4) | (test_1$p > 0.05) ) ## when > 0.05 or it is already small, do not cosider it as nested
247 | 		{
248 | 			info = list( Lambda=NULL, p=0.555555, mean_diff=0 )
249 | 			return(info)
250 | 		}
251 | 
252 | 		## try_error happens when tad to test is too small. THEREFORE, ASSIGN P=0 TO THE TAD
253 | 		test_left_tad = try(p_wilcox_test( A, head, mid=ceiling((head+mid)/2), mid, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
254 | 		if( class(test_left_tad)=="try-error" ) test_left_tad = list(p=0)
255 | 		test_right_tad = try(p_wilcox_test( A, mid+1, mid=ceiling((mid+1+tail)/2), tail, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter
256 | 		if( class(test_right_tad)=="try-error" ) test_right_tad = list(p=0)
257 | 
258 | 		info = list( Lambda=NULL, p=max( test_1$p, min(test_left_tad$p, test_right_tad$p) ), mean_diff=0 )
259 | 		return(info)
260 | 	}
261 | 
262 | 
263 | 
264 | 	p_wilcox_test = function( A, head, mid, tail, alternative, is_CD=FALSE, only_corner=FALSE ) ## only_corner tests if a domain is a TAD (no nesting)
265 |         {
266 |                 A_whole = A[head:tail, head:tail]
267 |                 A_a = A[head:mid, head:mid]
268 |                 A_b = A[(mid+1):tail, (mid+1):tail]
269 |                 A_ab = A[head:mid, (mid+1):tail]
270 | 
271 |                 tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] ## need to check whether diag should be false or true
272 | 
273 |                 tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] ## need to check whether diag should be false or true
274 |                 tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] ## need to check whether diag should be false or true
275 | 
276 |                 tri_Aa_vec = as.vector( tri_Aa )
277 |                 tri_Ab_vec = as.vector( tri_Ab )
278 |                 A_ab_vec = as.vector( A_ab )
279 | 
280 |                 corner_mat_info = get_half_mat_values_v2(A_ab)
281 |                 A_ab_corner = as.vector(corner_mat_info$y)
282 |                 A_ab_ncorner = corner_mat_info$x
283 |                 p_corner = wilcox.test(x=A_ab_corner, y=A_ab_ncorner, alternative='greater', exact=F)
284 |                 p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_ncorner, alternative='greater', exact=F)
285 |                 # p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=c(A_ab_ncorner, A_ab_corner), alternative='greater', exact=F)
286 | 
287 |                 if(is_CD==FALSE) p = max(p_inter$p.value, p_corner$p.value) ## if the tested part is the CD
288 |                 if(is_CD==TRUE) p = p_inter$p.value ## if the tested part is the CD
289 |                 # if(only_corner==TRUE) p = p_corner$p.value
290 |                 mean_diff_inter = mean(A_ab_ncorner) - mean(c(tri_Ab_vec, tri_Aa_vec)) ## negative is good
291 |                 mean_diff_corner = mean(A_ab_ncorner) - mean(A_ab_corner) ## negative is good
292 | 
293 | 
294 |                 if(is_CD==FALSE) mean_diff = min(mean_diff_corner, mean_diff_inter) ## if the tested part is the CD
295 |                 if(is_CD==TRUE) mean_diff = mean_diff_inter
296 | 
297 |                 if(min(length(tri_Ab_vec), length(tri_Ab_vec)) < 10) mean_diff = 100 ## when one of the two twins is too small. 10: dim(4,4)
298 | 
299 |                 # mean_diff = mean(A_ab_corner) - mean(A_ab_ncorner)
300 | 
301 |                 # p_test_Aa =  wilcox.test(x=A_ab_vec, y=tri_Aa_vec, alternative="less", exact=F)
302 |                 # p_test_Ab =  wilcox.test(x=A_ab_vec, y=tri_Ab_vec, alternative="less", exact=F)
303 |                 # p = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_vec, alternative=alternative, exact=F)
304 | 
305 |                 # xy = get_corner_xy(A_whole)
306 |                 # p = wilcox.test(x=xy$x, y=xy$y, alternative=alternative, exact=F)
307 | 
308 |                 # p = max(p_test_Aab$p.value, p_test_Ab$p.value)
309 |                 info = list( Lambda=NULL, p=p, mean_diff=mean_diff)
310 |                 return(info)
311 |         }
312 | 
313 | 
314 | 
315 | 	
316 | 	p_likelihood_ratio_lnorm <- function( A, head, mid, tail, n_parameters, imputation, imputation_num=1E2 )
317 | 	{
318 | 		likelihood_fun = likelihood_lnorm_mle
319 | 		A_whole = A[head:tail, head:tail]
320 | 		A_a = A[head:mid, head:mid]
321 | 		A_b = A[(mid+1):tail, (mid+1):tail]
322 | 		A_ab = A[head:mid, (mid+1):tail]
323 | 		
324 | 		tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 
325 | 		## added 25-03-2018. If no zero values, no imputation
326 | 		no_zero_flag = 0
327 | 		if( sum(tri_Awhole==0) == 0 ) { no_zero_flag = 1; imputation = FALSE }
328 | 		
329 | 		tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 
330 | 		tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 
331 | 		
332 | 		tri_Aa_vec = as.vector( tri_Aa )
333 | 		tri_Ab_vec = as.vector( tri_Ab )
334 | 		A_ab_vec = as.vector( A_ab )
335 | 
336 | 		mean_a = mean( tri_Aa_vec[tri_Aa_vec!=0] )
337 | 		mean_b = mean( tri_Ab_vec[tri_Ab_vec!=0] )
338 | 		mean_ab = mean( A_ab_vec[A_ab_vec!=0] )
339 | 		mean_diff = mean_ab - min(mean_a, mean_b)
340 | 		
341 | 		if( (all(tri_Aa_vec==0)) | (all(tri_Ab_vec==0)) | (all(A_ab_vec==0))  )
342 | 		{
343 | 			info = list( Lambda=NA, p=0, mean_diff=mean_diff )
344 | 			return(info)
345 | 		}
346 | 
347 | 		## lnorm
348 | 		if(!imputation)
349 | 		{
350 | 			inner_a = likelihood_fun( tri_Aa )
351 | 			inner_b = likelihood_fun( tri_Ab )
352 | 			inter_ab = likelihood_fun( A_ab )
353 | 			whole = likelihood_fun( tri_Awhole )
354 | 
355 | 			## log likelihood of H1
356 | 			LH1 = inner_a + inner_b + inter_ab
357 | 			LH0 = whole
358 | 			Lambda = -2*( LH0 - LH1 )
359 | 			
360 | 			if( no_zero_flag ) n_parameters = n_parameters - 1 ## no alpha parameter
361 | 			df_h0 = 1*n_parameters ##(theta_1 = theta_2 = theta_3)
362 | 			df_h1 = 3*n_parameters ##(theta_1, theta_2, theta_3)
363 | 			df = df_h1 - df_h0
364 | 		
365 | 			p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
366 | 			info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
367 | 			return(info)
368 | 		}
369 | 		
370 | 		if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
371 | 		{
372 | 			vec_list = list( tri_Aa=tri_Aa, tri_Ab=tri_Ab, A_ab=A_ab )
373 | 			imputations = imputation_list( vec_list, imputation_num )
374 | 			
375 | 			inner_as = apply(imputations$tri_Aa, 1, likelihood_fun)
376 | 			inner_bs = apply(imputations$tri_Ab, 1, likelihood_fun)
377 | 			inter_abs = apply(imputations$A_ab, 1, likelihood_fun)
378 | 			
379 | 			wholes = apply(do.call( cbind, imputations ), 1, likelihood_fun)
380 | 			LH1s = inner_as + inner_bs + inter_abs
381 | 			LH0s = wholes
382 | 			Lambdas = -2*( LH0s - LH1s )
383 | 			Lambda = mean( Lambdas )
384 | 			n_parameters = n_parameters - 1 ## the mixture parameter is not taken into account
385 | 			
386 | 			# cat(Lambda, '\n')
387 | 			df_h0 = 1*n_parameters
388 | 			df_h1 = 3*n_parameters
389 | 			df = df_h1 - df_h0
390 | 			
391 | 			p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE)
392 | 			info = list( Lambda=Lambda, p=p, mean_diff=mean_diff )
393 | 			return(info)
394 | 			
395 | 		# if( imputation ) ## zero values are imputed by the estimated distribution based on non random values
396 | 		# {
397 | 			# inner_a = likelihood_lnorm( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa )
398 | 			# inner_b = likelihood_lnorm( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab )
399 | 			# inter_ab = likelihood_lnorm( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab )
400 | 			# whole = likelihood_lnorm( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole )
401 | 			# n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account
402 | 		# }
403 | 		
404 | 		}
405 | 	}
406 | 
407 | 	imputation_list <- function( vec_list, imputation_num )
408 | 	{
409 | 		imputations = lapply( vec_list, function(v) 
410 | 		{
411 | 			if(sum(v!=0)==1) {final_vec = matrix( v[v!=0], imputation_num, length(v) ); return(final_vec)}
412 | 			
413 | 			fit = fit_lnorm(v)
414 | 			set.seed(1)
415 | 			
416 | 			## THERE WILL BE ERROR IF IS.NA SDLOG
417 | 			if(!is.na(fit['sdlog'])) imputation_vec = matrix(rlnorm( sum(v==0)*imputation_num, fit['meanlog'], fit['sdlog'] ), nrow=imputation_num)
418 | 			if(is.na(fit['sdlog'])) stop("In function imputation_list, sdlog=NA encountered")
419 | 
420 | 			ori_vec =  t(replicate(imputation_num, v[v!=0]))
421 | 			final_vec = cbind( ori_vec, imputation_vec )
422 | 		} )
423 | 		return( imputations )
424 | 	}
425 | 
426 | 
427 | 	fit_lnorm <- function(vec)
428 | 	{
429 | 		vec = vec[vec!=0]
430 | 		fit = MASS::fitdistr(vec, 'lognormal')$estimate
431 | 		return(fit)
432 | 	}
433 | 	
434 | 	
435 | 	# LikelihoodRatioTest <- function(res_info, ncores, remove_zero=TRUE, A_already_corrected=FALSE, distr='lnorm', n_parameters=3, imputation_num=1E2)
436 | 	# {
437 | 	# 	require( doParallel )
438 | 	# 	pA_sym = res_info$pA_sym
439 | 	# 	if(A_already_corrected==TRUE) cpA_sym = pA_sym
440 | 	# 	if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym
441 | 	# 	# registerDoParallel(cores=ncores)
442 | 	# 	trees = foreach( j =1:length( res_info$res_inner ) ) %do% 
443 | 	# 	{
444 | 	# 		name_index = rownames(res_info$res_inner[[j]]$A)
445 | 	# 		res_info$res_inner[[j]]$cA = cpA_sym[name_index, name_index] ## the corrected A. Use this matrix is essential for reducing the distance-based false positves
446 | 	# 		tmp = get_tree_decoration( res_info$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )
447 | 	# 		tmp
448 | 	# 	}
449 | 	# 	return(trees)	
450 | 	# }
451 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | matrix_multiplication_cpp <- function(A, B) {
 5 |     .Call(`_CALDER_matrix_multiplication_cpp`, A, B)
 6 | }
 7 | 
 8 | matrix_multiplication_sym_cpp <- function(A) {
 9 |     .Call(`_CALDER_matrix_multiplication_sym_cpp`, A)
10 | }
11 | 
12 | loglik_lnorm_cpp <- function(sum_ln1, sum_ln2, p, q) {
13 |     .Call(`_CALDER_loglik_lnorm_cpp`, sum_ln1, sum_ln2, p, q)
14 | }
15 | 
16 | loglik_lnorm_cpp_vec <- function(vec_values) {
17 |     .Call(`_CALDER_loglik_lnorm_cpp_vec`, vec_values)
18 | }
19 | 
20 | get_A_len <- function(A) {
21 |     .Call(`_CALDER_get_A_len`, A)
22 | }
23 | 
24 | get_A_ln1 <- function(A) {
25 |     .Call(`_CALDER_get_A_ln1`, A)
26 | }
27 | 
28 | get_A_ln2 <- function(A) {
29 |     .Call(`_CALDER_get_A_ln2`, A)
30 | }
31 | 
32 | loglik_lnorm_cpp_mat <- function(sum_ln1, sum_ln2, ps, qs) {
33 |     .Call(`_CALDER_loglik_lnorm_cpp_mat`, sum_ln1, sum_ln2, ps, qs)
34 | }
35 | 
36 | zigzag_loglik_ancestors_v4_5 <- function(A, k, min_n_bins = 2L) {
37 |     .Call(`_CALDER_zigzag_loglik_ancestors_v4_5`, A, k, min_n_bins)
38 | }
39 | 
40 | compute_L <- function(A, L, k) {
41 |     .Call(`_CALDER_compute_L`, A, L, k)
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/R/bisecting_kmeans.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 	
  3 |  	## k-means with replicatable seeds
  4 |  	my_kmeans = function(iter.max=1E3, nstart=50, ...)
  5 |  	{
  6 |  		set.seed(1)
  7 |  		res = kmeans(iter.max=iter.max, nstart=nstart, ...)
  8 |  		return(res)
  9 |  	}
 10 | 
 11 |  	## This function tries to adjust the height of each split, in order to generate a valid hclust object and with balanced compartments A.1 A.2 B.1 B.2
 12 |  	## Clusters with more nodes will get bigger height in case of same height
 13 |  	adjust_hs <- function(l_r_h)
 14 |  	{
 15 |  		hs = sapply(l_r_h, function(v) v$h)
 16 |  		all_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$l, v$r))))
 17 |   		r_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$r))))
 18 | 		
 19 | 	 	sizes = sapply(l_r_h, function(v) length(v$l) + length(v$r)) ##
 20 | 
 21 | 	 	################ This part deals with duplicated heights
 22 | 	 	hs = hs + sizes*1E-7
 23 | 	 	################ This part tries to make the top-level left and right branch to have similar height, such that to make balanced A.1, A.2, B.1, B.2 compartments
 24 | 	 	## Find the index of second branch, whose number of nodes is n_total - n_left: sizes[1] - sizes[2]
 25 | 	 	l_b = 2 ## left sub-branch
 26 | 	 	# r_b = which(sizes==(sizes[1] - sizes[2]))[1] ## right sub-branch
 27 | 	 	r_b = which(r_names[1]==all_names) ## right sub-branch
 28 | 
 29 | 	 	l_h = hs[l_b]
 30 | 	 	r_h = hs[r_b]
 31 | 	 	max_h = max(l_h, r_h) ## the maximum height of the two branches
 32 | 	 	hs_new = mean(sort(hs, decreasing=TRUE)[2:3]) ## hs_new is the 3rd largest height
 33 | 		hs[l_b] = ifelse(l_h > r_h, max_h, hs_new)
 34 | 		hs[r_b] = ifelse(r_h > l_h, max_h, hs_new)
 35 | 
 36 | 	 	if(any(duplicated(hs))) stop('ERROR: DUPLICATED HEIGHTS exist in bisecting_kmeans')
 37 | 	 	return( hs )
 38 |  	}
 39 | 
 40 |  	bisecting_kmeans <- function(data)
 41 |  	{
 42 |  		dist_mat = as.matrix(stats::dist(data))
 43 |  		indices = 1:nrow(data)
 44 |  		l_r_h <<- list()
 45 | 
 46 | 	 	get_h <- function(l_indices, r_indices)
 47 | 	 	{
 48 | 			combined_indices = c(l_indices, r_indices)
 49 | 			idx <- as.matrix(expand.grid(combined_indices, combined_indices))
 50 | 			max(dist_mat[idx]) ## diameter		
 51 | 	 	}
 52 | 
 53 | 	 	get_sub_tree <- function( indices )
 54 | 	 	{
 55 | 		 	n_nodes = length(indices)
 56 | 
 57 | 		 	if(n_nodes==1) ## if only two nodes
 58 | 		 	{
 59 | 		 		h = NULL
 60 | 		 		# tree = list(h=h, leaf=indices)
 61 | 		 		return()
 62 | 		 	}
 63 | 
 64 | 		 	############# if more than two nodes
 65 | 		 	if(n_nodes==2) cluster=c(1,2) else cluster = my_kmeans(x=data[indices, ], centers=2)$cluster
 66 | 		 	l_indices = indices[cluster==1]
 67 | 		 	r_indices = indices[cluster==2]
 68 | 		 	h = get_h(l_indices, r_indices)
 69 | 		 	l_r_h <<- c(l_r_h, list(list(l=l_indices, r=r_indices, h=h)))
 70 | 
 71 | 		 	# cat(h, '\n')
 72 | 			l_branch = get_sub_tree( l_indices )
 73 | 			r_branch = get_sub_tree( r_indices )
 74 | 		 	# tree = list(h=h, l_branch=l_branch, r_branch=r_branch, l_indices=l_indices, r_indices=r_indices)
 75 | 		 	# return(tree)
 76 | 	 	}
 77 | 
 78 | 	 	get_sub_tree(indices)
 79 | 
 80 | 	 	hs = adjust_hs(l_r_h)
 81 | 
 82 | 	 	r_hs = rank(hs)
 83 | 	 	for( i in 1:length(l_r_h) ) {name=r_hs[i]; names(name)=paste0(collapse='_', sort(c(l_r_h[[i]]$l, l_r_h[[i]]$r))); l_r_h[[i]]$name=name}
 84 | 	 	pos_names = sapply(l_r_h, function(v) v$name)
 85 | 	 	neg_names = -(1:length(indices)); names(neg_names) = 1:length(indices); all_names = c(pos_names, neg_names)
 86 | 	 	for( i in 1:length(l_r_h) ) {l_r_h[[i]]$l_name=unname(all_names[paste0(l_r_h[[i]]$l, collapse='_')]); l_r_h[[i]]$r_name=unname(all_names[paste0(l_r_h[[i]]$r, collapse='_')]) }
 87 | 		
 88 | 		merge_height = data.frame(l=sapply(l_r_h, function(v) v$l_name), r=sapply(l_r_h, function(v) v$r_name), h=hs)
 89 | 		merge_height = merge_height[order(merge_height$h), ]
 90 | 		rownames(merge_height) = NULL
 91 | 		
 92 | 		data_tmp = cbind(c(0,0,1,1), c(0,1,1,0))
 93 | 		hc = hclust(stats::dist(data_tmp), "com")
 94 | 		hc$merge = as.matrix(unname(merge_height[,1:2]))
 95 | 		hc$height = merge_height$h
 96 | 		# hc$order = unname(unlist(res, recursive=TRUE)[grepl('leaf', names(unlist(res, recursive=TRUE)))])
 97 | 		# hc$order = 1:length(indices)
 98 | 		hc$labels = 1:length(indices)
 99 | 		den <- as.dendrogram(hc)
100 | 		hc_r <- as.hclust(reorder(den, 1:length(indices)))
101 | 		hc_r$method = "complete"
102 | 		hc_r$dist.method = "euclidean"
103 | 		l_r_h <<- list()
104 | 		rm(l_r_h)
105 | 		return(hc_r)
106 |  	}
107 |  
108 | 


--------------------------------------------------------------------------------
/R/call_domains.R:
--------------------------------------------------------------------------------
  1 | 	## Yuanlong LIU
  2 | 	## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship
  3 | 
  4 | 	# @author : Hanjun Shin(shanjun@usc.edu)
  5 | 	# @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH)
  6 | 	# @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix. 
  7 | 	# @version 0.0.2
  8 | 
  9 | 	# @fn TopDom
 10 | 	# @param matrixFile : string, matrixFile Address,
 11 | 	# - Format = {chromosome, bin start, bin end, N numbers normalized value }
 12 | 	# - N * (N + 3), where N is the number of bins
 13 | 	# @param window.size :integer, number of bins to extend.
 14 | 	# @param out_binSignal : string, binSignal file address to write
 15 | 	# @param out_ext : string, ext file address to write
 16 | 
 17 | 
 18 | 
 19 | 	remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09
 20 | 	{
 21 | 		# binSignal = res_input_mat$binSignal
 22 | 		# domains = domains_bp
 23 | 		# domains = domains[, c('from.id', 'to.id', 'size')]
 24 | 		while(1)
 25 | 		{
 26 | 			i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one
 27 | 			if(i==Inf) break
 28 | 			ids = domains[(i-1):i, 'to.id']
 29 | 			index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])]
 30 | 			domains = domains[-index2rm, ]
 31 | 			domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1
 32 | 			domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1
 33 | 			cat(nrow(domains), '\n')
 34 | 			if(nrow(domains) < 10) break ## avoid exceptions
 35 | 		}
 36 | 		# domains[, 'chr'] = domains_bp[1, 'chr']
 37 | 		domains[, 'from.coord'] = domains[, 'from.id'] - 1
 38 | 		domains[, 'to.coord'] = domains[, 'to.id']
 39 | 		rownames(domains) = 1:nrow(domains)
 40 | 		# print(head(domains))
 41 | 		return(domains)
 42 | 	}
 43 | 
 44 | 	## modified input file format and added several potentially useful parameter options, Yuanlong LIU
 45 | 	TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL)
 46 | 	{
 47 | 	  if(!is.null(window.sizes)) window.size = window.sizes[1]
 48 | 
 49 | 	  # print("#########################################################################")
 50 | 	  # print("Step 0 : File Read ")
 51 | 	  # print("#########################################################################")
 52 | 	  window.size = as.numeric(window.size)
 53 | 	  matdf = A_extended
 54 | 	  
 55 | 	  if( ncol(matdf) - nrow(matdf) == 3) {
 56 | 		colnames(matdf) <- c("chr", "from.coord", "to.coord")
 57 | 	  } else if( ncol(matdf) - nrow(matdf) ==4 ) {
 58 | 		colnames(matdf) <- c("id", "chr", "from.coord", "to.coord")
 59 | 	  } else {
 60 | 		print("Unknwon Type of matrix file")
 61 | 		return(0)
 62 | 	  }
 63 | 	  n_bins = 1*nrow(matdf)
 64 | 	  mean.cf <- rep(0, n_bins)
 65 | 	  pvalue <- rep(1, n_bins)
 66 | 	  
 67 | 	  local.ext = rep(-0.5, n_bins)
 68 | 	  
 69 | 	  bins <- data.frame(id=1:n_bins, 
 70 | 						 chr=matdf[, "chr"], 
 71 | 						 from.coord=matdf[, "from.coord"], 
 72 | 						 to.coord=matdf[, "to.coord"] )
 73 | 	  
 74 | 	  matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] )
 75 | 	  
 76 | 
 77 | 	  ptm <- proc.time()
 78 | 	  
 79 | 	  ## Only compute for one track of signal
 80 | 	  if(is.null(window.sizes)){
 81 | 	  	  for(i in 1:n_bins)
 82 | 		  {
 83 | 			diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size)
 84 | 			mean.cf[i] = mean(diamond)
 85 | 		  }
 86 | 	  }
 87 | 
 88 | 
 89 | 	  if(!is.null(window.sizes)) ## compute multiple tracks of signal
 90 | 	  {
 91 | 	  	 mean.cfs = matrix(, n_bins, length(window.sizes))
 92 | 	  	 for(k in 1:length(window.sizes) )
 93 | 	  	 {
 94 | 	  	 	window.size.iter = window.sizes[k]
 95 | 	  	    for(i in 1:n_bins)
 96 | 			  {
 97 | 				diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter)
 98 | 				mean.cfs[i, k] = mean(diamond)
 99 | 			  }  
100 | 	  	 }
101 | 	  	 mean.cf = apply(mean.cfs, 1, mean)
102 | 	  }
103 | 
104 | 	  
105 | 	  eltm = proc.time() - ptm
106 | 	  # print(paste("Step 1 Running Time : ", eltm[3]))
107 | 	  # print("Step 1 : Done !!")
108 | 	  
109 | 	  # print("#########################################################################")
110 | 	  # print("Step 2 : Detect TD boundaries based on binSignals")
111 | 	  # print("#########################################################################")
112 | 	  
113 | 	  ptm = proc.time()
114 | 	  #gap.idx = Which.Gap.Region(matrix.data=matrix.data)
115 | 	  #gap.idx = Which.Gap.Region2(mean.cf)
116 | 	  gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size)
117 | 	  
118 | 	  proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3)
119 | 	  
120 | 	  #print(proc.regions)
121 | 	  
122 | 	  for( i in 1:nrow(proc.regions))
123 | 	  {
124 | 		start = proc.regions[i, "start"]
125 | 		end = proc.regions[i, "end"]
126 | 		
127 | 		# print(paste("Process Regions from ", start, "to", end))
128 | 		  
129 | 		local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end])
130 | 	  }
131 | 	  
132 | 	  local.ext_bp = local.ext
133 | 	  
134 | 	  eltm = proc.time() - ptm
135 | 	  # print(paste("Step 2 Running Time : ", eltm[3]))
136 | 	  # print("Step 2 : Done !!")
137 | 	  
138 | 	  if(statFilter)
139 | 	  {
140 | 		# print("#########################################################################")
141 | 		# print("Step 3 : Statistical Filtering of false positive TD boundaries")
142 | 		# print("#########################################################################")
143 | 	  	
144 | 	  	if(is.null(stat_window_size)) stat_window_size=window.size
145 | 		ptm = proc.time()  
146 | 		# print("-- Matrix Scaling....")
147 | 		scale.matrix.data = matrix.data
148 | 
149 | 		## This is to normalize each off diag values -- 2018-09-20
150 | 		# for( i in 1:(2*window.size) )
151 | 		for( i in 1:(2*stat_window_size) )
152 | 		{
153 | 		  #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) )
154 | 		  scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] )
155 | 		}
156 | 		
157 | 		# print("-- Compute p-values by Wilcox Ranksum Test")
158 | 		for( i in 1:nrow(proc.regions))
159 | 		{
160 | 		  start = proc.regions[i, "start"]
161 | 		  end = proc.regions[i, "end"]
162 | 		  
163 | 		  # print(paste("Process Regions from ", start, "to", end))
164 | 		  # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1)
165 | 		  
166 | 		  pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1)
167 | 		}
168 | 		# print("-- Done!")
169 | 		
170 | 		# print("-- Filtering False Positives")
171 | 		local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue<p_thresh))] = -2
172 | 		local.ext[which(local.ext==-1)] = 0
173 | 		local.ext[which(local.ext==-2)] = -1
174 | 		# print("-- Done!")
175 | 		
176 | 		eltm = proc.time() - ptm
177 | 		# print(paste("Step 3 Running Time : ", eltm[3]))
178 | 		# print("Step 3 : Done!")
179 | 	  } else pvalue = 0
180 | 
181 | 	  domains = Convert.Bin.To.Domain.TMP(bins=bins, 
182 | 									  signal.idx=which(local.ext==-1), 
183 | 									  gap.idx=which(local.ext==-0.5), 
184 | 									  pvalues=pvalue, 
185 | 									  pvalue.cut=p_thresh)
186 | 
187 | 	  domains = subset(domains, size!=0) ## ADDED 06-April-2019, Yuanlong
188 | 	  # print(head(domains))
189 | 	  # stop('HELLO')
190 | 	  if( sum(domains$size)!=nrow(A_extended) ) stop('In TopDom, some rows are missing')
191 | 
192 | 
193 | 	  binSignal = cbind(bins, 
194 | 				   local.ext = local.ext,
195 | 				   mean.cf = mean.cf, 
196 | 				   pvalue = pvalue)
197 | 	  bins = binSignal
198 | 
199 | 	  if(!is.null(domain_size_min))
200 | 	  {
201 | 	  	domains = remove_small_domains(domain_size_min=domain_size_min, binSignal=binSignal, domains=domains)
202 | 	  }
203 | 	  
204 | 	  
205 | 	  bedform = domains[, c("chr", "from.coord", "to.coord", "tag")]
206 | 	  colnames(bedform) = c("chrom", "chromStart", "chromEnd", "name")
207 | 	  
208 | 	  if( !is.null(outFile) ) {
209 | 		# print("#########################################################################")
210 | 		# print("Writing Files")
211 | 		# print("#########################################################################")
212 | 		
213 | 		outBinSignal =  paste(outFile, ".binSignal", sep="")
214 | 		# print(paste("binSignal File :", outBinSignal) )
215 | 		write.table(bins, file=outBinSignal, quote=F, row.names=F, col.names=T, sep="\t")  
216 | 	  
217 | 	  
218 | 		outDomain = paste(outFile, ".domain", sep="")
219 | 		# print(paste("Domain File :", outDomain) )
220 | 		write.table( domains, file=outDomain, quote=F, row.names=F, col.names=T, sep="\t")
221 | 		
222 | 		outBed = paste(outFile, ".bed", sep="")
223 | 		# print(paste("Bed File : ", outBed))
224 | 		write.table( bedform, file=outBed, quote=F, row.names=F, col.names=F, sep="\t")
225 | 	  }
226 | 	  
227 | 	  # print("Done!!")
228 | 	  
229 | 	  # print("Chromatin domains generated !")
230 | 	  if(return_peak_and_binsignal==TRUE) return( list( local_ext=local.ext_bp, mean_cf=mean.cf, domains=domains ) )
231 | 	  return(list(binSignal=bins, domain=domains, bed=bedform))
232 | 	}
233 | 
234 | 	# @fn Get.Diamond.Matrix
235 | 	# @param mat.data : N by N matrix, where each element indicate contact frequency
236 | 	# @param i :integer, bin index
237 | 	# @param size : integer, window size to expand from bin
238 | 	# @retrun : matrix.
239 | 	Get.Diamond.Matrix <- function(mat.data, i, size)
240 | 	{
241 | 	  n_bins = nrow( mat.data )
242 | 	  if(i==n_bins) return(NA)
243 | 	  
244 | 	  lowerbound = max( 1, i-size+1 )
245 | 	  upperbound = min( i+size, n_bins)
246 | 	  
247 | 	  return( mat.data[lowerbound:i, (i+1):upperbound] )
248 | 	}
249 | 
250 | 	# @fn Which.process.region
251 | 	# @param rmv.idx : vector of idx, remove index vector
252 | 	# @param n_bins : total number of bins
253 | 	# @param min.size : minimum size of bins
254 | 	# @retrun : data.frame of proc.regions
255 | 	Which.process.region <- function(rmv.idx, n_bins, min.size=3)
256 | 	{
257 | 	  gap.idx = rmv.idx
258 | 	  
259 | 	  proc.regions = data.frame(start=numeric(0), end=numeric(0))
260 | 	  proc.set = setdiff(1:n_bins, gap.idx)
261 | 	  n_proc.set = length(proc.set)
262 | 	  
263 | 	  i=1
264 | 	  while(i < n_proc.set )
265 | 	  {
266 | 		start = proc.set[i]
267 | 		j = i+1
268 | 		
269 | 		while(j <= n_proc.set)
270 | 		{
271 | 		  if( proc.set[j] - proc.set[j-1] <= 1) j = j + 1
272 | 		  else {
273 | 			proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
274 | 			i = j
275 | 			break
276 | 		  }
277 | 		}
278 | 		
279 | 		if(j >= n_proc.set ) {
280 | 		  proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) )
281 | 		  break
282 | 		}
283 | 	  }
284 | 	  
285 | 	  colnames(proc.regions) = c("start", "end")
286 | 	  proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ]
287 | 	  
288 | 	  return(proc.regions)
289 | 	}
290 | 
291 | 	# @fn Which.Gap.Region
292 | 	# @breif version 0.0.1 used
293 | 	# @param matrix.data : n by n matrix
294 | 	# @return gap index
295 | 	Which.Gap.Region <- function(matrix.data)
296 | 	{
297 | 	  n_bins = nrow(matrix.data)
298 | 	  gap = rep(0, n_bins)
299 | 	  
300 | 	  i=1
301 | 	  while(i < n_bins)
302 | 	  {
303 | 		j = i + 1
304 | 		while( j <= n_bins)
305 | 		{
306 | 		  if( sum( matrix.data[i:j, i:j]) == 0 ) {
307 | 			gap[i:j] = -0.5
308 | 			j = j+1  
309 | 			#if(j-i > 1) gap[i:j]=-0.5
310 | 			#j=j+1
311 | 		  } else break
312 | 		}
313 | 		
314 | 		i = j  
315 | 	  }
316 | 	  
317 | 	  idx = which(gap == -0.5)
318 | 	  return(idx)
319 | 	}
320 | 
321 | 	# @fn Which.Gap.Region3
322 | 	# @param matrix.data : n by n matrix
323 | 	# @return gap index
324 | 	Which.Gap.Region3 <- function(mean.cf)
325 | 	{
326 | 	  n_bins = length(mean.cf)
327 | 	  gapidx = which(mean.cf==0)
328 | 	  
329 | 	  return(gapidx)
330 | 	}
331 | 
332 | 	# @fn Which.Gap.Region2
333 | 	# @breif version 0.0.2 used
334 | 	# @param matrix.data : n by n matrix
335 | 	# @return gap index
336 | 	Which.Gap.Region2 <- function(matrix.data, w)
337 | 	{
338 | 	  n_bins = nrow(matrix.data)
339 | 	  gap = rep(0, n_bins)
340 | 	  
341 | 	  for(i in 1:n_bins)
342 | 	  {
343 | 		if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5
344 | 	  }
345 | 	  
346 | 	  idx = which(gap == -0.5)
347 | 	  return(idx)
348 | 	}
349 | 
350 | 	# @fn Detect.Local.Extreme
351 | 	# @param x : original signal to find local minima
352 | 	# @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise.
353 | 	Detect.Local.Extreme <- function(x)
354 | 	{
355 | 	  n_bins = length(x)
356 | 	  ret = rep(0, n_bins)
357 | 	  x[is.na(x)]=0
358 | 	  
359 | 	  if(n_bins <= 3)
360 | 	  {
361 | 		ret[which.min(x)]=-1
362 | 		ret[which.max(x)]=1
363 | 		
364 | 		return(ret)
365 | 	  }
366 | 	  # Norm##################################################3
367 | 	  new.point = Data.Norm(x=1:n_bins, y=x)
368 | 	  x=new.point$y
369 | 	  ##################################################
370 | 	  cp = Change.Point(x=1:n_bins, y=x)
371 | 	  
372 | 	  if( length(cp$cp) <= 2 ) return(ret)
373 | 	  if( length(cp$cp) == n_bins) return(ret)
374 | 	  for(i in 2:(length(cp$cp)-1))
375 | 	  {
376 | 		if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1
377 | 		else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1
378 | 		
379 | 		min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
380 | 		max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] )
381 | 		
382 | 		if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1
383 | 		if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1
384 | 	  }
385 | 
386 | 	  return(ret)
387 | 	}
388 | 
389 | 	# @fn Data.Norm
390 | 	# @param x : x axis vector
391 | 	# @param x : y axis vector
392 | 	# @return list of normalized x and y
393 | 	Data.Norm <- function(x, y)
394 | 	{
395 | 	  ret.x = rep(0, length(x))
396 | 	  ret.y = rep(0, length(y))
397 | 	  
398 | 	  ret.x[1] = x[1]
399 | 	  ret.y[1] = y[1]
400 | 	  
401 | 	  diff.x = diff(x)
402 | 	  diff.y = diff(y)
403 | 	  
404 | 	  scale.x = 1 / mean( abs(diff(x) ) )
405 | 	  scale.y = 1 / mean( abs( diff(y) ) )
406 | 	  
407 | 	  #print(scale.x)
408 | 	  #print(scale.y)
409 | 	  
410 | 	  for(i in 2:length(x))
411 | 	  {
412 | 		ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x)
413 | 		ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y)
414 | 	  }
415 | 	  
416 | 	  return(list(x=ret.x, y=ret.y))
417 | 	}
418 | 
419 | 	# @fn Change.Point
420 | 	# @param x : x axis vector
421 | 	# @param x : y axis vector
422 | 	# @return change point index in x vector,
423 | 	# Note that the first and the last point will be always change point
424 | 	Change.Point <- function( x, y )
425 | 	{
426 | 	  if( length(x) != length(y)) 
427 | 	  {
428 | 		print("ERROR : The length of x and y should be the same")
429 | 		return(0)
430 | 	  }
431 | 	  
432 | 	  n_bins <- length(x)
433 | 	  Fv <- rep(NA, n_bins)
434 | 	  Ev <- rep(NA, n_bins)
435 | 	  cp <- 1
436 | 	  
437 | 	  i=1
438 | 	  Fv[1]=0
439 | 	  while( i < n_bins )
440 | 	  {
441 | 		j=i+1
442 | 		Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 )
443 | 		
444 | 		while(j<n_bins)
445 | 		{
446 | 		  j=j+1
447 | 		  k=(i+1):(j-1)
448 | 		  Ev[j] = ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
449 | 		  Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i])^2 ) - ( sum( abs( (y[j]-y[i] )*x[k] - (x[j] -x[i])*y[k] - (x[i]*y[j]) + (x[j]*y[i]) ) ) / sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) )
450 | 		  
451 | 		  #################################################
452 | 		  #Not Original Code
453 | 		  if( is.na(Fv[j]) || is.na(Fv[j-1]) ) {
454 | 			j = j-1
455 | 			cp <- c(cp, j)
456 | 			break
457 | 		  }
458 | 		  ####################################################3
459 | 		  if(Fv[j] < Fv[j-1] ) {
460 | 			j = j - 1
461 | 			cp <- c(cp, j )
462 | 			break
463 | 		  }
464 | 		}
465 | 		i=j
466 | 	  }
467 | 	  
468 | 	  cp <- c(cp, n_bins)
469 | 	  
470 | 	  return(list(cp=cp, objF=Fv, errF=Ev))
471 | 	}
472 | 
473 | 	# @fn Get.Pvalue
474 | 	# @param matrix.data : matrix
475 | 	# @param size : size to extend
476 | 	# @param scale : scale parameter if necessary. deprecated parameter
477 | 	# @return computed p-value vector
478 | 	Get.Pvalue <- function( matrix.data, size, scale=1 )
479 | 	{
480 | 	  n_bins = nrow(matrix.data)
481 | 	  pvalue <- rep(1, n_bins)
482 | 	  
483 | 	  for( i in 1:(n_bins-1) )
484 | 	  {
485 | 		dia = as.vector( Get.Diamond.Matrix2(matrix.data, i, size=size) )
486 | 		ups = as.vector( Get.Upstream.Triangle(matrix.data, i, size=size) )
487 | 		downs = as.vector( Get.Downstream.Triangle(matrix.data, i, size=size) )
488 | 		
489 | 		wil.test =  wilcox.test(x=dia*scale, y=c(ups, downs), alternative="less", exact=F)
490 | 		pvalue[i] = wil.test$p.value  
491 | 		
492 | 		#print(paste(i, "=", wil.test$p.value) )  
493 | 	  }
494 | 	  
495 | 	  pvalue[ is.na(pvalue) ] = 1
496 | 	  return(pvalue)
497 | 	}
498 | 
499 | 	# @fn Get.Upstream.Triangle
500 | 	# @param mat.data : matrix data
501 | 	# @param i : bin index
502 | 	# @param size : size of window to extend
503 | 	# @return upstream triangle matrix
504 | 	Get.Upstream.Triangle <- function(mat.data, i, size)
505 | 	{
506 | 	  n_bins = nrow(mat.data)
507 | 	  
508 | 	  lower = max(1, i-size)
509 | 	  tmp.mat = mat.data[lower:i, lower:i]
510 | 	  return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
511 | 	}
512 | 
513 | 	# @fn Get.Downstream.Triangle
514 | 	# @param mat.data : matrix data
515 | 	# @param i : bin index
516 | 	# @param size : size of window to extend
517 | 	# @return downstream triangle matrix
518 | 	Get.Downstream.Triangle <- function(mat.data, i, size)
519 | 	{
520 | 	  n_bins = nrow(mat.data)
521 | 	  if(i==n_bins) return(NA)
522 | 	  
523 | 	  upperbound = min(i+size, n_bins)
524 | 	  tmp.mat = mat.data[(i+1):upperbound, (i+1):upperbound]
525 | 	  return( tmp.mat[ upper.tri( tmp.mat, diag=F ) ] )
526 | 	}
527 | 
528 | 	# @fn Get.Diamond.Matrix2
529 | 	# @param mat.data : matrix data
530 | 	# @param i : bin index
531 | 	# @param size : size of window to extend
532 | 	# @return diamond matrix
533 | 	Get.Diamond.Matrix2 <- function(mat.data, i, size)
534 | 	{
535 | 	  n_bins = nrow(mat.data)
536 | 	  new.mat = matrix(rep(NA, size*size), nrow=size, ncol=size)
537 | 	  
538 | 	  for(k in 1:size)
539 | 	  {
540 | 		if(i-(k-1) >= 1 && i < n_bins)
541 | 		{
542 | 		  lower = min(i+1, n_bins)
543 | 		  upper = min(i+size, n_bins)
544 | 		  
545 | 		  new.mat[size-(k-1), 1:(upper-lower+1)] = mat.data[i-(k-1), lower:upper]
546 | 		}
547 | 	  }
548 | 	  
549 | 	  return(new.mat)
550 | 	}
551 | 	  
552 | 	# @fn Convert.Bin.To.Domain
553 | 	# @param bins : bin information
554 | 	# @param signal.idx : signal index
555 | 	# @param signal.idx : gap index
556 | 	# @param pvalues : pvalue vector
557 | 	# @param pvalue.cut : pvalue threshold
558 | 	# @return dataframe storing domain information
559 | 	Convert.Bin.To.Domain <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
560 | 	{
561 | 	  n_bins = nrow(bins)
562 | 	  ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
563 | 	  levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
564 | 	  
565 | 	  rmv.idx = setdiff(1:n_bins, gap.idx)
566 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
567 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
568 | 	  n_procs = nrow(proc.region)
569 | 	  gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
570 | 	  
571 | 	  rmv.idx = union(signal.idx, gap.idx)
572 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
573 | 	  n_procs = nrow(proc.region)
574 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
575 | 	  domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
576 | 	  
577 | 	  rmv.idx = setdiff(1:n_bins, signal.idx)
578 | 	  proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
579 | 	  n_procs = nrow(proc.region)
580 | 	  if(n_procs>0)
581 | 	  {
582 | 		from.coord = bins[proc.region[, "start"]+1, "from.coord"]  
583 | 		boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
584 | 		ret = rbind(ret, boundary)
585 | 	  }
586 | 	  
587 | 	  ret = rbind(gap, domain)
588 | 	  ret = ret[order(ret[,3]), ]
589 | 	  
590 | 	  ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
591 | 	  ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
592 | 	  ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
593 | 	  ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU
594 | 	  
595 | 	  if(!is.null(pvalues) && !is.null(pvalue.cut))
596 | 	  {
597 | 		for(i in 1:nrow(ret))
598 | 		{
599 | 		  if(ret[i, "tag"]=="domain")
600 | 		  {
601 | 			domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
602 | 			p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
603 | 			
604 | 			if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
605 | 		  }
606 | 		}
607 | 	  }
608 | 	  
609 | 	  new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
610 | 	  stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
611 | 	  
612 | 	  i=1
613 | 	  while(i <= nrow(ret))
614 | 	  {
615 | 		if( ret[i, "tag"] == "boundary" )
616 | 		{
617 | 		  stack.bdr = rbind(stack.bdr, ret[i, ])
618 | 		} else if(nrow(stack.bdr)>0) {
619 | 		  new.bdr = data.frame(chr=bins[1, "chr"], 
620 | 							   from.id = min( stack.bdr[, "from.id"]), 
621 | 							   from.coord=min(stack.bdr[, "from.coord"]),
622 | 							   to.id = max( stack.bdr[, "to.id"]),
623 | 							   to.coord=max(stack.bdr[, "to.coord"]),
624 | 							   tag="boundary",
625 | 							   size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"]))
626 | 		  new.bdr.set = rbind(new.bdr.set, new.bdr)
627 | 		  stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
628 | 		}
629 | 		
630 | 		i = i + 1
631 | 	  }
632 | 	  
633 | 	  ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set )
634 | 	  ret = ret[order(ret[, "to.coord"]), ]
635 | 	  
636 | 	  return(ret)
637 | 	}
638 | 
639 | 
640 | 	# @fn Convert.Bin.To.Domain
641 | 	# @param bins : bin information
642 | 	# @param signal.idx : signal index
643 | 	# @param signal.idx : gap index
644 | 	# @param pvalues : pvalue vector
645 | 	# @param pvalue.cut : pvalue threshold
646 | 	# @return dataframe storing domain information
647 | 	Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL)
648 | 	{
649 | 	  n_bins = nrow(bins)
650 | 	  ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0))
651 | 	  levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary")
652 | 	  
653 | 	  rmv.idx = setdiff(1:n_bins, gap.idx)
654 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
655 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
656 | 	  n_procs = nrow(proc.region)
657 | 	  gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
658 | 	  
659 | 	  rmv.idx = union(signal.idx, gap.idx)
660 | 	  proc.region = Which.process.region(rmv.idx, n_bins, min.size=0)
661 | 	  n_procs = nrow(proc.region)
662 | 	  from.coord = bins[proc.region[, "start"], "from.coord"]
663 | 	  domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
664 | 	  
665 | 	  rmv.idx = setdiff(1:n_bins, signal.idx)
666 | 	  proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) )
667 | 	  n_procs = nrow(proc.region)
668 | 	  if(n_procs>0)
669 | 	  {
670 | 		from.coord = bins[proc.region[, "start"]+1, "from.coord"]  
671 | 		boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F)
672 | 		ret = rbind(ret, boundary)
673 | 	  }
674 | 	  
675 | 	  ret = rbind(gap, domain)
676 | 	  ret = ret[order(ret[,3]), ]
677 | 	  
678 | 	  ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"])
679 | 	  ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] )
680 | 	  ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"])
681 | 	  ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"]
682 | 	  
683 | 	  if(!is.null(pvalues) && !is.null(pvalue.cut))
684 | 	  {
685 | 		for(i in 1:nrow(ret))
686 | 		{
687 | 		  if(ret[i, "tag"]=="domain")
688 | 		  {
689 | 			domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"]
690 | 			p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut )
691 | 			
692 | 			if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary"
693 | 		  }
694 | 		}
695 | 	  }
696 | 	  
697 | 	  return(ret)
698 | 	}


--------------------------------------------------------------------------------
/R/compartment_PCA.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 	get_PCs = function( mat, which ) ## fast way to compute PCs
 3 | 	{
 4 | 		PC_mat = crossprod(mat)
 5 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=max(which), which = "LM" )
 6 | 		if(any(res_eigs_sym$values <0)) stop('Non-positive eigenvalues for A^2')
 7 |  		PCs = mat%*%(res_eigs_sym$vectors)
 8 | 		return( PCs[, which] )
 9 | 	}
10 | 
11 | 	
12 | 	PC_compartment <- function(A_oe) ## compute PC and define A/B compartment. Not used currently
13 | 	{
14 | 		cA_oe = fast_cor(A_oe)
15 | 		PC_mat = crossprod(cA_oe)
16 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" )
17 | 		PC1 = cA_oe%*%res_eigs_sym$vectors[,1]
18 | 		PC2 = cA_oe%*%res_eigs_sym$vectors[,2]
19 | 
20 | 		borders = which(diff(1*(PC1 > 0))!=0)
21 | 		to_id = as.numeric(rownames(A_oe)[borders])
22 | 		from_id = as.numeric(rownames(A_oe)[c(1, head(borders, length(borders)-1)+1)])
23 | 			
24 | 		start_poses = (from_id-1)*bin_size + 1
25 | 		end_poses = to_id*bin_size
26 | 
27 | 		compartment_AB = data.frame(chr=paste0('chr', chr), start_poses=start_poses, end_poses=end_poses, A_or_B=NA, zero=0, dot='.', start_poses_2=start_poses, end_poses_2=end_poses, col=NA)
28 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'A_or_B'] = 'A'
29 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'A_or_B'] = 'B'
30 | 			
31 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'col'] = '112,128,144'
32 | 		compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'col'] = '255,255,0'
33 | 
34 | 		compartments_bed_files = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_2', '.bed')
35 | 		write.table( compartment_AB, file=compartments_bed_files, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' )
36 | 	}
37 | 
38 | 	PC_compartment_slim <- function(A_oe, downsratio=NULL) ## compute and save the PC values only. Not used currently
39 | 	{
40 | 		cA_oe = fast_cor(A_oe)
41 | 		PC_mat = crossprod(cA_oe)
42 | 		res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" )
43 | 		PC1 = cA_oe%*%res_eigs_sym$vectors[,1]
44 | 		PC2 = cA_oe%*%res_eigs_sym$vectors[,2]
45 | 
46 | 		compartment_AB = data.frame(PC1=PC1, bin_names=rownames(A_oe))
47 | 
48 | 		if( is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB.Rdata')
49 | 		if( !is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_downsratio_', downsratio, '.Rdata')
50 | 		
51 | 		save( compartment_AB, file=compartments_AB_file )
52 | 	}
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/R/compartment_data_generation_fun.R:
--------------------------------------------------------------------------------
 1 | 	
 2 | 	compress_mat_fast_tmp = function(input_mat, compress_size)
 3 | 	  {
 4 | 	         mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) )
 5 | 	                n = ncol(input_mat)
 6 | 	                mat2prod = mat_block(n, compress_size)
 7 | 	                # return(t(input_mat%*%mat2prod / compress_size))
 8 | 	                return(t(matrix_multiplication_cpp(input_mat, mat2prod)))
 9 | 	        }
10 | 
11 | 	mat_10to40kb = function(mat2compress, bin_size, bin_size_initial) ## 10kb to 40 kb
12 | 	{
13 | 		compress_size = bin_size / bin_size_initial
14 | 		len = nrow(mat2compress) - nrow(mat2compress)%%compress_size
15 | 		mat2compress = mat2compress[1:len, 1:len]
16 | 		c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size )
17 | 		mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size )
18 | 		rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) )
19 | 		return(mat_compressed)
20 | 	}
21 | 
22 | 	# if(!file.exists(contact_mat_file)) contact_mat_file = paste0('/mnt/ndata/Yuanlong/2.Results/1.Juicer/', CELL_LINE, '/contact_mat/mat_chr', chr, '_', bin_size_initial_kb, 'kb_ob.txt.gz')
23 | 
24 | 	## bin_size_initial is the binsize of your input matrix, can be different from the bin_size of your planned analysis
25 | 	contact_mat_processing = function(contact_mat_file, bin_size, bin_size_initial=bin_size)
26 | 	{	
27 | 		
28 | 		compress_size = ifelse(bin_size < 40E3, 1, 1)
29 | 		zero_ratio = 0.01
30 | 
31 | 		combined_xk_oe_raw = data.table::fread(contact_mat_file)
32 | 
33 | 		## this code generates the compartment domains
34 | 
35 | 		colnames(combined_xk_oe_raw) = c('pos_1', 'pos_2', 'val')
36 | 		combined_xk_oe_raw = subset(combined_xk_oe_raw, !is.na(val))
37 | 		combined_xk_oe_raw[,1] = combined_xk_oe_raw[,1]/bin_size_initial
38 | 		combined_xk_oe_raw[,2] = combined_xk_oe_raw[,2]/bin_size_initial
39 | 		combined_xk_oe = combined_xk_oe_raw
40 | 
41 | 		if(!all(combined_xk_oe[[2]] >= combined_xk_oe[[1]])) stop('\nYou provided matrix does not represent an upper triangular matrix!\n\n')
42 | 
43 | 		oe_size = max(max(combined_xk_oe[[1]]), max(combined_xk_oe[[2]])) + 1 ## should +1 because combined_xk_oe index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
44 | 		mat_oe_sparse = Matrix::Matrix(0, nrow=oe_size, ncol=oe_size)
45 | 		mat_oe_sparse[cbind(combined_xk_oe[[1]]+1, combined_xk_oe[[2]]+1)] <- combined_xk_oe[[3]]
46 | 
47 | 		rownames(mat_oe_sparse) = colnames(mat_oe_sparse) = as.character( 1:nrow(mat_oe_sparse) )
48 | 		
49 | 		mat_oe_sparse = Matrix::forceSymmetric(mat_oe_sparse, uplo='U')
50 | 		if(bin_size!=bin_size_initial) mat_oe_sparse = mat_10to40kb( mat_oe_sparse, bin_size, bin_size_initial )
51 | 		A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
52 | 		if(nrow(A_oe) < 100) A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=0) ## when all are dense
53 | 		while(min(apply(A_oe, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
54 | 		{
55 | 			A_oe = remove_blank_cols(A_oe, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
56 | 			if(nrow(A_oe) < 1) stop('ERROR IN GENERATING MEANINGFUL A_oe at the data generating step')
57 | 		}
58 | 
59 | 		##########################################################
60 | 
61 | 		len = nrow(A_oe) - nrow(A_oe)%%compress_size
62 | 		A_oe_2_compress = A_oe[, 1:len]
63 | 
64 | 		bin_names = rownames(A_oe)
65 | 
66 | 		A_oe_compressed = compress_mat_fast( as.matrix(A_oe_2_compress), compress_size=compress_size )
67 | 		colnames(A_oe_compressed) = bin_names
68 | 		rm(A_oe_2_compress); gc()
69 | 		
70 | 		range(A_oe_compressed)
71 | 		# # sum(A_oe_compressed > 1000)
72 | 		# # A_oe_compressed[A_oe_compressed > 1000] = 1000
73 | 		A_oe_compressed_sparse = A_oe_compressed
74 | 		A_oe_compressed = as.matrix(A_oe_compressed)
75 | 		A_oe_compressed_log = log2(A_oe_compressed + 1)
76 | 		
77 | 		# #########################################################
78 | 		# cat('compute correlation matrix ... ')
79 | 
80 | 		cA_oe_compressed_log = fast_cor(A_oe_compressed_log)
81 | 		ccA_oe_compressed_log = fast_cor(cA_oe_compressed_log)
82 | 
83 | 		# cat('compute correlation matrix done ... ')
84 | 
85 | 		# #########################################################
86 | 		# # ccA_oe_compressed_atanh = atanh(ccA_oe_compressed - 1E-7)
87 | 		ccA_oe_compressed_log_atanh = atanh(ccA_oe_compressed_log / (1+1E-7))
88 | 
89 | 		# rm(A_oe_compressed, A_oe_compressed_sparse, cA_oe_compressed_log)
90 | 		gc()
91 | 		# #########################################################
92 | 		# cat('ready to compute compartment domains\n')
93 | 
94 | 		out = list(A_oe=A_oe, atanh_score=ccA_oe_compressed_log_atanh)
95 | 
96 | 		return(out)
97 | 	}
98 | 
99 | 


--------------------------------------------------------------------------------
/R/general_functions.R:
--------------------------------------------------------------------------------
   1 |     get_ccA_atanh = function(contact_mat_file, compress_size, bin_size)
   2 |     {
   3 |         contact_mat_raw = data.table::fread(contact_mat_file)
   4 |         contact_mat_raw = subset(contact_mat_raw, !is.na(V3))
   5 |         contact_mat_raw[,1] = contact_mat_raw[,1]/bin_size
   6 |         contact_mat_raw[,2] = contact_mat_raw[,2]/bin_size
   7 |         contact_mat = contact_mat_raw
   8 | 
   9 |         colnames(contact_mat) = c('pos_1', 'pos_2', 'val')  
  10 |         if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')
  11 | 
  12 |         n_bins_whole = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
  13 |         contact_mat_sparse = Matrix(0, nrow=n_bins_whole, ncol=n_bins_whole)
  14 |         contact_mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]
  15 | 
  16 |         rownames(contact_mat_sparse) = colnames(contact_mat_sparse) = as.character( 1:nrow(contact_mat_sparse) )
  17 |         
  18 |         contact_mat_sparse = Matrix::forceSymmetric(contact_mat_sparse, uplo='U')
  19 |         # if(bin_size!=bin_size_initial) contact_mat_sparse = mat_10to40kb( contact_mat_sparse, bin_size, bin_size_initial )
  20 |         A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
  21 |         if(nrow(A) < 100) A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=0) ## when all are dense
  22 |         while(min(apply(A, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
  23 |         {
  24 |             A = remove_blank_cols(A, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
  25 |             if(nrow(A) < 1) stop('ERROR IN GENERATING MEANINGFUL A at the data generating step')
  26 |         }
  27 | 
  28 |         # #########################################################
  29 | 
  30 |         n_bins_A = nrow(A) - nrow(A)%%compress_size
  31 |         A_2_compress = A[, 1:n_bins_A]
  32 | 
  33 |         bin_names = rownames(A)
  34 | 
  35 |         A_compressed = compress_mat_fast( as.matrix(A_2_compress), compress_size=compress_size )
  36 |         colnames(A_compressed) = bin_names
  37 |         rm(A_2_compress); gc()
  38 |         
  39 |         A_compressed_sparse = A_compressed
  40 |         A_compressed = as.matrix(A_compressed)
  41 |         A_compressed_log = log2(A_compressed + 1)
  42 |         
  43 |         # #########################################################
  44 |         cat('Compute correlation matrix\n')
  45 | 
  46 |         cA_compressed_log = fast_cor(A_compressed_log)
  47 |         ccA_compressed_log = fast_cor(cA_compressed_log)
  48 | 
  49 |         cat('Finished computing correlation matrix\n')
  50 | 
  51 |         # #########################################################
  52 |         # # ccA_compressed_atanh = atanh(ccA_compressed - 1E-7)
  53 |         ccA_atanh = atanh(ccA_compressed_log / (1+1E-7))
  54 |         A = as.matrix(A)
  55 |         rm(A_compressed, A_compressed_sparse, cA_compressed_log, A_compressed_log); gc()
  56 |         # #########################################################
  57 |         cat('Finished data generation\n')
  58 |         return(list(A=A, ccA_atanh=ccA_atanh))
  59 |     }
  60 | 
  61 | 
  62 |     
  63 |     get_detailed_full_tree <- function(res_info)
  64 |     {
  65 |         # attach(res_info)
  66 |         for( i in 1:nrow(res_info$segmentss) )
  67 |         {
  68 |             index_left2adj = which(igraph::V(res_info$hi_tree)$left_rescaled == res_info$segmentss_nadj[i,1])
  69 |             index_right2adj = which(igraph::V(res_info$hi_tree)$right_rescaled == res_info$segmentss_nadj[i,2])
  70 |             igraph::V(res_info$hi_tree)[index_left2adj]$left_rescaled = res_info$segmentss[i,1]
  71 |             igraph::V(res_info$hi_tree)[index_right2adj]$right_rescaled = res_info$segmentss[i,2]
  72 |             igraph::V(res_info$hi_tree)$width_rescaled = igraph::V(res_info$hi_tree)$right_rescaled - igraph::V(res_info$hi_tree)$left_rescaled + 1
  73 |             igraph::V(res_info$hi_tree)$name = paste('(',igraph::V(res_info$hi_tree)$left_rescaled, ',', igraph::V(res_info$hi_tree)$right_rescaled, ')', sep='')
  74 |         }
  75 |         
  76 |         ## xenocfraf:
  77 |         # res_inner = res_inner[sapply(res_inner, length) > 0]
  78 |         ## xenocfraf:
  79 |         # res_inner = res_inner[sapply(res_inner, length) > 0]
  80 |         branches = lapply( res_info$res_inner, get_tree_v0 )
  81 |         for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=res_info$segmentss[i,1])
  82 |         trunk = res_info$hi_tree
  83 |         
  84 |         full_tree = xenocraft( trunk, branches )
  85 |         names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ','))
  86 |         igraph::V(full_tree)$left = substring(names_tmp[,1], 2)
  87 |         igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
  88 |         if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
  89 |         
  90 |         detailed_full_tree = tree_germination(full_tree)
  91 |         # detach(res_info)
  92 |         return(detailed_full_tree)
  93 |     }
  94 |     
  95 |     
  96 | 
  97 |     insulation_score_fun = function(A, size=3)
  98 |     {
  99 |         insulation_score_fun_helper = function(bin_start)
 100 |         {
 101 |             bin_mid = bin_start + size -1
 102 |             bin_end = bin_start + 2*size - 1
 103 |             up = sum(A[bin_start:bin_mid, bin_start:bin_mid])
 104 |             down = sum(A[(bin_mid+1):bin_end, (bin_mid+1):bin_end])
 105 |             inter = 2*sum(A[bin_start:bin_mid, (bin_mid+1):bin_end])
 106 |             insulation = 1 - inter / (up + down)
 107 |             return(insulation)
 108 |         }
 109 | 
 110 |         bin_starts = 1:(nrow(A) - 2*size + 1)
 111 |         insulations = sapply(bin_starts, function(bin_start) insulation_score_fun_helper(bin_start))
 112 |         names(insulations) = rownames(A)[size:(nrow(A) - size)]
 113 |         return(insulations)
 114 |     }
 115 | 
 116 |     
 117 | 
 118 |     ## generating the compartment_segs based on A and initial_clusters
 119 | 
 120 |     generate_compartment_segs <- function( initial_clusters )
 121 |     {
 122 |         bins_seq = unname(unlist(initial_clusters))
 123 |         if(!all(diff(bins_seq)==1)) stop('check generate_compartment_segs in generate_compartment_segs.R')
 124 |         if(bins_seq[1]!=1) stop('check generate_compartment_segs in generate_compartment_segs.R')
 125 |         compartment_segs = do.call(rbind, lapply(initial_clusters, function(v) v[c(1, length(v))]))
 126 |         compartment_segs = data.frame( start_pos=compartment_segs[,1], end_pos=compartment_segs[,2] )
 127 |         return( compartment_segs )
 128 |     }
 129 | 
 130 |     ## This function gets the nodes at level k
 131 |     ## Yuanlong LIU
 132 |     ## 02_07_2018
 133 |     
 134 |     get_level_k_nodes <- function(tree, k)
 135 |     {
 136 |         nodes = igraph::ego(tree, order=k, nodes=1, mode = "out", mindist = 1)[[1]]
 137 |         return(nodes)
 138 |     }
 139 |     
 140 |     
 141 | 
 142 |     ## This function gets the TADs at level k
 143 |     ## Yuanlong LIU
 144 |     ## 18_05_2018
 145 |     
 146 |     get_level_k_TADs <- function(branches, k)
 147 |     {
 148 |         tad_sizes_ind = lapply( branches, function(branch) 
 149 |         {
 150 |             branch_level_k = igraph::induced.subgraph( branch, igraph::V(branch)$depth <= k )
 151 |             tad_size_ind = get_leaves(branch_level_k, 'igraph')$width
 152 |         })
 153 |         
 154 |         tad_sizes = unlist(tad_sizes_ind)
 155 |         end_pos = cumsum(tad_sizes)
 156 |         start_pos = c(1, 1 + end_pos[-length(end_pos)])
 157 |         tads = data.frame(start_pos=start_pos, end_pos=end_pos)
 158 |         return( tads )      
 159 |     }
 160 |     
 161 |     
 162 | 
 163 | 
 164 |     get_least_residue_matrix <- function(pA_sym, max_nbins, allowed_shifts)
 165 |     {
 166 |         nbins = nrow( pA_sym )
 167 |         max_nbins_bp = max_nbins
 168 |         # remainders = nbins %% (max_nbins + (-10:10))
 169 |         remainders = nbins %% (max_nbins + allowed_shifts)
 170 |             
 171 |         max_nbins = (max_nbins + allowed_shifts)[ which.min(remainders) ]
 172 |         n2one = floor(nbins/max_nbins)
 173 |         remainder = nbins %% max_nbins
 174 |         if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in ', remainder, ' rows/columns excluded for the analysis.')
 175 | 
 176 |         if(remainder!=0) A_final = pA_sym[-(tail(1:nbins, remainder)), -(tail(1:nbins, remainder))]
 177 |         if(remainder==0) A_final = pA_sym
 178 |         res = list(A_final=A_final, n2one=n2one, max_nbins_new=max_nbins)
 179 |     }
 180 | 
 181 | 
 182 | 
 183 |     ## merge clusters based on their corr_mat
 184 |     ## Yuanlong LIU
 185 |     ## 01-07-2018
 186 |     
 187 |     sim.fun <- function( corr_mat, mod1, mod2, method='mean' )
 188 |     {
 189 |         if(method=='mean') return(mean(corr_mat[mod1, mod2]))
 190 |         if(method=='median') return(median(corr_mat[mod1, mod2]))
 191 |         if(method=='max') return(max(corr_mat[mod1, mod2]))
 192 |         
 193 |     }
 194 |     
 195 | 
 196 |     ############################################################    
 197 |         
 198 |     ## this function removes all-zero columns / rows
 199 |     rm_zeros <- function(A) ## mat is a symmetric matrix
 200 |     {
 201 |         zero_indices = apply( A, 2, function(v) all(v==0) )
 202 |         if( all(zero_indices==0) ) return(A)
 203 |         A = A[!zero_indices, !zero_indices]
 204 |         return(A)
 205 |     }
 206 |     
 207 |     my_dist <- function(m) {mtm <- Matrix::tcrossprod(m); sq <- rowSums(m*m);  res = suppressWarnings(sqrt(outer(sq,sq,"+") - 2*mtm)); res[is.nan(res)]=0; diag(res)=0; return(res)}
 208 |     
 209 |     
 210 |     MoC = function(P, Q)
 211 |     {
 212 |         len_p = length(P)
 213 |         len_q = length(Q)
 214 |         if((len_p==len_q) & (len_p==1)) return(1)
 215 |         grids = expand.grid(1:len_p, 1:len_q)
 216 |         vecs = apply( grids, 1, function(x) {u=x[1]; v=x[2]; length(intersect( P[[u]], Q[[v]] ))^2/length(P[[u]])/length(Q[[v]])} )
 217 |         res = 1/( sqrt(len_p*len_q) - 1)*(sum(vecs) - 1)
 218 |         return(res)
 219 |     }
 220 | 
 221 |     get_stair_vecs <- function(mat, from, to)
 222 |     {
 223 |         n = nrow(mat)
 224 |         solve <- function(mid) c(mat[1:mid,(mid+1):n])
 225 |         lapply(from:to, solve)
 226 |     }
 227 |     
 228 |     correct_A_fast_divide_by_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean')
 229 |     {
 230 |         f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2
 231 |         
 232 |         n = nrow(A)
 233 |         upper_tri_values = A[upper.tri(A, diag=TRUE)]
 234 |         means_mat = indices = A*0
 235 |         indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2)
 236 |         
 237 |         diag_values = orders = rep(list(list()), n)
 238 | 
 239 |         for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d)
 240 |         for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)]
 241 | 
 242 |         if(mean_or_median=='mean'){
 243 |             if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
 244 |             if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } ))            
 245 |         }
 246 | 
 247 |         if(mean_or_median=='median'){
 248 |             if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
 249 |             if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } ))          
 250 |         }
 251 | 
 252 |         means[means==0] = 1
 253 |         means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))]
 254 |         if(divide_or_substract=='divide') A_corrected = A / means_mat
 255 |         if(divide_or_substract=='substract') A_corrected = A - means_mat
 256 |         A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) )
 257 |         rownames( A_corrected ) = colnames( A_corrected ) = rownames( A )
 258 |         return( A_corrected )
 259 |     }
 260 | 
 261 | 
 262 |     correct_A_fast_equal_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean')
 263 |     {
 264 |         ## may also try log normal, since the off-diagnal values follow well a log-normal distribution
 265 |         ## commented by Yuanlong LIU, 2018-07-26
 266 |         f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2
 267 |         
 268 |         n = nrow(A)
 269 |         upper_tri_values = A[upper.tri(A, diag=TRUE)]
 270 |         means_mat = indices = A*0
 271 |         indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2)
 272 | 
 273 |         diag_values = orders = rep(list(list()), n)
 274 | 
 275 |         for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d)
 276 |         for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)]
 277 | 
 278 |         if(mean_or_median=='mean'){
 279 |             if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
 280 |             if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } ))            
 281 |         }
 282 | 
 283 |         if(mean_or_median=='median'){
 284 |             if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } ))
 285 |             if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } ))          
 286 |         }
 287 | 
 288 |         means[means==0] = 1
 289 |         means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))]
 290 |         if(divide_or_substract=='divide') A_corrected = A / means_mat
 291 |         if(divide_or_substract=='substract') A_corrected = A - means_mat
 292 |         A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) )
 293 |         rownames( A_corrected ) = colnames( A_corrected ) = rownames( A )
 294 |         return( A_corrected )
 295 |     }
 296 | 
 297 |     creat_phylo_object <- function(tree)
 298 |     {
 299 |         creat_phylo_object_inner <- function(tree)
 300 |         {
 301 |             twins = igraph::ego(tree, order=1, node=1, mode='out', mindist=1)[[1]]
 302 |             branches = igraph::decompose(tree - 1)
 303 |             left_branch = branches[[1]]
 304 |             right_branch = branches[[2]]
 305 |             left_branch_flag = igraph::vcount( left_branch ) == 1
 306 |             right_branch_flag = igraph::vcount( right_branch ) == 1
 307 |             short_walk_dist = 1
 308 |             igraph::diameters = sapply(branches, igraph::diameter)
 309 |             long_walk_dist = abs(diff(igraph::diameters)) + 1
 310 |             
 311 |             if( left_branch_flag ) left_branch_newick = igraph::V(left_branch)$name
 312 |             if( !left_branch_flag ) left_branch_newick = creat_phylo_object_inner(left_branch)
 313 |             
 314 |             if( right_branch_flag ) right_branch_newick = igraph::V(right_branch)$name
 315 |             if( !right_branch_flag ) right_branch_newick = creat_phylo_object_inner(right_branch)   
 316 | 
 317 |             if( igraph::diameters[1] > igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', short_walk_dist, ',', right_branch_newick, ':', long_walk_dist, ')', sep='' )
 318 |             if( igraph::diameters[1] <= igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', long_walk_dist, ',', right_branch_newick, ':', short_walk_dist, ')', sep='' )
 319 |             return( tree_newick )
 320 |         }
 321 |         tree_newick = creat_phylo_object_inner(tree)
 322 |         final_newick = paste(tree_newick, ';', sep='')
 323 |         tree = read.tree(text=final_newick)
 324 |         return( tree )
 325 |     }
 326 |     
 327 | 
 328 |     dist_max_min <- function( L_diff )
 329 |     {
 330 |         N = dim(L_diff)[1]
 331 |         L_diff_dist = list()
 332 |         for( dist in 1:(N-1) )
 333 |         {
 334 |             maxLs = numeric( N-dist )
 335 |             for( row in 1:(N-dist) )
 336 |             {
 337 |                 i = row
 338 |                 j = row + dist
 339 |                 maxLs[i] = L_diff[i, j]
 340 |             }
 341 |             
 342 |             L_diff_dist[[dist]] = maxLs
 343 |         }
 344 |         return(L_diff_dist)
 345 |     }
 346 |     
 347 | 
 348 |     divide_into_groups <- function(A, n_group)
 349 |     {
 350 |         N = nrow(A)
 351 |         dists = 1:(N-1)
 352 |         counts = sapply(dists, function(v) n_cells2compute( A, v ))
 353 |         average = (ceiling(sum(counts) / n_group)) ## the average computational complexity
 354 |         groups <- cumsum(counts) %/% average + 1
 355 |         borders = c(0, which(diff(groups)!=0))
 356 |         binsizes = c()
 357 |         for( i in 1:(length(borders)-1) )
 358 |         {
 359 |             binsizes = c(binsizes, sum(counts[ (borders[i]+1):borders[i+1] ]))
 360 |         }
 361 |         info = list(borders=borders, binsizes=binsizes)
 362 |         return(info)
 363 |     }
 364 |     
 365 | 
 366 |     get_leaves <- function(tree, type='name')
 367 |     {
 368 |         leaf_indices = which(igraph::degree( tree, mode='out' ) == 0)
 369 |         if( type=='name' ) return(igraph::V(tree)[leaf_indices]$name)
 370 |         if( type=='igraph_node' ) return(igraph::V(tree)[leaf_indices])
 371 |         if( type=='igraph' ) return(igraph::V(tree)[leaf_indices])
 372 |         if( type=='index' ) return(leaf_indices)
 373 |     }
 374 | 
 375 |     get_segments <- function(hi_tree, binsize_thresh, return_segmentss_tree=FALSE)
 376 |     {
 377 |         leaf_widths =  get_leaves(hi_tree, type='igraph')$width_rescaled
 378 |         if( binsize_thresh <= min(leaf_widths) )
 379 |         {
 380 |             warning( 'Your input binsize_thresh has a value: ', binsize_thresh, ', which is smaller than the minimum of the leaf width of hi_tree: ', min(leaf_widths) )
 381 |         }
 382 |         
 383 |         nodes2rm = numeric()
 384 |         for( i in 2:igraph::vcount(hi_tree) ) ## root node not take into account
 385 |         {
 386 |             node = igraph::V(hi_tree)[i]
 387 |             width = node$right_rescaled - node$left_rescaled + 1
 388 |             if(width <= binsize_thresh)
 389 |             {
 390 |                 parent = igraph::ego(hi_tree, order=1, nodes=node, mode = "in", mindist = 1)[[1]]
 391 |                 width_parent = parent$right_rescaled - parent$left_rescaled + 1
 392 |                 if(width_parent <= binsize_thresh) nodes2rm = c(nodes2rm, i)    
 393 |             }
 394 |         }
 395 |         
 396 |         trimmed_tree = hi_tree - nodes2rm
 397 |         leaves = get_leaves(trimmed_tree, type='igraph_node')
 398 |         segmentss = cbind(leaves$left_rescaled, leaves$right_rescaled)
 399 |         if(return_segmentss_tree==TRUE) return( trimmed_tree )
 400 | 
 401 |         return( segmentss )
 402 |     }
 403 |     
 404 | 
 405 | get_tree_decoration = function( single_res_info, decoration=TRUE, distr, n_parameters, imputation_num=1E2 )
 406 |     {
 407 |         cA = single_res_info$cA
 408 |         # cat(dim(cA), '\n')
 409 |         if( !is.null(single_res_info$full_tree) ) tree=single_res_info$full_tree
 410 |         if( is.null(single_res_info$full_tree) ) tree = get_tree_v0(single_res_info)
 411 |         
 412 |         if(length(tree)==1) if( class(tree)=="character" & tree=='bad_tree') return( tree )
 413 | 
 414 |         ## more decoration
 415 |         if( decoration==FALSE ) return( tree )
 416 | 
 417 |         leaf_indices = get_leaves( tree, type='index' )
 418 |         igraph::V(tree)$mean_diff = 0
 419 |         zero_ratios = numeric()
 420 |         for(i in setdiff(1:igraph::vcount(tree), leaf_indices))
 421 |         {
 422 |             node = igraph::V(tree)[i]
 423 |             # igraph::V(tree)[i]$L = L[node$left, node$right]
 424 | 
 425 |             # A_union = cA[node$left:node$right, node$left:node$right]
 426 |             # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] )
 427 |             
 428 |             # the p-value of: H0: unioned model; H1: hierarchical model
 429 |             twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]]          
 430 |             mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2]
 431 |             
 432 |             # if(distr=='nb') test_info = p_likelihood_ratio_nb( cA, head=node$left, mid=mid, tail=node$right )
 433 |             # if(distr=='norm') test_info = p_likelihood_ratio_norm( cA, head=node$left, mid=mid, tail=node$right )
 434 |             # if(distr=='gamma') 
 435 |             # {
 436 |                 # test_info = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=FALSE )
 437 |                 # test_info_imputation = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=TRUE )
 438 |                 # igraph::V(tree)[i]$imp_p = test_info_imputation$p
 439 |                 # igraph::V(tree)[i]$imp_Lambda = test_info_imputation$Lambda
 440 |             # }
 441 |             
 442 |             if(distr=='lnorm')
 443 |             {
 444 |                 if(!is.null(imputation_num))
 445 |                 {
 446 |                     test_info_imp  = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation= TRUE, imputation_num=imputation_num )       
 447 |                     igraph::V(tree)[i]$imp_p = test_info_imp$p
 448 |                     igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff                  
 449 |                 }
 450 | 
 451 |                 test_info_nimp = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation=FALSE, imputation_num=imputation_num )       
 452 | 
 453 |                 igraph::V(tree)[i]$nimp_p = test_info_nimp$p
 454 |                 igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff
 455 |                 # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2)
 456 |             }
 457 | 
 458 |             if(distr=='wilcox')
 459 |             {
 460 |                 if(i==1) test_info = p_wilcox_test( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )      
 461 |                 if(i!=1) test_info = p_wilcox_test( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )     
 462 | 
 463 |                 # if(i==1) test_info = p_wilcox_test_nested( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )     
 464 |                 # if(i!=1) test_info = p_wilcox_test_nested( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' )        
 465 | 
 466 | 
 467 |                 igraph::V(tree)[i]$nimp_p = igraph::V(tree)[i]$imp_p = igraph::V(tree)[i]$wilcox_p = test_info$p
 468 |                 igraph::V(tree)[i]$mean_diff = test_info$mean_diff
 469 |                 # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2)
 470 |             }
 471 |             
 472 |             test_info_mean_diff = lognormal_mean_test( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right) )       
 473 | 
 474 |             igraph::V(tree)[i]$aa_p = test_info_mean_diff$p_Aa
 475 |             igraph::V(tree)[i]$bb_p = test_info_mean_diff$p_Ab
 476 |         }
 477 |         
 478 |         igraph::V(tree)$L_diff = 0
 479 |         # igraph::V(tree)$L_diff = igraph::V(tree)$L - igraph::V(tree)$L_union
 480 |         if(!is.null(imputation_num)) igraph::V(tree)[leaf_indices]$imp_p = 1
 481 |         igraph::V(tree)[leaf_indices]$nimp_p = igraph::V(tree)[leaf_indices]$wilcox_p = igraph::V(tree)[leaf_indices]$imp_p = 1     
 482 |         igraph::V(tree)[leaf_indices]$aa_p = 1
 483 |         igraph::V(tree)[leaf_indices]$bb_p = 1
 484 |         return(tree)
 485 |     }
 486 | 
 487 |   
 488 |     add_boundary_binsignal_to_decrated_branches <- function( decorated_branches, bin_signals_5_10 )
 489 |     {
 490 |         tree_widths = sapply(decorated_branches, function(v) igraph::V(v)[1]$width)
 491 |         if(sum(tree_widths) != length( bin_signals_5_10 )) stop('Check add_boundary_binsignal_to_decrated_branches')
 492 |         for(k in 1:length(decorated_branches))
 493 |         {
 494 |             tree = decorated_branches[[k]]
 495 |             leaf_indices = get_leaves( tree, type='index' )
 496 | 
 497 |             for(i in setdiff(1:igraph::vcount(tree), leaf_indices))
 498 |             {
 499 |                 node = igraph::V(tree)[i]
 500 |                 # igraph::V(tree)[i]$L = L[node$left, node$right]
 501 | 
 502 |                 # A_union = cA[node$left:node$right, node$left:node$right]
 503 |                 # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] )
 504 |                 
 505 |                 # the p-value of: H0: unioned model; H1: hierarchical model
 506 |                 twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]]          
 507 |                 mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2]
 508 |                 
 509 |                 absolute_mid = mid + sum(tree_widths[(1:k)-1])
 510 |                 igraph::V(tree)[i]$binsignal = bin_signals_5_10[absolute_mid]
 511 |             }
 512 |             igraph::V(tree)[leaf_indices]$binsignal = 0
 513 |             igraph::V(tree)[which(is.na(igraph::V(tree)$binsignal))]$binsignal = 0
 514 |             decorated_branches[[k]] = tree
 515 |         }
 516 |         return(decorated_branches)
 517 |     }
 518 |     
 519 |     get_tree_v0 <- function( single_res_info )
 520 |     {
 521 |         ancestors = single_res_info$ancestors
 522 |         L = single_res_info$L
 523 |         N = dim(ancestors)[1]
 524 |         current_node = ancestors[1, N]
 525 | 
 526 |         recursive <- function(current_node)
 527 |         {
 528 |             # cat(current_node, '\n')
 529 |             seqs = as.numeric(strsplit(current_node,'-')[[1]])
 530 |             # cat( current_node, '\n' )
 531 | 
 532 |             left_node = ancestors[seqs[1],seqs[2]]
 533 |             right_node = ancestors[seqs[3],seqs[4]]
 534 | 
 535 |             flag_left = (seqs[1]!=seqs[2]) & (left_node!="")
 536 |             flag_right = (seqs[3]!=seqs[4]) & (right_node!="")
 537 | 
 538 |             if(is.na(flag_left)) return( 'bad_tree' ) ## added: 30-04-2020
 539 |             
 540 |             left_leaf = paste( seqs[1],seqs[2], sep='-' )
 541 |             right_leaf = paste( seqs[3],seqs[4], sep='-' )
 542 |             
 543 |             if( flag_left ) left_tree = recursive(left_node)
 544 |             if( !flag_left ) left_tree = left_leaf
 545 | 
 546 |             if( flag_right ) right_tree = recursive(right_node)
 547 |             if( !flag_right ) right_tree = right_leaf
 548 | 
 549 |             if( !flag_left ) left_node = left_leaf
 550 |             if( !flag_right ) right_node = right_leaf
 551 |             
 552 |             tree_raw = igraph::graph.empty() + current_node + left_tree + right_tree
 553 |             tree = igraph::add_edges(tree_raw, c(current_node, left_node, current_node, right_node))
 554 | 
 555 |             return( tree )
 556 |         }
 557 |         
 558 |         tree = recursive(current_node)
 559 |         # stop('I stop here')
 560 |         
 561 |         # if(tree=='bad_tree') return('bad_tree') ## added: 30-04-2020
 562 |         if(class(tree)!='igraph') return('bad_tree')
 563 |  
 564 |         ## this part tries to decorate the tree by adding various node attributes
 565 |         
 566 |         igraph::V(tree)$left = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(head(tmp,1)))} )
 567 |         igraph::V(tree)$right = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(tail(tmp,1)))} )
 568 |         igraph::V(tree)$name = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; paste( '(', head(tmp,1), ',', tail(tmp,1), ')', sep='' )} )
 569 |         igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
 570 |         
 571 |         return(tree)
 572 |     }
 573 | 
 574 | 
 575 |     is_binary_tree <-function(tree)
 576 |     {
 577 |         leaves = get_leaves( tree, 'index' )
 578 |         not_leaves = setdiff(1:igraph::vcount(tree), leaves)
 579 |         degrees = igraph::degree(tree, v = not_leaves, mode = 'out')
 580 |         if(!igraph::is.connected( tree )) return(FALSE)
 581 |         if(all(degrees==2)) return(TRUE)
 582 |         return(FALSE)
 583 |     }
 584 |     
 585 |     join_left_or_right <- function(tree)
 586 |     {
 587 |         leaves = get_leaves( tree, 'igraph' )
 588 | 
 589 |         left_or_right = numeric()
 590 |         left_or_right[1] = 0
 591 |         left_or_right[length(leaves) - 1] = 0
 592 | 
 593 |         for( i in 2:(length(leaves) - 1) )
 594 |         {
 595 |             leave = leaves[i]
 596 |             parent = igraph::ego(tree, nodes=leave, order=1, mindist=1, mode='in')[[1]]
 597 |             if( parent$left < leave$left ) left_or_right[i] = -1 ## left
 598 |             if( parent$right > leave$right ) left_or_right[i] = 1 ## right      
 599 |         }
 600 |         return( left_or_right )
 601 |     }
 602 | 
 603 |     
 604 |     long_slices <- function(left_or_right, thresh=3)
 605 |     {
 606 |         dup_lens = rle( left_or_right )$lengths
 607 |         dup_values = rle( left_or_right )$values
 608 |         long_dups = which( dup_lens >= thresh )
 609 |         long_dup_lens = dup_lens[long_dups]
 610 |         long_dup_start_pos = cumsum( dup_lens )[long_dups-1] + 1 # 1: shift to the right by one
 611 |         long_dup_direction = left_or_right[ long_dup_start_pos ]
 612 |         indices_of_slice = as.vector(unlist(mapply(function(u,v) {seq(from=u, length=v, by=1)}, long_dup_start_pos, long_dup_lens)))
 613 |         dup_info = list( long_slices_lens=long_dup_lens, long_slices_start_pos=long_dup_start_pos, long_dup_direction=long_dup_direction, indices_of_slice=indices_of_slice )
 614 |         return(dup_info)
 615 |     }
 616 |     
 617 |     
 618 |     n_cells2compute <- function( A, dist, min_n_bins=1 )
 619 |     {
 620 |         N = nrow(A)
 621 |         if(min_n_bins==1)
 622 |         {
 623 |             count = dist*(1+dist)*(2+dist)*(N-dist)/6
 624 |             return(count)
 625 |         }
 626 |         
 627 |         count = (dist - 2*min_n_bins + 2)*(dist^2 + 2*dist*min_n_bins + dist - 2*(min_n_bins - 2)*min_n_bins)*(N - dist) / 6
 628 |         return(count)
 629 |     }
 630 |     
 631 |     
 632 |     
 633 |     plot_tree <- function(tree, seg_col='blue', which_part='upper', indices_of_slice=NULL, ...)
 634 |     {
 635 |         root_node = igraph::V(tree)[1]
 636 |         leaves = igraph::V(tree)[which(igraph::degree( tree, mode='out' ) == 0)]
 637 |         plot_inner <- function( node ){
 638 |             left = node$left
 639 |             right = node$right
 640 |             if(right - left == 1)
 641 |             {
 642 |                 return()
 643 |             }
 644 |             
 645 |             x0 = left - 0.5
 646 |             x1 = right + 0.5
 647 |             if(which_part=='upper')
 648 |             {
 649 |                 segments(x0, x0, x0, x1, col=seg_col, ...)
 650 |                 segments(x0, x1, x1, x1, col=seg_col, ...)      
 651 |             }
 652 | 
 653 |             if(which_part=='lower')
 654 |             {
 655 |                 segments(x0, x0, x1, x0, col='black', ...)
 656 |                 segments(x1, x1, x1, x0, col='black', ...)          
 657 |             }
 658 | 
 659 |             if(which_part=='both')
 660 |             {
 661 |                 segments(x0, x0, x0, x1, col=seg_col, ...)
 662 |                 segments(x0, x1, x1, x1, col=seg_col, ...)      
 663 |                 segments(x0, x0, x1, x0, col='black', ...)
 664 |                 segments(x1, x1, x1, x0, col='black', ...)  
 665 |             }
 666 |             
 667 |             if( !is.null(indices_of_slice) )
 668 |             {
 669 |                 if(left %in% indices_of_slice)
 670 |                 segments(x0, x0, x0, x1, col='yellow', ...)
 671 | 
 672 |                 if(right %in% indices_of_slice)
 673 |                 segments(x0, x1, x1, x1, col='black', ...)  
 674 |             }
 675 |             
 676 |             if( node %in% leaves ) return()
 677 |             twins = igraph::ego(tree, node=node, order=1, mindist=1, mode='out')[[1]]
 678 |             left_node = twins[1]
 679 |             plot_inner(left_node)
 680 |             if( length(twins) > 1 )
 681 |             {
 682 |                 # cat('hello', '\n')
 683 |                 right_node = twins[2]
 684 |                 plot_inner(right_node)
 685 |             }
 686 |         }
 687 |         plot_inner(root_node)
 688 |         return()
 689 |     }
 690 | 
 691 |     remove_blank_cols <- function( mat, sparse=FALSE, row_or_col='both', ratio=0.05 )
 692 |     {
 693 |         if(sparse==TRUE)
 694 |         {
 695 |             if(row_or_col=='row')
 696 |             {
 697 |                 non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) >  ratio)
 698 |                 new_mat = mat[non_zero_indices, ]
 699 |                 return(new_mat)
 700 |             }
 701 | 
 702 |             if(row_or_col=='col')
 703 |             {
 704 |                 non_zero_indices = (Matrix::colSums(mat != 0) / nrow(mat) >  ratio)
 705 |                 new_mat = mat[, non_zero_indices]
 706 |                 return(new_mat)
 707 |             }
 708 | 
 709 |             if( ratio > 0 ) ## remove col/rows with non-zero number smaller than 5% percentile
 710 |             {
 711 |                 positive_num = unname(Matrix::rowSums(mat != 0))
 712 |                 positive_num_thresh = quantile(positive_num[positive_num > 0], ratio)
 713 |                 non_zero_indices = (Matrix::rowSums(mat != 0) >  positive_num_thresh)
 714 |                 new_mat = mat[non_zero_indices, non_zero_indices]
 715 |                 return(new_mat)
 716 |             }
 717 | 
 718 | 
 719 |             non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) >  ratio)
 720 |             new_mat = mat[non_zero_indices, non_zero_indices]
 721 |             return(new_mat)
 722 |         }
 723 | 
 724 |         col_sum = apply( mat, 2, sum )
 725 |         blank_col_indices = which( col_sum==0 )
 726 |         new_mat = mat[ -blank_col_indices, -blank_col_indices ]
 727 |         return(new_mat)
 728 |     }
 729 | 
 730 |     
 731 |     tree_germination <- function(tree)
 732 |     {
 733 |         leaves = get_leaves( tree )
 734 |         for( leaf in leaves )
 735 |         {
 736 |             cat(leaf, '\n')
 737 |             if((igraph::vcount(tree) - ecount(tree)) !=1) break
 738 |             bins = as.character(igraph::V(tree)[leaf]$left:igraph::V(tree)[leaf]$right)
 739 |             if(length(bins) == 1) stop('Some leave node contains only one bin, please check')
 740 |             if(length(bins) == 2) ## this is an exception. added on 11/06/2018
 741 |             {
 742 |                 nodes_supp = as.character(bins)
 743 |                 edges2add = c(leaf, bins[1], leaf, bins[2])
 744 |                 tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add)
 745 |                 next
 746 |             }
 747 |             
 748 |             joints_supp = paste('j', bins[1:(length(bins) - 2)], sep='')
 749 |             joints = c(leaf, joints_supp, bins[length(bins)])
 750 |             edges2add = character()
 751 |             for(i in 1:(length(joints)-1))
 752 |             {
 753 |                 edges2add = c(edges2add, c( joints[i], bins[i], joints[i], joints[i+1] ))
 754 |             }
 755 |             nodes_supp = c(joints_supp, bins)
 756 |             tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add)
 757 |         }
 758 |         return( tree )
 759 |     }
 760 | 
 761 |     
 762 |     trim_tree_adaptive <- function( tree, L_diff_thresh=-Inf, max_imp_p=Inf, max_nimp_p=Inf, width_thresh=-Inf, boundary_signal_thresh=Inf, peak_thresh=-Inf )
 763 |     {
 764 |         if(igraph::vcount(tree) ==1) return(tree) ## so this can be used!
 765 | 
 766 |         width_thresh = min(width_thresh, igraph::V(tree)[1]$width-1) ## cannot merge if tree is already very small
 767 |         
 768 |         nodes2rm_Ldiff = igraph::V(tree)[which( igraph::V(tree)$L_diff <= L_diff_thresh )]$name
 769 |         
 770 |         
 771 |         nodes2rm_imp_p = igraph::V(tree)[which( igraph::V(tree)$imp_p >= max_imp_p )]$name
 772 |         nodes2rm_nimp_p = igraph::V(tree)[which( igraph::V(tree)$nimp_p >= max_nimp_p )]$name
 773 |         nodes2rm_boundary_signal = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
 774 |         nodes2rm_not_peak = igraph::V(tree)[which( igraph::V(tree)$is_peak <= peak_thresh )]$name
 775 |         
 776 |         nodes2rm = unique( c( nodes2rm_Ldiff, nodes2rm_imp_p, nodes2rm_nimp_p, nodes2rm_boundary_signal, nodes2rm_not_peak ) )
 777 |         nodes_not_2rm = setdiff( igraph::V(tree)$name, nodes2rm )
 778 | 
 779 |         flags = sapply( nodes2rm, function(v) { children = names(unlist( igraph::ego( tree, nodes = v, order=igraph::diameter(tree), mindist=1, mode='out' ))); all(children %in% nodes2rm) } )
 780 |         nodes2rm_final = names(unlist( igraph::ego(tree, nodes = nodes2rm[which(flags==TRUE)], order=igraph::diameter(tree), mindist=1, mode='out' ) ) )
 781 |         
 782 |         if(!is.null(nodes2rm_final)) tree = tree - nodes2rm_final
 783 |         if(is.null(nodes2rm_final)) cat('No nodes are removed at the given p_thresh\n')
 784 |     
 785 |         if( width_thresh==-Inf ) return( tree )
 786 |         
 787 |         tree = trim_tree_adaptive_width_thresh(tree, width_thresh)
 788 |         return( tree )
 789 |     }
 790 |     
 791 |     trim_tree_adaptive_width_thresh <- function( tree, width_thresh )
 792 |     {
 793 |         ## first remove all nodes of a parent if both are < width_thresh
 794 |         nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
 795 |         parentOfnodes2rm_width = names(unlist(igraph::ego(tree, node=nodes2rm_width, order=1, mindist=1, mode='in')))
 796 |         ## if two nodes have the same parent, remove them
 797 |         index2rm = c(which(duplicated(parentOfnodes2rm_width)), which(duplicated(fromLast=TRUE, parentOfnodes2rm_width)))
 798 |         nodes2rm = nodes2rm_width[index2rm]
 799 |         tree = tree - nodes2rm
 800 |         if(!is_binary_tree(tree)) stop('Error in trim_tree_adaptive_width_thresh')
 801 | 
 802 |         ## merge TADs that are too small    
 803 |         nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
 804 | 
 805 |         di = igraph::diameter(tree)
 806 |         while( length(nodes2rm_width) > 0 )
 807 |         {
 808 |             node = nodes2rm_width[1]
 809 |             parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]]
 810 |             nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]]
 811 |             left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)]
 812 |             right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)]
 813 |             
 814 |             left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)]
 815 |             right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)]
 816 |             
 817 |             igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right
 818 |             igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left
 819 |             igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
 820 |             tree = tree - node
 821 |             nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name
 822 |         }
 823 |         igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='')
 824 |         return(tree)
 825 |     }
 826 |     
 827 |     ## should be different from trim_tree_adaptive_width_thresh because binsignal is not monotonic
 828 |     trim_tree_adaptive_binsig_thresh <- function( tree, boundary_signal_thresh )
 829 |     {
 830 | 
 831 |         ## merge TADs that are too small    
 832 |         parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
 833 |         leaves = get_leaves( tree )
 834 |         nodes2rm = intersect(leaves,  names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' ))))
 835 | 
 836 |         while( length(nodes2rm) > 0 )
 837 |         {
 838 |             di = igraph::diameter(tree)
 839 |             node = nodes2rm[1]
 840 |             parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]]
 841 |             sibling = setdiff(igraph::ego( tree, nodes = parent, order=1, mindist=1, mode='out' )[[1]]$name, node)
 842 |             twins_of_sibling = igraph::ego( tree, nodes = sibling, order=1, mindist=1, mode='out' )[[1]]$name
 843 |             nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]]
 844 |             left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)]
 845 |             right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)]
 846 |             
 847 |             left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)]
 848 |             right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)]
 849 |             
 850 |             igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right
 851 |             igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left
 852 |             
 853 |             
 854 |             if(length(twins_of_sibling) > 0) tree = igraph::add_edges(tree, c( parent$name, twins_of_sibling[1], parent$name, twins_of_sibling[2]) )
 855 |             tree = tree - node - sibling
 856 |             
 857 |             igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1
 858 |             
 859 |             parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name
 860 |             leaves = get_leaves( tree )
 861 |             nodes2rm = intersect(leaves,  names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' ))))
 862 |         }
 863 |         igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='')
 864 |         return(tree)
 865 |     }
 866 | 
 867 |     visualize_left_or_right <- function(left_or_right, ...)
 868 |     {
 869 |         len = length( left_or_right )
 870 |         plot(1, type="n", xlab="", ylab="", xlim=c(0, len), ylim=c(-2, 2))
 871 |         for( i in 1:len )
 872 |         {
 873 |             if( left_or_right[i]==1 ) segments(i-0.1, 1, i+1, 1, col='red', ...)
 874 |             if( left_or_right[i]==-1 ) segments(i-0.1, -1, i+1, -1, col='blue', ...)
 875 |         }
 876 |     }
 877 | 
 878 |     xenocraft <- function( trunk, branches )
 879 |     {
 880 |         
 881 |         xenocraft_nodes = sapply(branches, function(v) igraph::V(v)[1]$name)
 882 |         if(!all(xenocraft_nodes %in% igraph::V(trunk)$name)) stop("Check xenocraft function")
 883 |         
 884 |         children_xenocraft_nodes = unique(unlist(sapply(igraph::ego(trunk, order=igraph::diameter(trunk), node=xenocraft_nodes, mode='out', mindist=1), function(v) v$name)))
 885 |         prunned_trunck = trunk - children_xenocraft_nodes
 886 |         full_tree = Reduce( union, c(list(prunned_trunck), branches) )
 887 |         att2delete = setdiff(vertex_attr_names(full_tree), 'name')
 888 |         for( att in att2delete ) full_tree = delete_vertex_attr(full_tree, att) 
 889 |         return( full_tree )
 890 |     }
 891 | 
 892 |     fast_cor <- function(mat)
 893 |     {
 894 |         res = 1/( NROW( mat ) -1)*crossprod ( scale( mat , TRUE , TRUE ) )
 895 |         # scaled_mat = scale( mat , TRUE , TRUE )
 896 |         # res = 1/( NROW( mat ) -1)*matrix_multiplication_sym_cpp( scaled_mat )
 897 | 
 898 |         return(res)
 899 |     }
 900 | 
 901 |     fast_cor_cor <- function(mat, k)
 902 |     {
 903 |         scaled_mat = scale(mat , TRUE , TRUE )
 904 |         coeff = 1/( NROW( mat ) -1) ## there is no need to multiply this coeff when computing Pearson coeff
 905 | 
 906 |         res_begin = coeff*crossprod( scaled_mat[, 1:k], scaled_mat )
 907 | 
 908 |         for(j in (k+1):nrow(mat))
 909 |         {
 910 |             res_slice = res_begin[-1,]
 911 |             res_next = coeff*crossprod( scaled_mat[, j], scaled_mat )
 912 |             res_slice = rbind(res_slice, res_next)
 913 |             new_values = cor(t(res_slice), t(res_next))
 914 |             cat(j, '\n')        
 915 |         }
 916 |     }
 917 | 
 918 | 
 919 |     get_bin_singals_CHiP <- function(chr, hc_ordered, res, ks, ChiP_NAME, CELL_LINE, ChiP_data_already_loaded=FALSE)
 920 |     {
 921 | 
 922 |         #################
 923 |         n_bins_of_CD = sapply(res$initial_clusters, length)
 924 |         pos_start_end = lapply(res$initial_clusters, function(v)
 925 |         {
 926 |             bins_ori = as.numeric(res$bin_names[v])
 927 |             from_id = min(bins_ori)
 928 |             to_id = max(bins_ori)
 929 |             start_pos = (from_id-1)*bin_size + 1
 930 |             end_pos = to_id*bin_size
 931 |             return(c(start_pos, end_pos))
 932 |         })
 933 | 
 934 |         pos_start_end = do.call(rbind, pos_start_end)
 935 |         #################
 936 | 
 937 |         ordered_bin_signals_ALL_k_list = lapply(ks, function(k){
 938 | 
 939 |             hc_k_labels = get_cluser_levels(hc_ordered, k_clusters=k, balanced_4_clusters=FALSE)$cluster_labels
 940 | 
 941 |             if(is.null(ChiP_NAME))
 942 |             {
 943 |                 CD_index = labels(hc_ordered)
 944 |                 pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ]
 945 |                 ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2],  compartment=hc_k_labels[CD_index])
 946 |                 return(ordered_bin_signals_ALL)
 947 |             }
 948 | 
 949 |             cluster_signal = get_names_by_H3k4me1(chr, res, ChiP_NAME, kmeans_cluster_assigment=hc_k_labels, CELL_LINE=CELL_LINE, ChiP_data_already_loaded=ChiP_data_already_loaded)
 950 |             # cluster_signal = rbind(cluster_signal, sorted_coverage)
 951 |             ordered_bin_signals_ALL = apply(cluster_signal, 1, function(v)
 952 |             {
 953 |                 bin_signals = v[hc_k_labels]
 954 |                 names(bin_signals) = names(hc_k_labels)
 955 |                 ordered_bin_signals = bin_signals[labels(hc_ordered)] ## order by dendro topology
 956 |                 return(ordered_bin_signals)
 957 |             })
 958 | 
 959 |             ordered_bin_signals_ALL = as.data.frame(ordered_bin_signals_ALL)
 960 |             # CD_index = rownames(ordered_bin_signals_ALL)
 961 |             CD_index = labels(hc_ordered)
 962 | 
 963 |             pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ]
 964 | 
 965 |             ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2],  compartment=hc_k_labels[CD_index], ordered_bin_signals_ALL)
 966 |             return(ordered_bin_signals_ALL)
 967 |             })
 968 |         names(ordered_bin_signals_ALL_k_list) = paste0('clusters_', ks)
 969 |         names(ordered_bin_signals_ALL_k_list)[which(names(ordered_bin_signals_ALL_k_list)=="clusters_Inf")] = 'compartment_domains'
 970 |         return(ordered_bin_signals_ALL_k_list)
 971 |     }
 972 | 
 973 | 
 974 | 
 975 |         densplot = function(x,y,points = FALSE, pch=19, cex=1, xlim=c(min(x),max(x)), ylim=c(min(y),max(y)), ...){
 976 |         df = data.frame(x,y)
 977 |         d = densCols(x,y, colramp=colorRampPalette(c("black", "white")))
 978 |         df$dens = col2rgb(d)[1,] + 1L
 979 |         
 980 |         cols = colorRampPalette(c("#000099", "#00FEFF", "#45FE4F","#FCFF00", "#FF9400", "#FF3100"))(256)
 981 |         
 982 |         df$col = cols[df$dens]
 983 |         df=df[order(df$dens),]
 984 |         if(points)
 985 |                 points(df$x,df$y, pch=pch, col=df$col, cex=cex, ...)
 986 |         else
 987 |                 plot(df$x,df$y, pch=pch, col=df$col, cex=cex, xlim=xlim, ylim=ylim, ...)
 988 |         }
 989 | 
 990 | 
 991 | 
 992 |     my_merge = function(...) merge(..., all=TRUE)
 993 |     
 994 |     
 995 |     build_chr_bin_domain_fun <- function( CELL_LINE, chrs, cluster_level, p_thresh, ob_oe, downsratio=NULL, compress_size=10 )
 996 |     {
 997 |         chrs = as.character(chrs)
 998 |         chr_bin_domain_tmp = lapply(chrs, function(chr) get_clusters_bins_xy(CELL_LINE, chr, cluster_level, p_thresh, ob_oe, downsratio=downsratio, compress_size=compress_size))
 999 |         names( chr_bin_domain_tmp ) = chrs ## can use mapply
1000 |         chr_bin_domain = lapply(chrs, function(v) 
1001 |         {
1002 |             chr_bin_domain_ind = data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) )
1003 |             chr_bin_domain_ind = chr_bin_domain_ind[order(chr_bin_domain_ind$bin_index), ]
1004 |             return(chr_bin_domain_ind)
1005 |         })
1006 |         names(chr_bin_domain) = chrs
1007 |         res = do.call(rbind, chr_bin_domain)
1008 |         rownames(res) = NULL
1009 |         return( res )
1010 |     }
1011 | 
1012 |     build_chr_bin_domain_fun_direct <- function( chr, initial_clusters, cluster_vec, bin_names ) ## directly from R workingspace instead of loading
1013 |     {
1014 |         chrs = as.character(chr) ## for historic reason, chr -- chrs
1015 |         chr_bin_domain_tmp = lapply(chrs, function(chr) get_cluster_bin_names(initial_clusters, cluster_vec, bin_names))
1016 |         names( chr_bin_domain_tmp ) = chrs ## can use mapply
1017 |         chr_bin_domain = lapply(chrs, function(v) data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) ))
1018 |         names(chr_bin_domain) = chrs
1019 |         return( do.call(rbind, chr_bin_domain) )
1020 |     }
1021 | 
1022 |     get_clusters_bins_xy <- function(CELL_LINE, chr, cluster_level, p_thresh, ob_oe='oe', downsratio, compress_size, sort=TRUE)
1023 |     {
1024 |         if(ob_oe=='oe') sub_folder = paste0('./', CELL_LINE, '/oe_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh)
1025 |         if(ob_oe=='ob') sub_folder = paste0('./', CELL_LINE, '/ob_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh)
1026 | 
1027 |         if(!is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB', ws, 'downsratio_', downsratio, '.Rdata')
1028 | 
1029 |         if(is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB3_3.Rdata')
1030 |         
1031 |         load(compartments_Rdata_file)
1032 |         clusters_bins = get_cluster_bin_names(sort=sort, res$initial_clusters, res$clusters[, cluster_level], res$bin_names)
1033 |         rm( res )
1034 |         return(clusters_bins)
1035 |     }
1036 | 
1037 | 
1038 |     get_cluster_bin_names <- function(initial_clusters, cluster_vec, bin_names, sort=TRUE)
1039 |     {
1040 |         if(sort==TRUE) cluster_indices = sort(unique(cluster_vec), decreasing=TRUE)
1041 |         if(sort==FALSE) cluster_indices = unique(cluster_vec)
1042 | 
1043 |         cluster_bins = lapply(cluster_indices, function(v)
1044 |         {
1045 |             indices = which(cluster_vec==v)
1046 |             bin_names[unlist(initial_clusters[indices])]
1047 |         })
1048 |         names(cluster_bins) = cluster_indices
1049 |         return(cluster_bins)
1050 |     }
1051 | 
1052 |     # ave_cor <- function(mat, seg_len) ## seg_len is the length of a segment
1053 |     # {
1054 |     #   d = 1:nrow(mat)
1055 |     #   seq_index = split(d, ceiling(seq_along(d)/10))
1056 |     #   tmp = simplify2array( lapply(seq_index[1:22], function(v) fast_cor( mat[v, ] )/10*length(v)) )
1057 |     #   ave_cor_val = rowMeans(tmp, dims = 2)
1058 |     # }
1059 | 
1060 |     # if( kmeans_cluster_assigment ): compute the cluster_level signals
1061 |    
1062 | 
1063 |     reorder_dendro <- function(hc_object, named_weights, return_g=FALSE, aggregateFun=mean)
1064 |     {
1065 |         get_children <- function(g, root_node)
1066 |         {
1067 |             leaves = igraph::V(g)[igraph::degree(g)==1]$name
1068 |             children = intersect(leaves, igraph::ego(g, mode='out', root_node, order=igraph::diameter(g))[[1]]$name)
1069 |             return(children)
1070 |         }
1071 | 
1072 |         # hc_dendro = as.dendrogram(hc_object)
1073 |         # cat('hello I am here\n')
1074 |         # print(class(hc_object))
1075 |         g = igraph::as.igraph(ape::as.phylo(hc_object))
1076 |         # stop('I am here')
1077 |         leaves = igraph::V(g)[igraph::degree(g)==1]$name
1078 | 
1079 |         igraph::V(g)$weight = sapply(1:igraph::vcount(g), function(v) aggregateFun(named_weights[get_children(g, v)]))
1080 | 
1081 |         if(return_g==TRUE) return(g)
1082 | 
1083 |         swap_branches <- function(g, root_node)
1084 |         {
1085 |             twins = igraph::ego(g, mode='out', root_node, order=1, mindist=1)[[1]]$name
1086 |             twins_weight = igraph::V(g)[twins]$weight
1087 | 
1088 |             if(twins_weight[1] > twins_weight[2])
1089 |             {
1090 |                 children_of_twin_A = get_children(g, twins[1])
1091 |                 children_of_twin_B = get_children(g, twins[2])
1092 |                 leaves = swap_names( leaves, children_of_twin_A, children_of_twin_B )
1093 |             }
1094 |             return(leaves)
1095 |         }
1096 | 
1097 |         swap_names <- function( leaves, names2swap_A, names2swap_B )
1098 |         {
1099 |             names2swap_indices = match(c(names2swap_A, names2swap_B), leaves)
1100 |             leaves[ names2swap_indices ] = c(names2swap_B, names2swap_A)
1101 |             return(leaves)
1102 |         }
1103 | 
1104 |         for( root_node in setdiff(igraph::V(g)$name, leaves) )
1105 |         {
1106 |             leaves = swap_branches(g, root_node)
1107 |             # cat(leaves, '\n')
1108 |         }
1109 |             
1110 |         return(leaves)
1111 |     }
1112 | 
1113 | 
1114 |     get_cluser_assignment = function(hc, k_clusters, leaves_hclust_pc)
1115 |     {
1116 |         clusters_raw = cutree(hc, k_clusters)
1117 |         clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
1118 | 
1119 |         od = rank(sapply( clusters, function(v) sort(match(v, as.numeric(leaves_hclust_pc)))[1]))
1120 | 
1121 |         cluster_assignment = numeric(sum(sapply(clusters, length)))
1122 |         for(j in 1:length(clusters)) cluster_assignment[clusters[[j]]] = od[j]
1123 |         return( cluster_assignment )
1124 |     }
1125 | 
1126 | 
1127 | 
1128 |     get_cluster_boudaries <- function(hc, k_clusters, named_weights)
1129 |     {
1130 |         len = length(labels(hc))
1131 |         clusters_raw = cutree(hc, k_clusters)
1132 |         clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
1133 |         clusters_pc_rank = lapply(clusters, function(v) unname(sort(rank(named_weights)[v])))
1134 |         bondaries = sapply(clusters_pc_rank, function(v) tail(v,1))
1135 |         return(setdiff(bondaries, c(1, len)))   
1136 |     }
1137 | 
1138 |     # get_cluser_vector <- function(hc, k_clusters, named_weights) ## get cluster assignment of 1,2,3,4,5... (A1, A2, ...)
1139 |     # {
1140 |     #   clusters_raw = cutree(hc, k_clusters)
1141 |     #     clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))
1142 |     #     clusters_pc_rank = unname(rank(sapply(clusters, function(v) unname(sort(rank(named_weights)[v]))[1])))
1143 |     #     cluster_vector = numeric()
1144 |     #     for( i in 1:length(clusters) ) cluster_vector[ clusters[[i]] ] = clusters_pc_rank[i]
1145 |     #     return( cluster_vector )  
1146 |     # }
1147 | 
1148 |     ## if return binary tree, A1, A2, B1, B2 are forced to be returned
1149 |     get_cluser_levels <- function(hc_ordered, k_clusters, balanced_4_clusters=FALSE) ## get the detailed A1, A2, ..., B1, B2...
1150 |     {
1151 |         assign_twins_name <- function(graph, node)
1152 |         {
1153 |             twins = igraph::ego(graph, node, mode='out', order=1, mindist=1)[[1]]$name
1154 |             igraph::V(graph)[twins]$level_name = paste0(igraph::V(graph)[node]$level_name, '.', c(2,1))
1155 |             if(node==igraph::V(graph)[1]$name) igraph::V(graph)[twins]$level_name = c('B', 'A')
1156 |             return(graph)
1157 |         }
1158 | 
1159 |         if(k_clusters==Inf) k_clusters = length(labels(as.dendrogram(hc_ordered)))
1160 |         #################################################
1161 |         # cat('haha I am here\n')
1162 |         # print(class(hc_ordered))
1163 |         graph = igraph::as.igraph(ape::as.phylo(hc_ordered))
1164 |         leave_names = get_leaves(graph)
1165 | 
1166 |         bfs_names = igraph::bfs(graph, 1)$order$name
1167 |         dfs_names = igraph::dfs(graph, 1)$order$name
1168 | 
1169 |         igraph::V(graph)[1]$level_name = ''
1170 |         for( node in bfs_names )
1171 |         {
1172 |             graph = assign_twins_name(graph, node)
1173 |             # if( !any(is.na(igraph::V(graph)[common_father_name]$level_name)) ) break ## when all common_father_name have level_name
1174 |             if( !any(is.na(igraph::V(graph)$level_name)) ) break ## when all common_father_name have level_name
1175 |         }
1176 |         
1177 |         if( balanced_4_clusters==TRUE ) ##A1, A2, B1, B2
1178 |         {
1179 |             branch_root_name = c('A.1', 'A.2', 'B.1', 'B.2')
1180 |             branch_root = match(branch_root_name, igraph::V(graph)$level_name)
1181 | 
1182 |             children = igraph::ego(graph, order = igraph::diameter(graph), nodes = branch_root, mode = 'out', mindist = 0) ## mindist==0, when itself is a branch
1183 |             tmp = lapply( children, function(v) intersect(leave_names, v$name) )
1184 |             cluster_labels = rep(branch_root_name, sapply(tmp, length))
1185 |             names(cluster_labels) = unlist( tmp )
1186 |             cluster_labels = cluster_labels[ as.character(1:length(cluster_labels)) ]
1187 |             return(cluster_labels)
1188 |         }
1189 | 
1190 |         #################################################
1191 |         clusters_raw = cutree(hc_ordered, k_clusters)[labels(hc_ordered)] ## labels are ordered according to pc 
1192 |         clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))[unique(clusters_raw)]
1193 |         names(clusters) = 1:k_clusters # named by pc order 
1194 |         
1195 |         ## get the vector. Named from 1 to 5 from left leaves to right leaves
1196 |         cluster_vector = rep( 1:k_clusters, sapply(clusters, length) )
1197 |         names(cluster_vector) = labels(hc_ordered)
1198 |         cluster_vector = cluster_vector[ as.character( sort(as.numeric(names(cluster_vector))) ) ]
1199 | 
1200 |         #################################################
1201 | 
1202 |         ## which node is the common father of all nodes in a cluster
1203 |         common_father_index = sapply( clusters, function(u) 
1204 |             {
1205 |                 if(length(u)==1) return( which(u==igraph::V(graph)$name) )
1206 |                 return(max(which(sapply(igraph::ego(graph, order = igraph::diameter(graph), nodes = igraph::V(graph), mode = 'out', mindist = 1), function(v) all(u %in% v$name) ))))
1207 |             })
1208 | 
1209 |         common_father_name = igraph::V(graph)[common_father_index]$name
1210 | 
1211 | 
1212 | 
1213 | 
1214 | 
1215 |         ## whether common_father_name are ordered
1216 |         if(!all(diff(match( common_father_name, dfs_names )) >= 0)) stop('!!!!!!!!check get_all_children in header_funs.R')
1217 |         
1218 | 
1219 | 
1220 |         level_names = igraph::V(graph)[common_father_name]$level_name ## label names are ordered
1221 |         if(any(sort(level_names, decreasing=TRUE)!=level_names)) warning('!!!!!!!!Need to check get_all_children in header_funs.R')
1222 | 
1223 |         cluster_labels = rep( level_names, sapply(clusters, length) )
1224 |         names(cluster_labels) = labels(hc_ordered)
1225 |         cluster_labels = cluster_labels[ as.character( sort(as.numeric(names(cluster_labels))) ) ]
1226 | 
1227 |         return( list(cluster_vector=cluster_vector, cluster_labels=cluster_labels))
1228 |     }
1229 | 
1230 | 
1231 | 
1232 |      
1233 |     get_hkmeans_cluser_levels <- function(hk_cluster_centers, PC1)
1234 |     {
1235 |         hc = hclust(as.dist(my_dist(hk_cluster_centers)), method='com')
1236 |         reordered_names = as.character(rank(PC1))
1237 |         hc_ordered = dendextend::rotate(hc, reordered_names) 
1238 |         hk_clust_labels = get_cluser_levels(hc_ordered, k_clusters=length(PC1))$cluster_labels
1239 |         return( hk_clust_labels )
1240 |     }   
1241 | 
1242 | 
1243 |     
1244 |     ## names_A_final is the rownames of the A_final
1245 |     ## names_A_final is matched to the names of the starting contact matrix
1246 |     
1247 |     get_original_tad_indices <- function(names_A_final, TADs, bin_size)
1248 |     {
1249 |         start_pos = as.numeric(names_A_final[TADs$start_pos])
1250 |         end_pos = as.numeric(names_A_final[TADs$end_pos])
1251 |         start_pos_ori = (start_pos - 1)*bin_size + 1
1252 |         end_pos_ori = end_pos*bin_size
1253 |         TADs = data.frame( start_pos_ori=start_pos_ori, end_pos_ori=end_pos_ori )
1254 |         return( TADs )
1255 |     }
1256 |     
1257 | 
1258 | 
1259 | 
1260 |     ## This code updates the branch name 
1261 |     ## Branches obtained from branches = lapply( res_inner, get_tree_v0 )
1262 |     ## The original branch name start from 1
1263 |     update_branch_name <- function(branch, root_start)
1264 |     {
1265 |         igraph::V(branch)$left = igraph::V(branch)$left + root_start - 1
1266 |         igraph::V(branch)$right = igraph::V(branch)$right + root_start - 1
1267 |         igraph::V(branch)$name = paste('(',igraph::V(branch)$left, ',', igraph::V(branch)$right, ')', sep='')
1268 |         return(branch)
1269 |     }
1270 | 
1271 | 


--------------------------------------------------------------------------------
/R/generate_compartments_bed_fun.R:
--------------------------------------------------------------------------------
 1 | 	## Yuanlong LIU, 09-08-2018
 2 | 
 3 | 	generate_compartments_bed <- function(input_mat, p_thresh, out_file_name, chr, window.sizes=3, stat_window_size=NULL, bin_size)
 4 | 	{	
 5 | 		input_mat_extended = data.frame(chr=paste0('chr', chr), pos_start=0:(nrow(input_mat)-1), pos_end=1:nrow(input_mat), mat=input_mat)
 6 | 		res_input_mat = TopDom_v2(input_mat_extended, window.size=NULL, NULL, T, p_thresh=p_thresh, window.sizes=window.sizes, stat_window_size=stat_window_size, domain_size_min=NULL)
 7 | 		# return(res_input_mat)
 8 | 		to_id = as.numeric(rownames(input_mat)[res_input_mat$domain$to.id])
 9 | 		from_id = as.numeric(rownames(input_mat)[res_input_mat$domain$from.id])
10 | 		
11 | 		start_poses = (from_id-1)*bin_size + 1
12 | 		# end_poses = start_poses + n2one*bin_size
13 | 		end_poses = start_poses + bin_size
14 | 		
15 | 		input_mat_compartments_bed = data.frame(paste0('chr', chr), as.character(start_poses), as.character(end_poses) )
16 | 		if(!is.null(out_file_name)) write.table( input_mat_compartments_bed, file=out_file_name, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' )
17 | 		return( res_input_mat )
18 | 	}
19 | 	


--------------------------------------------------------------------------------
/R/post_processing_nested_domains.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 	get_cluster_index <- function(pos, initial_clusters, bin_names, bin_size)
  3 | 	{
  4 | 		compartment_segs = generate_compartment_segs( initial_clusters )
  5 | 		CDs = get_original_tad_indices( bin_names, compartment_segs, bin_size=bin_size )
  6 | 		cluster_index = which(apply( CDs, 1, function(v) (v[1] < pos)&(v[2]>pos)  )==1)
  7 | 		return(cluster_index)
  8 | 	}
  9 | 
 10 | 	get_original_tad_indices_extra <- function(names_A_final, TADs, bin_size) ## to get the TADs for the extra edges
 11 |     {
 12 |         # start_pos = as.numeric(names_A_final[TADs$start_pos])
 13 |         end_pos = as.numeric(names_A_final[TADs$end_pos])
 14 |         # start_pos_ori = (start_pos - 1)*bin_size + 1
 15 |         end_pos_ori = end_pos*bin_size
 16 |         TADs = data.frame( start_pos_ori=end_pos_ori+1, end_pos_ori=end_pos_ori )
 17 |         return( TADs )
 18 |     }
 19 | 
 20 | 	
 21 | 	LikelihoodRatioTest <- function(sub_domains_raw, ncores, remove_zero=FALSE, distr, n_parameters=3, imputation_num=1E2, A_already_corrected=FALSE)
 22 | 	{
 23 | 		# require( doParallel )
 24 | 		pA_sym = sub_domains_raw$pA_sym
 25 | 		if(A_already_corrected==TRUE) cpA_sym = pA_sym
 26 | 		## CAN SPEEDUP correct_A_fast_divide_by_mean BECAUSE ONLY SOME OFF-DIAGNAL LINES NEED TO BE CORRECTED
 27 | 		if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym
 28 | 		trees = foreach::foreach( j =1:length( sub_domains_raw$res_inner ) ) %do% 
 29 | 		{
 30 | 			name_index = rownames(sub_domains_raw$pA_sym)[sub_domains_raw$segmentss[j,1]:sub_domains_raw$segmentss[j,2]]
 31 | 			sub_domains_raw$res_inner[[j]]$cA = as.matrix(cpA_sym[name_index, name_index]) ## the corrected A. Use this matrix is essential for reducing the distance-based false positves
 32 | 			
 33 | 			## for example, when the number of invovled bins in sub_domains_raw$res_inner[[j]] is too small
 34 | 			# tmp = try(get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ))
 35 | 			tmp = get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )
 36 | 			# if( class(tmp)=="try-error" )
 37 | 			if(class(tmp)!='igraph') ## if(tmp=='bad_tree')
 38 | 			{
 39 | 				root_tree = igraph::graph.empty() + 'root'
 40 | 				igraph::V(root_tree)$width = nrow(sub_domains_raw$res_inner[[j]]$A)
 41 | 				igraph::V(root_tree)$left = 1
 42 | 				igraph::V(root_tree)$right = igraph::V(root_tree)$width
 43 | 				tmp = root_tree ## represented by the length of the segment
 44 | 			}
 45 | 			tmp
 46 | 		}
 47 | 		return(trees)	
 48 | 	}
 49 | 
 50 | 	
 51 | 	# pipline <- function(chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
 52 | 	create_TADs <- function(sub_domains_raw, chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end
 53 | 	{
 54 | 		chr_name = paste0('chr', chr)
 55 | 		bin_size_kb = bin_size / 1E3
 56 | 
 57 | 		if(just_save_no_nest)
 58 | 		{
 59 | 			TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 60 | 			TADs = cbind(chr_name, TADs)
 61 | 			save( TADs, file=Tads_R_File )
 62 | 		}
 63 | 		TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 64 | 		# if(TADs_type=='extra') TADs = get_original_tad_indices_extra( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 )
 65 | 
 66 | 		# print(TADs)
 67 | 
 68 | 		TADs = cbind(chr_name, TADs)
 69 | 		return(TADs)
 70 | 	}
 71 | 
 72 | 	#####################################################################
 73 | 
 74 | 	## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
 75 | 	clean_TADs_all = function(TADs_all_raw, CDs, bin_size)
 76 | 	{
 77 | 		TADs_all_end = TADs_all_raw[,2]
 78 | 		CD_end = CDs[,2]
 79 | 		outer_mat = outer(TADs_all_end, CD_end, "-")
 80 | 		min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
 81 | 		# end2rm = TADs_all_end[(min_dist <= 3) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb		
 82 | 		end2rm = TADs_all_end[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb		
 83 | 		## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE
 84 | 
 85 | 
 86 | 		TADs_all_head = TADs_all_raw[,1]
 87 | 		CD_head = CDs[,1]
 88 | 		outer_mat = outer(TADs_all_head, CD_head, "-")
 89 | 		min_dist = apply(outer_mat, 1, function(v) min(abs(v)))
 90 | 		head2rm = TADs_all_head[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary	
 91 | 		TADs_all = subset(TADs_all_raw, (!(start_pos %in% head2rm)) & (!(end_pos %in% end2rm)) )	
 92 | 		return(TADs_all)
 93 | 	}
 94 | 
 95 | 	post_process_sub_domains = function(chr, sub_domains_raw, ncores, out_dir, bin_size)
 96 | 	{
 97 | 		
 98 | 		distr=c('lnorm', 'wilcox')[2]
 99 | 		remove_zero = FALSE
100 | 		n_parameters = 3
101 | 		imputation_num = 1E2
102 | 		A_already_corrected = FALSE
103 | 
104 | 		decorated_branches = LikelihoodRatioTest(sub_domains_raw=sub_domains_raw, ncores=ncores, remove_zero=FALSE, distr=distr, n_parameters=n_parameters, imputation_num=imputation_num, A_already_corrected=A_already_corrected)
105 | 		chr_name = paste0('chr', chr)
106 | 
107 | 
108 | 		mean_diff_thresh = -0.1
109 | 		i = 1
110 | 		# for(i in 1:length(p0s))
111 | 		{
112 | 			# cat(i, '\n')
113 | 			# p0 = p0s[i]
114 | 
115 | 			normal_decorated_branches = decorated_branches[sapply(decorated_branches, igraph::vcount) > 1]
116 | 			# ps = sort(unlist(lapply(normal_decorated_branches, function(v) igraph::V(v)$wilcox_p)), decreasing=FALSE) ## fdr correction
117 | 			# p0_adj = ps[min(which(p.adjust(ps, method = 'fdr') > p0))]
118 | 			# p0_adj = ps[min(which(p.adjust(ps, method = 'bonferroni') > p0))]
119 | 			p0_adj = 0.05
120 | 
121 | 			TADs_all_raw = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=TRUE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
122 | 
123 | 			# CDs = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE); cat(nrow(CDs), '\n')
124 | 			CDs = prunning(decorated_branches, to_correct=FALSE, p0=-1, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh)
125 | 			# if(nrow(CDs)!=length(res$initial_clusters)) stop('nrow(CDs)!=length(res$initial_clusters)')
126 | 
127 | 			TADs_all = clean_TADs_all(TADs_all_raw, CDs, bin_size=bin_size)
128 | 
129 | 			# Tad_edges <- sort(unique(c(TADs_tmp[,1], (TADs_tmp[,2]+1))))
130 | 			TADs_all_edges <- sort(unique(TADs_all[,2]))
131 | 			CD_edges <- sort(unique(CDs[,2]))
132 | 			
133 | 			TADs_extra_edges = unique(setdiff(TADs_all_edges, CD_edges))
134 | 			TADs_extra = data.frame(start_pos=TADs_extra_edges, end_pos= TADs_extra_edges)
135 | 
136 | 
137 | 			TADs_pos_all = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_all, TADs_type='ALL', bin_size=bin_size)
138 | 			TADs_pos_extra = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_extra, TADs_type='extra', bin_size=bin_size)		
139 | 			TADs_pos_CD = create_TADs(sub_domains_raw=sub_domains_raw, chr, CDs, TADs_type='CD', bin_size=bin_size)
140 | 
141 | 			# TADs_info = list(decorated_branches=decorated_branches, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD)
142 | 			TADs_info = list(decorated_branches=decorated_branches, TADs_all=TADs_all, CDs=CDs, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD, TADs_pos_extra=TADs_pos_extra)
143 | 		}
144 | 
145 | 
146 | 		options(stringsAsFactors=FALSE)
147 | 	    level_5 = TADs_info$TADs_pos_extra[, c('chr_name', 'end_pos_ori')]
148 | 	    colnames(level_5)[1:2] = c('chr', 'nested_boundary')
149 | 	    sub_domain_boundary_bed_file = paste0(out_dir, '/chr', chr, '_nested_boundaries.bed')
150 | 
151 | 	    level_5_bed = data.frame(level_5, level_5[,2], '', '.', level_5, level_5[,2], '#000000')
152 | 	    op <- options(scipen=999)
153 | 	    write.table( level_5_bed, file=sub_domain_boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE )
154 | 
155 | 	    on.exit(options(op))
156 | 
157 | 		return(NULL)
158 | 	}
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/R/prunning.R:
--------------------------------------------------------------------------------
  1 | trim_tree_adaptive_top_down_v2 = function( tree, wilcox_p_thresh, mean_diff_thresh )
  2 |         {
  3 |                 # leaves = get_leaves(tree)
  4 |                 if(igraph::vcount(tree)==1) return(tree)
  5 |                 # cat('I am in trim_tree_adaptive_top_down_v2\n')
  6 |                 nsig_nodes = union( igraph::V(tree)[which(igraph::V(tree)$wilcox_p > wilcox_p_thresh)]$name, igraph::V(tree)[which(igraph::V(tree)$mean_diff > mean_diff_thresh)]$name )
  7 | 
  8 | 
  9 |                 children_of_nsig = names(unlist(igraph::ego(tree, order=1, node=nsig_nodes, mode='out', mindist=1)))
 10 |                 if(length(children_of_nsig)!=0) trimmed_tree = tree - children_of_nsig
 11 |                 if(length(children_of_nsig)==0) trimmed_tree = tree
 12 | 
 13 |                 comps = igraph::decompose(trimmed_tree)
 14 |                 root_index = which(sapply( comps, function(comp) igraph::V(tree)[1]$name %in% igraph::V(comp)$name )==1)
 15 |                 trimmed_tree = comps[[root_index]]
 16 |                 if(!is_binary_tree( trimmed_tree )) stop("trim_tree_adaptive_top_down, the resulted tree is not a binary tree")
 17 |                 return( trimmed_tree )
 18 |         }
 19 | 
 20 | 
 21 | prunning = function(branches, p0, to_correct=FALSE, width_thresh=-Inf, width_thresh_CD=5, boundary_signal_thresh=-Inf, return_which='TADs', top_down=FALSE, all_levels=FALSE, CD_border_adj=FALSE, peak_thresh=NULL, mean_diff_thresh)
 22 | 	{
 23 | 
 24 | 		size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v))))
 25 | 		p_thresh = p0/size2correct
 26 | 		if(to_correct==FALSE) p_thresh=p0
 27 | 		# if(!is.null(p0)) p_thresh = p0
 28 | 
 29 | 		
 30 | 		if(top_down==FALSE)
 31 | 		{
 32 | 			trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh, peak_thresh=peak_thresh )
 33 | 			# size2correct = sum(sapply(trimmed_branches, igraph::vcount)) - sum(sapply(trimmed_branches, function(v) length(get_leaves(v))))
 34 | 			
 35 | 			# for( i in 1:length( trimmed_branches ) )
 36 | 			# {
 37 | 				# trimmed_branch = trimmed_branches[[i]]
 38 | 				# if(igraph::vcount(trimmed_branch) > 1) trimmed_branches[[i]] = lapply( trimmed_branches[i], trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=-1 )[[1]]
 39 | 			# }
 40 | 			
 41 | 
 42 | 		}
 43 | 		
 44 | 		# if(top_down==TRUE) trimmed_branches = lapply( branches, trim_tree_adaptive_top_down, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh )
 45 | 		if(top_down==TRUE) trimmed_branches = lapply( branches, function(branch) trim_tree_adaptive_top_down_v2(wilcox_p_thresh=p_thresh, mean_diff_thresh=mean_diff_thresh, tree=branch ))
 46 | 
 47 | 		if(CD_border_adj==TRUE)
 48 | 		{
 49 | 			all_tads = get_adjusted_nested_TADs( trimmed_branches, width_thresh_CD, all_levels )
 50 | 			return( all_tads )
 51 | 		}
 52 | 		
 53 | 		## get all nested TADs in trimmed_branches
 54 | 		if(all_levels==TRUE)
 55 | 		{
 56 | 			all_tads = data.frame(start_pos=numeric(), end_pos=numeric())
 57 | 			widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width))
 58 | 			for(i in 1:length(trimmed_branches)) 
 59 | 			{
 60 | 				all_tads_i = get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i]))
 61 | 				all_tads = rbind(all_tads, all_tads_i)
 62 | 			}
 63 | 			return( all_tads )
 64 | 		}
 65 | 					
 66 | 		if( return_which=='trimmed_branches' ) return( trimmed_branches )
 67 | 		
 68 | 		tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
 69 | 		tad_sizes = unlist(tad_sizes_ind)
 70 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
 71 | 		end_pos = cumsum(tad_sizes)
 72 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
 73 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
 74 | 		return( tads )
 75 | 	}
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 	## This function combines prunning with branches of only one node
 81 | 	prunning_hybrid <- function(branches, ...)
 82 | 	{
 83 | 		names(branches) = as.character(1:length(branches))
 84 | 		normal_branches = branches[sapply( branches, function(v) class(v)=='igraph' )]
 85 | 		unnormal_branches = branches[sapply( branches, function(v) class(v)!='igraph' )] ## that is reprsented as bin_start:bin_end
 86 | 
 87 | 		trimmed_branches = prunning(normal_branches, return_which='trimmed_branches', ...)
 88 | 
 89 | 		normal_tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
 90 | 		unormal_tad_sizes_ind = unnormal_branches
 91 | 		tad_sizes_ind = c(normal_tad_sizes_ind, unormal_tad_sizes_ind)
 92 | 		tad_sizes_ind = tad_sizes_ind[names(branches)]
 93 | 		tad_sizes = unlist(tad_sizes_ind)
 94 | 
 95 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
 96 | 		end_pos = cumsum(tad_sizes)
 97 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
 98 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
 99 | 		return( tads )
100 | 	}
101 | 
102 | 
103 | 	get_all_tads_in_a_trimmed_branch <- function(trimmed_branch, pos_shift)
104 | 	{
105 | 		res = data.frame( start_pos=igraph::V(trimmed_branch)$left + pos_shift, end_pos=igraph::V(trimmed_branch)$right + pos_shift )
106 | 		res = res[order(res[,1], res[,2]), ]
107 | 		return(res)
108 | 	}
109 | 	
110 | 	prunning_bottom_up <- function(branches, p0=NULL, width_thresh)
111 | 	{
112 | 		size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v))))
113 | 		
114 | 		p_thresh = 0.05/size2correct
115 | 		if(!is.null(p0)) p_thresh = p0
116 | 		
117 | 		trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh )
118 | 
119 | 		tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width )
120 | 		tad_sizes = unlist(tad_sizes_ind)
121 | 		# tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes))
122 | 		end_pos = cumsum(tad_sizes)
123 | 		start_pos = c(1, 1 + end_pos[-length(end_pos)])
124 | 		tads = data.frame(start_pos=start_pos, end_pos=end_pos)
125 | 		return( tads )
126 | 	}
127 | 	
128 | 	
129 | 	trim_tree_adaptive_bottom_up <- function( tree, which_p='imp_p' )
130 | 	{
131 | 		if(which_p=='imp_p') ps = sort(unique(igraph::V(tree)$imp_p), decreasing=TRUE)
132 | 		# if(which_p=='nimp_p') ps = sort(unique(igraph::V(tree)$nimp_p), decreasing=TRUE)
133 | 		# if(which_p=='both') ps = sort(unique(pmin(igraph::V(tree)$nimp_p, igraph::V(tree)$imp_p)), decreasing=TRUE)
134 | 		
135 | 		trimed_tree_current = tree
136 | 		trimmed_branch_bottom_up = vector('list', length(ps))
137 | 		for(i in 1:length(ps))
138 | 		{
139 | 			trimed_tree_current = trim_tree_adaptive( tree, L_diff_thresh=-Inf, max_imp_p=ps[i], max_nimp_p=Inf, width_thresh=-Inf )
140 | 			trimmed_branch_bottom_up[[i]] = trimed_tree_current
141 | 		}
142 | 		igraph::vcounts = sapply(trimmed_branch_bottom_up, igraph::vcount)
143 | 		ps = ps[!duplicated(igraph::vcounts)]
144 | 		trimmed_branch_bottom_up = trimmed_branch_bottom_up[!duplicated(igraph::vcounts)]
145 | 		res = list(ps=ps, trimmed_branch_bottom_up=trimmed_branch_bottom_up)
146 | 		return( res )
147 | 	}
148 | 	
149 | 	
150 | 	## get adjusted nested TADs
151 | 	get_adjusted_nested_TADs <- function( trimmed_branches, width_thresh_CD, all_levels )
152 | 	{
153 | 			widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width))
154 | 			all_tads_i_list = lapply( 1:length(trimmed_branches), function(i) get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i])))
155 | 
156 | 			for(i in 1:length(trimmed_branches)) 
157 | 			{
158 | 				all_tads_i = all_tads_i_list[[i]]
159 | 				if( nrow(all_tads_i) <= 1 ) next
160 | 				## move the left-most border a little bit right if needed
161 | 				left_borders = unique(all_tads_i[,1])
162 | 				min_diff_left = left_borders[2] - left_borders[1]
163 | 				if( min_diff_left <= width_thresh_CD  )
164 | 				{
165 | 					all_tads_i[ all_tads_i==left_borders[1] ] = left_borders[2]
166 | 					all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1],  ] ## remove "negative" TADs
167 | 					all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs
168 | 
169 | 					all_tads_i_list[[i]] = all_tads_i
170 | 
171 | 					## need to modify the right border of nested TADs in previous CD if the left border of this CD is modified
172 | 					if(i > 1)
173 | 					{
174 | 						## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i]	
175 | 						all_tads_i_list[[i-1]][ all_tads_i_list[[i-1]]==max(all_tads_i_list[[i-1]]) ] = min(all_tads_i_list[[i]]) - 1
176 | 					}
177 | 				}
178 | 
179 | 				if( nrow(all_tads_i) <= 1 ) next
180 | 				## move the right-most border a little bit left if needed
181 | 				right_borders = unique(rev(all_tads_i[,2]))
182 | 				min_diff_right = right_borders[1] - right_borders[2]
183 | 				if( min_diff_right <= width_thresh_CD  )
184 | 				{
185 | 					all_tads_i[ all_tads_i==right_borders[1] ] = right_borders[2]
186 | 					all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1],  ]
187 | 					all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs
188 | 					
189 | 					all_tads_i_list[[i]] = all_tads_i
190 | 
191 | 					if(i < length(trimmed_branches))
192 | 					{
193 | 						## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i]	
194 | 						all_tads_i_list[[i+1]][ all_tads_i_list[[i+1]]==min(all_tads_i_list[[i+1]]) ] = max(all_tads_i_list[[i]]) + 1
195 | 					}
196 | 				}
197 | 			}
198 | 
199 | 			if(!all_levels) all_tads_i_list = lapply( all_tads_i_list, function(v) data.frame(start_pos=head(v[,1],1), end_pos=tail(v[,2],1)) )
200 | 
201 | 			all_tads = do.call(rbind, all_tads_i_list)
202 | 			colnames(all_tads) = c('start_pos', 'end_pos')
203 | 			return( all_tads )
204 | 	}
205 | 	
206 | 	
207 | 	
208 | 	


--------------------------------------------------------------------------------
/R/zigzag_nested_domain.R:
--------------------------------------------------------------------------------
 1 | 	## Yuanlong LIU
 2 | 	## 12/04/2018
 3 | 	## 06/06/2018
 4 | 	## 16/06/2018
 5 | 	## This code runs zigzag search on each compartment domain as a whole without dividing into 400
 6 | 
 7 | 	## 09-10-2018
 8 | 	
 9 | 	## A should be already named
10 | 	## "zigzag" resembles computational path of finding the optimum  
11 | 	# HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2)
12 | 	HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, min_n_bins=2)
13 | 	{
14 | 		arg_list = as.list(environment())
15 | 		
16 | 		res_folder = file.path(res_dir)
17 | 		dir.create(res_folder, recursive=TRUE, showWarnings = FALSE)
18 | 		
19 | 		total_execution_time_file = file.path(res_dir, 'total_execution.time')
20 | 		time_begin = Sys.time()
21 | 		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
22 | 	
23 | 		## check whether your input matrix is symmetric
24 | 		# A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') )
25 | 		# A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11
26 | 		A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11
27 | 
28 | 		tol = 100 * .Machine$double.eps
29 | 		max_diff = max(abs(A_sym - A))
30 | 		notSym_flag = max_diff > tol
31 | 		if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first')
32 | 		
33 | 		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
34 | 		
35 | 		# pA_sym = rm_zeros(A_sym) ## pA_sym: positive A
36 | 		pA_sym = remove_blank_cols(A_sym, sparse=TRUE, ratio=0)		
37 | 		n_zero_rows = nrow(A_sym) - nrow(pA_sym)
38 | 		zero_rows_flag = n_zero_rows > 0
39 | 		if( zero_rows_flag )
40 | 		{
41 | 			warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis')
42 | 			original_row_names = rownames(A_sym)
43 | 			kept_row_names = rownames(pA_sym)
44 | 		}
45 | 
46 | 		# res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain
47 | 		# for( i in 1:nrow(compartment_segs) )
48 | 		# {
49 | 		# 	seg = compartment_segs[i,1]:compartment_segs[i,2]
50 | 		# 	cat('Compute seg:', i, 'of length:', length(seg), '\n')
51 | 
52 | 		# 	A_seg = as.matrix(pA_sym[seg, seg])
53 | 		# 	res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg))
54 | 		# 	res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors)
55 | 		# 	res_inner[[i]] = res_outer
56 | 		# 	cat('finished', '\n')
57 | 		# }
58 | 
59 | 		## changed to paralell, 2018-11-11
60 | 		cat('\n')
61 | 
62 | 		res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do%
63 | 		{
64 | 			seg = compartment_segs[i,1]:compartment_segs[i,2]
65 | 			cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins', i, nrow(compartment_segs), length(seg)))
66 | 
67 | 			A_seg = as.matrix(pA_sym[seg, seg])
68 | 			res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins)
69 | 			res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins)
70 | 			res_outer
71 | 			# res_inner[[i]] = res_outer
72 | 		}
73 | 
74 | 		cat('\n')
75 | 		
76 | 		segmentss = compartment_segs
77 | 		res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner )
78 | 		# res_folder_final = file.path(res_dir, 'final')
79 | 		# dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
80 | 		# save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
81 | 	
82 | 		time_finish = Sys.time()
83 | 		cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
84 | 		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
85 | 	
86 | 		return( res_info )
87 | 	}
88 | 	
89 | 	
90 | 	
91 | 	
92 | 	
93 | 	
94 | 	
95 | 	


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## (Check CALDER2 with mutiple updates here: https://github.com/CSOgroup/CALDER2)
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | # CALDER user manuel
  8 | 
  9 | CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts. CALDER is currently implemented in R.
 10 | 
 11 | 
 12 | ![Alt text](./img/CALDER_methods.png "CALDER methods")
 13 | 
 14 | ## (A note on the performance of Calder vs PC-based approach)
 15 | 
 16 | * PC1 of the correlation matrix was typically used to define A/B compartment. We found Calder demonstrates superior robustness over PC-based approach in identifying meaningful compartments, particularly when faced with complex chromosomal structural variations (figure on the left) and loose interaction between the p and q arms (figure on the right)
 17 | 
 18 | ![Alt text](./img/Hela_chr11_and_RWPE1_chr9_pq.png "Hela_chr11_and_RWPE1_chr9_pq")
 19 | 
 20 | # Installation
 21 | 
 22 | 
 23 | ## Make sure all dependencies have been installed:
 24 | 
 25 | * R.utils (>= 2.9.0),
 26 | * doParallel (>= 1.0.15),
 27 | * ape (>= 5.3),
 28 | * dendextend (>= 1.12.0),
 29 | * fitdistrplus (>= 1.0.14),
 30 | * igraph (>= 1.2.4.1),
 31 | * Matrix (>= 1.2.17),
 32 | * rARPACK (>= 0.11.0),
 33 | * factoextra (>= 1.0.5),
 34 | * maptools (>= 0.9.5),
 35 | * data.table (>= 1.12.2),
 36 | * fields (>= 9.8.3),
 37 | * GenomicRanges (>= 1.36.0)
 38 | 
 39 | ## Clone its repository and install it from source:
 40 | 
 41 | `git clone https://github.com/CSOgroup/CALDER.git`
 42 | 
 43 | `install.packages(path_to_CALDER, repos = NULL, type="source")` ## install from the cloned source file
 44 | 
 45 | 
 46 | Please contact yliueagle@googlemail.com for any questions about installation.
 47 | 
 48 | ## install CALDER and dependencies automaticly:
 49 | 
 50 | ```
 51 | if (!requireNamespace("BiocManager", quietly = TRUE))
 52 |     install.packages("BiocManager")
 53 | 
 54 | BiocManager::install("GenomicRanges")
 55 | install.packages("remotes")
 56 | remotes::install_github("CSOgroup/CALDER")
 57 | ```
 58 | 
 59 | # Usage
 60 | 
 61 | The input data of CALDER is a three-column text file storing the contact table of a full chromosome (zipped format is acceptable, as long as it can be read by `data.table::fread`). Each row represents a contact record `pos_x, pos_y, contact_value`, which is the same format as that generated by the `dump` command of juicer (https://github.com/aidenlab/juicer/wiki/Data-Extraction):	
 62 | 
 63 | 	16050000	16050000	10106.306
 64 | 	16050000	16060000	2259.247
 65 | 	16060000	16060000	7748.551
 66 | 	16050000	16070000	1251.3663
 67 | 	16060000	16070000	4456.1245
 68 | 	16070000	16070000	4211.7393
 69 | 	16050000	16080000	522.0705
 70 | 	16060000	16080000	983.1761
 71 | 	16070000	16080000	1996.749
 72 | 	...
 73 | 
 74 | A demo dataset is included in the repository `CALDER/inst/extdata/mat_chr22_10kb_ob.txt.gz` and can be accessed by `system.file("extdata", "mat_chr22_10kb_ob.txt.gz", package='CALDER')` once CALDER is installed. This data contains contact values of GM12878 on chr22 binned at 10kb (Rao et al. 2014) 
 75 | 
 76 | CALDER contains three modules: (1) compute chromatin domains; (2) derive their hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each compartment domain.
 77 | 
 78 | ### To run three modules in a single step:
 79 | ```
 80 | CALDER_main(contact_mat_file, 
 81 | 			chr, 
 82 | 			bin_size, 
 83 | 			out_dir, 
 84 | 			sub_domains=TRUE, 
 85 | 			save_intermediate_data=FALSE,
 86 | 			genome='hg19')
 87 | ```
 88 | 
 89 | ### To run three modules in seperated steps:
 90 | ```
 91 | # This will not compute sub-domains, but save the intermediate_data that can be used to compute sub-domains latter on
 92 | CALDER_main(contact_mat_file, 
 93 | 			chr, 
 94 | 			bin_size, 
 95 | 			out_dir, 
 96 | 			sub_domains=FALSE, 
 97 | 			save_intermediate_data=TRUE,
 98 | 			genome='hg19') 
 99 | 
100 | # (optional depends on needs) Compute sub-domains using intermediate_data_file that was previous saved in the out_dir (named as chrxx_intermediate_data.Rds)
101 | CALDER_sub_domains(intermediate_data_file, 
102 | 				   chr, 
103 | 				   out_dir, 
104 | 				   bin_size) 
105 | ```
106 | 
107 | ### Paramters:
108 | 
109 | * `contact_mat_file`: path to the contact table of a chromosome
110 | * `chr`: chromosome number. Either numeric or character, will be pasted to the output name
111 | * `bin_size`: numeric, the size of a bin in consistent with the contact table
112 | * `out_dir`: the output directory
113 | * `sub_domains`: logical, whether to compute nested sub-domains
114 | * `save_intermediate_data`: logical. If TRUE, an intermediate_data will be saved. This file can be used for computing nested sub-domains later on
115 | * `genome`: string. Specifies the genome assembly (Default="hg19").
116 | 
117 | ### Output:
118 | 
119 | #### chrxx_domain_hierachy.tsv
120 | * information of compartment domain and their hierarchical organization. The hierarchical structure is fully represented by `compartment_label`, for example, `B.2.2.2` and `B.2.2.1` are two sub-branches of `B.2.2`. The `pos_end` column specifies all compartment domain borders, except when it is marked as `gap`, which indicates it is the border of a gap chromsome region that has too few contacts and was excluded from the analysis (e.g., due to low mappability, deletion, technique flaw) 
121 | 
122 | #### chrxx_sub_compartments.bed
123 | * a .bed file containing the sub-compartment information, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments)  
124 | 
125 | #### chrxx_domain_boundaries.bed
126 | * a .bed file containing the chromatin domains boundaries, that can be visualized in IGV
127 | 
128 | #### chrxx_nested_boundaries.bed
129 | * a .bed file containing the nested sub-domain boundaries, that can be visualized in IGV
130 | 
131 | #### chrxx_intermediate_data.Rds
132 | * an Rds file storing the intermediate_data that can be used to compute nested sub-domains (if CALDER is run in two seperated steps)
133 | 
134 | #### chrxx_log.txt, chrxx_sub_domains_log.txt
135 | * log file storing the status and running time of each step
136 | 
137 | 
138 | 
139 | ### Running time:
140 | For the computational requirement, running CALDER on the GM12878 Hi-C dataset at bin size of 40kb took **36 minutes** to derive the chromatin domains and their hierarchy for all chromosomes (i.e., CALDER Step1 and Step2); **13 minutes** to derive the nested sub-domains (i.e., CALDER Step3). At the bin size of 10kb, it took **1 h 44 minutes and 55 minutes** correspondingly (server information: 40 cores, 64GB Ram, Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz). The evaluation was done using a single core although CALDER can be run in a parallel manner.
141 | 
142 | ### Demo run:
143 | 
144 | ```
145 | library(CALDER)
146 | 
147 | contact_mat_file = system.file("extdata", "mat_chr22_10kb_ob.txt.gz", package = 'CALDER')
148 | 
149 | CALDER_main(contact_mat_file, chr=22, bin_size=10E3, out_dir='./GM12878', sub_domains=TRUE, save_intermediate_data=FALSE)
150 | ```
151 | 
152 | The saved .bed files can be view directly through IGV:
153 | 
154 | ![Alt text](./img/IGV_results.png "IGV")
155 | 
156 | # Citation
157 | 
158 | If you use CALDER in your work, please cite: https://www.nature.com/articles/s41467-021-22666-3
159 | 
160 | 
161 | # Contact information
162 | 
163 | * Author: Yuanlong LIU
164 | * Affiliation: Computational Systems Oncology group, Department of Computational Biology, University of Lausanne, Switzerland
165 | * Email: yliueagle@googlemail.com
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/Read-and-delete-me:
--------------------------------------------------------------------------------
 1 | * Edit the help file skeletons in 'man', possibly combining help files
 2 |   for multiple functions.
 3 | * Edit the exports in 'NAMESPACE', and add necessary imports.
 4 | * Put any C/C++/Fortran code in 'src'.
 5 | * If you have compiled code, add a useDynLib() directive to
 6 |   'NAMESPACE'.
 7 | * Run R CMD build to build the package tarball.
 8 | * Run R CMD check to check the package tarball.
 9 | 
10 | Read "Writing R Extensions" for more information.
11 | 


--------------------------------------------------------------------------------
/img/CALDER_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/CALDER_methods.png


--------------------------------------------------------------------------------
/img/Hela_chr11_and_RWPE1_chr9_pq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/Hela_chr11_and_RWPE1_chr9_pq.png


--------------------------------------------------------------------------------
/img/IGV_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/IGV_results.png


--------------------------------------------------------------------------------
/inst/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/.DS_Store


--------------------------------------------------------------------------------
/inst/extdata/TxDb.Hsapiens.UCSC.hg19.knownGene.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/TxDb.Hsapiens.UCSC.hg19.knownGene.rds


--------------------------------------------------------------------------------
/inst/extdata/TxDb.Mmusculus.UCSC.mm9.knownGene.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/TxDb.Mmusculus.UCSC.mm9.knownGene.rds


--------------------------------------------------------------------------------
/inst/extdata/mat_chr22_10kb_ob.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/mat_chr22_10kb_ob.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/mat_mm9_chr1_100kb_ob.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/mat_mm9_chr1_100kb_ob.txt.gz


--------------------------------------------------------------------------------
/man/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/man/.Rapp.history


--------------------------------------------------------------------------------
/man/CALDER-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{CALDER-package}
 2 | \alias{CALDER-package}
 3 | \alias{CALDER}
 4 | \docType{package}
 5 | \title{
 6 | \packageTitle{CALDER}
 7 | }
 8 | \description{
 9 | \packageDescription{CALDER}
10 | }
11 | \details{
12 | 
13 | The DESCRIPTION file:
14 | \packageDESCRIPTION{CALDER}
15 | \packageIndices{CALDER}
16 | ~~ An overview of how to use the package, including the most important ~~
17 | ~~ functions ~~
18 | }
19 | \author{
20 | \packageAuthor{CALDER}
21 | 
22 | Maintainer: \packageMaintainer{CALDER}
23 | }
24 | \references{
25 | ~~ Literature or other references for background information ~~
26 | }
27 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~
28 | ~~ the R documentation directory ~~
29 | \keyword{ package }
30 | \seealso{
31 | ~~ Optional links to other man pages, e.g. ~~
32 | ~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
33 | }
34 | \examples{
35 | ~~ simple examples of the most important functions ~~
36 | }
37 | 


--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/src/.DS_Store


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | // matrix_multiplication_cpp
 10 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B);
 11 | RcppExport SEXP _CALDER_matrix_multiplication_cpp(SEXP ASEXP, SEXP BSEXP) {
 12 | BEGIN_RCPP
 13 |     Rcpp::RObject rcpp_result_gen;
 14 |     Rcpp::RNGScope rcpp_rngScope_gen;
 15 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 16 |     Rcpp::traits::input_parameter< arma::mat >::type B(BSEXP);
 17 |     rcpp_result_gen = Rcpp::wrap(matrix_multiplication_cpp(A, B));
 18 |     return rcpp_result_gen;
 19 | END_RCPP
 20 | }
 21 | // matrix_multiplication_sym_cpp
 22 | arma::mat matrix_multiplication_sym_cpp(arma::mat A);
 23 | RcppExport SEXP _CALDER_matrix_multiplication_sym_cpp(SEXP ASEXP) {
 24 | BEGIN_RCPP
 25 |     Rcpp::RObject rcpp_result_gen;
 26 |     Rcpp::RNGScope rcpp_rngScope_gen;
 27 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 28 |     rcpp_result_gen = Rcpp::wrap(matrix_multiplication_sym_cpp(A));
 29 |     return rcpp_result_gen;
 30 | END_RCPP
 31 | }
 32 | // loglik_lnorm_cpp
 33 | double loglik_lnorm_cpp(double sum_ln1, double sum_ln2, double p, double q);
 34 | RcppExport SEXP _CALDER_loglik_lnorm_cpp(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP pSEXP, SEXP qSEXP) {
 35 | BEGIN_RCPP
 36 |     Rcpp::RObject rcpp_result_gen;
 37 |     Rcpp::RNGScope rcpp_rngScope_gen;
 38 |     Rcpp::traits::input_parameter< double >::type sum_ln1(sum_ln1SEXP);
 39 |     Rcpp::traits::input_parameter< double >::type sum_ln2(sum_ln2SEXP);
 40 |     Rcpp::traits::input_parameter< double >::type p(pSEXP);
 41 |     Rcpp::traits::input_parameter< double >::type q(qSEXP);
 42 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp(sum_ln1, sum_ln2, p, q));
 43 |     return rcpp_result_gen;
 44 | END_RCPP
 45 | }
 46 | // loglik_lnorm_cpp_vec
 47 | double loglik_lnorm_cpp_vec(arma::vec vec_values);
 48 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_vec(SEXP vec_valuesSEXP) {
 49 | BEGIN_RCPP
 50 |     Rcpp::RObject rcpp_result_gen;
 51 |     Rcpp::RNGScope rcpp_rngScope_gen;
 52 |     Rcpp::traits::input_parameter< arma::vec >::type vec_values(vec_valuesSEXP);
 53 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_vec(vec_values));
 54 |     return rcpp_result_gen;
 55 | END_RCPP
 56 | }
 57 | // get_A_len
 58 | arma::mat get_A_len(arma::mat A);
 59 | RcppExport SEXP _CALDER_get_A_len(SEXP ASEXP) {
 60 | BEGIN_RCPP
 61 |     Rcpp::RObject rcpp_result_gen;
 62 |     Rcpp::RNGScope rcpp_rngScope_gen;
 63 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 64 |     rcpp_result_gen = Rcpp::wrap(get_A_len(A));
 65 |     return rcpp_result_gen;
 66 | END_RCPP
 67 | }
 68 | // get_A_ln1
 69 | arma::mat get_A_ln1(arma::mat A);
 70 | RcppExport SEXP _CALDER_get_A_ln1(SEXP ASEXP) {
 71 | BEGIN_RCPP
 72 |     Rcpp::RObject rcpp_result_gen;
 73 |     Rcpp::RNGScope rcpp_rngScope_gen;
 74 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 75 |     rcpp_result_gen = Rcpp::wrap(get_A_ln1(A));
 76 |     return rcpp_result_gen;
 77 | END_RCPP
 78 | }
 79 | // get_A_ln2
 80 | arma::mat get_A_ln2(arma::mat A);
 81 | RcppExport SEXP _CALDER_get_A_ln2(SEXP ASEXP) {
 82 | BEGIN_RCPP
 83 |     Rcpp::RObject rcpp_result_gen;
 84 |     Rcpp::RNGScope rcpp_rngScope_gen;
 85 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
 86 |     rcpp_result_gen = Rcpp::wrap(get_A_ln2(A));
 87 |     return rcpp_result_gen;
 88 | END_RCPP
 89 | }
 90 | // loglik_lnorm_cpp_mat
 91 | arma::mat loglik_lnorm_cpp_mat(arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs);
 92 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_mat(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP psSEXP, SEXP qsSEXP) {
 93 | BEGIN_RCPP
 94 |     Rcpp::RObject rcpp_result_gen;
 95 |     Rcpp::RNGScope rcpp_rngScope_gen;
 96 |     Rcpp::traits::input_parameter< arma::mat >::type sum_ln1(sum_ln1SEXP);
 97 |     Rcpp::traits::input_parameter< arma::mat >::type sum_ln2(sum_ln2SEXP);
 98 |     Rcpp::traits::input_parameter< arma::mat >::type ps(psSEXP);
 99 |     Rcpp::traits::input_parameter< arma::mat >::type qs(qsSEXP);
100 |     rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_mat(sum_ln1, sum_ln2, ps, qs));
101 |     return rcpp_result_gen;
102 | END_RCPP
103 | }
104 | // zigzag_loglik_ancestors_v4_5
105 | List zigzag_loglik_ancestors_v4_5(arma::mat A, int k, int min_n_bins);
106 | RcppExport SEXP _CALDER_zigzag_loglik_ancestors_v4_5(SEXP ASEXP, SEXP kSEXP, SEXP min_n_binsSEXP) {
107 | BEGIN_RCPP
108 |     Rcpp::RObject rcpp_result_gen;
109 |     Rcpp::RNGScope rcpp_rngScope_gen;
110 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
111 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
112 |     Rcpp::traits::input_parameter< int >::type min_n_bins(min_n_binsSEXP);
113 |     rcpp_result_gen = Rcpp::wrap(zigzag_loglik_ancestors_v4_5(A, k, min_n_bins));
114 |     return rcpp_result_gen;
115 | END_RCPP
116 | }
117 | // compute_L
118 | List compute_L(arma::mat A, arma::mat L, int k);
119 | RcppExport SEXP _CALDER_compute_L(SEXP ASEXP, SEXP LSEXP, SEXP kSEXP) {
120 | BEGIN_RCPP
121 |     Rcpp::RObject rcpp_result_gen;
122 |     Rcpp::RNGScope rcpp_rngScope_gen;
123 |     Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP);
124 |     Rcpp::traits::input_parameter< arma::mat >::type L(LSEXP);
125 |     Rcpp::traits::input_parameter< int >::type k(kSEXP);
126 |     rcpp_result_gen = Rcpp::wrap(compute_L(A, L, k));
127 |     return rcpp_result_gen;
128 | END_RCPP
129 | }
130 | 
131 | static const R_CallMethodDef CallEntries[] = {
132 |     {"_CALDER_matrix_multiplication_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_cpp, 2},
133 |     {"_CALDER_matrix_multiplication_sym_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_sym_cpp, 1},
134 |     {"_CALDER_loglik_lnorm_cpp", (DL_FUNC) &_CALDER_loglik_lnorm_cpp, 4},
135 |     {"_CALDER_loglik_lnorm_cpp_vec", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_vec, 1},
136 |     {"_CALDER_get_A_len", (DL_FUNC) &_CALDER_get_A_len, 1},
137 |     {"_CALDER_get_A_ln1", (DL_FUNC) &_CALDER_get_A_ln1, 1},
138 |     {"_CALDER_get_A_ln2", (DL_FUNC) &_CALDER_get_A_ln2, 1},
139 |     {"_CALDER_loglik_lnorm_cpp_mat", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_mat, 4},
140 |     {"_CALDER_zigzag_loglik_ancestors_v4_5", (DL_FUNC) &_CALDER_zigzag_loglik_ancestors_v4_5, 3},
141 |     {"_CALDER_compute_L", (DL_FUNC) &_CALDER_compute_L, 3},
142 |     {NULL, NULL, 0}
143 | };
144 | 
145 | RcppExport void R_init_CALDER(DllInfo *dll) {
146 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
147 |     R_useDynamicSymbols(dll, FALSE);
148 | }
149 | 


--------------------------------------------------------------------------------
/src/matrix_multiplication_fun.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 	//http://arma.sourceforge.net/docs.html#stats_fns
 3 | 	#include "RcppArmadillo.h"
 4 | 	using namespace Rcpp;
 5 | 	
 6 | 	// [[Rcpp::export]]
 7 | 	arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B)
 8 | 	{
 9 | 		return A*B;
10 | 	}
11 | 
12 | 	// [[Rcpp::export]]
13 | 	arma::mat matrix_multiplication_sym_cpp(arma::mat A)
14 | 	{
15 | 		return A*A;
16 | 	}
17 | 
18 | 	
19 | 	


--------------------------------------------------------------------------------
/src/zigzag_loglik_ancestors_v4.5.cpp:
--------------------------------------------------------------------------------
  1 |     // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*-
  2 | 
  3 |     #include "RcppArmadillo.h"
  4 |     using namespace Rcpp;
  5 |     using namespace std;
  6 |     // #define M_PI  3.141592653589793238462643383280    /* pi */                    
  7 | 
  8 |     // [[Rcpp::export]]
  9 |     double loglik_lnorm_cpp( double sum_ln1, double sum_ln2, double p, double q ) // several times faster than fitdistr
 10 |     {
 11 |         if( sum_ln2 < 0 ) cout << "sum_ln2 not valid in loglik_lnorm_cpp\n";
 12 |         if( p < 0 ) cout << "p not valid in loglik_lnorm_cpp\n";
 13 |         if( q < 0 ) cout << "q not valid in loglik_lnorm_cpp\n";
 14 |             
 15 |         if( q <=1 ) return 0; // p = sum(x==0): the number of zero-values
 16 |         double alpha = 1.0*p/(p+q);
 17 |         double mu = sum_ln1/q;
 18 | 
 19 |         double sigma2 = sum_ln2/q + pow(mu, 2) - 2*mu*sum_ln1/q; // %: element-wise product 
 20 |         if( abs(sigma2) <=1E-10 ) 
 21 |         {
 22 |             return 0;
 23 |         }
 24 |         double loglik = -sum_ln1 - 0.5*q*(log(2*M_PI) + log(sigma2)) - (sum_ln2 + q*pow(mu, 2) - 2*mu*sum_ln1)/2/sigma2; 
 25 |         if( p==0 ) {return loglik;} else {return p*log(alpha) + q*log(1-alpha) + loglik;}
 26 |     }
 27 | 
 28 |     // [[Rcpp::export]]
 29 |     double loglik_lnorm_cpp_vec( arma::vec vec_values ) //log-likelihood of vec_values
 30 |     {
 31 |         int p, q;
 32 |         int n = vec_values.n_elem;
 33 |         if( n < 2 ) return 0; // added on 19-07-2018
 34 | 
 35 |         double sum_ln1, sum_ln2;
 36 |         arma::vec positive_vec = vec_values.elem(find(vec_values > 0));
 37 |         q = positive_vec.n_elem; // the number of positive values
 38 |         p = n - q; // the number of zeros
 39 |         if( q <= 1 ) return 0;
 40 |         
 41 |         sum_ln1 = sum(log(positive_vec));
 42 |         sum_ln2 = sum(pow(log(positive_vec), 2));
 43 |         
 44 |         return loglik_lnorm_cpp( sum_ln1, sum_ln2, p, q );
 45 |     }
 46 |     
 47 |     // [[Rcpp::export]]
 48 |     arma::mat get_A_len(arma::mat A) // get matrix A_len: A_len := A*(A>0). This is used for computing the number of positive elements in a rectangle region
 49 |     {
 50 |         int n_row = A.n_rows;
 51 |         arma::mat A_len=arma::zeros<arma::mat>(n_row, n_row); // for test
 52 |         arma::uvec ids = find(A > 0);
 53 |         arma::vec new_values = arma::ones<arma::vec>(ids.n_elem);
 54 |         A_len.elem(ids) = new_values;
 55 |         return A_len;
 56 |     }
 57 |     
 58 |     // [[Rcpp::export]]
 59 |     arma::mat get_A_ln1(arma::mat A) // log(A_ij)
 60 |     {
 61 |         int n_row = A.n_rows;
 62 |         arma::mat A_ln1=arma::zeros<arma::mat>(n_row, n_row); // for test
 63 |         arma::uvec ids = find(A > 0);
 64 |         arma::vec new_values = log(A.elem(ids));
 65 |         A_ln1.elem(ids) = new_values;
 66 |         return A_ln1;
 67 |     }
 68 |     
 69 |     // [[Rcpp::export]]
 70 |     arma::mat get_A_ln2(arma::mat A) // log(A_ij)^2
 71 |     {
 72 |         int n_row = A.n_rows;
 73 |         arma::mat A_ln2=arma::zeros<arma::mat>(n_row, n_row); // for test
 74 |         arma::uvec ids = find(A > 0);
 75 |         arma::vec new_values = pow(log(A.elem(ids)), 2);
 76 |         A_ln2.elem(ids) = new_values;
 77 |         return A_ln2;
 78 |     }
 79 |  
 80 |      // compute the loglik matrix   
 81 |     // [[Rcpp::export]]
 82 |     arma::mat loglik_lnorm_cpp_mat( arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs ) // several times faster than fitdistr
 83 |     {
 84 |         int n_row = sum_ln1.n_rows;
 85 |         int n_col = sum_ln1.n_cols;
 86 |         arma::mat loglik(n_row, n_col);
 87 |         for(int i=0; i<n_row; i++)
 88 |             for(int j=0; j<n_col; j++)
 89 |                 loglik(i,j) = loglik_lnorm_cpp( sum_ln1(i,j), sum_ln2(i,j), ps(i,j), qs(i,j) );
 90 |         return loglik;
 91 |     }
 92 |     
 93 |     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 94 |     // compute the blockwise sum
 95 |     // [[Rcpp::export]] // use the same version name v4 instead of v4_5
 96 |     List zigzag_loglik_ancestors_v4_5( arma::mat A, int k, int min_n_bins=2 ){
 97 |         int n_row = A.n_rows;
 98 |         // int min_n_bins = 2;
 99 |         int p, q, max_mid_index, max_mid;
100 |         arma::mat ps, loglik_tmp, loglik;
101 |         arma::vec n_cells;
102 |         arma::mat A_len = get_A_len(A); // A_len = A*(A > 0)
103 |         arma::mat A_ln1 = get_A_ln1(A); // log(A_ij)
104 |         arma::mat A_ln2 = get_A_ln2(A); // log(A_ij)^2
105 |         StringMatrix ancestors(n_row, n_row);
106 |         arma::mat L=arma::zeros<arma::mat>(n_row, n_row); // for test
107 |         Rcpp::List res;
108 | 
109 |         for( int k= min_n_bins-1; k<=(2*min_n_bins - 2); k++ ) // other values of L are 0
110 |         {
111 |             for( int v=1; v<= (n_row - k); v++ )
112 |             {
113 |                 arma::mat tmp_mat=A.submat(arma::span(v-1, v+k-1), arma::span(v-1, v+k-1)); // span(0,1) := 1:2 in R
114 |                 arma::vec upper_tri_vec = tmp_mat.elem(find(trimatu(tmp_mat)));
115 |                 L(v-1, v-1+k) = loglik_lnorm_cpp_vec( upper_tri_vec );
116 |             }
117 |         } // Checked to be the same in R
118 |         // cout << "Finished initialize L\n";
119 |         
120 |         // initialize the rad_mat as the first off-diagonal values
121 |         arma::mat rad_mat_current_ln1(n_row-1, 1);
122 |         arma::mat rad_mat_current_ln2(n_row-1, 1);
123 |         arma::mat rad_mat_current_len(n_row-1, 1);
124 |         
125 |         for(int i=0; i<(n_row-1); i++) // The first off-diagonal values
126 |         {
127 |             rad_mat_current_ln1(i, 0) = A_ln1(i, i+1);
128 |             rad_mat_current_ln2(i, 0) = A_ln2(i, i+1);
129 |             rad_mat_current_len(i, 0) = A_len(i, i+1);
130 |         }
131 |         
132 |         // initialized to be two vertical cells (2 rows)
133 |         arma::mat vertical_columns_next_ln1(2, n_row-2);
134 |         arma::mat vertical_columns_next_ln2(2, n_row-2);
135 |         arma::mat vertical_columns_next_len(2, n_row-2);
136 | 
137 |         for(int i=0; i<(n_row-2); i++) 
138 |         {
139 |             vertical_columns_next_ln1.col(i) = A_ln1( arma::span(i, i+1), i+2 );
140 |             vertical_columns_next_ln2.col(i) = A_ln2( arma::span(i, i+1), i+2 );
141 |             vertical_columns_next_len.col(i) = A_len( arma::span(i, i+1), i+2 );
142 |         }
143 |         
144 |         for(int i=1; i<2; i++) // cumsum of the two vertical cells (i=1:1)
145 |         {
146 |             vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + vertical_columns_next_ln1.row(i-1); //cumsum
147 |             vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + vertical_columns_next_ln2.row(i-1); //cumsum
148 |             vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + vertical_columns_next_len.row(i-1); //cumsum
149 |         }
150 |         
151 |         arma::mat rad_mat_next_ln1 = rad_mat_current_ln1;
152 |         arma::mat rad_mat_next_ln2 = rad_mat_current_ln2;
153 |         arma::mat rad_mat_next_len = rad_mat_current_len;
154 | 
155 |         arma::mat vertical_columns_current_ln1 = vertical_columns_next_ln1; // this line just create the vertical_columns_current_ln
156 |         arma::mat vertical_columns_current_ln2 = vertical_columns_next_ln2; // this line just create the vertical_columns_current_ln
157 |         arma::mat vertical_columns_current_len = vertical_columns_next_len; // this line just create the vertical_columns_current_ln
158 | 
159 |         // Rcout << L << "\n";
160 |         // cout << "Begin iteration:\n";
161 |         
162 |         // time complexity of this part: n^3
163 |         // for(int shift=3; shift<=n_row; shift++)
164 |         // each row of rad_mat represent one off-diagonal point
165 |         for(int shift=3; shift<=k; shift++)
166 |         {
167 |             rad_mat_current_ln1 = rad_mat_next_ln1;
168 |             rad_mat_current_ln2 = rad_mat_next_ln2;
169 |             rad_mat_current_len = rad_mat_next_len;
170 | 
171 |             rad_mat_next_ln1 = arma::mat( n_row-shift+1, shift-1);
172 |             rad_mat_next_ln2 = arma::mat( n_row-shift+1, shift-1);
173 |             rad_mat_next_len = arma::mat( n_row-shift+1, shift-1);
174 |             
175 |             n_cells = arma::zeros<arma::vec>(shift-1); // size of each rectangle
176 |             for( int i=0; i< (shift-1); i++ ) n_cells(i) = (i+1)*(shift-i-1); 
177 |             arma::mat rad_mat_next_len_all = (arma::ones<arma::mat>(n_row-shift+1, shift-1))*(arma::diagmat(n_cells)); // In R: rep(vec, n_row times) // Schur product: element-wise multiplication of two objects
178 |             
179 |             for(int i=1; i<=(n_row-shift+1); i++)
180 |             {
181 |                 // next = current + vertical_columns_next_ln values
182 |                 rad_mat_next_ln1.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln1( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln1( arma::span(0, shift-2-1), i-1 ).t();
183 |                 rad_mat_next_ln2.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln2( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln2( arma::span(0, shift-2-1), i-1 ).t();
184 |                 rad_mat_next_len.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_len( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_len( arma::span(0, shift-2-1), i-1 ).t();
185 | 
186 |                 rad_mat_next_ln1(i-1, shift-1-1) = vertical_columns_next_ln1(shift-1-1, i-1); // the last new element
187 |                 rad_mat_next_ln2(i-1, shift-1-1) = vertical_columns_next_ln2(shift-1-1, i-1); // the last new element
188 |                 rad_mat_next_len(i-1, shift-1-1) = vertical_columns_next_len(shift-1-1, i-1); // the last new element
189 |             }
190 |             
191 | 
192 |             
193 |             ////////////////////////////////////// compute the vertical_columns_next values
194 |             if(shift < n_row) //stop when shift=n
195 |             {
196 |                 vertical_columns_current_ln1 = vertical_columns_next_ln1;
197 |                 vertical_columns_current_ln2 = vertical_columns_next_ln2;
198 |                 vertical_columns_current_len = vertical_columns_next_len;
199 | 
200 |                 arma::mat first_row_ln1(1, n_row-shift);
201 |                 arma::mat first_row_ln2(1, n_row-shift);
202 |                 arma::mat first_row_len(1, n_row-shift);
203 | 
204 |                 for(int i=0; i<(n_row-shift); i++) 
205 |                 {
206 |                     first_row_ln1(0, i) = A_ln1(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
207 |                     first_row_ln2(0, i) = A_ln2(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
208 |                     first_row_len(0, i) = A_len(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln
209 |                 }
210 |                 
211 |                 vertical_columns_next_ln1 = vertical_columns_current_ln1.submat(0, 1, shift-2, n_row-shift); // drop the first column
212 |                 vertical_columns_next_ln2 = vertical_columns_current_ln2.submat(0, 1, shift-2, n_row-shift); // drop the first column
213 |                 vertical_columns_next_len = vertical_columns_current_len.submat(0, 1, shift-2, n_row-shift); // drop the first column
214 | 
215 |                 vertical_columns_next_ln1 = arma::join_cols(first_row_ln1, vertical_columns_next_ln1);
216 |                 vertical_columns_next_ln2 = arma::join_cols(first_row_ln2, vertical_columns_next_ln2);
217 |                 vertical_columns_next_len = arma::join_cols(first_row_len, vertical_columns_next_len);
218 | 
219 |                 for(int i=1; i<shift; i++) 
220 |                 {
221 |                     vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + first_row_ln1; // cumsum
222 |                     vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + first_row_ln2; // cumsum
223 |                     vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + first_row_len; // cumsum
224 |                 }   
225 |             }
226 |             
227 |             ////////////////////////////////////// compute the L and ancestors
228 |             // if(shift >= 4)
229 |             if(shift >= 2*min_n_bins) //            
230 |             {
231 |                 ps = rad_mat_next_len_all - rad_mat_next_len; // number of positive values
232 |                 loglik_tmp = loglik_lnorm_cpp_mat( rad_mat_next_ln1, rad_mat_next_ln2, ps, rad_mat_next_len );
233 |                 // loglik = loglik_tmp.submat(0, 1, n_row-shift, shift-2-1); // remove first and last col because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat
234 |                 loglik = loglik_tmp.submat(0, min_n_bins-1, n_row-shift, shift-2-(min_n_bins-1)); // 2018-11-14, remove first and last min_n_bins-1 cols because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat
235 | 
236 |                 
237 |                 arma::mat cases(1, shift-2*min_n_bins+1); // shift=5: 1:2, i.e., two cases
238 |                 // arma::mat loglik(n_row-shift, shift-min_n_bins);
239 |                 for( int row=1; row<=(n_row-shift+1); row++ )
240 |                 {
241 |                     p = row; //7
242 |                     q = row + shift; // 7 + 4 = 11
243 |                     cases = loglik( p-1, arma::span(0, shift-2*min_n_bins) ) + L(p-1, arma::span(p-1-1 + min_n_bins, q - min_n_bins-1-1) ) + (L(arma::span(p-1 + min_n_bins, q - min_n_bins-1), q-1-1)).t();
244 |                     L(p-1, q-1-1) = cases.max();
245 | 
246 |                     max_mid_index = cases.index_max() + 1; //The c++ offset 1
247 |                     max_mid = (p-1 + min_n_bins) + max_mid_index -1; // should minus one
248 |                     // ancestor = paste(i, max_mid, max_mid+1, j, sep='-')
249 |                     ancestors(p-1, q-1-1) = to_string(p) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(q-1);
250 |                     // cout << "cases:" << L(p-1, q-1-1) << "\n";
251 |                 }
252 |             }
253 |             
254 |         }
255 |         
256 |         res["ancestors"] = ancestors;
257 |         res["L"] = L;
258 |         return res;
259 |     }
260 |     
261 |     // compute the ancestors
262 |     // [[Rcpp::export]]
263 |     List compute_L( arma::mat A, arma::mat L, int k ) // A seems not needed here, and can be removed, Y.L, 2018-11-14 (Indeed this part is not used)
264 |     {
265 |         int n_row = A.n_rows;
266 |         int min_n_bins = 2;
267 |         StringMatrix ancestors(n_row, n_row);
268 |     
269 |         // should rewrite this part
270 |         // for( int i= min_n_bins-1; i<=(2*min_n_bins - 2); i++ )
271 |             // for( int j=i; j<n_row; j++ ) L(j-i, j) = 1;
272 |         
273 |         for( int shift=(2*min_n_bins - 1); shift<=(n_row-1); shift++ )  
274 |         {
275 |             int i, j, max_mid_index, max_mid;
276 |             arma::mat cases(1, shift-2*min_n_bins+1);
277 |             arma::mat prob_vec(n_row-shift, shift-min_n_bins);
278 |             for( int row=1; row<=(n_row-shift); row++ )
279 |             {
280 |                     i = row;
281 |                     j = row + shift;
282 |                     cases = prob_vec( i-1, arma::span(0, j-2*min_n_bins-i+1) ) + L(i-1, arma::span(i-1-1 + min_n_bins, j - min_n_bins-1) ) + (L(arma::span(i + min_n_bins-1, j - min_n_bins), j-1)).t();
283 |                     L(i-1, j-1) = cases.max();
284 |                     max_mid_index = cases.index_max() + 1; //The c++ offset 1
285 |                     max_mid = (i-1 + min_n_bins) + max_mid_index -1; // should minus one
286 |                     // ancestor = paste(i, max_mid, max_mid+1, j, sep='-')
287 |                     ancestors(i-1, j-1) = to_string(i) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(j); // This 'to_string' may be optimized 
288 |             }
289 |         }
290 |     return List::create(Named("L") = L, Named("ancestors") = ancestors);    
291 |     }
292 |     


--------------------------------------------------------------------------------