├── .DS_Store ├── .gitattributes ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── CALDER_hierarchy.R ├── CALDER_main.R ├── HRG_MLE_each_compartment_fun.R ├── HighResolution2Low.R ├── LikelihoodRatioTest_fun.R ├── RcppExports.R ├── bisecting_kmeans.R ├── call_domains.R ├── compartment_PCA.R ├── compartment_data_generation_fun.R ├── general_functions.R ├── generate_compartments_bed_fun.R ├── post_processing_nested_domains.R ├── prunning.R └── zigzag_nested_domain.R ├── README.md ├── Read-and-delete-me ├── img ├── CALDER_methods.png ├── Hela_chr11_and_RWPE1_chr9_pq.png └── IGV_results.png ├── inst ├── .DS_Store └── extdata │ ├── TxDb.Hsapiens.UCSC.hg19.knownGene.rds │ ├── TxDb.Mmusculus.UCSC.mm9.knownGene.rds │ ├── mat_chr22_10kb_ob.txt.gz │ └── mat_mm9_chr1_100kb_ob.txt.gz ├── man ├── .Rapp.history └── CALDER-package.Rd └── src ├── .DS_Store ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── matrix_multiplication_fun.cpp └── zigzag_loglik_ancestors_v4.5.cpp /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | total_execution.time 3 | 4 | # Created by https://www.toptal.com/developers/gitignore/api/r,sublimetext 5 | # Edit at https://www.toptal.com/developers/gitignore?templates=r,sublimetext 6 | 7 | ### R ### 8 | # History files 9 | .Rhistory 10 | .Rapp.history 11 | 12 | # Session Data files 13 | .RData 14 | .RDataTmp 15 | 16 | # User-specific files 17 | .Ruserdata 18 | 19 | # Example code in package build process 20 | *-Ex.R 21 | 22 | # Output files from R CMD build 23 | /*.tar.gz 24 | 25 | # Output files from R CMD check 26 | /*.Rcheck/ 27 | 28 | # RStudio files 29 | .Rproj.user/ 30 | 31 | # produced vignettes 32 | vignettes/*.html 33 | vignettes/*.pdf 34 | 35 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 36 | .httr-oauth 37 | 38 | # knitr and R markdown default cache directories 39 | *_cache/ 40 | /cache/ 41 | 42 | # Temporary files created by R markdown 43 | *.utf8.md 44 | *.knit.md 45 | 46 | # R Environment Variables 47 | .Renviron 48 | 49 | # pkgdown site 50 | docs/ 51 | 52 | # translation temp files 53 | po/*~ 54 | 55 | # RStudio Connect folder 56 | rsconnect/ 57 | 58 | ### R.Bookdown Stack ### 59 | # R package: bookdown caching files 60 | /*_files/ 61 | 62 | ### SublimeText ### 63 | # Cache files for Sublime Text 64 | *.tmlanguage.cache 65 | *.tmPreferences.cache 66 | *.stTheme.cache 67 | 68 | # Workspace files are user-specific 69 | *.sublime-workspace 70 | 71 | # Project files should be checked into the repository, unless a significant 72 | # proportion of contributors will probably not be using Sublime Text 73 | # *.sublime-project 74 | 75 | # SFTP configuration file 76 | sftp-config.json 77 | sftp-config-alt*.json 78 | 79 | # Package control specific files 80 | Package Control.last-run 81 | Package Control.ca-list 82 | Package Control.ca-bundle 83 | Package Control.system-ca-bundle 84 | Package Control.cache/ 85 | Package Control.ca-certs/ 86 | Package Control.merged-ca-bundle 87 | Package Control.user-ca-bundle 88 | oscrypto-ca-bundle.crt 89 | bh_unicode_properties.cache 90 | 91 | # Sublime-github package stores a github token in this file 92 | # https://packagecontrol.io/packages/sublime-github 93 | GitHub.sublime-settings 94 | 95 | # End of https://www.toptal.com/developers/gitignore/api/r,sublimetext 96 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CALDER 2 | Type: Package 3 | Title: CALDER: a Hi-C non-linear hierarchical organization analysis tool 4 | Version: 1.0 5 | Date: 2020-09-01 6 | Author: Yuanlong LIU 7 | Maintainer: Who to complain to 8 | Description: CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts 9 | LinkingTo: Rcpp, RcppArmadillo 10 | Authors@R: c( 11 | person(given = "Yuanlong", 12 | family = "LIU", 13 | role = c("aut", "cre"), 14 | email = "yliueagle@googlemail.com")) 15 | URL: https://github.com/YuanlongLiu/CALDER/ 16 | BugReports: https://github.com/YuanlongLiu/CALDER/issues 17 | License: MIT 18 | Encoding: UTF-8 19 | Depends: 20 | R (>= 3.5.2) 21 | Imports: 22 | R.utils (>= 2.9.0), 23 | doParallel (>= 1.0.15), 24 | ape (>= 5.3), 25 | dendextend (>= 1.12.0), 26 | fitdistrplus (>= 1.0.14), 27 | igraph (>= 1.2.4.1), 28 | Matrix (>= 1.2.17), 29 | rARPACK (>= 0.11.0), 30 | factoextra (>= 1.0.5), 31 | maptools (>= 0.9.5), 32 | data.table (>= 1.12.2), 33 | fields (>= 9.8.3), 34 | GenomicRanges (>= 1.36.0), 35 | strawr (>= 0.0.9) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yuanlong LIU 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(CALDER, .registration=TRUE) 2 | importFrom(Rcpp, evalCpp) 3 | exportPattern("^[[:alpha:]]+") 4 | -------------------------------------------------------------------------------- /R/CALDER_hierarchy.R: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | 3 | 4 | get_gene_info <- function(genome) { 5 | 6 | if(genome == 'hg19') { 7 | gene_info_file = system.file("extdata", "TxDb.Hsapiens.UCSC.hg19.knownGene.rds", package = 'CALDER') 8 | } else if(genome == 'mm9') { 9 | gene_info_file = system.file("extdata", "TxDb.Mmusculus.UCSC.mm9.knownGene.rds", package = 'CALDER') 10 | } else { 11 | stop(paste0("Unknown genome (", genome, ")")) 12 | } 13 | 14 | 15 | gene_info = readRDS(gene_info_file) 16 | } 17 | 18 | 19 | CALDER_CD_hierarchy = function(contact_mat_file, 20 | chr, 21 | bin_size, 22 | out_dir, 23 | save_intermediate_data=FALSE, 24 | genome = 'hg19') 25 | { 26 | time0 = Sys.time() 27 | log_file = paste0(out_dir, '/chr', chr, '_log.txt') 28 | 29 | cat('\n') 30 | 31 | cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE) 32 | cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n') 33 | processed_data = contact_mat_processing(contact_mat_file, bin_size=bin_size) 34 | 35 | A_oe = processed_data$A_oe 36 | ccA_oe_compressed_log_atanh = processed_data$atanh_score 37 | 38 | cat('\r', '>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time())) 39 | cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 40 | 41 | p_thresh = ifelse(bin_size < 40000, 0.05, 1) 42 | window.sizes = 3 43 | compartments = vector("list", 2) 44 | chr_name = paste0("chr", chr) 45 | 46 | cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 47 | cat('\r', '>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time())) 48 | 49 | compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size, window.sizes = window.sizes, ccA_oe_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL) 50 | topDom_output = compartments[[2]] 51 | bin_names = rownames(A_oe) 52 | A_oe = as.matrix(A_oe) 53 | initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2]) 54 | 55 | if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) { 56 | stop(CELL_LINE, " initial_clusters error in topDom") 57 | } 58 | 59 | n_clusters = length(initial_clusters) 60 | A_oe_cluster_mean = HighResolution2Low_k_rectangle(A_oe, initial_clusters, initial_clusters, sum_or_mean = "mean") 61 | 62 | trend_mean_list = lapply( 1:4, function(v) 1*(A_oe_cluster_mean[, -(1:v)] > A_oe_cluster_mean[, - n_clusters - 1 + (v:1)]) ) 63 | trend_mean = do.call(cbind, trend_mean_list) 64 | c_trend_mean = cor(t(trend_mean)) 65 | atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7)) 66 | 67 | 68 | # if(to_scale) 69 | { 70 | trend_mean = scale(trend_mean) 71 | c_trend_mean = scale(c_trend_mean) 72 | atanh_c_trend_mean= scale(atanh_c_trend_mean) 73 | } 74 | 75 | 76 | PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10) 77 | PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2 78 | rownames(PC_12_atanh) = 1:nrow(PC_12_atanh) 79 | 80 | ############################################################ 81 | PC_direction = 1 82 | 83 | gene_info <- get_gene_info(genome) 84 | 85 | ## switch PC direction based on gene density 86 | { 87 | initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v])) 88 | chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length))) 89 | chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size + 1 90 | chr_bin_pc$end = chr_bin_pc$bin*bin_size 91 | chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE) 92 | gene_info_chr = subset(gene_info, seqnames==chr_name) 93 | 94 | refGR = chr_bin_pc_range 95 | testGR = gene_info_chr 96 | hits <- findOverlaps(refGR, testGR) 97 | overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)]) 98 | overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin)) 99 | bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE) 100 | bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0 101 | 102 | gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2] 103 | if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak') 104 | PC_direction = PC_direction*sign(gene_density_cor) 105 | 106 | PC_12_atanh = PC_12_atanh*PC_direction 107 | } 108 | 109 | 110 | project_info = project_to_major_axis(PC_12_atanh) 111 | x_pro = project_info$x_pro 112 | 113 | ############################################################ 114 | hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE]) 115 | 116 | hc_hybrid_PC12 = hc_disect_kmeans_PC12 117 | 118 | { 119 | reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean) 120 | hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names) 121 | hc_hybrid_x_pro = hc_disect_kmeans_PC12 122 | reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro) 123 | CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro) 124 | } 125 | 126 | ############################################################ 127 | parameters = list(bin_size = bin_size, p_thresh = p_thresh) 128 | res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters) 129 | intermediate_data_file = paste0(out_dir, '/chr', chr, '_intermediate_data.Rds') 130 | 131 | hc = res$CALDER_hc 132 | hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels) 133 | bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length))) 134 | 135 | rownames(bin_comp) = NULL 136 | res$comp = bin_comp 137 | res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v]) 138 | res$mat = A_oe 139 | res$chr = chr 140 | generate_hierachy_bed(chr=chr, res=res, out_dir=out_dir, bin_size=bin_size) 141 | 142 | 143 | cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 144 | cat('\r', '>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time())) 145 | 146 | if(abs(gene_density_cor) < 0.2) cat('WARNING: correlation between gene density and PC1 on this chr is: ', substr(gene_density_cor, 1, 4), ', which is a bit low', '\n', file=log_file, append=TRUE) 147 | 148 | time1 = Sys.time() 149 | # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0)) 150 | 151 | delta_time <- time1 - time0 152 | timediff <- format(round(delta_time, 2), nsmall = 2) 153 | 154 | cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE) 155 | # if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE) 156 | 157 | ############################################################ 158 | intermediate_data = res 159 | if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file) 160 | # cat(intermediate_data_file) 161 | return(intermediate_data) 162 | } 163 | 164 | CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, out_dir, bin_size) 165 | { 166 | time0 = Sys.time() 167 | log_file = paste0(out_dir, '/chr', chr, '_sub_domains_log.txt') 168 | 169 | cat('\r', '>>>> Begin compute sub-domains at:', as.character(Sys.time())) 170 | cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE) 171 | 172 | if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file) 173 | { 174 | if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 175 | if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n') 176 | compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters ) 177 | 178 | cat('\r', '>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time())) 179 | cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 180 | sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2) 181 | no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=out_dir, bin_size=bin_size) 182 | cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 183 | cat('\r', '>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time())) 184 | 185 | time1 = Sys.time() 186 | # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0)) 187 | delta_time <- time1 - time0 188 | timediff <- format(round(delta_time, 2), nsmall = 2) 189 | 190 | cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE) 191 | } 192 | # return(NULL) 193 | } 194 | 195 | 196 | 197 | ############################################################ 198 | create_compartment_bed_v4 = function(chr_bin_domain, bin_size) 199 | { 200 | # for( chr in chrs ) 201 | { 202 | v = chr_bin_domain 203 | # v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling 204 | # v$intra_domain = names(cols)[(as.numeric(v$intra_domain))] 205 | v = v[order(v$bin_index), ] 206 | 207 | 208 | borders_non_consecutive = which(diff(v$bin_index)!=1) 209 | borders_domain = cumsum(rle(v$comp)$lengths) 210 | borders = sort(union(borders_non_consecutive, borders_domain)) 211 | bins = v$bin_index 212 | to_id = as.numeric(bins[borders]) 213 | from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)]) 214 | 215 | pos_start = (from_id-1)*bin_size + 1 216 | pos_end = to_id*bin_size 217 | # chr = as.numeric( gsub('chr', '', v$chr) ) 218 | chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX 219 | 220 | compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders]) 221 | } 222 | return(compartment_info_tab) 223 | } 224 | 225 | ############################################################ 226 | generate_hierachy_bed = function(chr, res, out_dir, bin_size) 227 | { 228 | chr_name = paste0('chr', chr) 229 | # res = reses[[chr_name]][[CELL_LINE]] 230 | hc = res$CALDER_hc 231 | 232 | hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels) 233 | bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length))) 234 | chr_bin_domain = bin_comp 235 | chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr) 236 | 237 | # chr_bin_domain = chr_bin_domain[order(bin_index)] 238 | 239 | compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size) 240 | 241 | boundaries = unname(sapply(res$initial_clusters, max)) 242 | boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size 243 | 244 | compartment_info_tab$is_boundary = 'gap' 245 | compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary' 246 | 247 | colnames(compartment_info_tab)[4] = 'compartment_label' 248 | compartments_tsv_file = paste0(out_dir, '/chr', chr, '_domain_hierachy.tsv') 249 | compartments_bed_file = paste0(out_dir, '/chr', chr, '_sub_compartments.bed') 250 | boundary_bed_file = paste0(out_dir, '/chr', chr, '_domain_boundaries.bed') 251 | 252 | options(scipen=999) 253 | write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE ) 254 | 255 | 256 | comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 257 | names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2') 258 | comp_val = (8:1)/8 259 | names(comp_val) = names(comp_cols) 260 | 261 | comp_8 = substr(compartment_info_tab$compartment_label, 1, 5) 262 | 263 | compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8]) 264 | write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 265 | 266 | bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary') 267 | bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000') 268 | write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 269 | } 270 | 271 | 272 | 273 | project_to_major_axis <- function(PC_12_atanh) 274 | { 275 | Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2]) 276 | Data = Data[order(Data$x),] 277 | loess_fit <- loess(y ~ x, Data) 278 | 279 | more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1])) 280 | major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x)) 281 | new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis 282 | 283 | dis = fields::rdist(PC_12_atanh[, 1:2], major_axis) 284 | projected_x = new_x_axis[apply(dis, 1, which.min)] 285 | names(projected_x) = rownames(PC_12_atanh) 286 | # projected_x = major_axis[apply(dis, 1, which.min)] 287 | project_info = list(x_pro=projected_x, major_axis=major_axis) 288 | return(project_info) 289 | } 290 | 291 | 292 | get_best_reorder <- function(hc_hybrid_x_pro, x_pro) 293 | { 294 | n = length(x_pro) 295 | reordered_names_x_pro_list = list() 296 | 297 | reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2 298 | 299 | best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v])))) 300 | return(reordered_names_x_pro_list[[1]]) 301 | } 302 | 303 | -------------------------------------------------------------------------------- /R/CALDER_main.R: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | CALDER_main = function(contact_mat_file, 3 | chr, 4 | bin_size, 5 | out_dir, 6 | sub_domains = TRUE, 7 | save_intermediate_data = TRUE, 8 | genome='hg19') { 9 | required_packages = c('doParallel', 'GenomicRanges', 'R.utils', 'factoextra', 'maptools') 10 | sapply(required_packages, require, character.only = TRUE, quietly = TRUE) 11 | 12 | dir.create(out_dir, showWarnings = FALSE) 13 | intermediate_data = CALDER_CD_hierarchy(contact_mat_file, 14 | chr, 15 | bin_size, 16 | out_dir, 17 | save_intermediate_data, 18 | genome) 19 | 20 | if(sub_domains==TRUE) { 21 | CALDER_sub_domains(intermediate_data=intermediate_data, 22 | chr=chr, 23 | out_dir=out_dir, 24 | bin_size=bin_size) 25 | } 26 | } 27 | 28 | -------------------------------------------------------------------------------- /R/HRG_MLE_each_compartment_fun.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## 12/04/2018 3 | ## 23/05/2018 4 | ## 29/05/2018 5 | 6 | ## A should be already named 7 | ## A should be symmetric 8 | ## Does not remove 0-rows/columns of A 9 | HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE) 10 | { 11 | min_n_bins_outer=2 12 | min_n_bins_inner=2 13 | distr = 'lnorm' 14 | A = pA_sym[seg, seg] 15 | if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices') 16 | arg_list = as.list(environment()) 17 | res_folder = file.path(res_dir) 18 | dir.create(res_folder, recursive=TRUE, showWarnings = TRUE) 19 | 20 | total_execution_time_file = file.path(res_dir, 'total_execution.time') 21 | cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE) 22 | time_begin = Sys.time() 23 | cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE) 24 | 25 | ## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken 26 | n_A = nrow(A) 27 | 28 | ## IF THE SEGMENT IS ALREADY SMALL ENOUGH 29 | if( n_A <= max(allowed_max_nbins_seq) ) 30 | { 31 | ## This will be modified 32 | { 33 | dists = (2*min_n_bins_outer - 1):(nrow(A)-1) 34 | counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer )) 35 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 36 | chunks = c(dists[1]-1, dists[split_info$break_points]) 37 | res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast ) 38 | } 39 | 40 | ## this part gets the hi_tree 41 | { 42 | hi_tree = get_tree_v0( res_outer ) 43 | igraph::V(hi_tree)$left_rescaled = igraph::V(hi_tree)$left 44 | igraph::V(hi_tree)$right_rescaled = igraph::V(hi_tree)$right 45 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right - igraph::V(hi_tree)$left + 1 46 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 47 | } 48 | 49 | full_tree = hi_tree 50 | res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) ) 51 | res_info$full_tree = full_tree 52 | res_folder_final = file.path(res_dir, 'final') 53 | dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 54 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 55 | 56 | time_finish = Sys.time() 57 | cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE) 58 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE) 59 | return(res_info) 60 | } 61 | 62 | ## which one leads to the least residue 63 | max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ] 64 | residue = n_A %% max_nbins 65 | 66 | residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0) 67 | A_final = residue_mat_info$A_final 68 | n2one = residue_mat_info$n2one 69 | 70 | ## A_low has no row/col name 71 | A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low 72 | 73 | ## Starting from here, the original row/col names are no longer used 74 | 75 | ## this part run HRG_core on the outer part 76 | { 77 | dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1) 78 | counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer )) 79 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 80 | chunks = c(dists[1]-1, dists[split_info$break_points]) 81 | res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast ) 82 | } 83 | 84 | ## this part gets the hi_tree 85 | { 86 | hi_tree = get_tree_v0( res_outer ) 87 | igraph::V(hi_tree)$left_rescaled = n2one*(igraph::V(hi_tree)$left - 1) + 1 88 | igraph::V(hi_tree)$right_rescaled = n2one*igraph::V(hi_tree)$right 89 | node2adjust = which(igraph::V(hi_tree)$right_rescaled==max(igraph::V(hi_tree)$right_rescaled)) ## append the residue bins here 90 | igraph::V(hi_tree)[node2adjust]$right_rescaled = n_A 91 | 92 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1 93 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 94 | } 95 | 96 | # segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp) 97 | 98 | segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine) 99 | 100 | ## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL 101 | ## 14-05-2018 102 | ## Also adjust the tree 103 | if(adjust_segmentss_topdom==TRUE) 104 | { 105 | shift = seg[1]-1 106 | segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift 107 | for( i in 1:nrow(segmentss) ) 108 | { 109 | index_left2adj = which(igraph::V(hi_tree)$left_rescaled == segmentss_nadj[i,1]) 110 | index_right2adj = which(igraph::V(hi_tree)$right_rescaled == segmentss_nadj[i,2]) 111 | igraph::V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1] 112 | igraph::V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2] 113 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1 114 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 115 | } 116 | } 117 | 118 | trunk = hi_tree 119 | # save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata')) 120 | 121 | res_inner = rep(list(list()), nrow(segmentss)) 122 | for( i in 1:nrow(segmentss) ) 123 | { 124 | cat(i,'\n') 125 | # if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {} 126 | index = segmentss[i,1]:segmentss[i,2] 127 | A_part = A[index, index] 128 | dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1) 129 | counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner )) 130 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 131 | chunks = c(dists[1]-1, dists[split_info$break_points]) 132 | 133 | ## A_part is named 134 | res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast ) 135 | res_inner[[i]] = res_info_inner #minimum size filtering 136 | cat('finished', '\n') 137 | } 138 | 139 | res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner ) 140 | res_folder_final = file.path(res_dir, 'final') 141 | dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 142 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 143 | 144 | time_finish = Sys.time() 145 | cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE) 146 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE) 147 | 148 | 149 | ## xenocfraf: 150 | # res_inner = res_inner[sapply(res_inner, length) > 0] 151 | branches = lapply( res_inner, get_tree_v0 ) 152 | for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1]) 153 | 154 | full_tree = xenocraft( trunk, branches ) 155 | names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ',')) 156 | igraph::V(full_tree)$left = substring(names_tmp[,1], 2) 157 | igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1) 158 | if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree") 159 | 160 | res_info$full_tree = full_tree 161 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 162 | 163 | return( res_info ) 164 | } 165 | 166 | -------------------------------------------------------------------------------- /R/HighResolution2Low.R: -------------------------------------------------------------------------------- 1 | ## These functions try to aggregate bin-wise contact matrix into a domain-wise contact matrix 2 | 3 | HighResolution2Low_k_complete <-function( A, max_nbins ) 4 | { 5 | nbins = nrow( A ) 6 | 7 | max_nbins_bp = max_nbins 8 | max_nbinss = max_nbins + (-5:20) 9 | expand_fold_info = t(sapply(max_nbinss, fold_expand_rate, K=nbins)) 10 | index = max(which(expand_fold_info[,'rescale_rate']==min(expand_fold_info[,'rescale_rate']))) 11 | max_nbins = max_nbinss[ index ] 12 | n2one_head = expand_fold_info[index, 'n2one_head'] 13 | n2one_last = expand_fold_info[index, 'n2one_last'] 14 | 15 | if(n2one_head==1) return(list()) 16 | rescale_rate = n2one_last / n2one_head 17 | 18 | if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in rescale_rate as: ', rescale_rate, '\n') 19 | 20 | if(rescale_rate!=1) ## only do when rescale is not 1 21 | { 22 | aa_end = n2one_head*(max_nbins-1) 23 | bb_end = nrow(A) 24 | A_aa = A[1:aa_end, 1:aa_end] 25 | A_bb = A[(aa_end+1):bb_end, (aa_end+1):bb_end] 26 | A_ab = A[1:aa_end, (aa_end+1):bb_end, drop=FALSE] 27 | A_aa_low = HighResolution2Low_k(mat=A_aa, k=max_nbins-1) 28 | A_bb_low = sum( A_bb ) 29 | 30 | ## https://stackoverflow.com/questions/15265512/summing-every-n-points-in-r 31 | v = apply(A_ab, 1, sum) 32 | A_ab_low = unname(tapply(v, (seq_along(v)-1) %/% n2one_head, sum)) 33 | 34 | A_ab_low_new = A_ab_low / n2one_last*n2one_head 35 | A_bb_low_new = A_bb_low / n2one_last^2*n2one_head^2 36 | A_low = rbind(cbind(A_aa_low, A_ab_low_new), c( A_ab_low_new, A_bb_low_new )) 37 | colnames(A_low) = NULL 38 | } 39 | 40 | if(rescale_rate==1) A_low = HighResolution2Low_k(mat=A, k=max_nbins) ## k is the dimension of A_final_low 41 | ## A_low has no row/col name 42 | res = list(A_low=A_low, n2one_head=n2one_head, n2one_last=n2one_last) 43 | return(res) 44 | } 45 | 46 | split_info <-function( K, max_nbins ) 47 | { 48 | x = floor( K/max_nbins ) + 1 49 | n2 = max_nbins*x - K 50 | n1 = max_nbins - n2 51 | vec = c(rep(x, n1), rep(x-1, n2)) 52 | split_vec = split(1:K, rep(1:max_nbins, vec)) 53 | 54 | expand_fold_ratio = min(n1/n2, n2/n1) 55 | res = list(vec=vec, expand_fold_ratio=expand_fold_ratio, split_vec=split_vec) 56 | return(res) 57 | } 58 | 59 | 60 | ## diagonal values are summed twice 61 | HighResolution2Low <-function( A, rescale, which=2 ) 62 | { 63 | nbins = nrow(A) 64 | nbins_low = floor(nbins/rescale) 65 | A_low = matrix( , nbins_low, nbins_low) 66 | keep_rows = nbins - nbins%%nbins_low 67 | A_truncated = A[ 1:keep_rows, 1:keep_rows ] 68 | 69 | if(which==1) A_low = matsplitter_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low) 70 | if(which==2) A_low = mat_split_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low) 71 | 72 | return( A_low ) 73 | } 74 | 75 | ## https://stackoverflow.com/questions/24299171/function-to-split-a-matrix-into-sub-matrices-in-r 76 | ## Function to split a matrix into sub-matrices in R 77 | matsplitter_sum <- function(M, r, c) 78 | { 79 | rg <- (row(M)-1)%/%r + 1 80 | cg <- (col(M)-1)%/%c + 1 81 | rci <- (rg-1)*max(cg) + cg 82 | N <- prod(dim(M))/r/c 83 | cv <- unlist(lapply(1:N, function(x) M[rci==x])) 84 | dim(cv)<-c(r, c, N) 85 | res = matrix(apply(cv, 3, sum), nrow(M)/r, byrow=TRUE) 86 | return(res) 87 | } 88 | 89 | 90 | mat_split_sum <- function(M, r, c) 91 | { 92 | nr <- ceiling(nrow(M)/r) 93 | nc <- ceiling(ncol(M)/c) 94 | newM <- matrix(NA, nr*r, nc*c) 95 | newM[1:nrow(M), 1:ncol(M)] <- M 96 | 97 | div_k <- kronecker(matrix(seq_len(nr*nc), nr, byrow = TRUE), matrix(1, r, c)) 98 | matlist <- split(newM, div_k) 99 | res = matrix(sapply(matlist, sum), nrow(M)/r, byrow=TRUE) 100 | return(res) 101 | } 102 | 103 | ## this is much faster 104 | HighResolution2Low_k <- function(mat, k) 105 | { 106 | chunk2 <- function(x, n) split(x, cut(seq_along(x), n, labels = FALSE)) 107 | n = nrow(mat) 108 | A_low = matrix( , k, k ) 109 | indices = chunk2( 1:n, k ) 110 | for(row_index in 1:k) 111 | { 112 | A_sub = mat[indices[[row_index]], ] 113 | for(col_index in 1:k) A_low[row_index, col_index] = sum( A_sub[ , indices[[col_index]]] ) 114 | } 115 | return( A_low ) 116 | } 117 | 118 | HighResolution2Low_k_rectangle <- function(mat, row_split, col_split, sum_or_mean=c('sum', 'mean', 'median', 'p_above_one')) 119 | { 120 | # if(remove_zero==TRUE) mat[mat==0] = NA 121 | k_row = length(row_split) 122 | k_col = length(col_split) 123 | 124 | A_low = matrix( , k_row, k_col ) 125 | 126 | if(sum_or_mean=='sum') 127 | { 128 | for(row_index in 1:k_row) 129 | { 130 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 131 | for(col_index in 1:k_col) A_low[row_index, col_index] = sum( A_sub[ , col_split[[col_index]]] ) 132 | } 133 | } 134 | 135 | if(sum_or_mean=='mean') 136 | { 137 | for(row_index in 1:k_row) 138 | { 139 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 140 | for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]], na.rm=TRUE ) 141 | } 142 | } 143 | 144 | if(sum_or_mean=='median') 145 | { 146 | for(row_index in 1:k_row) 147 | { 148 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 149 | for(col_index in 1:k_col) A_low[row_index, col_index] = median( A_sub[ , col_split[[col_index]]], na.rm=TRUE ) 150 | } 151 | } 152 | 153 | if(sum_or_mean=='p_above_one') 154 | { 155 | for(row_index in 1:k_row) 156 | { 157 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 158 | for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]] > 1 ) 159 | } 160 | } 161 | # if(remove_zero==TRUE) A_low[is.na(A_low)] = 0 162 | return( A_low ) 163 | } 164 | 165 | ## for a n x n matrix, this function generate a nxm matrix, with m*compress_size = n 166 | ## this function is used for generating the nxm matrix, where the i,jth value is the contact value 167 | ## 08-08-2018 168 | compress_mat_fast = function(input_mat, compress_size) 169 | { 170 | mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) ) 171 | n = ncol(input_mat) 172 | mat2prod = mat_block(n, compress_size) 173 | # return(t(input_mat%*%mat2prod / compress_size)) 174 | return(t(matrix_multiplication_cpp(input_mat, mat2prod)) / compress_size) 175 | } 176 | 177 | -------------------------------------------------------------------------------- /R/LikelihoodRatioTest_fun.R: -------------------------------------------------------------------------------- 1 | ## The following code tries to evaluate the performance of different likelihood ratio-based tests 2 | ## Yuanlong LIU 3 | ## 01-03-2018 4 | 5 | ############## model 1: keep the same structure ############## 6 | ## In this case, a tree structure should be provided 7 | ## Constraint: the top three levels have the same contact intensity parameter 8 | 9 | ## 01-03-2018 10 | p_likelihood_ratio <- function( A, head, mid, tail, num ) 11 | { 12 | A_whole = A[head:tail, head:tail] 13 | A_a = A[head:mid, head:mid] 14 | A_b = A[(mid+1):tail, (mid+1):tail] 15 | A_ab = A[head:mid, (mid+1):tail] 16 | 17 | tri_Awhole = A_whole[upper.tri(A_whole, diag=FALSE)] 18 | tri_Aa = A_a[upper.tri(A_a, diag=FALSE)] 19 | tri_Ab = A_b[upper.tri(A_b, diag=FALSE)] 20 | # gamma_fit(tri_Awhole, num) 21 | 22 | inner_a = get_prob( tri_Aa ) 23 | inner_b = get_prob( tri_Ab ) 24 | inter_ab = get_prob( A_ab ) 25 | 26 | ## negative binomial 27 | # inner_a = get_prob_ng( tri_Aa ) 28 | # inner_b = get_prob_ng( tri_Ab ) 29 | # inter_ab = get_prob_ng( A_ab ) 30 | 31 | 32 | ## log likelihood of H1 33 | LH1 = inner_a + inner_b + inter_ab 34 | LH0 = get_prob( tri_Awhole ) 35 | 36 | Lambda = -2*( LH0 - LH1 ) 37 | # cat(Lambda, '\n') 38 | df_h0 = 1*1 39 | df_h1 = 3*1 40 | df = df_h1 - df_h0 41 | 42 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 43 | info = list( Lambda=Lambda, p=p ) 44 | return(info) 45 | } 46 | 47 | p_likelihood_ratio_nb <- function( A, head, mid, tail ) 48 | { 49 | A_whole = A[head:tail, head:tail] 50 | A_a = A[head:mid, head:mid] 51 | A_b = A[(mid+1):tail, (mid+1):tail] 52 | A_ab = A[head:mid, (mid+1):tail] 53 | 54 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 55 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 56 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 57 | 58 | ## negative binomial 59 | inner_a = get_prob_nb( tri_Aa ) 60 | inner_b = get_prob_nb( tri_Ab ) 61 | inter_ab = get_prob_nb( A_ab ) 62 | 63 | 64 | ## log likelihood of H1 65 | LH1 = inner_a + inner_b + inter_ab 66 | LH0 = get_prob_nb( tri_Awhole ) 67 | 68 | Lambda = -2*( LH0 - LH1 ) 69 | # cat(Lambda, '\n') 70 | n_parameters = 2 71 | df_h0 = 1*n_parameters 72 | df_h1 = 3*n_parameters 73 | df = df_h1 - df_h0 74 | 75 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 76 | info = list( Lambda=Lambda, p=p ) 77 | return(info) 78 | } 79 | 80 | p_likelihood_ratio_norm <- function( A, head, mid, tail ) 81 | { 82 | A_whole = A[head:tail, head:tail] 83 | A_a = A[head:mid, head:mid] 84 | A_b = A[(mid+1):tail, (mid+1):tail] 85 | A_ab = A[head:mid, (mid+1):tail] 86 | 87 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 88 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 89 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 90 | 91 | ## norm 92 | inner_a = likelihood_norm( tri_Aa ) 93 | inner_b = likelihood_norm( tri_Ab ) 94 | inter_ab = likelihood_norm( A_ab ) 95 | 96 | 97 | ## log likelihood of H1 98 | LH1 = inner_a + inner_b + inter_ab 99 | LH0 = likelihood_norm( tri_Awhole ) 100 | 101 | Lambda = -2*( LH0 - LH1 ) 102 | # cat(Lambda, '\n') 103 | n_parameters = 2 104 | df_h0 = 1*n_parameters 105 | df_h1 = 3*n_parameters 106 | df = df_h1 - df_h0 107 | 108 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 109 | info = list( Lambda=Lambda, p=p ) 110 | return(info) 111 | } 112 | 113 | 114 | 115 | p_likelihood_ratio_gamma <- function( A, head, mid, tail, n_parameters, imputation ) 116 | { 117 | A_whole = A[head:tail, head:tail] 118 | A_a = A[head:mid, head:mid] 119 | A_b = A[(mid+1):tail, (mid+1):tail] 120 | A_ab = A[head:mid, (mid+1):tail] 121 | 122 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 123 | ## added 25-03-2018. If no zero values, no imputation 124 | if( length(tri_Awhole==0) == 0 ) imputation = FALSE 125 | 126 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 127 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 128 | 129 | ## norm 130 | inner_a = likelihood_gamma_mme( tri_Aa ) 131 | inner_b = likelihood_gamma_mme( tri_Ab ) 132 | inter_ab = likelihood_gamma_mme( A_ab ) 133 | whole = likelihood_gamma_mme( tri_Awhole ) 134 | 135 | if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 136 | { 137 | inner_a = likelihood_gamma_mme( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa ) 138 | inner_b = likelihood_gamma_mme( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab ) 139 | inter_ab = likelihood_gamma_mme( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab ) 140 | whole = likelihood_gamma_mme( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole ) 141 | n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account 142 | } 143 | 144 | 145 | ## log likelihood of H1 146 | LH1 = inner_a + inner_b + inter_ab 147 | LH0 = whole 148 | 149 | Lambda = -2*( LH0 - LH1 ) 150 | # cat(Lambda, '\n') 151 | df_h0 = 1*n_parameters 152 | df_h1 = 3*n_parameters 153 | df = df_h1 - df_h0 154 | 155 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 156 | info = list( Lambda=Lambda, p=p ) 157 | return(info) 158 | } 159 | 160 | lognormal_mean_test <- function( cA, head, mid, tail ) 161 | { 162 | A_whole = cA[head:tail, head:tail] 163 | A_a = cA[head:mid, head:mid] 164 | A_b = cA[(mid+1):tail, (mid+1):tail] 165 | A_ab = cA[head:mid, (mid+1):tail] 166 | 167 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 168 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 169 | 170 | tri_Aa_vec = as.vector( tri_Aa ) 171 | tri_Ab_vec = as.vector( tri_Ab ) 172 | A_ab_vec = as.vector( A_ab ) 173 | 174 | tri_Aa_vec_p = tri_Aa_vec[tri_Aa_vec!=0] 175 | tri_Ab_vec_p = tri_Ab_vec[tri_Ab_vec!=0] 176 | A_ab_vec_p = A_ab_vec[A_ab_vec!=0] 177 | 178 | p_Aa = p_lognormal_mean( tri_Aa_vec_p, A_ab_vec_p ) 179 | p_Ab = p_lognormal_mean( tri_Ab_vec_p, A_ab_vec_p ) 180 | return( list(p_Aa=p_Aa, p_Ab=p_Ab) ) 181 | } 182 | 183 | ## https://www.jstor.org/stable/2533570 184 | ## google: Methods for Comparing the Means of Two Independent Log-Normal Samples 185 | ## 03-07-2018 186 | p_lognormal_mean <- function( vec_aa, vec_ab ) 187 | { 188 | if( all(vec_aa==0) | all(vec_ab==0) ) return(0) 189 | 190 | n_aa = length(vec_aa) 191 | n_ab = length(vec_ab) 192 | fited_info_aa = MASS::fitdistr(vec_aa, 'lognormal') ## intra 193 | mu_aa = fited_info_aa$estimate[1] 194 | sd_aa = fited_info_aa$estimate[2] 195 | s2_aa = sd_aa^2 196 | 197 | fited_info_ab = MASS::fitdistr(vec_ab, 'lognormal') ## inter 198 | mu_ab = fited_info_ab$estimate[1] 199 | sd_ab = fited_info_ab$estimate[2] 200 | s2_ab = sd_ab^2 201 | 202 | z_score = ( (mu_aa - mu_ab) + (1/2)*(s2_aa - s2_ab) ) / sqrt( s2_aa/n_aa + s2_ab/n_ab + (1/2)*( s2_aa^2/(n_aa-1) + s2_ab^2/(n_ab-1) ) ) 203 | p = pnorm( z_score, lower.tail=FALSE ) 204 | return(p) 205 | } 206 | 207 | get_corner_xy <- function(A_whole) 208 | { 209 | n = nrow(A_whole) 210 | corner_size = floor(n/2) 211 | A_corner = A_whole[1:corner_size, (n - corner_size + 1 ):n] 212 | expected_high = A_corner[upper.tri(A_corner, diag=FALSE)] 213 | expected_low = A_corner[lower.tri(A_corner, diag=TRUE)] 214 | return(list(x=expected_low, y=expected_high)) 215 | } 216 | 217 | get_half_mat_values <- function(mat) 218 | { 219 | n1 = nrow(mat) 220 | n2 = ncol(mat) 221 | delta = n1/n2 222 | rows = lapply( 1:n2, function(x) (1+ceiling(x*delta)):n1 ) 223 | rows = rows[ sapply(rows, function(v) max(v) <= n1) ] 224 | flag = which(diff(sapply(rows, length)) > 0) 225 | if(length(flag)>0) rows = rows[ 1:min(flag) ] 226 | 227 | row_col_indices = cbind( unlist(rows), rep(1:length(rows), sapply(rows, length))) 228 | x = mat[row_col_indices] 229 | mat[row_col_indices] = NA 230 | y = na.omit(as.vector(mat)) 231 | return(list(x=x, y=y)) 232 | } 233 | 234 | 235 | get_half_mat_values_v2 <- function(mat) 236 | { 237 | ## https://stackoverflow.com/questions/52990525/get-upper-triangular-matrix-from-nonsymmetric-matrix/52991508#52991508 238 | y = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) > -1/2] 239 | x = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) < -1/2] 240 | return(list(x=x, y=y)) 241 | } 242 | 243 | p_wilcox_test_nested <- function( A, head, mid, tail, alternative, is_CD ) 244 | { 245 | test_1 = p_wilcox_test( A, head, mid, tail, alternative, is_CD, only_corner=FALSE ) #: coner + inter 246 | if( (tail - head <= 4) | (test_1$p > 0.05) ) ## when > 0.05 or it is already small, do not cosider it as nested 247 | { 248 | info = list( Lambda=NULL, p=0.555555, mean_diff=0 ) 249 | return(info) 250 | } 251 | 252 | ## try_error happens when tad to test is too small. THEREFORE, ASSIGN P=0 TO THE TAD 253 | test_left_tad = try(p_wilcox_test( A, head, mid=ceiling((head+mid)/2), mid, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter 254 | if( class(test_left_tad)=="try-error" ) test_left_tad = list(p=0) 255 | test_right_tad = try(p_wilcox_test( A, mid+1, mid=ceiling((mid+1+tail)/2), tail, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter 256 | if( class(test_right_tad)=="try-error" ) test_right_tad = list(p=0) 257 | 258 | info = list( Lambda=NULL, p=max( test_1$p, min(test_left_tad$p, test_right_tad$p) ), mean_diff=0 ) 259 | return(info) 260 | } 261 | 262 | 263 | 264 | p_wilcox_test = function( A, head, mid, tail, alternative, is_CD=FALSE, only_corner=FALSE ) ## only_corner tests if a domain is a TAD (no nesting) 265 | { 266 | A_whole = A[head:tail, head:tail] 267 | A_a = A[head:mid, head:mid] 268 | A_b = A[(mid+1):tail, (mid+1):tail] 269 | A_ab = A[head:mid, (mid+1):tail] 270 | 271 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] ## need to check whether diag should be false or true 272 | 273 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] ## need to check whether diag should be false or true 274 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] ## need to check whether diag should be false or true 275 | 276 | tri_Aa_vec = as.vector( tri_Aa ) 277 | tri_Ab_vec = as.vector( tri_Ab ) 278 | A_ab_vec = as.vector( A_ab ) 279 | 280 | corner_mat_info = get_half_mat_values_v2(A_ab) 281 | A_ab_corner = as.vector(corner_mat_info$y) 282 | A_ab_ncorner = corner_mat_info$x 283 | p_corner = wilcox.test(x=A_ab_corner, y=A_ab_ncorner, alternative='greater', exact=F) 284 | p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_ncorner, alternative='greater', exact=F) 285 | # p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=c(A_ab_ncorner, A_ab_corner), alternative='greater', exact=F) 286 | 287 | if(is_CD==FALSE) p = max(p_inter$p.value, p_corner$p.value) ## if the tested part is the CD 288 | if(is_CD==TRUE) p = p_inter$p.value ## if the tested part is the CD 289 | # if(only_corner==TRUE) p = p_corner$p.value 290 | mean_diff_inter = mean(A_ab_ncorner) - mean(c(tri_Ab_vec, tri_Aa_vec)) ## negative is good 291 | mean_diff_corner = mean(A_ab_ncorner) - mean(A_ab_corner) ## negative is good 292 | 293 | 294 | if(is_CD==FALSE) mean_diff = min(mean_diff_corner, mean_diff_inter) ## if the tested part is the CD 295 | if(is_CD==TRUE) mean_diff = mean_diff_inter 296 | 297 | if(min(length(tri_Ab_vec), length(tri_Ab_vec)) < 10) mean_diff = 100 ## when one of the two twins is too small. 10: dim(4,4) 298 | 299 | # mean_diff = mean(A_ab_corner) - mean(A_ab_ncorner) 300 | 301 | # p_test_Aa = wilcox.test(x=A_ab_vec, y=tri_Aa_vec, alternative="less", exact=F) 302 | # p_test_Ab = wilcox.test(x=A_ab_vec, y=tri_Ab_vec, alternative="less", exact=F) 303 | # p = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_vec, alternative=alternative, exact=F) 304 | 305 | # xy = get_corner_xy(A_whole) 306 | # p = wilcox.test(x=xy$x, y=xy$y, alternative=alternative, exact=F) 307 | 308 | # p = max(p_test_Aab$p.value, p_test_Ab$p.value) 309 | info = list( Lambda=NULL, p=p, mean_diff=mean_diff) 310 | return(info) 311 | } 312 | 313 | 314 | 315 | 316 | p_likelihood_ratio_lnorm <- function( A, head, mid, tail, n_parameters, imputation, imputation_num=1E2 ) 317 | { 318 | likelihood_fun = likelihood_lnorm_mle 319 | A_whole = A[head:tail, head:tail] 320 | A_a = A[head:mid, head:mid] 321 | A_b = A[(mid+1):tail, (mid+1):tail] 322 | A_ab = A[head:mid, (mid+1):tail] 323 | 324 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 325 | ## added 25-03-2018. If no zero values, no imputation 326 | no_zero_flag = 0 327 | if( sum(tri_Awhole==0) == 0 ) { no_zero_flag = 1; imputation = FALSE } 328 | 329 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 330 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 331 | 332 | tri_Aa_vec = as.vector( tri_Aa ) 333 | tri_Ab_vec = as.vector( tri_Ab ) 334 | A_ab_vec = as.vector( A_ab ) 335 | 336 | mean_a = mean( tri_Aa_vec[tri_Aa_vec!=0] ) 337 | mean_b = mean( tri_Ab_vec[tri_Ab_vec!=0] ) 338 | mean_ab = mean( A_ab_vec[A_ab_vec!=0] ) 339 | mean_diff = mean_ab - min(mean_a, mean_b) 340 | 341 | if( (all(tri_Aa_vec==0)) | (all(tri_Ab_vec==0)) | (all(A_ab_vec==0)) ) 342 | { 343 | info = list( Lambda=NA, p=0, mean_diff=mean_diff ) 344 | return(info) 345 | } 346 | 347 | ## lnorm 348 | if(!imputation) 349 | { 350 | inner_a = likelihood_fun( tri_Aa ) 351 | inner_b = likelihood_fun( tri_Ab ) 352 | inter_ab = likelihood_fun( A_ab ) 353 | whole = likelihood_fun( tri_Awhole ) 354 | 355 | ## log likelihood of H1 356 | LH1 = inner_a + inner_b + inter_ab 357 | LH0 = whole 358 | Lambda = -2*( LH0 - LH1 ) 359 | 360 | if( no_zero_flag ) n_parameters = n_parameters - 1 ## no alpha parameter 361 | df_h0 = 1*n_parameters ##(theta_1 = theta_2 = theta_3) 362 | df_h1 = 3*n_parameters ##(theta_1, theta_2, theta_3) 363 | df = df_h1 - df_h0 364 | 365 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 366 | info = list( Lambda=Lambda, p=p, mean_diff=mean_diff ) 367 | return(info) 368 | } 369 | 370 | if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 371 | { 372 | vec_list = list( tri_Aa=tri_Aa, tri_Ab=tri_Ab, A_ab=A_ab ) 373 | imputations = imputation_list( vec_list, imputation_num ) 374 | 375 | inner_as = apply(imputations$tri_Aa, 1, likelihood_fun) 376 | inner_bs = apply(imputations$tri_Ab, 1, likelihood_fun) 377 | inter_abs = apply(imputations$A_ab, 1, likelihood_fun) 378 | 379 | wholes = apply(do.call( cbind, imputations ), 1, likelihood_fun) 380 | LH1s = inner_as + inner_bs + inter_abs 381 | LH0s = wholes 382 | Lambdas = -2*( LH0s - LH1s ) 383 | Lambda = mean( Lambdas ) 384 | n_parameters = n_parameters - 1 ## the mixture parameter is not taken into account 385 | 386 | # cat(Lambda, '\n') 387 | df_h0 = 1*n_parameters 388 | df_h1 = 3*n_parameters 389 | df = df_h1 - df_h0 390 | 391 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 392 | info = list( Lambda=Lambda, p=p, mean_diff=mean_diff ) 393 | return(info) 394 | 395 | # if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 396 | # { 397 | # inner_a = likelihood_lnorm( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa ) 398 | # inner_b = likelihood_lnorm( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab ) 399 | # inter_ab = likelihood_lnorm( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab ) 400 | # whole = likelihood_lnorm( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole ) 401 | # n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account 402 | # } 403 | 404 | } 405 | } 406 | 407 | imputation_list <- function( vec_list, imputation_num ) 408 | { 409 | imputations = lapply( vec_list, function(v) 410 | { 411 | if(sum(v!=0)==1) {final_vec = matrix( v[v!=0], imputation_num, length(v) ); return(final_vec)} 412 | 413 | fit = fit_lnorm(v) 414 | set.seed(1) 415 | 416 | ## THERE WILL BE ERROR IF IS.NA SDLOG 417 | if(!is.na(fit['sdlog'])) imputation_vec = matrix(rlnorm( sum(v==0)*imputation_num, fit['meanlog'], fit['sdlog'] ), nrow=imputation_num) 418 | if(is.na(fit['sdlog'])) stop("In function imputation_list, sdlog=NA encountered") 419 | 420 | ori_vec = t(replicate(imputation_num, v[v!=0])) 421 | final_vec = cbind( ori_vec, imputation_vec ) 422 | } ) 423 | return( imputations ) 424 | } 425 | 426 | 427 | fit_lnorm <- function(vec) 428 | { 429 | vec = vec[vec!=0] 430 | fit = MASS::fitdistr(vec, 'lognormal')$estimate 431 | return(fit) 432 | } 433 | 434 | 435 | # LikelihoodRatioTest <- function(res_info, ncores, remove_zero=TRUE, A_already_corrected=FALSE, distr='lnorm', n_parameters=3, imputation_num=1E2) 436 | # { 437 | # require( doParallel ) 438 | # pA_sym = res_info$pA_sym 439 | # if(A_already_corrected==TRUE) cpA_sym = pA_sym 440 | # if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym 441 | # # registerDoParallel(cores=ncores) 442 | # trees = foreach( j =1:length( res_info$res_inner ) ) %do% 443 | # { 444 | # name_index = rownames(res_info$res_inner[[j]]$A) 445 | # res_info$res_inner[[j]]$cA = cpA_sym[name_index, name_index] ## the corrected A. Use this matrix is essential for reducing the distance-based false positves 446 | # tmp = get_tree_decoration( res_info$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ) 447 | # tmp 448 | # } 449 | # return(trees) 450 | # } 451 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | matrix_multiplication_cpp <- function(A, B) { 5 | .Call(`_CALDER_matrix_multiplication_cpp`, A, B) 6 | } 7 | 8 | matrix_multiplication_sym_cpp <- function(A) { 9 | .Call(`_CALDER_matrix_multiplication_sym_cpp`, A) 10 | } 11 | 12 | loglik_lnorm_cpp <- function(sum_ln1, sum_ln2, p, q) { 13 | .Call(`_CALDER_loglik_lnorm_cpp`, sum_ln1, sum_ln2, p, q) 14 | } 15 | 16 | loglik_lnorm_cpp_vec <- function(vec_values) { 17 | .Call(`_CALDER_loglik_lnorm_cpp_vec`, vec_values) 18 | } 19 | 20 | get_A_len <- function(A) { 21 | .Call(`_CALDER_get_A_len`, A) 22 | } 23 | 24 | get_A_ln1 <- function(A) { 25 | .Call(`_CALDER_get_A_ln1`, A) 26 | } 27 | 28 | get_A_ln2 <- function(A) { 29 | .Call(`_CALDER_get_A_ln2`, A) 30 | } 31 | 32 | loglik_lnorm_cpp_mat <- function(sum_ln1, sum_ln2, ps, qs) { 33 | .Call(`_CALDER_loglik_lnorm_cpp_mat`, sum_ln1, sum_ln2, ps, qs) 34 | } 35 | 36 | zigzag_loglik_ancestors_v4_5 <- function(A, k, min_n_bins = 2L) { 37 | .Call(`_CALDER_zigzag_loglik_ancestors_v4_5`, A, k, min_n_bins) 38 | } 39 | 40 | compute_L <- function(A, L, k) { 41 | .Call(`_CALDER_compute_L`, A, L, k) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R/bisecting_kmeans.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## k-means with replicatable seeds 4 | my_kmeans = function(iter.max=1E3, nstart=50, ...) 5 | { 6 | set.seed(1) 7 | res = kmeans(iter.max=iter.max, nstart=nstart, ...) 8 | return(res) 9 | } 10 | 11 | ## This function tries to adjust the height of each split, in order to generate a valid hclust object and with balanced compartments A.1 A.2 B.1 B.2 12 | ## Clusters with more nodes will get bigger height in case of same height 13 | adjust_hs <- function(l_r_h) 14 | { 15 | hs = sapply(l_r_h, function(v) v$h) 16 | all_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$l, v$r)))) 17 | r_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$r)))) 18 | 19 | sizes = sapply(l_r_h, function(v) length(v$l) + length(v$r)) ## 20 | 21 | ################ This part deals with duplicated heights 22 | hs = hs + sizes*1E-7 23 | ################ This part tries to make the top-level left and right branch to have similar height, such that to make balanced A.1, A.2, B.1, B.2 compartments 24 | ## Find the index of second branch, whose number of nodes is n_total - n_left: sizes[1] - sizes[2] 25 | l_b = 2 ## left sub-branch 26 | # r_b = which(sizes==(sizes[1] - sizes[2]))[1] ## right sub-branch 27 | r_b = which(r_names[1]==all_names) ## right sub-branch 28 | 29 | l_h = hs[l_b] 30 | r_h = hs[r_b] 31 | max_h = max(l_h, r_h) ## the maximum height of the two branches 32 | hs_new = mean(sort(hs, decreasing=TRUE)[2:3]) ## hs_new is the 3rd largest height 33 | hs[l_b] = ifelse(l_h > r_h, max_h, hs_new) 34 | hs[r_b] = ifelse(r_h > l_h, max_h, hs_new) 35 | 36 | if(any(duplicated(hs))) stop('ERROR: DUPLICATED HEIGHTS exist in bisecting_kmeans') 37 | return( hs ) 38 | } 39 | 40 | bisecting_kmeans <- function(data) 41 | { 42 | dist_mat = as.matrix(stats::dist(data)) 43 | indices = 1:nrow(data) 44 | l_r_h <<- list() 45 | 46 | get_h <- function(l_indices, r_indices) 47 | { 48 | combined_indices = c(l_indices, r_indices) 49 | idx <- as.matrix(expand.grid(combined_indices, combined_indices)) 50 | max(dist_mat[idx]) ## diameter 51 | } 52 | 53 | get_sub_tree <- function( indices ) 54 | { 55 | n_nodes = length(indices) 56 | 57 | if(n_nodes==1) ## if only two nodes 58 | { 59 | h = NULL 60 | # tree = list(h=h, leaf=indices) 61 | return() 62 | } 63 | 64 | ############# if more than two nodes 65 | if(n_nodes==2) cluster=c(1,2) else cluster = my_kmeans(x=data[indices, ], centers=2)$cluster 66 | l_indices = indices[cluster==1] 67 | r_indices = indices[cluster==2] 68 | h = get_h(l_indices, r_indices) 69 | l_r_h <<- c(l_r_h, list(list(l=l_indices, r=r_indices, h=h))) 70 | 71 | # cat(h, '\n') 72 | l_branch = get_sub_tree( l_indices ) 73 | r_branch = get_sub_tree( r_indices ) 74 | # tree = list(h=h, l_branch=l_branch, r_branch=r_branch, l_indices=l_indices, r_indices=r_indices) 75 | # return(tree) 76 | } 77 | 78 | get_sub_tree(indices) 79 | 80 | hs = adjust_hs(l_r_h) 81 | 82 | r_hs = rank(hs) 83 | for( i in 1:length(l_r_h) ) {name=r_hs[i]; names(name)=paste0(collapse='_', sort(c(l_r_h[[i]]$l, l_r_h[[i]]$r))); l_r_h[[i]]$name=name} 84 | pos_names = sapply(l_r_h, function(v) v$name) 85 | neg_names = -(1:length(indices)); names(neg_names) = 1:length(indices); all_names = c(pos_names, neg_names) 86 | for( i in 1:length(l_r_h) ) {l_r_h[[i]]$l_name=unname(all_names[paste0(l_r_h[[i]]$l, collapse='_')]); l_r_h[[i]]$r_name=unname(all_names[paste0(l_r_h[[i]]$r, collapse='_')]) } 87 | 88 | merge_height = data.frame(l=sapply(l_r_h, function(v) v$l_name), r=sapply(l_r_h, function(v) v$r_name), h=hs) 89 | merge_height = merge_height[order(merge_height$h), ] 90 | rownames(merge_height) = NULL 91 | 92 | data_tmp = cbind(c(0,0,1,1), c(0,1,1,0)) 93 | hc = hclust(stats::dist(data_tmp), "com") 94 | hc$merge = as.matrix(unname(merge_height[,1:2])) 95 | hc$height = merge_height$h 96 | # hc$order = unname(unlist(res, recursive=TRUE)[grepl('leaf', names(unlist(res, recursive=TRUE)))]) 97 | # hc$order = 1:length(indices) 98 | hc$labels = 1:length(indices) 99 | den <- as.dendrogram(hc) 100 | hc_r <- as.hclust(reorder(den, 1:length(indices))) 101 | hc_r$method = "complete" 102 | hc_r$dist.method = "euclidean" 103 | l_r_h <<- list() 104 | rm(l_r_h) 105 | return(hc_r) 106 | } 107 | 108 | -------------------------------------------------------------------------------- /R/call_domains.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship 3 | 4 | # @author : Hanjun Shin(shanjun@usc.edu) 5 | # @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH) 6 | # @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix. 7 | # @version 0.0.2 8 | 9 | # @fn TopDom 10 | # @param matrixFile : string, matrixFile Address, 11 | # - Format = {chromosome, bin start, bin end, N numbers normalized value } 12 | # - N * (N + 3), where N is the number of bins 13 | # @param window.size :integer, number of bins to extend. 14 | # @param out_binSignal : string, binSignal file address to write 15 | # @param out_ext : string, ext file address to write 16 | 17 | 18 | 19 | remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09 20 | { 21 | # binSignal = res_input_mat$binSignal 22 | # domains = domains_bp 23 | # domains = domains[, c('from.id', 'to.id', 'size')] 24 | while(1) 25 | { 26 | i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one 27 | if(i==Inf) break 28 | ids = domains[(i-1):i, 'to.id'] 29 | index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])] 30 | domains = domains[-index2rm, ] 31 | domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1 32 | domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1 33 | cat(nrow(domains), '\n') 34 | if(nrow(domains) < 10) break ## avoid exceptions 35 | } 36 | # domains[, 'chr'] = domains_bp[1, 'chr'] 37 | domains[, 'from.coord'] = domains[, 'from.id'] - 1 38 | domains[, 'to.coord'] = domains[, 'to.id'] 39 | rownames(domains) = 1:nrow(domains) 40 | # print(head(domains)) 41 | return(domains) 42 | } 43 | 44 | ## modified input file format and added several potentially useful parameter options, Yuanlong LIU 45 | TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL) 46 | { 47 | if(!is.null(window.sizes)) window.size = window.sizes[1] 48 | 49 | # print("#########################################################################") 50 | # print("Step 0 : File Read ") 51 | # print("#########################################################################") 52 | window.size = as.numeric(window.size) 53 | matdf = A_extended 54 | 55 | if( ncol(matdf) - nrow(matdf) == 3) { 56 | colnames(matdf) <- c("chr", "from.coord", "to.coord") 57 | } else if( ncol(matdf) - nrow(matdf) ==4 ) { 58 | colnames(matdf) <- c("id", "chr", "from.coord", "to.coord") 59 | } else { 60 | print("Unknwon Type of matrix file") 61 | return(0) 62 | } 63 | n_bins = 1*nrow(matdf) 64 | mean.cf <- rep(0, n_bins) 65 | pvalue <- rep(1, n_bins) 66 | 67 | local.ext = rep(-0.5, n_bins) 68 | 69 | bins <- data.frame(id=1:n_bins, 70 | chr=matdf[, "chr"], 71 | from.coord=matdf[, "from.coord"], 72 | to.coord=matdf[, "to.coord"] ) 73 | 74 | matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] ) 75 | 76 | 77 | ptm <- proc.time() 78 | 79 | ## Only compute for one track of signal 80 | if(is.null(window.sizes)){ 81 | for(i in 1:n_bins) 82 | { 83 | diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size) 84 | mean.cf[i] = mean(diamond) 85 | } 86 | } 87 | 88 | 89 | if(!is.null(window.sizes)) ## compute multiple tracks of signal 90 | { 91 | mean.cfs = matrix(, n_bins, length(window.sizes)) 92 | for(k in 1:length(window.sizes) ) 93 | { 94 | window.size.iter = window.sizes[k] 95 | for(i in 1:n_bins) 96 | { 97 | diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter) 98 | mean.cfs[i, k] = mean(diamond) 99 | } 100 | } 101 | mean.cf = apply(mean.cfs, 1, mean) 102 | } 103 | 104 | 105 | eltm = proc.time() - ptm 106 | # print(paste("Step 1 Running Time : ", eltm[3])) 107 | # print("Step 1 : Done !!") 108 | 109 | # print("#########################################################################") 110 | # print("Step 2 : Detect TD boundaries based on binSignals") 111 | # print("#########################################################################") 112 | 113 | ptm = proc.time() 114 | #gap.idx = Which.Gap.Region(matrix.data=matrix.data) 115 | #gap.idx = Which.Gap.Region2(mean.cf) 116 | gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size) 117 | 118 | proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3) 119 | 120 | #print(proc.regions) 121 | 122 | for( i in 1:nrow(proc.regions)) 123 | { 124 | start = proc.regions[i, "start"] 125 | end = proc.regions[i, "end"] 126 | 127 | # print(paste("Process Regions from ", start, "to", end)) 128 | 129 | local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end]) 130 | } 131 | 132 | local.ext_bp = local.ext 133 | 134 | eltm = proc.time() - ptm 135 | # print(paste("Step 2 Running Time : ", eltm[3])) 136 | # print("Step 2 : Done !!") 137 | 138 | if(statFilter) 139 | { 140 | # print("#########################################################################") 141 | # print("Step 3 : Statistical Filtering of false positive TD boundaries") 142 | # print("#########################################################################") 143 | 144 | if(is.null(stat_window_size)) stat_window_size=window.size 145 | ptm = proc.time() 146 | # print("-- Matrix Scaling....") 147 | scale.matrix.data = matrix.data 148 | 149 | ## This is to normalize each off diag values -- 2018-09-20 150 | # for( i in 1:(2*window.size) ) 151 | for( i in 1:(2*stat_window_size) ) 152 | { 153 | #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) ) 154 | scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] ) 155 | } 156 | 157 | # print("-- Compute p-values by Wilcox Ranksum Test") 158 | for( i in 1:nrow(proc.regions)) 159 | { 160 | start = proc.regions[i, "start"] 161 | end = proc.regions[i, "end"] 162 | 163 | # print(paste("Process Regions from ", start, "to", end)) 164 | # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1) 165 | 166 | pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1) 167 | } 168 | # print("-- Done!") 169 | 170 | # print("-- Filtering False Positives") 171 | local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue= n_proc.set ) { 280 | proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) ) 281 | break 282 | } 283 | } 284 | 285 | colnames(proc.regions) = c("start", "end") 286 | proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ] 287 | 288 | return(proc.regions) 289 | } 290 | 291 | # @fn Which.Gap.Region 292 | # @breif version 0.0.1 used 293 | # @param matrix.data : n by n matrix 294 | # @return gap index 295 | Which.Gap.Region <- function(matrix.data) 296 | { 297 | n_bins = nrow(matrix.data) 298 | gap = rep(0, n_bins) 299 | 300 | i=1 301 | while(i < n_bins) 302 | { 303 | j = i + 1 304 | while( j <= n_bins) 305 | { 306 | if( sum( matrix.data[i:j, i:j]) == 0 ) { 307 | gap[i:j] = -0.5 308 | j = j+1 309 | #if(j-i > 1) gap[i:j]=-0.5 310 | #j=j+1 311 | } else break 312 | } 313 | 314 | i = j 315 | } 316 | 317 | idx = which(gap == -0.5) 318 | return(idx) 319 | } 320 | 321 | # @fn Which.Gap.Region3 322 | # @param matrix.data : n by n matrix 323 | # @return gap index 324 | Which.Gap.Region3 <- function(mean.cf) 325 | { 326 | n_bins = length(mean.cf) 327 | gapidx = which(mean.cf==0) 328 | 329 | return(gapidx) 330 | } 331 | 332 | # @fn Which.Gap.Region2 333 | # @breif version 0.0.2 used 334 | # @param matrix.data : n by n matrix 335 | # @return gap index 336 | Which.Gap.Region2 <- function(matrix.data, w) 337 | { 338 | n_bins = nrow(matrix.data) 339 | gap = rep(0, n_bins) 340 | 341 | for(i in 1:n_bins) 342 | { 343 | if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5 344 | } 345 | 346 | idx = which(gap == -0.5) 347 | return(idx) 348 | } 349 | 350 | # @fn Detect.Local.Extreme 351 | # @param x : original signal to find local minima 352 | # @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise. 353 | Detect.Local.Extreme <- function(x) 354 | { 355 | n_bins = length(x) 356 | ret = rep(0, n_bins) 357 | x[is.na(x)]=0 358 | 359 | if(n_bins <= 3) 360 | { 361 | ret[which.min(x)]=-1 362 | ret[which.max(x)]=1 363 | 364 | return(ret) 365 | } 366 | # Norm##################################################3 367 | new.point = Data.Norm(x=1:n_bins, y=x) 368 | x=new.point$y 369 | ################################################## 370 | cp = Change.Point(x=1:n_bins, y=x) 371 | 372 | if( length(cp$cp) <= 2 ) return(ret) 373 | if( length(cp$cp) == n_bins) return(ret) 374 | for(i in 2:(length(cp$cp)-1)) 375 | { 376 | if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1 377 | else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1 378 | 379 | min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) 380 | max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) 381 | 382 | if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1 383 | if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1 384 | } 385 | 386 | return(ret) 387 | } 388 | 389 | # @fn Data.Norm 390 | # @param x : x axis vector 391 | # @param x : y axis vector 392 | # @return list of normalized x and y 393 | Data.Norm <- function(x, y) 394 | { 395 | ret.x = rep(0, length(x)) 396 | ret.y = rep(0, length(y)) 397 | 398 | ret.x[1] = x[1] 399 | ret.y[1] = y[1] 400 | 401 | diff.x = diff(x) 402 | diff.y = diff(y) 403 | 404 | scale.x = 1 / mean( abs(diff(x) ) ) 405 | scale.y = 1 / mean( abs( diff(y) ) ) 406 | 407 | #print(scale.x) 408 | #print(scale.y) 409 | 410 | for(i in 2:length(x)) 411 | { 412 | ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x) 413 | ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y) 414 | } 415 | 416 | return(list(x=ret.x, y=ret.y)) 417 | } 418 | 419 | # @fn Change.Point 420 | # @param x : x axis vector 421 | # @param x : y axis vector 422 | # @return change point index in x vector, 423 | # Note that the first and the last point will be always change point 424 | Change.Point <- function( x, y ) 425 | { 426 | if( length(x) != length(y)) 427 | { 428 | print("ERROR : The length of x and y should be the same") 429 | return(0) 430 | } 431 | 432 | n_bins <- length(x) 433 | Fv <- rep(NA, n_bins) 434 | Ev <- rep(NA, n_bins) 435 | cp <- 1 436 | 437 | i=1 438 | Fv[1]=0 439 | while( i < n_bins ) 440 | { 441 | j=i+1 442 | Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) 443 | 444 | while(j0) 581 | { 582 | from.coord = bins[proc.region[, "start"]+1, "from.coord"] 583 | boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 584 | ret = rbind(ret, boundary) 585 | } 586 | 587 | ret = rbind(gap, domain) 588 | ret = ret[order(ret[,3]), ] 589 | 590 | ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) 591 | ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) 592 | ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) 593 | ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU 594 | 595 | if(!is.null(pvalues) && !is.null(pvalue.cut)) 596 | { 597 | for(i in 1:nrow(ret)) 598 | { 599 | if(ret[i, "tag"]=="domain") 600 | { 601 | domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] 602 | p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) 603 | 604 | if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" 605 | } 606 | } 607 | } 608 | 609 | new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 610 | stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 611 | 612 | i=1 613 | while(i <= nrow(ret)) 614 | { 615 | if( ret[i, "tag"] == "boundary" ) 616 | { 617 | stack.bdr = rbind(stack.bdr, ret[i, ]) 618 | } else if(nrow(stack.bdr)>0) { 619 | new.bdr = data.frame(chr=bins[1, "chr"], 620 | from.id = min( stack.bdr[, "from.id"]), 621 | from.coord=min(stack.bdr[, "from.coord"]), 622 | to.id = max( stack.bdr[, "to.id"]), 623 | to.coord=max(stack.bdr[, "to.coord"]), 624 | tag="boundary", 625 | size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"])) 626 | new.bdr.set = rbind(new.bdr.set, new.bdr) 627 | stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 628 | } 629 | 630 | i = i + 1 631 | } 632 | 633 | ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set ) 634 | ret = ret[order(ret[, "to.coord"]), ] 635 | 636 | return(ret) 637 | } 638 | 639 | 640 | # @fn Convert.Bin.To.Domain 641 | # @param bins : bin information 642 | # @param signal.idx : signal index 643 | # @param signal.idx : gap index 644 | # @param pvalues : pvalue vector 645 | # @param pvalue.cut : pvalue threshold 646 | # @return dataframe storing domain information 647 | Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL) 648 | { 649 | n_bins = nrow(bins) 650 | ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 651 | levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary") 652 | 653 | rmv.idx = setdiff(1:n_bins, gap.idx) 654 | proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) 655 | from.coord = bins[proc.region[, "start"], "from.coord"] 656 | n_procs = nrow(proc.region) 657 | gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 658 | 659 | rmv.idx = union(signal.idx, gap.idx) 660 | proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) 661 | n_procs = nrow(proc.region) 662 | from.coord = bins[proc.region[, "start"], "from.coord"] 663 | domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 664 | 665 | rmv.idx = setdiff(1:n_bins, signal.idx) 666 | proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) ) 667 | n_procs = nrow(proc.region) 668 | if(n_procs>0) 669 | { 670 | from.coord = bins[proc.region[, "start"]+1, "from.coord"] 671 | boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 672 | ret = rbind(ret, boundary) 673 | } 674 | 675 | ret = rbind(gap, domain) 676 | ret = ret[order(ret[,3]), ] 677 | 678 | ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) 679 | ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) 680 | ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) 681 | ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] 682 | 683 | if(!is.null(pvalues) && !is.null(pvalue.cut)) 684 | { 685 | for(i in 1:nrow(ret)) 686 | { 687 | if(ret[i, "tag"]=="domain") 688 | { 689 | domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] 690 | p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) 691 | 692 | if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" 693 | } 694 | } 695 | } 696 | 697 | return(ret) 698 | } -------------------------------------------------------------------------------- /R/compartment_PCA.R: -------------------------------------------------------------------------------- 1 | 2 | get_PCs = function( mat, which ) ## fast way to compute PCs 3 | { 4 | PC_mat = crossprod(mat) 5 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=max(which), which = "LM" ) 6 | if(any(res_eigs_sym$values <0)) stop('Non-positive eigenvalues for A^2') 7 | PCs = mat%*%(res_eigs_sym$vectors) 8 | return( PCs[, which] ) 9 | } 10 | 11 | 12 | PC_compartment <- function(A_oe) ## compute PC and define A/B compartment. Not used currently 13 | { 14 | cA_oe = fast_cor(A_oe) 15 | PC_mat = crossprod(cA_oe) 16 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" ) 17 | PC1 = cA_oe%*%res_eigs_sym$vectors[,1] 18 | PC2 = cA_oe%*%res_eigs_sym$vectors[,2] 19 | 20 | borders = which(diff(1*(PC1 > 0))!=0) 21 | to_id = as.numeric(rownames(A_oe)[borders]) 22 | from_id = as.numeric(rownames(A_oe)[c(1, head(borders, length(borders)-1)+1)]) 23 | 24 | start_poses = (from_id-1)*bin_size + 1 25 | end_poses = to_id*bin_size 26 | 27 | compartment_AB = data.frame(chr=paste0('chr', chr), start_poses=start_poses, end_poses=end_poses, A_or_B=NA, zero=0, dot='.', start_poses_2=start_poses, end_poses_2=end_poses, col=NA) 28 | compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'A_or_B'] = 'A' 29 | compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'A_or_B'] = 'B' 30 | 31 | compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'col'] = '112,128,144' 32 | compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'col'] = '255,255,0' 33 | 34 | compartments_bed_files = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_2', '.bed') 35 | write.table( compartment_AB, file=compartments_bed_files, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' ) 36 | } 37 | 38 | PC_compartment_slim <- function(A_oe, downsratio=NULL) ## compute and save the PC values only. Not used currently 39 | { 40 | cA_oe = fast_cor(A_oe) 41 | PC_mat = crossprod(cA_oe) 42 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" ) 43 | PC1 = cA_oe%*%res_eigs_sym$vectors[,1] 44 | PC2 = cA_oe%*%res_eigs_sym$vectors[,2] 45 | 46 | compartment_AB = data.frame(PC1=PC1, bin_names=rownames(A_oe)) 47 | 48 | if( is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB.Rdata') 49 | if( !is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_downsratio_', downsratio, '.Rdata') 50 | 51 | save( compartment_AB, file=compartments_AB_file ) 52 | } 53 | 54 | 55 | -------------------------------------------------------------------------------- /R/compartment_data_generation_fun.R: -------------------------------------------------------------------------------- 1 | 2 | compress_mat_fast_tmp = function(input_mat, compress_size) 3 | { 4 | mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) ) 5 | n = ncol(input_mat) 6 | mat2prod = mat_block(n, compress_size) 7 | # return(t(input_mat%*%mat2prod / compress_size)) 8 | return(t(matrix_multiplication_cpp(input_mat, mat2prod))) 9 | } 10 | 11 | mat_10to40kb = function(mat2compress, bin_size, bin_size_initial) ## 10kb to 40 kb 12 | { 13 | compress_size = bin_size / bin_size_initial 14 | len = nrow(mat2compress) - nrow(mat2compress)%%compress_size 15 | mat2compress = mat2compress[1:len, 1:len] 16 | c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size ) 17 | mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size ) 18 | rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) ) 19 | return(mat_compressed) 20 | } 21 | 22 | # if(!file.exists(contact_mat_file)) contact_mat_file = paste0('/mnt/ndata/Yuanlong/2.Results/1.Juicer/', CELL_LINE, '/contact_mat/mat_chr', chr, '_', bin_size_initial_kb, 'kb_ob.txt.gz') 23 | 24 | ## bin_size_initial is the binsize of your input matrix, can be different from the bin_size of your planned analysis 25 | contact_mat_processing = function(contact_mat_file, bin_size, bin_size_initial=bin_size) 26 | { 27 | 28 | compress_size = ifelse(bin_size < 40E3, 1, 1) 29 | zero_ratio = 0.01 30 | 31 | combined_xk_oe_raw = data.table::fread(contact_mat_file) 32 | 33 | ## this code generates the compartment domains 34 | 35 | colnames(combined_xk_oe_raw) = c('pos_1', 'pos_2', 'val') 36 | combined_xk_oe_raw = subset(combined_xk_oe_raw, !is.na(val)) 37 | combined_xk_oe_raw[,1] = combined_xk_oe_raw[,1]/bin_size_initial 38 | combined_xk_oe_raw[,2] = combined_xk_oe_raw[,2]/bin_size_initial 39 | combined_xk_oe = combined_xk_oe_raw 40 | 41 | if(!all(combined_xk_oe[[2]] >= combined_xk_oe[[1]])) stop('\nYou provided matrix does not represent an upper triangular matrix!\n\n') 42 | 43 | oe_size = max(max(combined_xk_oe[[1]]), max(combined_xk_oe[[2]])) + 1 ## should +1 because combined_xk_oe index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19) 44 | mat_oe_sparse = Matrix::Matrix(0, nrow=oe_size, ncol=oe_size) 45 | mat_oe_sparse[cbind(combined_xk_oe[[1]]+1, combined_xk_oe[[2]]+1)] <- combined_xk_oe[[3]] 46 | 47 | rownames(mat_oe_sparse) = colnames(mat_oe_sparse) = as.character( 1:nrow(mat_oe_sparse) ) 48 | 49 | mat_oe_sparse = Matrix::forceSymmetric(mat_oe_sparse, uplo='U') 50 | if(bin_size!=bin_size_initial) mat_oe_sparse = mat_10to40kb( mat_oe_sparse, bin_size, bin_size_initial ) 51 | A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A 52 | if(nrow(A_oe) < 100) A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=0) ## when all are dense 53 | while(min(apply(A_oe, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0 54 | { 55 | A_oe = remove_blank_cols(A_oe, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A 56 | if(nrow(A_oe) < 1) stop('ERROR IN GENERATING MEANINGFUL A_oe at the data generating step') 57 | } 58 | 59 | ########################################################## 60 | 61 | len = nrow(A_oe) - nrow(A_oe)%%compress_size 62 | A_oe_2_compress = A_oe[, 1:len] 63 | 64 | bin_names = rownames(A_oe) 65 | 66 | A_oe_compressed = compress_mat_fast( as.matrix(A_oe_2_compress), compress_size=compress_size ) 67 | colnames(A_oe_compressed) = bin_names 68 | rm(A_oe_2_compress); gc() 69 | 70 | range(A_oe_compressed) 71 | # # sum(A_oe_compressed > 1000) 72 | # # A_oe_compressed[A_oe_compressed > 1000] = 1000 73 | A_oe_compressed_sparse = A_oe_compressed 74 | A_oe_compressed = as.matrix(A_oe_compressed) 75 | A_oe_compressed_log = log2(A_oe_compressed + 1) 76 | 77 | # ######################################################### 78 | # cat('compute correlation matrix ... ') 79 | 80 | cA_oe_compressed_log = fast_cor(A_oe_compressed_log) 81 | ccA_oe_compressed_log = fast_cor(cA_oe_compressed_log) 82 | 83 | # cat('compute correlation matrix done ... ') 84 | 85 | # ######################################################### 86 | # # ccA_oe_compressed_atanh = atanh(ccA_oe_compressed - 1E-7) 87 | ccA_oe_compressed_log_atanh = atanh(ccA_oe_compressed_log / (1+1E-7)) 88 | 89 | # rm(A_oe_compressed, A_oe_compressed_sparse, cA_oe_compressed_log) 90 | gc() 91 | # ######################################################### 92 | # cat('ready to compute compartment domains\n') 93 | 94 | out = list(A_oe=A_oe, atanh_score=ccA_oe_compressed_log_atanh) 95 | 96 | return(out) 97 | } 98 | 99 | -------------------------------------------------------------------------------- /R/general_functions.R: -------------------------------------------------------------------------------- 1 | get_ccA_atanh = function(contact_mat_file, compress_size, bin_size) 2 | { 3 | contact_mat_raw = data.table::fread(contact_mat_file) 4 | contact_mat_raw = subset(contact_mat_raw, !is.na(V3)) 5 | contact_mat_raw[,1] = contact_mat_raw[,1]/bin_size 6 | contact_mat_raw[,2] = contact_mat_raw[,2]/bin_size 7 | contact_mat = contact_mat_raw 8 | 9 | colnames(contact_mat) = c('pos_1', 'pos_2', 'val') 10 | if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n') 11 | 12 | n_bins_whole = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19) 13 | contact_mat_sparse = Matrix(0, nrow=n_bins_whole, ncol=n_bins_whole) 14 | contact_mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]] 15 | 16 | rownames(contact_mat_sparse) = colnames(contact_mat_sparse) = as.character( 1:nrow(contact_mat_sparse) ) 17 | 18 | contact_mat_sparse = Matrix::forceSymmetric(contact_mat_sparse, uplo='U') 19 | # if(bin_size!=bin_size_initial) contact_mat_sparse = mat_10to40kb( contact_mat_sparse, bin_size, bin_size_initial ) 20 | A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A 21 | if(nrow(A) < 100) A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=0) ## when all are dense 22 | while(min(apply(A, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0 23 | { 24 | A = remove_blank_cols(A, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A 25 | if(nrow(A) < 1) stop('ERROR IN GENERATING MEANINGFUL A at the data generating step') 26 | } 27 | 28 | # ######################################################### 29 | 30 | n_bins_A = nrow(A) - nrow(A)%%compress_size 31 | A_2_compress = A[, 1:n_bins_A] 32 | 33 | bin_names = rownames(A) 34 | 35 | A_compressed = compress_mat_fast( as.matrix(A_2_compress), compress_size=compress_size ) 36 | colnames(A_compressed) = bin_names 37 | rm(A_2_compress); gc() 38 | 39 | A_compressed_sparse = A_compressed 40 | A_compressed = as.matrix(A_compressed) 41 | A_compressed_log = log2(A_compressed + 1) 42 | 43 | # ######################################################### 44 | cat('Compute correlation matrix\n') 45 | 46 | cA_compressed_log = fast_cor(A_compressed_log) 47 | ccA_compressed_log = fast_cor(cA_compressed_log) 48 | 49 | cat('Finished computing correlation matrix\n') 50 | 51 | # ######################################################### 52 | # # ccA_compressed_atanh = atanh(ccA_compressed - 1E-7) 53 | ccA_atanh = atanh(ccA_compressed_log / (1+1E-7)) 54 | A = as.matrix(A) 55 | rm(A_compressed, A_compressed_sparse, cA_compressed_log, A_compressed_log); gc() 56 | # ######################################################### 57 | cat('Finished data generation\n') 58 | return(list(A=A, ccA_atanh=ccA_atanh)) 59 | } 60 | 61 | 62 | 63 | get_detailed_full_tree <- function(res_info) 64 | { 65 | # attach(res_info) 66 | for( i in 1:nrow(res_info$segmentss) ) 67 | { 68 | index_left2adj = which(igraph::V(res_info$hi_tree)$left_rescaled == res_info$segmentss_nadj[i,1]) 69 | index_right2adj = which(igraph::V(res_info$hi_tree)$right_rescaled == res_info$segmentss_nadj[i,2]) 70 | igraph::V(res_info$hi_tree)[index_left2adj]$left_rescaled = res_info$segmentss[i,1] 71 | igraph::V(res_info$hi_tree)[index_right2adj]$right_rescaled = res_info$segmentss[i,2] 72 | igraph::V(res_info$hi_tree)$width_rescaled = igraph::V(res_info$hi_tree)$right_rescaled - igraph::V(res_info$hi_tree)$left_rescaled + 1 73 | igraph::V(res_info$hi_tree)$name = paste('(',igraph::V(res_info$hi_tree)$left_rescaled, ',', igraph::V(res_info$hi_tree)$right_rescaled, ')', sep='') 74 | } 75 | 76 | ## xenocfraf: 77 | # res_inner = res_inner[sapply(res_inner, length) > 0] 78 | ## xenocfraf: 79 | # res_inner = res_inner[sapply(res_inner, length) > 0] 80 | branches = lapply( res_info$res_inner, get_tree_v0 ) 81 | for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=res_info$segmentss[i,1]) 82 | trunk = res_info$hi_tree 83 | 84 | full_tree = xenocraft( trunk, branches ) 85 | names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ',')) 86 | igraph::V(full_tree)$left = substring(names_tmp[,1], 2) 87 | igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1) 88 | if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree") 89 | 90 | detailed_full_tree = tree_germination(full_tree) 91 | # detach(res_info) 92 | return(detailed_full_tree) 93 | } 94 | 95 | 96 | 97 | insulation_score_fun = function(A, size=3) 98 | { 99 | insulation_score_fun_helper = function(bin_start) 100 | { 101 | bin_mid = bin_start + size -1 102 | bin_end = bin_start + 2*size - 1 103 | up = sum(A[bin_start:bin_mid, bin_start:bin_mid]) 104 | down = sum(A[(bin_mid+1):bin_end, (bin_mid+1):bin_end]) 105 | inter = 2*sum(A[bin_start:bin_mid, (bin_mid+1):bin_end]) 106 | insulation = 1 - inter / (up + down) 107 | return(insulation) 108 | } 109 | 110 | bin_starts = 1:(nrow(A) - 2*size + 1) 111 | insulations = sapply(bin_starts, function(bin_start) insulation_score_fun_helper(bin_start)) 112 | names(insulations) = rownames(A)[size:(nrow(A) - size)] 113 | return(insulations) 114 | } 115 | 116 | 117 | 118 | ## generating the compartment_segs based on A and initial_clusters 119 | 120 | generate_compartment_segs <- function( initial_clusters ) 121 | { 122 | bins_seq = unname(unlist(initial_clusters)) 123 | if(!all(diff(bins_seq)==1)) stop('check generate_compartment_segs in generate_compartment_segs.R') 124 | if(bins_seq[1]!=1) stop('check generate_compartment_segs in generate_compartment_segs.R') 125 | compartment_segs = do.call(rbind, lapply(initial_clusters, function(v) v[c(1, length(v))])) 126 | compartment_segs = data.frame( start_pos=compartment_segs[,1], end_pos=compartment_segs[,2] ) 127 | return( compartment_segs ) 128 | } 129 | 130 | ## This function gets the nodes at level k 131 | ## Yuanlong LIU 132 | ## 02_07_2018 133 | 134 | get_level_k_nodes <- function(tree, k) 135 | { 136 | nodes = igraph::ego(tree, order=k, nodes=1, mode = "out", mindist = 1)[[1]] 137 | return(nodes) 138 | } 139 | 140 | 141 | 142 | ## This function gets the TADs at level k 143 | ## Yuanlong LIU 144 | ## 18_05_2018 145 | 146 | get_level_k_TADs <- function(branches, k) 147 | { 148 | tad_sizes_ind = lapply( branches, function(branch) 149 | { 150 | branch_level_k = igraph::induced.subgraph( branch, igraph::V(branch)$depth <= k ) 151 | tad_size_ind = get_leaves(branch_level_k, 'igraph')$width 152 | }) 153 | 154 | tad_sizes = unlist(tad_sizes_ind) 155 | end_pos = cumsum(tad_sizes) 156 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 157 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 158 | return( tads ) 159 | } 160 | 161 | 162 | 163 | 164 | get_least_residue_matrix <- function(pA_sym, max_nbins, allowed_shifts) 165 | { 166 | nbins = nrow( pA_sym ) 167 | max_nbins_bp = max_nbins 168 | # remainders = nbins %% (max_nbins + (-10:10)) 169 | remainders = nbins %% (max_nbins + allowed_shifts) 170 | 171 | max_nbins = (max_nbins + allowed_shifts)[ which.min(remainders) ] 172 | n2one = floor(nbins/max_nbins) 173 | remainder = nbins %% max_nbins 174 | if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in ', remainder, ' rows/columns excluded for the analysis.') 175 | 176 | if(remainder!=0) A_final = pA_sym[-(tail(1:nbins, remainder)), -(tail(1:nbins, remainder))] 177 | if(remainder==0) A_final = pA_sym 178 | res = list(A_final=A_final, n2one=n2one, max_nbins_new=max_nbins) 179 | } 180 | 181 | 182 | 183 | ## merge clusters based on their corr_mat 184 | ## Yuanlong LIU 185 | ## 01-07-2018 186 | 187 | sim.fun <- function( corr_mat, mod1, mod2, method='mean' ) 188 | { 189 | if(method=='mean') return(mean(corr_mat[mod1, mod2])) 190 | if(method=='median') return(median(corr_mat[mod1, mod2])) 191 | if(method=='max') return(max(corr_mat[mod1, mod2])) 192 | 193 | } 194 | 195 | 196 | ############################################################ 197 | 198 | ## this function removes all-zero columns / rows 199 | rm_zeros <- function(A) ## mat is a symmetric matrix 200 | { 201 | zero_indices = apply( A, 2, function(v) all(v==0) ) 202 | if( all(zero_indices==0) ) return(A) 203 | A = A[!zero_indices, !zero_indices] 204 | return(A) 205 | } 206 | 207 | my_dist <- function(m) {mtm <- Matrix::tcrossprod(m); sq <- rowSums(m*m); res = suppressWarnings(sqrt(outer(sq,sq,"+") - 2*mtm)); res[is.nan(res)]=0; diag(res)=0; return(res)} 208 | 209 | 210 | MoC = function(P, Q) 211 | { 212 | len_p = length(P) 213 | len_q = length(Q) 214 | if((len_p==len_q) & (len_p==1)) return(1) 215 | grids = expand.grid(1:len_p, 1:len_q) 216 | vecs = apply( grids, 1, function(x) {u=x[1]; v=x[2]; length(intersect( P[[u]], Q[[v]] ))^2/length(P[[u]])/length(Q[[v]])} ) 217 | res = 1/( sqrt(len_p*len_q) - 1)*(sum(vecs) - 1) 218 | return(res) 219 | } 220 | 221 | get_stair_vecs <- function(mat, from, to) 222 | { 223 | n = nrow(mat) 224 | solve <- function(mid) c(mat[1:mid,(mid+1):n]) 225 | lapply(from:to, solve) 226 | } 227 | 228 | correct_A_fast_divide_by_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean') 229 | { 230 | f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2 231 | 232 | n = nrow(A) 233 | upper_tri_values = A[upper.tri(A, diag=TRUE)] 234 | means_mat = indices = A*0 235 | indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2) 236 | 237 | diag_values = orders = rep(list(list()), n) 238 | 239 | for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d) 240 | for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)] 241 | 242 | if(mean_or_median=='mean'){ 243 | if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } )) 244 | if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } )) 245 | } 246 | 247 | if(mean_or_median=='median'){ 248 | if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } )) 249 | if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } )) 250 | } 251 | 252 | means[means==0] = 1 253 | means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))] 254 | if(divide_or_substract=='divide') A_corrected = A / means_mat 255 | if(divide_or_substract=='substract') A_corrected = A - means_mat 256 | A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) ) 257 | rownames( A_corrected ) = colnames( A_corrected ) = rownames( A ) 258 | return( A_corrected ) 259 | } 260 | 261 | 262 | correct_A_fast_equal_mean <- function(A, remove_zero=TRUE, divide_or_substract='divide', mean_or_median='mean') 263 | { 264 | ## may also try log normal, since the off-diagnal values follow well a log-normal distribution 265 | ## commented by Yuanlong LIU, 2018-07-26 266 | f = function(i, d) d*(d-1)/2+1+(2*d+i)*(i-1)/2 267 | 268 | n = nrow(A) 269 | upper_tri_values = A[upper.tri(A, diag=TRUE)] 270 | means_mat = indices = A*0 271 | indices[upper.tri(indices, diag=TRUE)] = 1:((n+1)*n/2) 272 | 273 | diag_values = orders = rep(list(list()), n) 274 | 275 | for(d in seq_along(orders)) orders[[d]] = f(i=seq(1, n-d+1), d=d) 276 | for(d in seq_along(orders)) diag_values[[d]] = upper_tri_values[f(i=seq(1, n-d+1), d=d)] 277 | 278 | if(mean_or_median=='mean'){ 279 | if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } )) 280 | if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=mean(v); v=v*0+m; return(v) } )) 281 | } 282 | 283 | if(mean_or_median=='median'){ 284 | if(remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v[v!=0]); v=v*0+m; v[is.na(v)]=0; return(v) } )) 285 | if(!remove_zero) means = unlist(lapply( diag_values, function(v) {m=median(v); v=v*0+m; return(v) } )) 286 | } 287 | 288 | means[means==0] = 1 289 | means_mat[upper.tri(means_mat, diag=TRUE)] = means[order(unlist(orders))] 290 | if(divide_or_substract=='divide') A_corrected = A / means_mat 291 | if(divide_or_substract=='substract') A_corrected = A - means_mat 292 | A_corrected = as.matrix( Matrix::forceSymmetric(A_corrected) ) 293 | rownames( A_corrected ) = colnames( A_corrected ) = rownames( A ) 294 | return( A_corrected ) 295 | } 296 | 297 | creat_phylo_object <- function(tree) 298 | { 299 | creat_phylo_object_inner <- function(tree) 300 | { 301 | twins = igraph::ego(tree, order=1, node=1, mode='out', mindist=1)[[1]] 302 | branches = igraph::decompose(tree - 1) 303 | left_branch = branches[[1]] 304 | right_branch = branches[[2]] 305 | left_branch_flag = igraph::vcount( left_branch ) == 1 306 | right_branch_flag = igraph::vcount( right_branch ) == 1 307 | short_walk_dist = 1 308 | igraph::diameters = sapply(branches, igraph::diameter) 309 | long_walk_dist = abs(diff(igraph::diameters)) + 1 310 | 311 | if( left_branch_flag ) left_branch_newick = igraph::V(left_branch)$name 312 | if( !left_branch_flag ) left_branch_newick = creat_phylo_object_inner(left_branch) 313 | 314 | if( right_branch_flag ) right_branch_newick = igraph::V(right_branch)$name 315 | if( !right_branch_flag ) right_branch_newick = creat_phylo_object_inner(right_branch) 316 | 317 | if( igraph::diameters[1] > igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', short_walk_dist, ',', right_branch_newick, ':', long_walk_dist, ')', sep='' ) 318 | if( igraph::diameters[1] <= igraph::diameters[2] ) tree_newick = paste( '(', left_branch_newick, ':', long_walk_dist, ',', right_branch_newick, ':', short_walk_dist, ')', sep='' ) 319 | return( tree_newick ) 320 | } 321 | tree_newick = creat_phylo_object_inner(tree) 322 | final_newick = paste(tree_newick, ';', sep='') 323 | tree = read.tree(text=final_newick) 324 | return( tree ) 325 | } 326 | 327 | 328 | dist_max_min <- function( L_diff ) 329 | { 330 | N = dim(L_diff)[1] 331 | L_diff_dist = list() 332 | for( dist in 1:(N-1) ) 333 | { 334 | maxLs = numeric( N-dist ) 335 | for( row in 1:(N-dist) ) 336 | { 337 | i = row 338 | j = row + dist 339 | maxLs[i] = L_diff[i, j] 340 | } 341 | 342 | L_diff_dist[[dist]] = maxLs 343 | } 344 | return(L_diff_dist) 345 | } 346 | 347 | 348 | divide_into_groups <- function(A, n_group) 349 | { 350 | N = nrow(A) 351 | dists = 1:(N-1) 352 | counts = sapply(dists, function(v) n_cells2compute( A, v )) 353 | average = (ceiling(sum(counts) / n_group)) ## the average computational complexity 354 | groups <- cumsum(counts) %/% average + 1 355 | borders = c(0, which(diff(groups)!=0)) 356 | binsizes = c() 357 | for( i in 1:(length(borders)-1) ) 358 | { 359 | binsizes = c(binsizes, sum(counts[ (borders[i]+1):borders[i+1] ])) 360 | } 361 | info = list(borders=borders, binsizes=binsizes) 362 | return(info) 363 | } 364 | 365 | 366 | get_leaves <- function(tree, type='name') 367 | { 368 | leaf_indices = which(igraph::degree( tree, mode='out' ) == 0) 369 | if( type=='name' ) return(igraph::V(tree)[leaf_indices]$name) 370 | if( type=='igraph_node' ) return(igraph::V(tree)[leaf_indices]) 371 | if( type=='igraph' ) return(igraph::V(tree)[leaf_indices]) 372 | if( type=='index' ) return(leaf_indices) 373 | } 374 | 375 | get_segments <- function(hi_tree, binsize_thresh, return_segmentss_tree=FALSE) 376 | { 377 | leaf_widths = get_leaves(hi_tree, type='igraph')$width_rescaled 378 | if( binsize_thresh <= min(leaf_widths) ) 379 | { 380 | warning( 'Your input binsize_thresh has a value: ', binsize_thresh, ', which is smaller than the minimum of the leaf width of hi_tree: ', min(leaf_widths) ) 381 | } 382 | 383 | nodes2rm = numeric() 384 | for( i in 2:igraph::vcount(hi_tree) ) ## root node not take into account 385 | { 386 | node = igraph::V(hi_tree)[i] 387 | width = node$right_rescaled - node$left_rescaled + 1 388 | if(width <= binsize_thresh) 389 | { 390 | parent = igraph::ego(hi_tree, order=1, nodes=node, mode = "in", mindist = 1)[[1]] 391 | width_parent = parent$right_rescaled - parent$left_rescaled + 1 392 | if(width_parent <= binsize_thresh) nodes2rm = c(nodes2rm, i) 393 | } 394 | } 395 | 396 | trimmed_tree = hi_tree - nodes2rm 397 | leaves = get_leaves(trimmed_tree, type='igraph_node') 398 | segmentss = cbind(leaves$left_rescaled, leaves$right_rescaled) 399 | if(return_segmentss_tree==TRUE) return( trimmed_tree ) 400 | 401 | return( segmentss ) 402 | } 403 | 404 | 405 | get_tree_decoration = function( single_res_info, decoration=TRUE, distr, n_parameters, imputation_num=1E2 ) 406 | { 407 | cA = single_res_info$cA 408 | # cat(dim(cA), '\n') 409 | if( !is.null(single_res_info$full_tree) ) tree=single_res_info$full_tree 410 | if( is.null(single_res_info$full_tree) ) tree = get_tree_v0(single_res_info) 411 | 412 | if(length(tree)==1) if( class(tree)=="character" & tree=='bad_tree') return( tree ) 413 | 414 | ## more decoration 415 | if( decoration==FALSE ) return( tree ) 416 | 417 | leaf_indices = get_leaves( tree, type='index' ) 418 | igraph::V(tree)$mean_diff = 0 419 | zero_ratios = numeric() 420 | for(i in setdiff(1:igraph::vcount(tree), leaf_indices)) 421 | { 422 | node = igraph::V(tree)[i] 423 | # igraph::V(tree)[i]$L = L[node$left, node$right] 424 | 425 | # A_union = cA[node$left:node$right, node$left:node$right] 426 | # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] ) 427 | 428 | # the p-value of: H0: unioned model; H1: hierarchical model 429 | twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]] 430 | mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2] 431 | 432 | # if(distr=='nb') test_info = p_likelihood_ratio_nb( cA, head=node$left, mid=mid, tail=node$right ) 433 | # if(distr=='norm') test_info = p_likelihood_ratio_norm( cA, head=node$left, mid=mid, tail=node$right ) 434 | # if(distr=='gamma') 435 | # { 436 | # test_info = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=FALSE ) 437 | # test_info_imputation = p_likelihood_ratio_gamma( cA, head=node$left, mid=mid, tail=node$right, n_parameters, imputation=TRUE ) 438 | # igraph::V(tree)[i]$imp_p = test_info_imputation$p 439 | # igraph::V(tree)[i]$imp_Lambda = test_info_imputation$Lambda 440 | # } 441 | 442 | if(distr=='lnorm') 443 | { 444 | if(!is.null(imputation_num)) 445 | { 446 | test_info_imp = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation= TRUE, imputation_num=imputation_num ) 447 | igraph::V(tree)[i]$imp_p = test_info_imp$p 448 | igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff 449 | } 450 | 451 | test_info_nimp = p_likelihood_ratio_lnorm( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), n_parameters=n_parameters, imputation=FALSE, imputation_num=imputation_num ) 452 | 453 | igraph::V(tree)[i]$nimp_p = test_info_nimp$p 454 | igraph::V(tree)[i]$mean_diff = test_info_imp$mean_diff 455 | # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2) 456 | } 457 | 458 | if(distr=='wilcox') 459 | { 460 | if(i==1) test_info = p_wilcox_test( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' ) 461 | if(i!=1) test_info = p_wilcox_test( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' ) 462 | 463 | # if(i==1) test_info = p_wilcox_test_nested( is_CD=TRUE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' ) 464 | # if(i!=1) test_info = p_wilcox_test_nested( is_CD=FALSE, cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right), alternative='less' ) 465 | 466 | 467 | igraph::V(tree)[i]$nimp_p = igraph::V(tree)[i]$imp_p = igraph::V(tree)[i]$wilcox_p = test_info$p 468 | igraph::V(tree)[i]$mean_diff = test_info$mean_diff 469 | # zero_ratios = c(zero_ratios, sum(cA[node$left:node$right, node$left:node$right]==0) / (length(node$left:node$right))^2) 470 | } 471 | 472 | test_info_mean_diff = lognormal_mean_test( cA, head=as.numeric(node$left), mid=mid, tail=as.numeric(node$right) ) 473 | 474 | igraph::V(tree)[i]$aa_p = test_info_mean_diff$p_Aa 475 | igraph::V(tree)[i]$bb_p = test_info_mean_diff$p_Ab 476 | } 477 | 478 | igraph::V(tree)$L_diff = 0 479 | # igraph::V(tree)$L_diff = igraph::V(tree)$L - igraph::V(tree)$L_union 480 | if(!is.null(imputation_num)) igraph::V(tree)[leaf_indices]$imp_p = 1 481 | igraph::V(tree)[leaf_indices]$nimp_p = igraph::V(tree)[leaf_indices]$wilcox_p = igraph::V(tree)[leaf_indices]$imp_p = 1 482 | igraph::V(tree)[leaf_indices]$aa_p = 1 483 | igraph::V(tree)[leaf_indices]$bb_p = 1 484 | return(tree) 485 | } 486 | 487 | 488 | add_boundary_binsignal_to_decrated_branches <- function( decorated_branches, bin_signals_5_10 ) 489 | { 490 | tree_widths = sapply(decorated_branches, function(v) igraph::V(v)[1]$width) 491 | if(sum(tree_widths) != length( bin_signals_5_10 )) stop('Check add_boundary_binsignal_to_decrated_branches') 492 | for(k in 1:length(decorated_branches)) 493 | { 494 | tree = decorated_branches[[k]] 495 | leaf_indices = get_leaves( tree, type='index' ) 496 | 497 | for(i in setdiff(1:igraph::vcount(tree), leaf_indices)) 498 | { 499 | node = igraph::V(tree)[i] 500 | # igraph::V(tree)[i]$L = L[node$left, node$right] 501 | 502 | # A_union = cA[node$left:node$right, node$left:node$right] 503 | # igraph::V(tree)[i]$L_union = get_prob_nb( A_union[upper.tri(A_union, diag=TRUE)] ) 504 | 505 | # the p-value of: H0: unioned model; H1: hierarchical model 506 | twins = igraph::ego(tree, 1, node, mode='out', mindist=1)[[1]] 507 | mid = sort(c(as.numeric(twins[1]$left), as.numeric(twins[1]$right), as.numeric(twins[2]$left), as.numeric(twins[2]$right)))[2] 508 | 509 | absolute_mid = mid + sum(tree_widths[(1:k)-1]) 510 | igraph::V(tree)[i]$binsignal = bin_signals_5_10[absolute_mid] 511 | } 512 | igraph::V(tree)[leaf_indices]$binsignal = 0 513 | igraph::V(tree)[which(is.na(igraph::V(tree)$binsignal))]$binsignal = 0 514 | decorated_branches[[k]] = tree 515 | } 516 | return(decorated_branches) 517 | } 518 | 519 | get_tree_v0 <- function( single_res_info ) 520 | { 521 | ancestors = single_res_info$ancestors 522 | L = single_res_info$L 523 | N = dim(ancestors)[1] 524 | current_node = ancestors[1, N] 525 | 526 | recursive <- function(current_node) 527 | { 528 | # cat(current_node, '\n') 529 | seqs = as.numeric(strsplit(current_node,'-')[[1]]) 530 | # cat( current_node, '\n' ) 531 | 532 | left_node = ancestors[seqs[1],seqs[2]] 533 | right_node = ancestors[seqs[3],seqs[4]] 534 | 535 | flag_left = (seqs[1]!=seqs[2]) & (left_node!="") 536 | flag_right = (seqs[3]!=seqs[4]) & (right_node!="") 537 | 538 | if(is.na(flag_left)) return( 'bad_tree' ) ## added: 30-04-2020 539 | 540 | left_leaf = paste( seqs[1],seqs[2], sep='-' ) 541 | right_leaf = paste( seqs[3],seqs[4], sep='-' ) 542 | 543 | if( flag_left ) left_tree = recursive(left_node) 544 | if( !flag_left ) left_tree = left_leaf 545 | 546 | if( flag_right ) right_tree = recursive(right_node) 547 | if( !flag_right ) right_tree = right_leaf 548 | 549 | if( !flag_left ) left_node = left_leaf 550 | if( !flag_right ) right_node = right_leaf 551 | 552 | tree_raw = igraph::graph.empty() + current_node + left_tree + right_tree 553 | tree = igraph::add_edges(tree_raw, c(current_node, left_node, current_node, right_node)) 554 | 555 | return( tree ) 556 | } 557 | 558 | tree = recursive(current_node) 559 | # stop('I stop here') 560 | 561 | # if(tree=='bad_tree') return('bad_tree') ## added: 30-04-2020 562 | if(class(tree)!='igraph') return('bad_tree') 563 | 564 | ## this part tries to decorate the tree by adding various node attributes 565 | 566 | igraph::V(tree)$left = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(head(tmp,1)))} ) 567 | igraph::V(tree)$right = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; return(as.numeric(tail(tmp,1)))} ) 568 | igraph::V(tree)$name = sapply( igraph::V(tree)$name, function(v) {tmp=strsplit(v,'-')[[1]]; paste( '(', head(tmp,1), ',', tail(tmp,1), ')', sep='' )} ) 569 | igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1 570 | 571 | return(tree) 572 | } 573 | 574 | 575 | is_binary_tree <-function(tree) 576 | { 577 | leaves = get_leaves( tree, 'index' ) 578 | not_leaves = setdiff(1:igraph::vcount(tree), leaves) 579 | degrees = igraph::degree(tree, v = not_leaves, mode = 'out') 580 | if(!igraph::is.connected( tree )) return(FALSE) 581 | if(all(degrees==2)) return(TRUE) 582 | return(FALSE) 583 | } 584 | 585 | join_left_or_right <- function(tree) 586 | { 587 | leaves = get_leaves( tree, 'igraph' ) 588 | 589 | left_or_right = numeric() 590 | left_or_right[1] = 0 591 | left_or_right[length(leaves) - 1] = 0 592 | 593 | for( i in 2:(length(leaves) - 1) ) 594 | { 595 | leave = leaves[i] 596 | parent = igraph::ego(tree, nodes=leave, order=1, mindist=1, mode='in')[[1]] 597 | if( parent$left < leave$left ) left_or_right[i] = -1 ## left 598 | if( parent$right > leave$right ) left_or_right[i] = 1 ## right 599 | } 600 | return( left_or_right ) 601 | } 602 | 603 | 604 | long_slices <- function(left_or_right, thresh=3) 605 | { 606 | dup_lens = rle( left_or_right )$lengths 607 | dup_values = rle( left_or_right )$values 608 | long_dups = which( dup_lens >= thresh ) 609 | long_dup_lens = dup_lens[long_dups] 610 | long_dup_start_pos = cumsum( dup_lens )[long_dups-1] + 1 # 1: shift to the right by one 611 | long_dup_direction = left_or_right[ long_dup_start_pos ] 612 | indices_of_slice = as.vector(unlist(mapply(function(u,v) {seq(from=u, length=v, by=1)}, long_dup_start_pos, long_dup_lens))) 613 | dup_info = list( long_slices_lens=long_dup_lens, long_slices_start_pos=long_dup_start_pos, long_dup_direction=long_dup_direction, indices_of_slice=indices_of_slice ) 614 | return(dup_info) 615 | } 616 | 617 | 618 | n_cells2compute <- function( A, dist, min_n_bins=1 ) 619 | { 620 | N = nrow(A) 621 | if(min_n_bins==1) 622 | { 623 | count = dist*(1+dist)*(2+dist)*(N-dist)/6 624 | return(count) 625 | } 626 | 627 | count = (dist - 2*min_n_bins + 2)*(dist^2 + 2*dist*min_n_bins + dist - 2*(min_n_bins - 2)*min_n_bins)*(N - dist) / 6 628 | return(count) 629 | } 630 | 631 | 632 | 633 | plot_tree <- function(tree, seg_col='blue', which_part='upper', indices_of_slice=NULL, ...) 634 | { 635 | root_node = igraph::V(tree)[1] 636 | leaves = igraph::V(tree)[which(igraph::degree( tree, mode='out' ) == 0)] 637 | plot_inner <- function( node ){ 638 | left = node$left 639 | right = node$right 640 | if(right - left == 1) 641 | { 642 | return() 643 | } 644 | 645 | x0 = left - 0.5 646 | x1 = right + 0.5 647 | if(which_part=='upper') 648 | { 649 | segments(x0, x0, x0, x1, col=seg_col, ...) 650 | segments(x0, x1, x1, x1, col=seg_col, ...) 651 | } 652 | 653 | if(which_part=='lower') 654 | { 655 | segments(x0, x0, x1, x0, col='black', ...) 656 | segments(x1, x1, x1, x0, col='black', ...) 657 | } 658 | 659 | if(which_part=='both') 660 | { 661 | segments(x0, x0, x0, x1, col=seg_col, ...) 662 | segments(x0, x1, x1, x1, col=seg_col, ...) 663 | segments(x0, x0, x1, x0, col='black', ...) 664 | segments(x1, x1, x1, x0, col='black', ...) 665 | } 666 | 667 | if( !is.null(indices_of_slice) ) 668 | { 669 | if(left %in% indices_of_slice) 670 | segments(x0, x0, x0, x1, col='yellow', ...) 671 | 672 | if(right %in% indices_of_slice) 673 | segments(x0, x1, x1, x1, col='black', ...) 674 | } 675 | 676 | if( node %in% leaves ) return() 677 | twins = igraph::ego(tree, node=node, order=1, mindist=1, mode='out')[[1]] 678 | left_node = twins[1] 679 | plot_inner(left_node) 680 | if( length(twins) > 1 ) 681 | { 682 | # cat('hello', '\n') 683 | right_node = twins[2] 684 | plot_inner(right_node) 685 | } 686 | } 687 | plot_inner(root_node) 688 | return() 689 | } 690 | 691 | remove_blank_cols <- function( mat, sparse=FALSE, row_or_col='both', ratio=0.05 ) 692 | { 693 | if(sparse==TRUE) 694 | { 695 | if(row_or_col=='row') 696 | { 697 | non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) > ratio) 698 | new_mat = mat[non_zero_indices, ] 699 | return(new_mat) 700 | } 701 | 702 | if(row_or_col=='col') 703 | { 704 | non_zero_indices = (Matrix::colSums(mat != 0) / nrow(mat) > ratio) 705 | new_mat = mat[, non_zero_indices] 706 | return(new_mat) 707 | } 708 | 709 | if( ratio > 0 ) ## remove col/rows with non-zero number smaller than 5% percentile 710 | { 711 | positive_num = unname(Matrix::rowSums(mat != 0)) 712 | positive_num_thresh = quantile(positive_num[positive_num > 0], ratio) 713 | non_zero_indices = (Matrix::rowSums(mat != 0) > positive_num_thresh) 714 | new_mat = mat[non_zero_indices, non_zero_indices] 715 | return(new_mat) 716 | } 717 | 718 | 719 | non_zero_indices = (Matrix::rowSums(mat != 0) / ncol(mat) > ratio) 720 | new_mat = mat[non_zero_indices, non_zero_indices] 721 | return(new_mat) 722 | } 723 | 724 | col_sum = apply( mat, 2, sum ) 725 | blank_col_indices = which( col_sum==0 ) 726 | new_mat = mat[ -blank_col_indices, -blank_col_indices ] 727 | return(new_mat) 728 | } 729 | 730 | 731 | tree_germination <- function(tree) 732 | { 733 | leaves = get_leaves( tree ) 734 | for( leaf in leaves ) 735 | { 736 | cat(leaf, '\n') 737 | if((igraph::vcount(tree) - ecount(tree)) !=1) break 738 | bins = as.character(igraph::V(tree)[leaf]$left:igraph::V(tree)[leaf]$right) 739 | if(length(bins) == 1) stop('Some leave node contains only one bin, please check') 740 | if(length(bins) == 2) ## this is an exception. added on 11/06/2018 741 | { 742 | nodes_supp = as.character(bins) 743 | edges2add = c(leaf, bins[1], leaf, bins[2]) 744 | tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add) 745 | next 746 | } 747 | 748 | joints_supp = paste('j', bins[1:(length(bins) - 2)], sep='') 749 | joints = c(leaf, joints_supp, bins[length(bins)]) 750 | edges2add = character() 751 | for(i in 1:(length(joints)-1)) 752 | { 753 | edges2add = c(edges2add, c( joints[i], bins[i], joints[i], joints[i+1] )) 754 | } 755 | nodes_supp = c(joints_supp, bins) 756 | tree = tree %>% igraph::add_vertices(length(nodes_supp), name=nodes_supp) %>% igraph::add_edges(edges2add) 757 | } 758 | return( tree ) 759 | } 760 | 761 | 762 | trim_tree_adaptive <- function( tree, L_diff_thresh=-Inf, max_imp_p=Inf, max_nimp_p=Inf, width_thresh=-Inf, boundary_signal_thresh=Inf, peak_thresh=-Inf ) 763 | { 764 | if(igraph::vcount(tree) ==1) return(tree) ## so this can be used! 765 | 766 | width_thresh = min(width_thresh, igraph::V(tree)[1]$width-1) ## cannot merge if tree is already very small 767 | 768 | nodes2rm_Ldiff = igraph::V(tree)[which( igraph::V(tree)$L_diff <= L_diff_thresh )]$name 769 | 770 | 771 | nodes2rm_imp_p = igraph::V(tree)[which( igraph::V(tree)$imp_p >= max_imp_p )]$name 772 | nodes2rm_nimp_p = igraph::V(tree)[which( igraph::V(tree)$nimp_p >= max_nimp_p )]$name 773 | nodes2rm_boundary_signal = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name 774 | nodes2rm_not_peak = igraph::V(tree)[which( igraph::V(tree)$is_peak <= peak_thresh )]$name 775 | 776 | nodes2rm = unique( c( nodes2rm_Ldiff, nodes2rm_imp_p, nodes2rm_nimp_p, nodes2rm_boundary_signal, nodes2rm_not_peak ) ) 777 | nodes_not_2rm = setdiff( igraph::V(tree)$name, nodes2rm ) 778 | 779 | flags = sapply( nodes2rm, function(v) { children = names(unlist( igraph::ego( tree, nodes = v, order=igraph::diameter(tree), mindist=1, mode='out' ))); all(children %in% nodes2rm) } ) 780 | nodes2rm_final = names(unlist( igraph::ego(tree, nodes = nodes2rm[which(flags==TRUE)], order=igraph::diameter(tree), mindist=1, mode='out' ) ) ) 781 | 782 | if(!is.null(nodes2rm_final)) tree = tree - nodes2rm_final 783 | if(is.null(nodes2rm_final)) cat('No nodes are removed at the given p_thresh\n') 784 | 785 | if( width_thresh==-Inf ) return( tree ) 786 | 787 | tree = trim_tree_adaptive_width_thresh(tree, width_thresh) 788 | return( tree ) 789 | } 790 | 791 | trim_tree_adaptive_width_thresh <- function( tree, width_thresh ) 792 | { 793 | ## first remove all nodes of a parent if both are < width_thresh 794 | nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name 795 | parentOfnodes2rm_width = names(unlist(igraph::ego(tree, node=nodes2rm_width, order=1, mindist=1, mode='in'))) 796 | ## if two nodes have the same parent, remove them 797 | index2rm = c(which(duplicated(parentOfnodes2rm_width)), which(duplicated(fromLast=TRUE, parentOfnodes2rm_width))) 798 | nodes2rm = nodes2rm_width[index2rm] 799 | tree = tree - nodes2rm 800 | if(!is_binary_tree(tree)) stop('Error in trim_tree_adaptive_width_thresh') 801 | 802 | ## merge TADs that are too small 803 | nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name 804 | 805 | di = igraph::diameter(tree) 806 | while( length(nodes2rm_width) > 0 ) 807 | { 808 | node = nodes2rm_width[1] 809 | parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]] 810 | nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]] 811 | left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)] 812 | right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)] 813 | 814 | left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)] 815 | right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)] 816 | 817 | igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right 818 | igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left 819 | igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1 820 | tree = tree - node 821 | nodes2rm_width = igraph::V(tree)[which( igraph::V(tree)$width <= width_thresh )]$name 822 | } 823 | igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='') 824 | return(tree) 825 | } 826 | 827 | ## should be different from trim_tree_adaptive_width_thresh because binsignal is not monotonic 828 | trim_tree_adaptive_binsig_thresh <- function( tree, boundary_signal_thresh ) 829 | { 830 | 831 | ## merge TADs that are too small 832 | parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name 833 | leaves = get_leaves( tree ) 834 | nodes2rm = intersect(leaves, names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' )))) 835 | 836 | while( length(nodes2rm) > 0 ) 837 | { 838 | di = igraph::diameter(tree) 839 | node = nodes2rm[1] 840 | parent = igraph::ego( tree, nodes = node, order=1, mindist=1, mode='in' )[[1]] 841 | sibling = setdiff(igraph::ego( tree, nodes = parent, order=1, mindist=1, mode='out' )[[1]]$name, node) 842 | twins_of_sibling = igraph::ego( tree, nodes = sibling, order=1, mindist=1, mode='out' )[[1]]$name 843 | nodesOfSameParent = igraph::ego( tree, nodes = parent, order=di, mindist=1, mode='out' )[[1]] 844 | left_siblings = nodesOfSameParent[which(nodesOfSameParent$right < igraph::V(tree)[node]$left)] 845 | right_siblings = nodesOfSameParent[which(nodesOfSameParent$left > igraph::V(tree)[node]$right)] 846 | 847 | left_siblings2change_right = left_siblings[left_siblings$right == (igraph::V(tree)[node]$left-1)] 848 | right_siblings2change_left = right_siblings[right_siblings$left == (igraph::V(tree)[node]$right+1)] 849 | 850 | igraph::V(tree)[ left_siblings2change_right ]$right = igraph::V(tree)[node]$right 851 | igraph::V(tree)[ right_siblings2change_left ]$left = igraph::V(tree)[node]$left 852 | 853 | 854 | if(length(twins_of_sibling) > 0) tree = igraph::add_edges(tree, c( parent$name, twins_of_sibling[1], parent$name, twins_of_sibling[2]) ) 855 | tree = tree - node - sibling 856 | 857 | igraph::V(tree)$width = igraph::V(tree)$right - igraph::V(tree)$left + 1 858 | 859 | parent_of_nodes2rm = igraph::V(tree)[which( igraph::V(tree)$binsignal <= boundary_signal_thresh )]$name 860 | leaves = get_leaves( tree ) 861 | nodes2rm = intersect(leaves, names(unlist(igraph::ego( tree, nodes = parent_of_nodes2rm, order=1, mindist=1, mode='out' )))) 862 | } 863 | igraph::V(tree)$name = paste('(',igraph::V(tree)$left, ',', igraph::V(tree)$right, ')', sep='') 864 | return(tree) 865 | } 866 | 867 | visualize_left_or_right <- function(left_or_right, ...) 868 | { 869 | len = length( left_or_right ) 870 | plot(1, type="n", xlab="", ylab="", xlim=c(0, len), ylim=c(-2, 2)) 871 | for( i in 1:len ) 872 | { 873 | if( left_or_right[i]==1 ) segments(i-0.1, 1, i+1, 1, col='red', ...) 874 | if( left_or_right[i]==-1 ) segments(i-0.1, -1, i+1, -1, col='blue', ...) 875 | } 876 | } 877 | 878 | xenocraft <- function( trunk, branches ) 879 | { 880 | 881 | xenocraft_nodes = sapply(branches, function(v) igraph::V(v)[1]$name) 882 | if(!all(xenocraft_nodes %in% igraph::V(trunk)$name)) stop("Check xenocraft function") 883 | 884 | children_xenocraft_nodes = unique(unlist(sapply(igraph::ego(trunk, order=igraph::diameter(trunk), node=xenocraft_nodes, mode='out', mindist=1), function(v) v$name))) 885 | prunned_trunck = trunk - children_xenocraft_nodes 886 | full_tree = Reduce( union, c(list(prunned_trunck), branches) ) 887 | att2delete = setdiff(vertex_attr_names(full_tree), 'name') 888 | for( att in att2delete ) full_tree = delete_vertex_attr(full_tree, att) 889 | return( full_tree ) 890 | } 891 | 892 | fast_cor <- function(mat) 893 | { 894 | res = 1/( NROW( mat ) -1)*crossprod ( scale( mat , TRUE , TRUE ) ) 895 | # scaled_mat = scale( mat , TRUE , TRUE ) 896 | # res = 1/( NROW( mat ) -1)*matrix_multiplication_sym_cpp( scaled_mat ) 897 | 898 | return(res) 899 | } 900 | 901 | fast_cor_cor <- function(mat, k) 902 | { 903 | scaled_mat = scale(mat , TRUE , TRUE ) 904 | coeff = 1/( NROW( mat ) -1) ## there is no need to multiply this coeff when computing Pearson coeff 905 | 906 | res_begin = coeff*crossprod( scaled_mat[, 1:k], scaled_mat ) 907 | 908 | for(j in (k+1):nrow(mat)) 909 | { 910 | res_slice = res_begin[-1,] 911 | res_next = coeff*crossprod( scaled_mat[, j], scaled_mat ) 912 | res_slice = rbind(res_slice, res_next) 913 | new_values = cor(t(res_slice), t(res_next)) 914 | cat(j, '\n') 915 | } 916 | } 917 | 918 | 919 | get_bin_singals_CHiP <- function(chr, hc_ordered, res, ks, ChiP_NAME, CELL_LINE, ChiP_data_already_loaded=FALSE) 920 | { 921 | 922 | ################# 923 | n_bins_of_CD = sapply(res$initial_clusters, length) 924 | pos_start_end = lapply(res$initial_clusters, function(v) 925 | { 926 | bins_ori = as.numeric(res$bin_names[v]) 927 | from_id = min(bins_ori) 928 | to_id = max(bins_ori) 929 | start_pos = (from_id-1)*bin_size + 1 930 | end_pos = to_id*bin_size 931 | return(c(start_pos, end_pos)) 932 | }) 933 | 934 | pos_start_end = do.call(rbind, pos_start_end) 935 | ################# 936 | 937 | ordered_bin_signals_ALL_k_list = lapply(ks, function(k){ 938 | 939 | hc_k_labels = get_cluser_levels(hc_ordered, k_clusters=k, balanced_4_clusters=FALSE)$cluster_labels 940 | 941 | if(is.null(ChiP_NAME)) 942 | { 943 | CD_index = labels(hc_ordered) 944 | pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ] 945 | ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2], compartment=hc_k_labels[CD_index]) 946 | return(ordered_bin_signals_ALL) 947 | } 948 | 949 | cluster_signal = get_names_by_H3k4me1(chr, res, ChiP_NAME, kmeans_cluster_assigment=hc_k_labels, CELL_LINE=CELL_LINE, ChiP_data_already_loaded=ChiP_data_already_loaded) 950 | # cluster_signal = rbind(cluster_signal, sorted_coverage) 951 | ordered_bin_signals_ALL = apply(cluster_signal, 1, function(v) 952 | { 953 | bin_signals = v[hc_k_labels] 954 | names(bin_signals) = names(hc_k_labels) 955 | ordered_bin_signals = bin_signals[labels(hc_ordered)] ## order by dendro topology 956 | return(ordered_bin_signals) 957 | }) 958 | 959 | ordered_bin_signals_ALL = as.data.frame(ordered_bin_signals_ALL) 960 | # CD_index = rownames(ordered_bin_signals_ALL) 961 | CD_index = labels(hc_ordered) 962 | 963 | pos_start_end = pos_start_end[match( CD_index, rownames(pos_start_end) ), ] 964 | 965 | ordered_bin_signals_ALL = data.frame(CD_index=CD_index, n_bins=n_bins_of_CD[CD_index], pos_start=pos_start_end[,1], pos_end=pos_start_end[,2], compartment=hc_k_labels[CD_index], ordered_bin_signals_ALL) 966 | return(ordered_bin_signals_ALL) 967 | }) 968 | names(ordered_bin_signals_ALL_k_list) = paste0('clusters_', ks) 969 | names(ordered_bin_signals_ALL_k_list)[which(names(ordered_bin_signals_ALL_k_list)=="clusters_Inf")] = 'compartment_domains' 970 | return(ordered_bin_signals_ALL_k_list) 971 | } 972 | 973 | 974 | 975 | densplot = function(x,y,points = FALSE, pch=19, cex=1, xlim=c(min(x),max(x)), ylim=c(min(y),max(y)), ...){ 976 | df = data.frame(x,y) 977 | d = densCols(x,y, colramp=colorRampPalette(c("black", "white"))) 978 | df$dens = col2rgb(d)[1,] + 1L 979 | 980 | cols = colorRampPalette(c("#000099", "#00FEFF", "#45FE4F","#FCFF00", "#FF9400", "#FF3100"))(256) 981 | 982 | df$col = cols[df$dens] 983 | df=df[order(df$dens),] 984 | if(points) 985 | points(df$x,df$y, pch=pch, col=df$col, cex=cex, ...) 986 | else 987 | plot(df$x,df$y, pch=pch, col=df$col, cex=cex, xlim=xlim, ylim=ylim, ...) 988 | } 989 | 990 | 991 | 992 | my_merge = function(...) merge(..., all=TRUE) 993 | 994 | 995 | build_chr_bin_domain_fun <- function( CELL_LINE, chrs, cluster_level, p_thresh, ob_oe, downsratio=NULL, compress_size=10 ) 996 | { 997 | chrs = as.character(chrs) 998 | chr_bin_domain_tmp = lapply(chrs, function(chr) get_clusters_bins_xy(CELL_LINE, chr, cluster_level, p_thresh, ob_oe, downsratio=downsratio, compress_size=compress_size)) 999 | names( chr_bin_domain_tmp ) = chrs ## can use mapply 1000 | chr_bin_domain = lapply(chrs, function(v) 1001 | { 1002 | chr_bin_domain_ind = data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) ) 1003 | chr_bin_domain_ind = chr_bin_domain_ind[order(chr_bin_domain_ind$bin_index), ] 1004 | return(chr_bin_domain_ind) 1005 | }) 1006 | names(chr_bin_domain) = chrs 1007 | res = do.call(rbind, chr_bin_domain) 1008 | rownames(res) = NULL 1009 | return( res ) 1010 | } 1011 | 1012 | build_chr_bin_domain_fun_direct <- function( chr, initial_clusters, cluster_vec, bin_names ) ## directly from R workingspace instead of loading 1013 | { 1014 | chrs = as.character(chr) ## for historic reason, chr -- chrs 1015 | chr_bin_domain_tmp = lapply(chrs, function(chr) get_cluster_bin_names(initial_clusters, cluster_vec, bin_names)) 1016 | names( chr_bin_domain_tmp ) = chrs ## can use mapply 1017 | chr_bin_domain = lapply(chrs, function(v) data.frame( chr=paste0('chr', v), bin_index=as.numeric(unlist(chr_bin_domain_tmp[[v]])), intra_domain=rep(names(chr_bin_domain_tmp[[v]]), sapply(chr_bin_domain_tmp[[v]], length)) )) 1018 | names(chr_bin_domain) = chrs 1019 | return( do.call(rbind, chr_bin_domain) ) 1020 | } 1021 | 1022 | get_clusters_bins_xy <- function(CELL_LINE, chr, cluster_level, p_thresh, ob_oe='oe', downsratio, compress_size, sort=TRUE) 1023 | { 1024 | if(ob_oe=='oe') sub_folder = paste0('./', CELL_LINE, '/oe_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh) 1025 | if(ob_oe=='ob') sub_folder = paste0('./', CELL_LINE, '/ob_chr_', chr, '_', bin_size/1E3, 'kb_', compress_size, 'to1_', p_thresh) 1026 | 1027 | if(!is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB', ws, 'downsratio_', downsratio, '.Rdata') 1028 | 1029 | if(is.null(downsratio)) compartments_Rdata_file = paste0(sub_folder, '/chr', chr, '_compartments_atanh_log_AB3_3.Rdata') 1030 | 1031 | load(compartments_Rdata_file) 1032 | clusters_bins = get_cluster_bin_names(sort=sort, res$initial_clusters, res$clusters[, cluster_level], res$bin_names) 1033 | rm( res ) 1034 | return(clusters_bins) 1035 | } 1036 | 1037 | 1038 | get_cluster_bin_names <- function(initial_clusters, cluster_vec, bin_names, sort=TRUE) 1039 | { 1040 | if(sort==TRUE) cluster_indices = sort(unique(cluster_vec), decreasing=TRUE) 1041 | if(sort==FALSE) cluster_indices = unique(cluster_vec) 1042 | 1043 | cluster_bins = lapply(cluster_indices, function(v) 1044 | { 1045 | indices = which(cluster_vec==v) 1046 | bin_names[unlist(initial_clusters[indices])] 1047 | }) 1048 | names(cluster_bins) = cluster_indices 1049 | return(cluster_bins) 1050 | } 1051 | 1052 | # ave_cor <- function(mat, seg_len) ## seg_len is the length of a segment 1053 | # { 1054 | # d = 1:nrow(mat) 1055 | # seq_index = split(d, ceiling(seq_along(d)/10)) 1056 | # tmp = simplify2array( lapply(seq_index[1:22], function(v) fast_cor( mat[v, ] )/10*length(v)) ) 1057 | # ave_cor_val = rowMeans(tmp, dims = 2) 1058 | # } 1059 | 1060 | # if( kmeans_cluster_assigment ): compute the cluster_level signals 1061 | 1062 | 1063 | reorder_dendro <- function(hc_object, named_weights, return_g=FALSE, aggregateFun=mean) 1064 | { 1065 | get_children <- function(g, root_node) 1066 | { 1067 | leaves = igraph::V(g)[igraph::degree(g)==1]$name 1068 | children = intersect(leaves, igraph::ego(g, mode='out', root_node, order=igraph::diameter(g))[[1]]$name) 1069 | return(children) 1070 | } 1071 | 1072 | # hc_dendro = as.dendrogram(hc_object) 1073 | # cat('hello I am here\n') 1074 | # print(class(hc_object)) 1075 | g = igraph::as.igraph(ape::as.phylo(hc_object)) 1076 | # stop('I am here') 1077 | leaves = igraph::V(g)[igraph::degree(g)==1]$name 1078 | 1079 | igraph::V(g)$weight = sapply(1:igraph::vcount(g), function(v) aggregateFun(named_weights[get_children(g, v)])) 1080 | 1081 | if(return_g==TRUE) return(g) 1082 | 1083 | swap_branches <- function(g, root_node) 1084 | { 1085 | twins = igraph::ego(g, mode='out', root_node, order=1, mindist=1)[[1]]$name 1086 | twins_weight = igraph::V(g)[twins]$weight 1087 | 1088 | if(twins_weight[1] > twins_weight[2]) 1089 | { 1090 | children_of_twin_A = get_children(g, twins[1]) 1091 | children_of_twin_B = get_children(g, twins[2]) 1092 | leaves = swap_names( leaves, children_of_twin_A, children_of_twin_B ) 1093 | } 1094 | return(leaves) 1095 | } 1096 | 1097 | swap_names <- function( leaves, names2swap_A, names2swap_B ) 1098 | { 1099 | names2swap_indices = match(c(names2swap_A, names2swap_B), leaves) 1100 | leaves[ names2swap_indices ] = c(names2swap_B, names2swap_A) 1101 | return(leaves) 1102 | } 1103 | 1104 | for( root_node in setdiff(igraph::V(g)$name, leaves) ) 1105 | { 1106 | leaves = swap_branches(g, root_node) 1107 | # cat(leaves, '\n') 1108 | } 1109 | 1110 | return(leaves) 1111 | } 1112 | 1113 | 1114 | get_cluser_assignment = function(hc, k_clusters, leaves_hclust_pc) 1115 | { 1116 | clusters_raw = cutree(hc, k_clusters) 1117 | clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v)) 1118 | 1119 | od = rank(sapply( clusters, function(v) sort(match(v, as.numeric(leaves_hclust_pc)))[1])) 1120 | 1121 | cluster_assignment = numeric(sum(sapply(clusters, length))) 1122 | for(j in 1:length(clusters)) cluster_assignment[clusters[[j]]] = od[j] 1123 | return( cluster_assignment ) 1124 | } 1125 | 1126 | 1127 | 1128 | get_cluster_boudaries <- function(hc, k_clusters, named_weights) 1129 | { 1130 | len = length(labels(hc)) 1131 | clusters_raw = cutree(hc, k_clusters) 1132 | clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v)) 1133 | clusters_pc_rank = lapply(clusters, function(v) unname(sort(rank(named_weights)[v]))) 1134 | bondaries = sapply(clusters_pc_rank, function(v) tail(v,1)) 1135 | return(setdiff(bondaries, c(1, len))) 1136 | } 1137 | 1138 | # get_cluser_vector <- function(hc, k_clusters, named_weights) ## get cluster assignment of 1,2,3,4,5... (A1, A2, ...) 1139 | # { 1140 | # clusters_raw = cutree(hc, k_clusters) 1141 | # clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v)) 1142 | # clusters_pc_rank = unname(rank(sapply(clusters, function(v) unname(sort(rank(named_weights)[v]))[1]))) 1143 | # cluster_vector = numeric() 1144 | # for( i in 1:length(clusters) ) cluster_vector[ clusters[[i]] ] = clusters_pc_rank[i] 1145 | # return( cluster_vector ) 1146 | # } 1147 | 1148 | ## if return binary tree, A1, A2, B1, B2 are forced to be returned 1149 | get_cluser_levels <- function(hc_ordered, k_clusters, balanced_4_clusters=FALSE) ## get the detailed A1, A2, ..., B1, B2... 1150 | { 1151 | assign_twins_name <- function(graph, node) 1152 | { 1153 | twins = igraph::ego(graph, node, mode='out', order=1, mindist=1)[[1]]$name 1154 | igraph::V(graph)[twins]$level_name = paste0(igraph::V(graph)[node]$level_name, '.', c(2,1)) 1155 | if(node==igraph::V(graph)[1]$name) igraph::V(graph)[twins]$level_name = c('B', 'A') 1156 | return(graph) 1157 | } 1158 | 1159 | if(k_clusters==Inf) k_clusters = length(labels(as.dendrogram(hc_ordered))) 1160 | ################################################# 1161 | # cat('haha I am here\n') 1162 | # print(class(hc_ordered)) 1163 | graph = igraph::as.igraph(ape::as.phylo(hc_ordered)) 1164 | leave_names = get_leaves(graph) 1165 | 1166 | bfs_names = igraph::bfs(graph, 1)$order$name 1167 | dfs_names = igraph::dfs(graph, 1)$order$name 1168 | 1169 | igraph::V(graph)[1]$level_name = '' 1170 | for( node in bfs_names ) 1171 | { 1172 | graph = assign_twins_name(graph, node) 1173 | # if( !any(is.na(igraph::V(graph)[common_father_name]$level_name)) ) break ## when all common_father_name have level_name 1174 | if( !any(is.na(igraph::V(graph)$level_name)) ) break ## when all common_father_name have level_name 1175 | } 1176 | 1177 | if( balanced_4_clusters==TRUE ) ##A1, A2, B1, B2 1178 | { 1179 | branch_root_name = c('A.1', 'A.2', 'B.1', 'B.2') 1180 | branch_root = match(branch_root_name, igraph::V(graph)$level_name) 1181 | 1182 | children = igraph::ego(graph, order = igraph::diameter(graph), nodes = branch_root, mode = 'out', mindist = 0) ## mindist==0, when itself is a branch 1183 | tmp = lapply( children, function(v) intersect(leave_names, v$name) ) 1184 | cluster_labels = rep(branch_root_name, sapply(tmp, length)) 1185 | names(cluster_labels) = unlist( tmp ) 1186 | cluster_labels = cluster_labels[ as.character(1:length(cluster_labels)) ] 1187 | return(cluster_labels) 1188 | } 1189 | 1190 | ################################################# 1191 | clusters_raw = cutree(hc_ordered, k_clusters)[labels(hc_ordered)] ## labels are ordered according to pc 1192 | clusters = tapply(as.numeric(names(clusters_raw)), clusters_raw, function(v) list(v))[unique(clusters_raw)] 1193 | names(clusters) = 1:k_clusters # named by pc order 1194 | 1195 | ## get the vector. Named from 1 to 5 from left leaves to right leaves 1196 | cluster_vector = rep( 1:k_clusters, sapply(clusters, length) ) 1197 | names(cluster_vector) = labels(hc_ordered) 1198 | cluster_vector = cluster_vector[ as.character( sort(as.numeric(names(cluster_vector))) ) ] 1199 | 1200 | ################################################# 1201 | 1202 | ## which node is the common father of all nodes in a cluster 1203 | common_father_index = sapply( clusters, function(u) 1204 | { 1205 | if(length(u)==1) return( which(u==igraph::V(graph)$name) ) 1206 | return(max(which(sapply(igraph::ego(graph, order = igraph::diameter(graph), nodes = igraph::V(graph), mode = 'out', mindist = 1), function(v) all(u %in% v$name) )))) 1207 | }) 1208 | 1209 | common_father_name = igraph::V(graph)[common_father_index]$name 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | ## whether common_father_name are ordered 1216 | if(!all(diff(match( common_father_name, dfs_names )) >= 0)) stop('!!!!!!!!check get_all_children in header_funs.R') 1217 | 1218 | 1219 | 1220 | level_names = igraph::V(graph)[common_father_name]$level_name ## label names are ordered 1221 | if(any(sort(level_names, decreasing=TRUE)!=level_names)) warning('!!!!!!!!Need to check get_all_children in header_funs.R') 1222 | 1223 | cluster_labels = rep( level_names, sapply(clusters, length) ) 1224 | names(cluster_labels) = labels(hc_ordered) 1225 | cluster_labels = cluster_labels[ as.character( sort(as.numeric(names(cluster_labels))) ) ] 1226 | 1227 | return( list(cluster_vector=cluster_vector, cluster_labels=cluster_labels)) 1228 | } 1229 | 1230 | 1231 | 1232 | 1233 | get_hkmeans_cluser_levels <- function(hk_cluster_centers, PC1) 1234 | { 1235 | hc = hclust(as.dist(my_dist(hk_cluster_centers)), method='com') 1236 | reordered_names = as.character(rank(PC1)) 1237 | hc_ordered = dendextend::rotate(hc, reordered_names) 1238 | hk_clust_labels = get_cluser_levels(hc_ordered, k_clusters=length(PC1))$cluster_labels 1239 | return( hk_clust_labels ) 1240 | } 1241 | 1242 | 1243 | 1244 | ## names_A_final is the rownames of the A_final 1245 | ## names_A_final is matched to the names of the starting contact matrix 1246 | 1247 | get_original_tad_indices <- function(names_A_final, TADs, bin_size) 1248 | { 1249 | start_pos = as.numeric(names_A_final[TADs$start_pos]) 1250 | end_pos = as.numeric(names_A_final[TADs$end_pos]) 1251 | start_pos_ori = (start_pos - 1)*bin_size + 1 1252 | end_pos_ori = end_pos*bin_size 1253 | TADs = data.frame( start_pos_ori=start_pos_ori, end_pos_ori=end_pos_ori ) 1254 | return( TADs ) 1255 | } 1256 | 1257 | 1258 | 1259 | 1260 | ## This code updates the branch name 1261 | ## Branches obtained from branches = lapply( res_inner, get_tree_v0 ) 1262 | ## The original branch name start from 1 1263 | update_branch_name <- function(branch, root_start) 1264 | { 1265 | igraph::V(branch)$left = igraph::V(branch)$left + root_start - 1 1266 | igraph::V(branch)$right = igraph::V(branch)$right + root_start - 1 1267 | igraph::V(branch)$name = paste('(',igraph::V(branch)$left, ',', igraph::V(branch)$right, ')', sep='') 1268 | return(branch) 1269 | } 1270 | 1271 | -------------------------------------------------------------------------------- /R/generate_compartments_bed_fun.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU, 09-08-2018 2 | 3 | generate_compartments_bed <- function(input_mat, p_thresh, out_file_name, chr, window.sizes=3, stat_window_size=NULL, bin_size) 4 | { 5 | input_mat_extended = data.frame(chr=paste0('chr', chr), pos_start=0:(nrow(input_mat)-1), pos_end=1:nrow(input_mat), mat=input_mat) 6 | res_input_mat = TopDom_v2(input_mat_extended, window.size=NULL, NULL, T, p_thresh=p_thresh, window.sizes=window.sizes, stat_window_size=stat_window_size, domain_size_min=NULL) 7 | # return(res_input_mat) 8 | to_id = as.numeric(rownames(input_mat)[res_input_mat$domain$to.id]) 9 | from_id = as.numeric(rownames(input_mat)[res_input_mat$domain$from.id]) 10 | 11 | start_poses = (from_id-1)*bin_size + 1 12 | # end_poses = start_poses + n2one*bin_size 13 | end_poses = start_poses + bin_size 14 | 15 | input_mat_compartments_bed = data.frame(paste0('chr', chr), as.character(start_poses), as.character(end_poses) ) 16 | if(!is.null(out_file_name)) write.table( input_mat_compartments_bed, file=out_file_name, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' ) 17 | return( res_input_mat ) 18 | } 19 | -------------------------------------------------------------------------------- /R/post_processing_nested_domains.R: -------------------------------------------------------------------------------- 1 | 2 | get_cluster_index <- function(pos, initial_clusters, bin_names, bin_size) 3 | { 4 | compartment_segs = generate_compartment_segs( initial_clusters ) 5 | CDs = get_original_tad_indices( bin_names, compartment_segs, bin_size=bin_size ) 6 | cluster_index = which(apply( CDs, 1, function(v) (v[1] < pos)&(v[2]>pos) )==1) 7 | return(cluster_index) 8 | } 9 | 10 | get_original_tad_indices_extra <- function(names_A_final, TADs, bin_size) ## to get the TADs for the extra edges 11 | { 12 | # start_pos = as.numeric(names_A_final[TADs$start_pos]) 13 | end_pos = as.numeric(names_A_final[TADs$end_pos]) 14 | # start_pos_ori = (start_pos - 1)*bin_size + 1 15 | end_pos_ori = end_pos*bin_size 16 | TADs = data.frame( start_pos_ori=end_pos_ori+1, end_pos_ori=end_pos_ori ) 17 | return( TADs ) 18 | } 19 | 20 | 21 | LikelihoodRatioTest <- function(sub_domains_raw, ncores, remove_zero=FALSE, distr, n_parameters=3, imputation_num=1E2, A_already_corrected=FALSE) 22 | { 23 | # require( doParallel ) 24 | pA_sym = sub_domains_raw$pA_sym 25 | if(A_already_corrected==TRUE) cpA_sym = pA_sym 26 | ## CAN SPEEDUP correct_A_fast_divide_by_mean BECAUSE ONLY SOME OFF-DIAGNAL LINES NEED TO BE CORRECTED 27 | if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym 28 | trees = foreach::foreach( j =1:length( sub_domains_raw$res_inner ) ) %do% 29 | { 30 | name_index = rownames(sub_domains_raw$pA_sym)[sub_domains_raw$segmentss[j,1]:sub_domains_raw$segmentss[j,2]] 31 | sub_domains_raw$res_inner[[j]]$cA = as.matrix(cpA_sym[name_index, name_index]) ## the corrected A. Use this matrix is essential for reducing the distance-based false positves 32 | 33 | ## for example, when the number of invovled bins in sub_domains_raw$res_inner[[j]] is too small 34 | # tmp = try(get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )) 35 | tmp = get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ) 36 | # if( class(tmp)=="try-error" ) 37 | if(class(tmp)!='igraph') ## if(tmp=='bad_tree') 38 | { 39 | root_tree = igraph::graph.empty() + 'root' 40 | igraph::V(root_tree)$width = nrow(sub_domains_raw$res_inner[[j]]$A) 41 | igraph::V(root_tree)$left = 1 42 | igraph::V(root_tree)$right = igraph::V(root_tree)$width 43 | tmp = root_tree ## represented by the length of the segment 44 | } 45 | tmp 46 | } 47 | return(trees) 48 | } 49 | 50 | 51 | # pipline <- function(chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end 52 | create_TADs <- function(sub_domains_raw, chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end 53 | { 54 | chr_name = paste0('chr', chr) 55 | bin_size_kb = bin_size / 1E3 56 | 57 | if(just_save_no_nest) 58 | { 59 | TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 60 | TADs = cbind(chr_name, TADs) 61 | save( TADs, file=Tads_R_File ) 62 | } 63 | TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 64 | # if(TADs_type=='extra') TADs = get_original_tad_indices_extra( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 65 | 66 | # print(TADs) 67 | 68 | TADs = cbind(chr_name, TADs) 69 | return(TADs) 70 | } 71 | 72 | ##################################################################### 73 | 74 | ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE 75 | clean_TADs_all = function(TADs_all_raw, CDs, bin_size) 76 | { 77 | TADs_all_end = TADs_all_raw[,2] 78 | CD_end = CDs[,2] 79 | outer_mat = outer(TADs_all_end, CD_end, "-") 80 | min_dist = apply(outer_mat, 1, function(v) min(abs(v))) 81 | # end2rm = TADs_all_end[(min_dist <= 3) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb 82 | end2rm = TADs_all_end[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb 83 | ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE 84 | 85 | 86 | TADs_all_head = TADs_all_raw[,1] 87 | CD_head = CDs[,1] 88 | outer_mat = outer(TADs_all_head, CD_head, "-") 89 | min_dist = apply(outer_mat, 1, function(v) min(abs(v))) 90 | head2rm = TADs_all_head[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary 91 | TADs_all = subset(TADs_all_raw, (!(start_pos %in% head2rm)) & (!(end_pos %in% end2rm)) ) 92 | return(TADs_all) 93 | } 94 | 95 | post_process_sub_domains = function(chr, sub_domains_raw, ncores, out_dir, bin_size) 96 | { 97 | 98 | distr=c('lnorm', 'wilcox')[2] 99 | remove_zero = FALSE 100 | n_parameters = 3 101 | imputation_num = 1E2 102 | A_already_corrected = FALSE 103 | 104 | decorated_branches = LikelihoodRatioTest(sub_domains_raw=sub_domains_raw, ncores=ncores, remove_zero=FALSE, distr=distr, n_parameters=n_parameters, imputation_num=imputation_num, A_already_corrected=A_already_corrected) 105 | chr_name = paste0('chr', chr) 106 | 107 | 108 | mean_diff_thresh = -0.1 109 | i = 1 110 | # for(i in 1:length(p0s)) 111 | { 112 | # cat(i, '\n') 113 | # p0 = p0s[i] 114 | 115 | normal_decorated_branches = decorated_branches[sapply(decorated_branches, igraph::vcount) > 1] 116 | # ps = sort(unlist(lapply(normal_decorated_branches, function(v) igraph::V(v)$wilcox_p)), decreasing=FALSE) ## fdr correction 117 | # p0_adj = ps[min(which(p.adjust(ps, method = 'fdr') > p0))] 118 | # p0_adj = ps[min(which(p.adjust(ps, method = 'bonferroni') > p0))] 119 | p0_adj = 0.05 120 | 121 | TADs_all_raw = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=TRUE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh) 122 | 123 | # CDs = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE); cat(nrow(CDs), '\n') 124 | CDs = prunning(decorated_branches, to_correct=FALSE, p0=-1, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh) 125 | # if(nrow(CDs)!=length(res$initial_clusters)) stop('nrow(CDs)!=length(res$initial_clusters)') 126 | 127 | TADs_all = clean_TADs_all(TADs_all_raw, CDs, bin_size=bin_size) 128 | 129 | # Tad_edges <- sort(unique(c(TADs_tmp[,1], (TADs_tmp[,2]+1)))) 130 | TADs_all_edges <- sort(unique(TADs_all[,2])) 131 | CD_edges <- sort(unique(CDs[,2])) 132 | 133 | TADs_extra_edges = unique(setdiff(TADs_all_edges, CD_edges)) 134 | TADs_extra = data.frame(start_pos=TADs_extra_edges, end_pos= TADs_extra_edges) 135 | 136 | 137 | TADs_pos_all = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_all, TADs_type='ALL', bin_size=bin_size) 138 | TADs_pos_extra = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_extra, TADs_type='extra', bin_size=bin_size) 139 | TADs_pos_CD = create_TADs(sub_domains_raw=sub_domains_raw, chr, CDs, TADs_type='CD', bin_size=bin_size) 140 | 141 | # TADs_info = list(decorated_branches=decorated_branches, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD) 142 | TADs_info = list(decorated_branches=decorated_branches, TADs_all=TADs_all, CDs=CDs, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD, TADs_pos_extra=TADs_pos_extra) 143 | } 144 | 145 | 146 | options(stringsAsFactors=FALSE) 147 | level_5 = TADs_info$TADs_pos_extra[, c('chr_name', 'end_pos_ori')] 148 | colnames(level_5)[1:2] = c('chr', 'nested_boundary') 149 | sub_domain_boundary_bed_file = paste0(out_dir, '/chr', chr, '_nested_boundaries.bed') 150 | 151 | level_5_bed = data.frame(level_5, level_5[,2], '', '.', level_5, level_5[,2], '#000000') 152 | op <- options(scipen=999) 153 | write.table( level_5_bed, file=sub_domain_boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 154 | 155 | on.exit(options(op)) 156 | 157 | return(NULL) 158 | } 159 | 160 | 161 | -------------------------------------------------------------------------------- /R/prunning.R: -------------------------------------------------------------------------------- 1 | trim_tree_adaptive_top_down_v2 = function( tree, wilcox_p_thresh, mean_diff_thresh ) 2 | { 3 | # leaves = get_leaves(tree) 4 | if(igraph::vcount(tree)==1) return(tree) 5 | # cat('I am in trim_tree_adaptive_top_down_v2\n') 6 | nsig_nodes = union( igraph::V(tree)[which(igraph::V(tree)$wilcox_p > wilcox_p_thresh)]$name, igraph::V(tree)[which(igraph::V(tree)$mean_diff > mean_diff_thresh)]$name ) 7 | 8 | 9 | children_of_nsig = names(unlist(igraph::ego(tree, order=1, node=nsig_nodes, mode='out', mindist=1))) 10 | if(length(children_of_nsig)!=0) trimmed_tree = tree - children_of_nsig 11 | if(length(children_of_nsig)==0) trimmed_tree = tree 12 | 13 | comps = igraph::decompose(trimmed_tree) 14 | root_index = which(sapply( comps, function(comp) igraph::V(tree)[1]$name %in% igraph::V(comp)$name )==1) 15 | trimmed_tree = comps[[root_index]] 16 | if(!is_binary_tree( trimmed_tree )) stop("trim_tree_adaptive_top_down, the resulted tree is not a binary tree") 17 | return( trimmed_tree ) 18 | } 19 | 20 | 21 | prunning = function(branches, p0, to_correct=FALSE, width_thresh=-Inf, width_thresh_CD=5, boundary_signal_thresh=-Inf, return_which='TADs', top_down=FALSE, all_levels=FALSE, CD_border_adj=FALSE, peak_thresh=NULL, mean_diff_thresh) 22 | { 23 | 24 | size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v)))) 25 | p_thresh = p0/size2correct 26 | if(to_correct==FALSE) p_thresh=p0 27 | # if(!is.null(p0)) p_thresh = p0 28 | 29 | 30 | if(top_down==FALSE) 31 | { 32 | trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh, peak_thresh=peak_thresh ) 33 | # size2correct = sum(sapply(trimmed_branches, igraph::vcount)) - sum(sapply(trimmed_branches, function(v) length(get_leaves(v)))) 34 | 35 | # for( i in 1:length( trimmed_branches ) ) 36 | # { 37 | # trimmed_branch = trimmed_branches[[i]] 38 | # if(igraph::vcount(trimmed_branch) > 1) trimmed_branches[[i]] = lapply( trimmed_branches[i], trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=-1 )[[1]] 39 | # } 40 | 41 | 42 | } 43 | 44 | # if(top_down==TRUE) trimmed_branches = lapply( branches, trim_tree_adaptive_top_down, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh ) 45 | if(top_down==TRUE) trimmed_branches = lapply( branches, function(branch) trim_tree_adaptive_top_down_v2(wilcox_p_thresh=p_thresh, mean_diff_thresh=mean_diff_thresh, tree=branch )) 46 | 47 | if(CD_border_adj==TRUE) 48 | { 49 | all_tads = get_adjusted_nested_TADs( trimmed_branches, width_thresh_CD, all_levels ) 50 | return( all_tads ) 51 | } 52 | 53 | ## get all nested TADs in trimmed_branches 54 | if(all_levels==TRUE) 55 | { 56 | all_tads = data.frame(start_pos=numeric(), end_pos=numeric()) 57 | widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width)) 58 | for(i in 1:length(trimmed_branches)) 59 | { 60 | all_tads_i = get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i])) 61 | all_tads = rbind(all_tads, all_tads_i) 62 | } 63 | return( all_tads ) 64 | } 65 | 66 | if( return_which=='trimmed_branches' ) return( trimmed_branches ) 67 | 68 | tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 69 | tad_sizes = unlist(tad_sizes_ind) 70 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 71 | end_pos = cumsum(tad_sizes) 72 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 73 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 74 | return( tads ) 75 | } 76 | 77 | 78 | 79 | 80 | ## This function combines prunning with branches of only one node 81 | prunning_hybrid <- function(branches, ...) 82 | { 83 | names(branches) = as.character(1:length(branches)) 84 | normal_branches = branches[sapply( branches, function(v) class(v)=='igraph' )] 85 | unnormal_branches = branches[sapply( branches, function(v) class(v)!='igraph' )] ## that is reprsented as bin_start:bin_end 86 | 87 | trimmed_branches = prunning(normal_branches, return_which='trimmed_branches', ...) 88 | 89 | normal_tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 90 | unormal_tad_sizes_ind = unnormal_branches 91 | tad_sizes_ind = c(normal_tad_sizes_ind, unormal_tad_sizes_ind) 92 | tad_sizes_ind = tad_sizes_ind[names(branches)] 93 | tad_sizes = unlist(tad_sizes_ind) 94 | 95 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 96 | end_pos = cumsum(tad_sizes) 97 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 98 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 99 | return( tads ) 100 | } 101 | 102 | 103 | get_all_tads_in_a_trimmed_branch <- function(trimmed_branch, pos_shift) 104 | { 105 | res = data.frame( start_pos=igraph::V(trimmed_branch)$left + pos_shift, end_pos=igraph::V(trimmed_branch)$right + pos_shift ) 106 | res = res[order(res[,1], res[,2]), ] 107 | return(res) 108 | } 109 | 110 | prunning_bottom_up <- function(branches, p0=NULL, width_thresh) 111 | { 112 | size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v)))) 113 | 114 | p_thresh = 0.05/size2correct 115 | if(!is.null(p0)) p_thresh = p0 116 | 117 | trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh ) 118 | 119 | tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 120 | tad_sizes = unlist(tad_sizes_ind) 121 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 122 | end_pos = cumsum(tad_sizes) 123 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 124 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 125 | return( tads ) 126 | } 127 | 128 | 129 | trim_tree_adaptive_bottom_up <- function( tree, which_p='imp_p' ) 130 | { 131 | if(which_p=='imp_p') ps = sort(unique(igraph::V(tree)$imp_p), decreasing=TRUE) 132 | # if(which_p=='nimp_p') ps = sort(unique(igraph::V(tree)$nimp_p), decreasing=TRUE) 133 | # if(which_p=='both') ps = sort(unique(pmin(igraph::V(tree)$nimp_p, igraph::V(tree)$imp_p)), decreasing=TRUE) 134 | 135 | trimed_tree_current = tree 136 | trimmed_branch_bottom_up = vector('list', length(ps)) 137 | for(i in 1:length(ps)) 138 | { 139 | trimed_tree_current = trim_tree_adaptive( tree, L_diff_thresh=-Inf, max_imp_p=ps[i], max_nimp_p=Inf, width_thresh=-Inf ) 140 | trimmed_branch_bottom_up[[i]] = trimed_tree_current 141 | } 142 | igraph::vcounts = sapply(trimmed_branch_bottom_up, igraph::vcount) 143 | ps = ps[!duplicated(igraph::vcounts)] 144 | trimmed_branch_bottom_up = trimmed_branch_bottom_up[!duplicated(igraph::vcounts)] 145 | res = list(ps=ps, trimmed_branch_bottom_up=trimmed_branch_bottom_up) 146 | return( res ) 147 | } 148 | 149 | 150 | ## get adjusted nested TADs 151 | get_adjusted_nested_TADs <- function( trimmed_branches, width_thresh_CD, all_levels ) 152 | { 153 | widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width)) 154 | all_tads_i_list = lapply( 1:length(trimmed_branches), function(i) get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i]))) 155 | 156 | for(i in 1:length(trimmed_branches)) 157 | { 158 | all_tads_i = all_tads_i_list[[i]] 159 | if( nrow(all_tads_i) <= 1 ) next 160 | ## move the left-most border a little bit right if needed 161 | left_borders = unique(all_tads_i[,1]) 162 | min_diff_left = left_borders[2] - left_borders[1] 163 | if( min_diff_left <= width_thresh_CD ) 164 | { 165 | all_tads_i[ all_tads_i==left_borders[1] ] = left_borders[2] 166 | all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1], ] ## remove "negative" TADs 167 | all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs 168 | 169 | all_tads_i_list[[i]] = all_tads_i 170 | 171 | ## need to modify the right border of nested TADs in previous CD if the left border of this CD is modified 172 | if(i > 1) 173 | { 174 | ## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i] 175 | all_tads_i_list[[i-1]][ all_tads_i_list[[i-1]]==max(all_tads_i_list[[i-1]]) ] = min(all_tads_i_list[[i]]) - 1 176 | } 177 | } 178 | 179 | if( nrow(all_tads_i) <= 1 ) next 180 | ## move the right-most border a little bit left if needed 181 | right_borders = unique(rev(all_tads_i[,2])) 182 | min_diff_right = right_borders[1] - right_borders[2] 183 | if( min_diff_right <= width_thresh_CD ) 184 | { 185 | all_tads_i[ all_tads_i==right_borders[1] ] = right_borders[2] 186 | all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1], ] 187 | all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs 188 | 189 | all_tads_i_list[[i]] = all_tads_i 190 | 191 | if(i < length(trimmed_branches)) 192 | { 193 | ## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i] 194 | all_tads_i_list[[i+1]][ all_tads_i_list[[i+1]]==min(all_tads_i_list[[i+1]]) ] = max(all_tads_i_list[[i]]) + 1 195 | } 196 | } 197 | } 198 | 199 | if(!all_levels) all_tads_i_list = lapply( all_tads_i_list, function(v) data.frame(start_pos=head(v[,1],1), end_pos=tail(v[,2],1)) ) 200 | 201 | all_tads = do.call(rbind, all_tads_i_list) 202 | colnames(all_tads) = c('start_pos', 'end_pos') 203 | return( all_tads ) 204 | } 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /R/zigzag_nested_domain.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## 12/04/2018 3 | ## 06/06/2018 4 | ## 16/06/2018 5 | ## This code runs zigzag search on each compartment domain as a whole without dividing into 400 6 | 7 | ## 09-10-2018 8 | 9 | ## A should be already named 10 | ## "zigzag" resembles computational path of finding the optimum 11 | # HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2) 12 | HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, min_n_bins=2) 13 | { 14 | arg_list = as.list(environment()) 15 | 16 | res_folder = file.path(res_dir) 17 | dir.create(res_folder, recursive=TRUE, showWarnings = FALSE) 18 | 19 | total_execution_time_file = file.path(res_dir, 'total_execution.time') 20 | time_begin = Sys.time() 21 | cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE) 22 | 23 | ## check whether your input matrix is symmetric 24 | # A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') ) 25 | # A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11 26 | A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11 27 | 28 | tol = 100 * .Machine$double.eps 29 | max_diff = max(abs(A_sym - A)) 30 | notSym_flag = max_diff > tol 31 | if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first') 32 | 33 | if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices') 34 | 35 | # pA_sym = rm_zeros(A_sym) ## pA_sym: positive A 36 | pA_sym = remove_blank_cols(A_sym, sparse=TRUE, ratio=0) 37 | n_zero_rows = nrow(A_sym) - nrow(pA_sym) 38 | zero_rows_flag = n_zero_rows > 0 39 | if( zero_rows_flag ) 40 | { 41 | warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis') 42 | original_row_names = rownames(A_sym) 43 | kept_row_names = rownames(pA_sym) 44 | } 45 | 46 | # res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain 47 | # for( i in 1:nrow(compartment_segs) ) 48 | # { 49 | # seg = compartment_segs[i,1]:compartment_segs[i,2] 50 | # cat('Compute seg:', i, 'of length:', length(seg), '\n') 51 | 52 | # A_seg = as.matrix(pA_sym[seg, seg]) 53 | # res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg)) 54 | # res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors) 55 | # res_inner[[i]] = res_outer 56 | # cat('finished', '\n') 57 | # } 58 | 59 | ## changed to paralell, 2018-11-11 60 | cat('\n') 61 | 62 | res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do% 63 | { 64 | seg = compartment_segs[i,1]:compartment_segs[i,2] 65 | cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins', i, nrow(compartment_segs), length(seg))) 66 | 67 | A_seg = as.matrix(pA_sym[seg, seg]) 68 | res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins) 69 | res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins) 70 | res_outer 71 | # res_inner[[i]] = res_outer 72 | } 73 | 74 | cat('\n') 75 | 76 | segmentss = compartment_segs 77 | res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner ) 78 | # res_folder_final = file.path(res_dir, 'final') 79 | # dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 80 | # save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 81 | 82 | time_finish = Sys.time() 83 | cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE) 84 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE) 85 | 86 | return( res_info ) 87 | } 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## (Check CALDER2 with mutiple updates here: https://github.com/CSOgroup/CALDER2) 2 | 3 | 4 | 5 | 6 | 7 | # CALDER user manuel 8 | 9 | CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts. CALDER is currently implemented in R. 10 | 11 | 12 | ![Alt text](./img/CALDER_methods.png "CALDER methods") 13 | 14 | ## (A note on the performance of Calder vs PC-based approach) 15 | 16 | * PC1 of the correlation matrix was typically used to define A/B compartment. We found Calder demonstrates superior robustness over PC-based approach in identifying meaningful compartments, particularly when faced with complex chromosomal structural variations (figure on the left) and loose interaction between the p and q arms (figure on the right) 17 | 18 | ![Alt text](./img/Hela_chr11_and_RWPE1_chr9_pq.png "Hela_chr11_and_RWPE1_chr9_pq") 19 | 20 | # Installation 21 | 22 | 23 | ## Make sure all dependencies have been installed: 24 | 25 | * R.utils (>= 2.9.0), 26 | * doParallel (>= 1.0.15), 27 | * ape (>= 5.3), 28 | * dendextend (>= 1.12.0), 29 | * fitdistrplus (>= 1.0.14), 30 | * igraph (>= 1.2.4.1), 31 | * Matrix (>= 1.2.17), 32 | * rARPACK (>= 0.11.0), 33 | * factoextra (>= 1.0.5), 34 | * maptools (>= 0.9.5), 35 | * data.table (>= 1.12.2), 36 | * fields (>= 9.8.3), 37 | * GenomicRanges (>= 1.36.0) 38 | 39 | ## Clone its repository and install it from source: 40 | 41 | `git clone https://github.com/CSOgroup/CALDER.git` 42 | 43 | `install.packages(path_to_CALDER, repos = NULL, type="source")` ## install from the cloned source file 44 | 45 | 46 | Please contact yliueagle@googlemail.com for any questions about installation. 47 | 48 | ## install CALDER and dependencies automaticly: 49 | 50 | ``` 51 | if (!requireNamespace("BiocManager", quietly = TRUE)) 52 | install.packages("BiocManager") 53 | 54 | BiocManager::install("GenomicRanges") 55 | install.packages("remotes") 56 | remotes::install_github("CSOgroup/CALDER") 57 | ``` 58 | 59 | # Usage 60 | 61 | The input data of CALDER is a three-column text file storing the contact table of a full chromosome (zipped format is acceptable, as long as it can be read by `data.table::fread`). Each row represents a contact record `pos_x, pos_y, contact_value`, which is the same format as that generated by the `dump` command of juicer (https://github.com/aidenlab/juicer/wiki/Data-Extraction): 62 | 63 | 16050000 16050000 10106.306 64 | 16050000 16060000 2259.247 65 | 16060000 16060000 7748.551 66 | 16050000 16070000 1251.3663 67 | 16060000 16070000 4456.1245 68 | 16070000 16070000 4211.7393 69 | 16050000 16080000 522.0705 70 | 16060000 16080000 983.1761 71 | 16070000 16080000 1996.749 72 | ... 73 | 74 | A demo dataset is included in the repository `CALDER/inst/extdata/mat_chr22_10kb_ob.txt.gz` and can be accessed by `system.file("extdata", "mat_chr22_10kb_ob.txt.gz", package='CALDER')` once CALDER is installed. This data contains contact values of GM12878 on chr22 binned at 10kb (Rao et al. 2014) 75 | 76 | CALDER contains three modules: (1) compute chromatin domains; (2) derive their hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each compartment domain. 77 | 78 | ### To run three modules in a single step: 79 | ``` 80 | CALDER_main(contact_mat_file, 81 | chr, 82 | bin_size, 83 | out_dir, 84 | sub_domains=TRUE, 85 | save_intermediate_data=FALSE, 86 | genome='hg19') 87 | ``` 88 | 89 | ### To run three modules in seperated steps: 90 | ``` 91 | # This will not compute sub-domains, but save the intermediate_data that can be used to compute sub-domains latter on 92 | CALDER_main(contact_mat_file, 93 | chr, 94 | bin_size, 95 | out_dir, 96 | sub_domains=FALSE, 97 | save_intermediate_data=TRUE, 98 | genome='hg19') 99 | 100 | # (optional depends on needs) Compute sub-domains using intermediate_data_file that was previous saved in the out_dir (named as chrxx_intermediate_data.Rds) 101 | CALDER_sub_domains(intermediate_data_file, 102 | chr, 103 | out_dir, 104 | bin_size) 105 | ``` 106 | 107 | ### Paramters: 108 | 109 | * `contact_mat_file`: path to the contact table of a chromosome 110 | * `chr`: chromosome number. Either numeric or character, will be pasted to the output name 111 | * `bin_size`: numeric, the size of a bin in consistent with the contact table 112 | * `out_dir`: the output directory 113 | * `sub_domains`: logical, whether to compute nested sub-domains 114 | * `save_intermediate_data`: logical. If TRUE, an intermediate_data will be saved. This file can be used for computing nested sub-domains later on 115 | * `genome`: string. Specifies the genome assembly (Default="hg19"). 116 | 117 | ### Output: 118 | 119 | #### chrxx_domain_hierachy.tsv 120 | * information of compartment domain and their hierarchical organization. The hierarchical structure is fully represented by `compartment_label`, for example, `B.2.2.2` and `B.2.2.1` are two sub-branches of `B.2.2`. The `pos_end` column specifies all compartment domain borders, except when it is marked as `gap`, which indicates it is the border of a gap chromsome region that has too few contacts and was excluded from the analysis (e.g., due to low mappability, deletion, technique flaw) 121 | 122 | #### chrxx_sub_compartments.bed 123 | * a .bed file containing the sub-compartment information, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments) 124 | 125 | #### chrxx_domain_boundaries.bed 126 | * a .bed file containing the chromatin domains boundaries, that can be visualized in IGV 127 | 128 | #### chrxx_nested_boundaries.bed 129 | * a .bed file containing the nested sub-domain boundaries, that can be visualized in IGV 130 | 131 | #### chrxx_intermediate_data.Rds 132 | * an Rds file storing the intermediate_data that can be used to compute nested sub-domains (if CALDER is run in two seperated steps) 133 | 134 | #### chrxx_log.txt, chrxx_sub_domains_log.txt 135 | * log file storing the status and running time of each step 136 | 137 | 138 | 139 | ### Running time: 140 | For the computational requirement, running CALDER on the GM12878 Hi-C dataset at bin size of 40kb took **36 minutes** to derive the chromatin domains and their hierarchy for all chromosomes (i.e., CALDER Step1 and Step2); **13 minutes** to derive the nested sub-domains (i.e., CALDER Step3). At the bin size of 10kb, it took **1 h 44 minutes and 55 minutes** correspondingly (server information: 40 cores, 64GB Ram, Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz). The evaluation was done using a single core although CALDER can be run in a parallel manner. 141 | 142 | ### Demo run: 143 | 144 | ``` 145 | library(CALDER) 146 | 147 | contact_mat_file = system.file("extdata", "mat_chr22_10kb_ob.txt.gz", package = 'CALDER') 148 | 149 | CALDER_main(contact_mat_file, chr=22, bin_size=10E3, out_dir='./GM12878', sub_domains=TRUE, save_intermediate_data=FALSE) 150 | ``` 151 | 152 | The saved .bed files can be view directly through IGV: 153 | 154 | ![Alt text](./img/IGV_results.png "IGV") 155 | 156 | # Citation 157 | 158 | If you use CALDER in your work, please cite: https://www.nature.com/articles/s41467-021-22666-3 159 | 160 | 161 | # Contact information 162 | 163 | * Author: Yuanlong LIU 164 | * Affiliation: Computational Systems Oncology group, Department of Computational Biology, University of Lausanne, Switzerland 165 | * Email: yliueagle@googlemail.com 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /Read-and-delete-me: -------------------------------------------------------------------------------- 1 | * Edit the help file skeletons in 'man', possibly combining help files 2 | for multiple functions. 3 | * Edit the exports in 'NAMESPACE', and add necessary imports. 4 | * Put any C/C++/Fortran code in 'src'. 5 | * If you have compiled code, add a useDynLib() directive to 6 | 'NAMESPACE'. 7 | * Run R CMD build to build the package tarball. 8 | * Run R CMD check to check the package tarball. 9 | 10 | Read "Writing R Extensions" for more information. 11 | -------------------------------------------------------------------------------- /img/CALDER_methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/CALDER_methods.png -------------------------------------------------------------------------------- /img/Hela_chr11_and_RWPE1_chr9_pq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/Hela_chr11_and_RWPE1_chr9_pq.png -------------------------------------------------------------------------------- /img/IGV_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/img/IGV_results.png -------------------------------------------------------------------------------- /inst/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/.DS_Store -------------------------------------------------------------------------------- /inst/extdata/TxDb.Hsapiens.UCSC.hg19.knownGene.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/TxDb.Hsapiens.UCSC.hg19.knownGene.rds -------------------------------------------------------------------------------- /inst/extdata/TxDb.Mmusculus.UCSC.mm9.knownGene.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/TxDb.Mmusculus.UCSC.mm9.knownGene.rds -------------------------------------------------------------------------------- /inst/extdata/mat_chr22_10kb_ob.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/mat_chr22_10kb_ob.txt.gz -------------------------------------------------------------------------------- /inst/extdata/mat_mm9_chr1_100kb_ob.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/inst/extdata/mat_mm9_chr1_100kb_ob.txt.gz -------------------------------------------------------------------------------- /man/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/man/.Rapp.history -------------------------------------------------------------------------------- /man/CALDER-package.Rd: -------------------------------------------------------------------------------- 1 | \name{CALDER-package} 2 | \alias{CALDER-package} 3 | \alias{CALDER} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{CALDER} 7 | } 8 | \description{ 9 | \packageDescription{CALDER} 10 | } 11 | \details{ 12 | 13 | The DESCRIPTION file: 14 | \packageDESCRIPTION{CALDER} 15 | \packageIndices{CALDER} 16 | ~~ An overview of how to use the package, including the most important ~~ 17 | ~~ functions ~~ 18 | } 19 | \author{ 20 | \packageAuthor{CALDER} 21 | 22 | Maintainer: \packageMaintainer{CALDER} 23 | } 24 | \references{ 25 | ~~ Literature or other references for background information ~~ 26 | } 27 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ 28 | ~~ the R documentation directory ~~ 29 | \keyword{ package } 30 | \seealso{ 31 | ~~ Optional links to other man pages, e.g. ~~ 32 | ~~ \code{\link[:-package]{}} ~~ 33 | } 34 | \examples{ 35 | ~~ simple examples of the most important functions ~~ 36 | } 37 | -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER/7cc59d9809d49e8bff7aa112b04d2eb3e4974b41/src/.DS_Store -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | // matrix_multiplication_cpp 10 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B); 11 | RcppExport SEXP _CALDER_matrix_multiplication_cpp(SEXP ASEXP, SEXP BSEXP) { 12 | BEGIN_RCPP 13 | Rcpp::RObject rcpp_result_gen; 14 | Rcpp::RNGScope rcpp_rngScope_gen; 15 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 16 | Rcpp::traits::input_parameter< arma::mat >::type B(BSEXP); 17 | rcpp_result_gen = Rcpp::wrap(matrix_multiplication_cpp(A, B)); 18 | return rcpp_result_gen; 19 | END_RCPP 20 | } 21 | // matrix_multiplication_sym_cpp 22 | arma::mat matrix_multiplication_sym_cpp(arma::mat A); 23 | RcppExport SEXP _CALDER_matrix_multiplication_sym_cpp(SEXP ASEXP) { 24 | BEGIN_RCPP 25 | Rcpp::RObject rcpp_result_gen; 26 | Rcpp::RNGScope rcpp_rngScope_gen; 27 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 28 | rcpp_result_gen = Rcpp::wrap(matrix_multiplication_sym_cpp(A)); 29 | return rcpp_result_gen; 30 | END_RCPP 31 | } 32 | // loglik_lnorm_cpp 33 | double loglik_lnorm_cpp(double sum_ln1, double sum_ln2, double p, double q); 34 | RcppExport SEXP _CALDER_loglik_lnorm_cpp(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP pSEXP, SEXP qSEXP) { 35 | BEGIN_RCPP 36 | Rcpp::RObject rcpp_result_gen; 37 | Rcpp::RNGScope rcpp_rngScope_gen; 38 | Rcpp::traits::input_parameter< double >::type sum_ln1(sum_ln1SEXP); 39 | Rcpp::traits::input_parameter< double >::type sum_ln2(sum_ln2SEXP); 40 | Rcpp::traits::input_parameter< double >::type p(pSEXP); 41 | Rcpp::traits::input_parameter< double >::type q(qSEXP); 42 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp(sum_ln1, sum_ln2, p, q)); 43 | return rcpp_result_gen; 44 | END_RCPP 45 | } 46 | // loglik_lnorm_cpp_vec 47 | double loglik_lnorm_cpp_vec(arma::vec vec_values); 48 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_vec(SEXP vec_valuesSEXP) { 49 | BEGIN_RCPP 50 | Rcpp::RObject rcpp_result_gen; 51 | Rcpp::RNGScope rcpp_rngScope_gen; 52 | Rcpp::traits::input_parameter< arma::vec >::type vec_values(vec_valuesSEXP); 53 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_vec(vec_values)); 54 | return rcpp_result_gen; 55 | END_RCPP 56 | } 57 | // get_A_len 58 | arma::mat get_A_len(arma::mat A); 59 | RcppExport SEXP _CALDER_get_A_len(SEXP ASEXP) { 60 | BEGIN_RCPP 61 | Rcpp::RObject rcpp_result_gen; 62 | Rcpp::RNGScope rcpp_rngScope_gen; 63 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 64 | rcpp_result_gen = Rcpp::wrap(get_A_len(A)); 65 | return rcpp_result_gen; 66 | END_RCPP 67 | } 68 | // get_A_ln1 69 | arma::mat get_A_ln1(arma::mat A); 70 | RcppExport SEXP _CALDER_get_A_ln1(SEXP ASEXP) { 71 | BEGIN_RCPP 72 | Rcpp::RObject rcpp_result_gen; 73 | Rcpp::RNGScope rcpp_rngScope_gen; 74 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 75 | rcpp_result_gen = Rcpp::wrap(get_A_ln1(A)); 76 | return rcpp_result_gen; 77 | END_RCPP 78 | } 79 | // get_A_ln2 80 | arma::mat get_A_ln2(arma::mat A); 81 | RcppExport SEXP _CALDER_get_A_ln2(SEXP ASEXP) { 82 | BEGIN_RCPP 83 | Rcpp::RObject rcpp_result_gen; 84 | Rcpp::RNGScope rcpp_rngScope_gen; 85 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 86 | rcpp_result_gen = Rcpp::wrap(get_A_ln2(A)); 87 | return rcpp_result_gen; 88 | END_RCPP 89 | } 90 | // loglik_lnorm_cpp_mat 91 | arma::mat loglik_lnorm_cpp_mat(arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs); 92 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_mat(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP psSEXP, SEXP qsSEXP) { 93 | BEGIN_RCPP 94 | Rcpp::RObject rcpp_result_gen; 95 | Rcpp::RNGScope rcpp_rngScope_gen; 96 | Rcpp::traits::input_parameter< arma::mat >::type sum_ln1(sum_ln1SEXP); 97 | Rcpp::traits::input_parameter< arma::mat >::type sum_ln2(sum_ln2SEXP); 98 | Rcpp::traits::input_parameter< arma::mat >::type ps(psSEXP); 99 | Rcpp::traits::input_parameter< arma::mat >::type qs(qsSEXP); 100 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_mat(sum_ln1, sum_ln2, ps, qs)); 101 | return rcpp_result_gen; 102 | END_RCPP 103 | } 104 | // zigzag_loglik_ancestors_v4_5 105 | List zigzag_loglik_ancestors_v4_5(arma::mat A, int k, int min_n_bins); 106 | RcppExport SEXP _CALDER_zigzag_loglik_ancestors_v4_5(SEXP ASEXP, SEXP kSEXP, SEXP min_n_binsSEXP) { 107 | BEGIN_RCPP 108 | Rcpp::RObject rcpp_result_gen; 109 | Rcpp::RNGScope rcpp_rngScope_gen; 110 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 111 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 112 | Rcpp::traits::input_parameter< int >::type min_n_bins(min_n_binsSEXP); 113 | rcpp_result_gen = Rcpp::wrap(zigzag_loglik_ancestors_v4_5(A, k, min_n_bins)); 114 | return rcpp_result_gen; 115 | END_RCPP 116 | } 117 | // compute_L 118 | List compute_L(arma::mat A, arma::mat L, int k); 119 | RcppExport SEXP _CALDER_compute_L(SEXP ASEXP, SEXP LSEXP, SEXP kSEXP) { 120 | BEGIN_RCPP 121 | Rcpp::RObject rcpp_result_gen; 122 | Rcpp::RNGScope rcpp_rngScope_gen; 123 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 124 | Rcpp::traits::input_parameter< arma::mat >::type L(LSEXP); 125 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 126 | rcpp_result_gen = Rcpp::wrap(compute_L(A, L, k)); 127 | return rcpp_result_gen; 128 | END_RCPP 129 | } 130 | 131 | static const R_CallMethodDef CallEntries[] = { 132 | {"_CALDER_matrix_multiplication_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_cpp, 2}, 133 | {"_CALDER_matrix_multiplication_sym_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_sym_cpp, 1}, 134 | {"_CALDER_loglik_lnorm_cpp", (DL_FUNC) &_CALDER_loglik_lnorm_cpp, 4}, 135 | {"_CALDER_loglik_lnorm_cpp_vec", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_vec, 1}, 136 | {"_CALDER_get_A_len", (DL_FUNC) &_CALDER_get_A_len, 1}, 137 | {"_CALDER_get_A_ln1", (DL_FUNC) &_CALDER_get_A_ln1, 1}, 138 | {"_CALDER_get_A_ln2", (DL_FUNC) &_CALDER_get_A_ln2, 1}, 139 | {"_CALDER_loglik_lnorm_cpp_mat", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_mat, 4}, 140 | {"_CALDER_zigzag_loglik_ancestors_v4_5", (DL_FUNC) &_CALDER_zigzag_loglik_ancestors_v4_5, 3}, 141 | {"_CALDER_compute_L", (DL_FUNC) &_CALDER_compute_L, 3}, 142 | {NULL, NULL, 0} 143 | }; 144 | 145 | RcppExport void R_init_CALDER(DllInfo *dll) { 146 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 147 | R_useDynamicSymbols(dll, FALSE); 148 | } 149 | -------------------------------------------------------------------------------- /src/matrix_multiplication_fun.cpp: -------------------------------------------------------------------------------- 1 | 2 | //http://arma.sourceforge.net/docs.html#stats_fns 3 | #include "RcppArmadillo.h" 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp::export]] 7 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B) 8 | { 9 | return A*B; 10 | } 11 | 12 | // [[Rcpp::export]] 13 | arma::mat matrix_multiplication_sym_cpp(arma::mat A) 14 | { 15 | return A*A; 16 | } 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/zigzag_loglik_ancestors_v4.5.cpp: -------------------------------------------------------------------------------- 1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*- 2 | 3 | #include "RcppArmadillo.h" 4 | using namespace Rcpp; 5 | using namespace std; 6 | // #define M_PI 3.141592653589793238462643383280 /* pi */ 7 | 8 | // [[Rcpp::export]] 9 | double loglik_lnorm_cpp( double sum_ln1, double sum_ln2, double p, double q ) // several times faster than fitdistr 10 | { 11 | if( sum_ln2 < 0 ) cout << "sum_ln2 not valid in loglik_lnorm_cpp\n"; 12 | if( p < 0 ) cout << "p not valid in loglik_lnorm_cpp\n"; 13 | if( q < 0 ) cout << "q not valid in loglik_lnorm_cpp\n"; 14 | 15 | if( q <=1 ) return 0; // p = sum(x==0): the number of zero-values 16 | double alpha = 1.0*p/(p+q); 17 | double mu = sum_ln1/q; 18 | 19 | double sigma2 = sum_ln2/q + pow(mu, 2) - 2*mu*sum_ln1/q; // %: element-wise product 20 | if( abs(sigma2) <=1E-10 ) 21 | { 22 | return 0; 23 | } 24 | double loglik = -sum_ln1 - 0.5*q*(log(2*M_PI) + log(sigma2)) - (sum_ln2 + q*pow(mu, 2) - 2*mu*sum_ln1)/2/sigma2; 25 | if( p==0 ) {return loglik;} else {return p*log(alpha) + q*log(1-alpha) + loglik;} 26 | } 27 | 28 | // [[Rcpp::export]] 29 | double loglik_lnorm_cpp_vec( arma::vec vec_values ) //log-likelihood of vec_values 30 | { 31 | int p, q; 32 | int n = vec_values.n_elem; 33 | if( n < 2 ) return 0; // added on 19-07-2018 34 | 35 | double sum_ln1, sum_ln2; 36 | arma::vec positive_vec = vec_values.elem(find(vec_values > 0)); 37 | q = positive_vec.n_elem; // the number of positive values 38 | p = n - q; // the number of zeros 39 | if( q <= 1 ) return 0; 40 | 41 | sum_ln1 = sum(log(positive_vec)); 42 | sum_ln2 = sum(pow(log(positive_vec), 2)); 43 | 44 | return loglik_lnorm_cpp( sum_ln1, sum_ln2, p, q ); 45 | } 46 | 47 | // [[Rcpp::export]] 48 | arma::mat get_A_len(arma::mat A) // get matrix A_len: A_len := A*(A>0). This is used for computing the number of positive elements in a rectangle region 49 | { 50 | int n_row = A.n_rows; 51 | arma::mat A_len=arma::zeros(n_row, n_row); // for test 52 | arma::uvec ids = find(A > 0); 53 | arma::vec new_values = arma::ones(ids.n_elem); 54 | A_len.elem(ids) = new_values; 55 | return A_len; 56 | } 57 | 58 | // [[Rcpp::export]] 59 | arma::mat get_A_ln1(arma::mat A) // log(A_ij) 60 | { 61 | int n_row = A.n_rows; 62 | arma::mat A_ln1=arma::zeros(n_row, n_row); // for test 63 | arma::uvec ids = find(A > 0); 64 | arma::vec new_values = log(A.elem(ids)); 65 | A_ln1.elem(ids) = new_values; 66 | return A_ln1; 67 | } 68 | 69 | // [[Rcpp::export]] 70 | arma::mat get_A_ln2(arma::mat A) // log(A_ij)^2 71 | { 72 | int n_row = A.n_rows; 73 | arma::mat A_ln2=arma::zeros(n_row, n_row); // for test 74 | arma::uvec ids = find(A > 0); 75 | arma::vec new_values = pow(log(A.elem(ids)), 2); 76 | A_ln2.elem(ids) = new_values; 77 | return A_ln2; 78 | } 79 | 80 | // compute the loglik matrix 81 | // [[Rcpp::export]] 82 | arma::mat loglik_lnorm_cpp_mat( arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs ) // several times faster than fitdistr 83 | { 84 | int n_row = sum_ln1.n_rows; 85 | int n_col = sum_ln1.n_cols; 86 | arma::mat loglik(n_row, n_col); 87 | for(int i=0; i 0) 103 | arma::mat A_ln1 = get_A_ln1(A); // log(A_ij) 104 | arma::mat A_ln2 = get_A_ln2(A); // log(A_ij)^2 105 | StringMatrix ancestors(n_row, n_row); 106 | arma::mat L=arma::zeros(n_row, n_row); // for test 107 | Rcpp::List res; 108 | 109 | for( int k= min_n_bins-1; k<=(2*min_n_bins - 2); k++ ) // other values of L are 0 110 | { 111 | for( int v=1; v<= (n_row - k); v++ ) 112 | { 113 | arma::mat tmp_mat=A.submat(arma::span(v-1, v+k-1), arma::span(v-1, v+k-1)); // span(0,1) := 1:2 in R 114 | arma::vec upper_tri_vec = tmp_mat.elem(find(trimatu(tmp_mat))); 115 | L(v-1, v-1+k) = loglik_lnorm_cpp_vec( upper_tri_vec ); 116 | } 117 | } // Checked to be the same in R 118 | // cout << "Finished initialize L\n"; 119 | 120 | // initialize the rad_mat as the first off-diagonal values 121 | arma::mat rad_mat_current_ln1(n_row-1, 1); 122 | arma::mat rad_mat_current_ln2(n_row-1, 1); 123 | arma::mat rad_mat_current_len(n_row-1, 1); 124 | 125 | for(int i=0; i<(n_row-1); i++) // The first off-diagonal values 126 | { 127 | rad_mat_current_ln1(i, 0) = A_ln1(i, i+1); 128 | rad_mat_current_ln2(i, 0) = A_ln2(i, i+1); 129 | rad_mat_current_len(i, 0) = A_len(i, i+1); 130 | } 131 | 132 | // initialized to be two vertical cells (2 rows) 133 | arma::mat vertical_columns_next_ln1(2, n_row-2); 134 | arma::mat vertical_columns_next_ln2(2, n_row-2); 135 | arma::mat vertical_columns_next_len(2, n_row-2); 136 | 137 | for(int i=0; i<(n_row-2); i++) 138 | { 139 | vertical_columns_next_ln1.col(i) = A_ln1( arma::span(i, i+1), i+2 ); 140 | vertical_columns_next_ln2.col(i) = A_ln2( arma::span(i, i+1), i+2 ); 141 | vertical_columns_next_len.col(i) = A_len( arma::span(i, i+1), i+2 ); 142 | } 143 | 144 | for(int i=1; i<2; i++) // cumsum of the two vertical cells (i=1:1) 145 | { 146 | vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + vertical_columns_next_ln1.row(i-1); //cumsum 147 | vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + vertical_columns_next_ln2.row(i-1); //cumsum 148 | vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + vertical_columns_next_len.row(i-1); //cumsum 149 | } 150 | 151 | arma::mat rad_mat_next_ln1 = rad_mat_current_ln1; 152 | arma::mat rad_mat_next_ln2 = rad_mat_current_ln2; 153 | arma::mat rad_mat_next_len = rad_mat_current_len; 154 | 155 | arma::mat vertical_columns_current_ln1 = vertical_columns_next_ln1; // this line just create the vertical_columns_current_ln 156 | arma::mat vertical_columns_current_ln2 = vertical_columns_next_ln2; // this line just create the vertical_columns_current_ln 157 | arma::mat vertical_columns_current_len = vertical_columns_next_len; // this line just create the vertical_columns_current_ln 158 | 159 | // Rcout << L << "\n"; 160 | // cout << "Begin iteration:\n"; 161 | 162 | // time complexity of this part: n^3 163 | // for(int shift=3; shift<=n_row; shift++) 164 | // each row of rad_mat represent one off-diagonal point 165 | for(int shift=3; shift<=k; shift++) 166 | { 167 | rad_mat_current_ln1 = rad_mat_next_ln1; 168 | rad_mat_current_ln2 = rad_mat_next_ln2; 169 | rad_mat_current_len = rad_mat_next_len; 170 | 171 | rad_mat_next_ln1 = arma::mat( n_row-shift+1, shift-1); 172 | rad_mat_next_ln2 = arma::mat( n_row-shift+1, shift-1); 173 | rad_mat_next_len = arma::mat( n_row-shift+1, shift-1); 174 | 175 | n_cells = arma::zeros(shift-1); // size of each rectangle 176 | for( int i=0; i< (shift-1); i++ ) n_cells(i) = (i+1)*(shift-i-1); 177 | arma::mat rad_mat_next_len_all = (arma::ones(n_row-shift+1, shift-1))*(arma::diagmat(n_cells)); // In R: rep(vec, n_row times) // Schur product: element-wise multiplication of two objects 178 | 179 | for(int i=1; i<=(n_row-shift+1); i++) 180 | { 181 | // next = current + vertical_columns_next_ln values 182 | rad_mat_next_ln1.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln1( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln1( arma::span(0, shift-2-1), i-1 ).t(); 183 | rad_mat_next_ln2.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln2( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln2( arma::span(0, shift-2-1), i-1 ).t(); 184 | rad_mat_next_len.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_len( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_len( arma::span(0, shift-2-1), i-1 ).t(); 185 | 186 | rad_mat_next_ln1(i-1, shift-1-1) = vertical_columns_next_ln1(shift-1-1, i-1); // the last new element 187 | rad_mat_next_ln2(i-1, shift-1-1) = vertical_columns_next_ln2(shift-1-1, i-1); // the last new element 188 | rad_mat_next_len(i-1, shift-1-1) = vertical_columns_next_len(shift-1-1, i-1); // the last new element 189 | } 190 | 191 | 192 | 193 | ////////////////////////////////////// compute the vertical_columns_next values 194 | if(shift < n_row) //stop when shift=n 195 | { 196 | vertical_columns_current_ln1 = vertical_columns_next_ln1; 197 | vertical_columns_current_ln2 = vertical_columns_next_ln2; 198 | vertical_columns_current_len = vertical_columns_next_len; 199 | 200 | arma::mat first_row_ln1(1, n_row-shift); 201 | arma::mat first_row_ln2(1, n_row-shift); 202 | arma::mat first_row_len(1, n_row-shift); 203 | 204 | for(int i=0; i<(n_row-shift); i++) 205 | { 206 | first_row_ln1(0, i) = A_ln1(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 207 | first_row_ln2(0, i) = A_ln2(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 208 | first_row_len(0, i) = A_len(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 209 | } 210 | 211 | vertical_columns_next_ln1 = vertical_columns_current_ln1.submat(0, 1, shift-2, n_row-shift); // drop the first column 212 | vertical_columns_next_ln2 = vertical_columns_current_ln2.submat(0, 1, shift-2, n_row-shift); // drop the first column 213 | vertical_columns_next_len = vertical_columns_current_len.submat(0, 1, shift-2, n_row-shift); // drop the first column 214 | 215 | vertical_columns_next_ln1 = arma::join_cols(first_row_ln1, vertical_columns_next_ln1); 216 | vertical_columns_next_ln2 = arma::join_cols(first_row_ln2, vertical_columns_next_ln2); 217 | vertical_columns_next_len = arma::join_cols(first_row_len, vertical_columns_next_len); 218 | 219 | for(int i=1; i= 4) 229 | if(shift >= 2*min_n_bins) // 230 | { 231 | ps = rad_mat_next_len_all - rad_mat_next_len; // number of positive values 232 | loglik_tmp = loglik_lnorm_cpp_mat( rad_mat_next_ln1, rad_mat_next_ln2, ps, rad_mat_next_len ); 233 | // loglik = loglik_tmp.submat(0, 1, n_row-shift, shift-2-1); // remove first and last col because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat 234 | loglik = loglik_tmp.submat(0, min_n_bins-1, n_row-shift, shift-2-(min_n_bins-1)); // 2018-11-14, remove first and last min_n_bins-1 cols because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat 235 | 236 | 237 | arma::mat cases(1, shift-2*min_n_bins+1); // shift=5: 1:2, i.e., two cases 238 | // arma::mat loglik(n_row-shift, shift-min_n_bins); 239 | for( int row=1; row<=(n_row-shift+1); row++ ) 240 | { 241 | p = row; //7 242 | q = row + shift; // 7 + 4 = 11 243 | cases = loglik( p-1, arma::span(0, shift-2*min_n_bins) ) + L(p-1, arma::span(p-1-1 + min_n_bins, q - min_n_bins-1-1) ) + (L(arma::span(p-1 + min_n_bins, q - min_n_bins-1), q-1-1)).t(); 244 | L(p-1, q-1-1) = cases.max(); 245 | 246 | max_mid_index = cases.index_max() + 1; //The c++ offset 1 247 | max_mid = (p-1 + min_n_bins) + max_mid_index -1; // should minus one 248 | // ancestor = paste(i, max_mid, max_mid+1, j, sep='-') 249 | ancestors(p-1, q-1-1) = to_string(p) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(q-1); 250 | // cout << "cases:" << L(p-1, q-1-1) << "\n"; 251 | } 252 | } 253 | 254 | } 255 | 256 | res["ancestors"] = ancestors; 257 | res["L"] = L; 258 | return res; 259 | } 260 | 261 | // compute the ancestors 262 | // [[Rcpp::export]] 263 | List compute_L( arma::mat A, arma::mat L, int k ) // A seems not needed here, and can be removed, Y.L, 2018-11-14 (Indeed this part is not used) 264 | { 265 | int n_row = A.n_rows; 266 | int min_n_bins = 2; 267 | StringMatrix ancestors(n_row, n_row); 268 | 269 | // should rewrite this part 270 | // for( int i= min_n_bins-1; i<=(2*min_n_bins - 2); i++ ) 271 | // for( int j=i; j