├── .gitattributes ├── .gitignore ├── CALDER2.Rproj ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── CALDER_hierarchy_v2.R ├── CALDER_main.R ├── HRG_MLE_each_compartment_fun.R ├── HighResolution2Low.R ├── LikelihoodRatioTest_fun.R ├── RcppExports.R ├── bisecting_kmeans.R ├── build_comp_table_opt.R ├── call_domains.R ├── compartment_PCA.R ├── compartment_data_generation_fun.R ├── general_functions.R ├── generate_compartments_bed_fun.R ├── post_processing_nested_domains.R ├── prunning.R └── zigzag_nested_domain_v2.R ├── README.md ├── docker ├── Dockerfile └── docker_build.sh ├── img ├── CALDER_features.png ├── CALDER_methods.png ├── Hela_chr11.png ├── Hela_chr11_and_RWPE1_chr9_pq.png ├── IGV_results.png └── RWPE1_chr9_pq.png ├── inst └── extdata │ ├── hg19_all_sub_compartments.bed │ ├── hg38_all_sub_compartments.bed │ ├── mat_chr21_10kb_ob.txt.gz │ ├── mat_chr22_10kb_ob.txt.gz │ ├── mm10_all_sub_compartments.bed │ └── mm9_all_sub_compartments.bed ├── man └── CALDER-package.Rd ├── scripts ├── calder ├── clean_test.sh └── create_release.sh ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── matrix_multiplication_fun.cpp └── zigzag_loglik_ancestors_v4.5.cpp └── tests ├── create_test_data.sh ├── test_cmd_cool.sh ├── testthat.R └── testthat ├── data ├── test.chrom.sizes ├── test.cool ├── test.mcool └── test_gene_coverage.bed └── test-main.R /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### CUSTOM ### 2 | /data 3 | total_execution.time 4 | 5 | ### TESTING ### 6 | _snaps/ 7 | 8 | ### R ### 9 | 10 | # History files 11 | .Rhistory 12 | .Rapp.history 13 | 14 | # Session Data files 15 | .RData 16 | .RDataTmp 17 | 18 | # User-specific files 19 | .Ruserdata 20 | 21 | # Example code in package build process 22 | *-Ex.R 23 | 24 | # Output files from R CMD build 25 | /*.tar.gz 26 | 27 | # Output files from R CMD check 28 | /*.Rcheck/ 29 | 30 | # RStudio files 31 | .Rproj.user/ 32 | 33 | # produced vignettes 34 | vignettes/*.html 35 | vignettes/*.pdf 36 | 37 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 38 | .httr-oauth 39 | 40 | # knitr and R markdown default cache directories 41 | *_cache/ 42 | /cache/ 43 | 44 | # Temporary files created by R markdown 45 | *.utf8.md 46 | *.knit.md 47 | 48 | # R Environment Variables 49 | .Renviron 50 | 51 | # pkgdown site 52 | docs/ 53 | 54 | # translation temp files 55 | po/*~ 56 | 57 | # RStudio Connect folder 58 | rsconnect/ 59 | 60 | ### R.Bookdown Stack ### 61 | # R package: bookdown caching files 62 | /*_files/ 63 | 64 | ### C++ ### 65 | # Prerequisites 66 | *.d 67 | 68 | # Compiled Object files 69 | *.slo 70 | *.lo 71 | *.o 72 | *.obj 73 | 74 | # Precompiled Headers 75 | *.gch 76 | *.pch 77 | 78 | # Compiled Dynamic libraries 79 | *.so 80 | *.dylib 81 | *.dll 82 | 83 | # Fortran module files 84 | *.mod 85 | *.smod 86 | 87 | # Compiled Static libraries 88 | *.lai 89 | *.la 90 | *.a 91 | *.lib 92 | 93 | # Executables 94 | *.exe 95 | *.out 96 | *.app 97 | 98 | 99 | ### macOS ### 100 | # General 101 | .DS_Store 102 | .AppleDouble 103 | .LSOverride 104 | 105 | # Icon must end with two \r 106 | Icon 107 | 108 | 109 | # Thumbnails 110 | ._* 111 | 112 | # Files that might appear in the root of a volume 113 | .DocumentRevisions-V100 114 | .fseventsd 115 | .Spotlight-V100 116 | .TemporaryItems 117 | .Trashes 118 | .VolumeIcon.icns 119 | .com.apple.timemachine.donotpresent 120 | 121 | # Directories potentially created on remote AFP share 122 | .AppleDB 123 | .AppleDesktop 124 | Network Trash Folder 125 | Temporary Items 126 | .apdisk 127 | 128 | ### macOS Patch ### 129 | # iCloud generated files 130 | *.icloud 131 | 132 | 133 | ### SublimeText ### 134 | # Cache files for Sublime Text 135 | *.tmlanguage.cache 136 | *.tmPreferences.cache 137 | *.stTheme.cache 138 | 139 | # Workspace files are user-specific 140 | *.sublime-workspace 141 | *.sublime-project 142 | 143 | # Project files should be checked into the repository, unless a significant 144 | # proportion of contributors will probably not be using Sublime Text 145 | # *.sublime-project 146 | 147 | # SFTP configuration file 148 | sftp-config.json 149 | sftp-config-alt*.json 150 | 151 | # Package control specific files 152 | Package Control.last-run 153 | Package Control.ca-list 154 | Package Control.ca-bundle 155 | Package Control.system-ca-bundle 156 | Package Control.cache/ 157 | Package Control.ca-certs/ 158 | Package Control.merged-ca-bundle 159 | Package Control.user-ca-bundle 160 | oscrypto-ca-bundle.crt 161 | bh_unicode_properties.cache 162 | 163 | # Sublime-github package stores a github token in this file 164 | # https://packagecontrol.io/packages/sublime-github 165 | GitHub.sublime-settings 166 | 167 | 168 | -------------------------------------------------------------------------------- /CALDER2.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: CALDER 2 | Type: Package 3 | Title: CALDER: a Hi-C analysis tool for inferring non-linear hierarchical organization of the 3D chromosome 4 | Authors@R: c( 5 | person(given = "Yuanlong", 6 | family = "LIU", 7 | role = c("aut", "cre"), 8 | email = "yliueagle@googlemail.com"), 9 | person(given = "Luca", 10 | family = "NANNI", 11 | role = c("aut"), 12 | email = "luca.nanni@unil.ch")) 13 | Version: 2.0 14 | Date: 2023-04-04 15 | Maintainer: Who to complain to 16 | Description: CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts 17 | URL: https://github.com/CSOgroup/CALDER2 18 | BugReports: https://github.com/CSOgroup/CALDER2/issues 19 | License: MIT 20 | Encoding: UTF-8 21 | LinkingTo: Rcpp, RcppArmadillo 22 | Depends: 23 | R (>= 3.5.2) 24 | biocViews: 25 | Imports: 26 | R.utils (>= 2.9.0), 27 | doParallel (>= 1.0.15), 28 | ape (>= 5.3), 29 | dendextend (>= 1.12.0), 30 | fitdistrplus (>= 1.0.14), 31 | igraph (>= 1.2.4), 32 | Matrix (>= 1.2.17), 33 | rARPACK (>= 0.11.0), 34 | factoextra (>= 1.0.5), 35 | data.table (>= 1.12.2), 36 | fields (>= 9.8.3), 37 | GenomicRanges (>= 1.36.0), 38 | strawr (>= 0.0.9), 39 | rhdf5 (>= 2.28.0), 40 | ggplot2 (>= 3.3.5), 41 | optparse 42 | RoxygenNote: 7.1.2 43 | Suggests: 44 | testthat (>= 3.0.0) 45 | Config/testthat/edition: 3 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yuanlong LIU 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(CALDER, .registration=TRUE) 2 | importFrom(Rcpp, evalCpp) 3 | importFrom(foreach, foreach) 4 | exportPattern("^[[:alpha:]]+") 5 | 6 | -------------------------------------------------------------------------------- /R/CALDER_hierarchy_v2.R: -------------------------------------------------------------------------------- 1 | ## Compute sub-compartments, V2. This version uses already computed compartments as reference to derive the A/B compartment direction 2 | ## Yuanlong LIU, 16-07-2021 3 | 4 | ## This code converts the compartment into table format 5 | 6 | make_comp_tab = function(comp_raw, bin_size) 7 | { 8 | # comp_raw[V1=="chrX"]$V1 = "chr23" 9 | # comp_raw[[1]] = gsub('chr', '', comp_raw[[1]]) 10 | bin_indices_li = apply(comp_raw[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]}) 11 | expand_len = sapply(bin_indices_li, length) 12 | 13 | n_domains = length(unique(comp_raw[[4]])) 14 | 15 | continous_rank = (n_domains + 1 - rank(unique(comp_raw[[4]]))) / n_domains 16 | names(continous_rank) = unique(comp_raw[[4]]) 17 | 18 | comp_tab = data.table::data.table(chr=rep(comp_raw[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw[[4]], expand_len), comp_rank=rep(comp_raw[[5]], expand_len)) 19 | comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name]) 20 | 21 | comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1 22 | comp_tab$pos_end = (comp_tab$bin_index)*bin_size 23 | 24 | return(comp_tab) 25 | } 26 | 27 | make_comp_tab_calder = function(comp_raw, bin_size) 28 | { 29 | `%dopar%` <- foreach::`%dopar%` 30 | `%do%` <- foreach::`%do%` 31 | 32 | colnames(comp_raw)[1] = 'chr' 33 | # comp_raw[V1=="chrX"]$V1 = "chr23" 34 | # comp_raw[[1]] = as.numeric(gsub('chr', '', comp_raw[[1]])) 35 | 36 | make_comp_tab_calder_chr = function(comp_raw_chr, bin_size) 37 | { 38 | bin_indices_li = apply(comp_raw_chr[, 2:3], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=bin_size)/bin_size; bins[1:(length(bins) - 1)]}) 39 | expand_len = sapply(bin_indices_li, length) 40 | 41 | n_domains = length(unique(comp_raw_chr[[4]])) 42 | 43 | continous_rank = (n_domains + 1 - rank(unique(comp_raw_chr[[4]]))) / n_domains 44 | names(continous_rank) = unique(comp_raw_chr[[4]]) 45 | 46 | comp_tab = data.table::data.table(chr=rep(comp_raw_chr[[1]], expand_len), bin_index=unlist(bin_indices_li), comp_name=rep(comp_raw_chr[[4]], expand_len), comp_rank=rep(comp_raw_chr[[5]], expand_len)) 47 | comp_tab$continous_rank = unname(continous_rank[comp_tab$comp_name]) 48 | 49 | comp_tab$pos_start = (comp_tab$bin_index - 1)*bin_size + 1 50 | comp_tab$pos_end = (comp_tab$bin_index)*bin_size 51 | return(comp_tab) 52 | } 53 | 54 | comp_tab_ALL = foreach::foreach(comp_raw_chr=split(comp_raw, by="chr"), .combine=rbind) %do% 55 | { 56 | make_comp_tab_calder_chr(comp_raw_chr, bin_size) 57 | } 58 | 59 | return(comp_tab_ALL) 60 | } 61 | 62 | generate_domain_2D_bedpe = function(sub_compartment_file, bin_size, bedpe_file, n_sub=8) 63 | { 64 | generate_domain_2D_bedpe_chr = function(comp_raw) 65 | { 66 | comp_tab = c(comp_raw, bin_size=10E3) 67 | 68 | comp_tab$comp_name = substr(comp_tab$comp_name, 1, log2(n_sub)*2 - 1) 69 | n_row = nrow(comp_tab) 70 | comp_tab$is_boudary = c(1*(comp_tab$comp_name[2:n_row] != comp_tab$comp_name[1:(n_row-1)]), 0) 71 | 72 | pos_end = union( comp_tab[is_boudary==1]$pos_end, comp_tab[n_row, ]$pos_end ) 73 | pos_start = union( comp_tab[1, ]$pos_start, pos_end[1:(length(pos_end) -1)] + 1 ) 74 | pos_start = as.character(pos_start) 75 | pos_end = as.character(pos_end) 76 | 77 | chr_name = comp_tab$chr[1] 78 | bedpe = data.table::data.table(chr_name, pos_start, pos_end, chr_name, pos_start, pos_end) 79 | return(bedpe) 80 | } 81 | my_skip = as.numeric(strsplit(system(intern=TRUE, sprintf('zgrep -n "chr" %s', sub_compartment_file)), ':')[[1]][1]) 82 | comp_raw_li = split(data.table::fread(sub_compartment_file, skip=my_skip), by="chr") 83 | bedpe = do.call(rbind, lapply(comp_raw_li, generate_domain_2D_bedpe_chr)) 84 | 85 | data.table::fwrite(bedpe, file=bedpe_file, col.names=FALSE, sep="\t") 86 | return(bedpe) 87 | } 88 | 89 | 90 | 91 | 92 | 93 | ## cool format is not accepted for the current version 94 | 95 | # cool2mat <- function(cool_file, chr_num, bin_size) ## convert cool format to long format / pos_1, pos_2, val / modified from https://rdrr.io/bioc/HiCcompare/src/R/hicpro2bedpe.R 96 | # { 97 | # dump <- rhdf5::h5dump(cool_file) 98 | # if(names(dump)=='resolutions') 99 | # { 100 | # cat('\nYour input contact matrix is in mcool format and contains resolutions of:', names(dump$resolutions), '\n') 101 | 102 | # which2keep = which(bin_size==as.numeric(names(dump$resolutions))) 103 | # dump = dump$resolutions[[which2keep]] 104 | # } 105 | 106 | # ids <- data.table::data.table(chr = dump$bins$chrom, start = dump$bins$start, id = seq(1, length(dump$bins$chrom), by = 1)) 107 | # ids = ids[chr==chr_num][, c('id', 'start')] 108 | 109 | # # make sparse matrix 110 | 111 | # mat <- data.table::data.table(bin1 = dump$pixels$bin1_id, bin2 = dump$pixels$bin2_id, val = dump$pixels$count) 112 | # mat_chr = mat[(bin1 %in% ids$id) & (bin2 %in% ids$id)] ## keep only cis contacts 113 | # colnames(ids)[2] = 'pos_1' 114 | # contact_mat = left_join(mat_chr, ids, by = c('bin1' = 'id')) 115 | # colnames(ids)[2] = 'pos_2' 116 | # contact_mat <- left_join(contact_mat, ids, by = c('bin2' = 'id')) 117 | # contact_mat = contact_mat[, c('pos_1', 'pos_2', 'val')] 118 | # return(contact_mat) 119 | # } 120 | 121 | 122 | CALDER_CD_hierarchy_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr, bin_size_input, bin_size2look, save_dir, save_intermediate_data=FALSE, swap_AB, ref_compartment_file, black_list_bins=NULL, feature_track=NULL) 123 | { 124 | chr_num = gsub('chr', '', chr, ignore.case=TRUE) 125 | chr_name = paste0('chr', chr_num) 126 | 127 | get_cor_with_ref = function(chr_bin_pc, bin_size, ref_compartment_file=NULL, feature_track=NULL) ## correlation of PC1 with comp. domain rank of genome 128 | { 129 | ext_chr_bin_pc = function(chr_bin_pc) ## spand chr_bin_pc using 5kb bin 130 | { 131 | bin_indices_li = unlist(apply(chr_bin_pc[, 4:5], 1, function(v) {bins = 1 + seq(v[1]-1, v[2], by=5E3)/5E3; list(bins[1:(length(bins) - 1)])}), recursive=FALSE) 132 | expand_len = sapply(bin_indices_li, length) 133 | chr_bin_pc_ext = data.table::data.table(chr=rep(chr_bin_pc[[1]], expand_len), bin_index=unlist(bin_indices_li),PC1_val=rep(chr_bin_pc$PC1_val, expand_len)) 134 | 135 | chr_bin_pc_ext$pos_start = (chr_bin_pc_ext$bin_index - 1)*5E3 + 1 136 | chr_bin_pc_ext$pos_end = (chr_bin_pc_ext$bin_index)*5E3 137 | return(chr_bin_pc_ext) 138 | } 139 | 140 | ## function to generate binning scores // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/ 141 | # feature_track = data.table::data.table(chr=as.vector(GenomicRanges::seqnames(bw_val)), start=start(bw_val), end=end(bw_val), score=bw_val$score) 142 | 143 | 144 | get_binned_vals = function(feature_track_chr, bin_size=5E3) 145 | { 146 | ## helper to compute binned average // https://divingintogeneticsandgenomics.rbind.io/post/compute-averages-sums-on-granges-or-equal-length-bins/ 147 | binnedMean <- function(bins, numvar, mcolname) 148 | { 149 | stopifnot(is(bins, "GRanges")) 150 | stopifnot(is(numvar, "RleList")) 151 | stopifnot(identical(GenomeInfoDb::seqlevels(bins), names(numvar))) 152 | bins_per_chrom <- split(GenomicRanges::ranges(bins), GenomicRanges::seqnames(bins)) 153 | sums_list <- lapply(names(numvar), 154 | function(seqname) { 155 | views <- IRanges::Views(numvar[[seqname]], 156 | bins_per_chrom[[seqname]]) 157 | IRanges::viewMeans(views) 158 | }) 159 | new_mcol <- unsplit(sums_list, as.factor(GenomicRanges::seqnames(bins))) 160 | GenomicRanges::mcols(bins)[[mcolname]] <- new_mcol 161 | return(bins) 162 | } 163 | 164 | 165 | GR = GenomicRanges::makeGRangesFromDataFrame(feature_track_chr, keep.extra.columns=TRUE) 166 | GR_chrs = split(GR, GenomicRanges::seqnames(GR)) 167 | seq_lens = sapply(GR_chrs, function(v) max(GenomicRanges::end(v))) 168 | 169 | GR_RleList = GenomicRanges::coverage(GR, weight="score") 170 | seq_info = GenomicRanges::seqinfo(GR_RleList) 171 | GenomeInfoDb::seqlengths(seq_info) = seq_lens 172 | 173 | bins = GenomicRanges::tileGenome(seq_info, tilewidth=bin_size, cut.last.tile.in.chrom=TRUE) 174 | bins = bins[GenomicRanges::width(bins)==bin_size] 175 | bin_val_tmp = binnedMean(bins, GR_RleList, "bin_val") 176 | bin_val_tmp = data.table::as.data.table(bin_val_tmp) 177 | bin_val = data.table::data.table(chr=bin_val_tmp$seqnames, bin_index=bin_val_tmp$end / bin_size, continous_rank=log2(1 + bin_val_tmp$bin_val - min(bin_val_tmp$bin_val))) 178 | return(bin_val) 179 | } 180 | 181 | 182 | chr2query = chr_bin_pc$chr[1] 183 | 184 | if(!is.null(ref_compartment_file)) 185 | { 186 | domain_rank = data.table::fread(ref_compartment_file) 187 | colnames(domain_rank)[1] = 'chr' 188 | domain_rank_chr = domain_rank[domain_rank[[1]]==chr2query, ] 189 | 190 | ref_tab = make_comp_tab_calder(domain_rank_chr, bin_size=5E3) ## 5kb is convenient for most of the bin sizes 191 | ref_tab = ref_tab[, c(1,2,5)] 192 | } 193 | 194 | if(!is.null(feature_track)) 195 | { 196 | colnames(feature_track) = c('chr', 'start', 'end', 'score') 197 | feature_track$chr = gsub('chr', '', feature_track$chr) 198 | feature_track_chr = feature_track[feature_track$chr==chr_num, ] 199 | ref_tab = get_binned_vals(feature_track_chr) 200 | } 201 | 202 | 203 | chr_name = gsub("chrchr", "chr", paste0("chr", ref_tab$chr)) 204 | # chr_name = ifelse(chr_name=="chr23", "chrX", chr_name) 205 | ref_tab$chr = chr_name 206 | colnames(ref_tab)[2] = "bin_index" 207 | 208 | ################################# convert chr_bin_pc into 5kb 209 | 210 | chr_bin_pc_ext = ext_chr_bin_pc(chr_bin_pc) 211 | ref_and_pc = merge(ref_tab, chr_bin_pc_ext, by=c("chr", "bin_index")) 212 | cor_with_ref = cor(method='spearman', ref_and_pc[, c("continous_rank", "PC1_val")])[1,2] 213 | 214 | return(cor_with_ref) 215 | } 216 | 217 | ### The main function starts here 218 | 219 | 220 | time0 = Sys.time() 221 | log_file = paste0(save_dir, '/chr', chr_num, '_log.txt') 222 | warning_file = paste0(save_dir, '/WARNING_chr', chr_num, '.txt') 223 | cor_log_file = paste0(save_dir, '/cor_with_ref.txt') 224 | 225 | cat('>>>> Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE) 226 | 227 | cat('[', as.character(chr),'] Begin process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n') 228 | processed_data = contact_mat_processing_v2(contact_tab_dump, contact_file_dump=contact_file_dump, contact_file_hic=contact_file_hic, chr=chr_num, bin_size_input=bin_size_input, bin_size2look=bin_size2look, black_list_bins=black_list_bins) 229 | 230 | 231 | mat_dense = processed_data$mat_dense 232 | ccmat_dense_compressed_log_atanh = processed_data$atanh_score 233 | 234 | cat('[', as.character(chr),'] Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n') 235 | cat('>>>> Finish process contact matrix and compute correlation score at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 236 | 237 | p_thresh = ifelse(bin_size2look < 40000, 0.05, 1) 238 | window.sizes = 3 239 | compartments = vector("list", 2) 240 | 241 | cat('>>>> Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 242 | cat('[', as.character(chr),'] Begin compute compartment domains and their hierachy at:', as.character(Sys.time()), '\n') 243 | 244 | compartments[[2]] = generate_compartments_bed(chr = chr, bin_size = bin_size2look, window.sizes = window.sizes, ccmat_dense_compressed_log_atanh, p_thresh, out_file_name = NULL, stat_window_size = NULL) 245 | topDom_output = compartments[[2]] 246 | bin_names = rownames(mat_dense) 247 | mat_dense = as.matrix(mat_dense) 248 | initial_clusters = apply(topDom_output$domain[, c("from.id", "to.id")], 1, function(v) v[1]:v[2]) 249 | 250 | if (sum(sapply(initial_clusters, length)) != max(unlist(initial_clusters))) { 251 | stop(CELL_LINE, " initial_clusters error in topDom") 252 | } 253 | 254 | n_clusters = length(initial_clusters) 255 | mat_dense_cluster_mean = HighResolution2Low_k_rectangle(mat_dense, initial_clusters, initial_clusters, sum_or_mean = "mean") 256 | 257 | trend_mean_list = lapply( 1:4, function(v) 1*(mat_dense_cluster_mean[, -(1:v)] > mat_dense_cluster_mean[, - n_clusters - 1 + (v:1)]) ) 258 | trend_mean = do.call(cbind, trend_mean_list) 259 | c_trend_mean = cor(t(trend_mean)) 260 | atanh_c_trend_mean= atanh(c_trend_mean / (1+1E-7)) 261 | 262 | 263 | # if(to_scale) 264 | { 265 | trend_mean = scale(trend_mean) 266 | c_trend_mean = scale(c_trend_mean) 267 | atanh_c_trend_mean= scale(atanh_c_trend_mean) 268 | } 269 | 270 | 271 | PC_12_atanh = get_PCs(atanh_c_trend_mean, which=1:10) 272 | PC_12_atanh[, 2:10] = PC_12_atanh[, 2:10]/5 ## xx-xx-xxxx: compress PC2 273 | rownames(PC_12_atanh) = 1:nrow(PC_12_atanh) 274 | 275 | ############################################################ 276 | 277 | PC_direction = 1 278 | 279 | ## switch PC direction based on correlation with "ground_truth" 280 | { 281 | initial_clusters_ori_bins = lapply(initial_clusters, function(v) as.numeric(bin_names[v])) 282 | chr_bin_pc = data.table::data.table(chr=chr_name, bin=unlist(initial_clusters_ori_bins), PC1_val=rep(PC_12_atanh[,1], sapply(initial_clusters_ori_bins, length))) 283 | chr_bin_pc$start = (chr_bin_pc$bin - 1)*bin_size2look + 1 284 | chr_bin_pc$end = chr_bin_pc$bin*bin_size2look 285 | 286 | # chr_bin_pc_range = makeGRangesFromDataFrame(chr_bin_pc, keep.extra.columns=TRUE) 287 | # gene_info_chr = subset(gene_info, seqnames==chr_name) 288 | 289 | # refGR = chr_bin_pc_range 290 | # testGR = gene_info_chr 291 | # hits <- findOverlaps(refGR, testGR) 292 | # overlaps <- pintersect(refGR[queryHits(hits)], testGR[subjectHits(hits)]) 293 | # overlaps_bins = unique(data.table::data.table(overlap_ratio=width(overlaps)/bin_size, bin=overlaps$bin)) 294 | # bin_pc_gene_coverage = merge(chr_bin_pc, overlaps_bins, all.x=TRUE) 295 | # bin_pc_gene_coverage$overlap_ratio[is.na(bin_pc_gene_coverage$overlap_ratio)] = 0 296 | 297 | # gene_density_cor = cor(method='spearman', subset(bin_pc_gene_coverage, (PC1_val < quantile(PC1_val, 0.25)) | (PC1_val > quantile(PC1_val, 0.75)) , c('PC1_val', 'overlap_ratio')))[1,2] 298 | # if(abs(gene_density_cor) < 0.2) warning('correlation between gene density and PC1 is too weak') 299 | 300 | 301 | if(!is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, ref_compartment_file=ref_compartment_file)) ## get correlation with supplied "ground truth or reference" 302 | if(is.null(ref_compartment_file)) cor_with_ref = try(get_cor_with_ref(chr_bin_pc, bin_size2look, feature_track=feature_track)) 303 | 304 | 305 | 306 | if(class(cor_with_ref)=='try-error') cor_with_ref = 1 ## psudo cor 307 | if(!is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and reference compartment is :', format(abs(cor_with_ref), digits=5), '\n') 308 | if(is.null(ref_compartment_file)) cat('[', as.character(chr),'] Correlation between PC1 and feature_track is :', format(abs(cor_with_ref), digits=5), '\n') 309 | 310 | PC_direction = PC_direction*sign(cor_with_ref) 311 | if(swap_AB==1) PC_direction = -PC_direction ## force swap PC direction if in some case the A/B direction is reverted 312 | PC_12_atanh = PC_12_atanh*PC_direction 313 | } 314 | 315 | 316 | project_info = project_to_major_axis(PC_12_atanh) 317 | x_pro = project_info$x_pro 318 | 319 | ############################################################ 320 | 321 | hc_disect_kmeans_PC12 = bisecting_kmeans(PC_12_atanh[, 1:10, drop=FALSE]) 322 | 323 | hc_hybrid_PC12 = hc_disect_kmeans_PC12 324 | 325 | { 326 | reordered_names = reorder_dendro(hc_hybrid_PC12, x_pro, aggregateFun=mean) 327 | hc_hybrid_PC12_reordered = dendextend::rotate(hc_hybrid_PC12, reordered_names) 328 | hc_hybrid_x_pro = hc_disect_kmeans_PC12 329 | reordered_names_x_pro = get_best_reorder(hc_hybrid_x_pro, x_pro) 330 | CALDER_hc = dendextend::rotate(hc_hybrid_x_pro, reordered_names_x_pro) 331 | } 332 | 333 | ############################################################ 334 | parameters = list(bin_size = bin_size2look, p_thresh = p_thresh) 335 | res = list( CALDER_hc=CALDER_hc, initial_clusters=initial_clusters, bin_names=bin_names, x_pro=x_pro, parameters=parameters) 336 | intermediate_data_file = paste0(save_dir, '/chr', chr, '_intermediate_data.Rds') 337 | 338 | hc = res$CALDER_hc 339 | hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels) 340 | bin_comp = data.table::data.table(chr=chr, bin_index=res$bin_names, comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length))) 341 | 342 | rownames(bin_comp) = NULL 343 | res$comp = bin_comp 344 | res$CDs = lapply(res$initial_clusters, function(v) res$bin_names[v]) 345 | res$mat = mat_dense 346 | res$chr = chr 347 | generate_hierachy_bed(chr=chr, res=res, save_dir=save_dir, bin_size=bin_size2look) 348 | 349 | 350 | cat('>>>> Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 351 | cat('[', as.character(chr),'] Finish compute compartment domains and their hierachy at: ', as.character(Sys.time()), '\n') 352 | 353 | if(!is.null(ref_compartment_file)) cat('Correlation between PC1 and reference compartment domain rank on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE) 354 | if(is.null(ref_compartment_file)) cat('Correlation between PC1 and feature_track on this chr is: ', format(abs(cor_with_ref), 1, 5), '\n', file=log_file, append=TRUE) 355 | 356 | # cat(sprintf("chr%s: %s\n", chr, format(abs(cor_with_ref), 1, 5)), file=cor_log_file, append=TRUE) 357 | # if(abs(cor_with_ref) < 0.3) cat('WARNING: correlation between PC1 and reference compartment domain rank on this chr is: ', format(cor_with_ref, digits=5), ', which is a bit low. Possible reason could be that this chromosome has some big structural variations (translocation, inversion for example). We suggest to overlay the compartment track with the hic map together with histone modification or gene expression track to double check the reliability of compartment calling on this chr.', '\n', file=warning_file) 358 | 359 | time1 = Sys.time() 360 | # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0)) 361 | 362 | delta_time <- time1 - time0 363 | timediff <- format(round(delta_time, 2), nsmall = 2) 364 | 365 | cat('\n\n', 'Total time used for computing compartment domains and their hierachy:', timediff, '\n', file=log_file, append=TRUE) 366 | # if(abs(gene_density_cor) > 0.2) cat('The gene density and PC1 correlation on this chr is: ', substr(gene_density_cor, 1, 4), '\n', file=log_file, append=TRUE) 367 | 368 | ############################################################ 369 | intermediate_data = res 370 | if(save_intermediate_data==TRUE) saveRDS(intermediate_data, file=intermediate_data_file) 371 | # cat(intermediate_data_file) 372 | return(intermediate_data_file) 373 | } 374 | 375 | 376 | 377 | 378 | 379 | project_to_major_axis <- function(PC_12_atanh) 380 | { 381 | Data = data.frame(x=PC_12_atanh[,1], y=PC_12_atanh[,2]) 382 | Data = Data[order(Data$x),] 383 | loess_fit <- loess(y ~ x, Data) 384 | 385 | more_x = seq(min(PC_12_atanh[,1]), max(PC_12_atanh[,1]), len=10*length(PC_12_atanh[,1])) 386 | major_axis = cbind(x=more_x, y=predict(loess_fit, newdata=more_x)) 387 | new_x_axis = cumsum(c(0, sqrt(diff(major_axis[,1])^2 + diff(major_axis[,2])^2))) ## the new xaxis on the curved major_axis 388 | 389 | dis = fields::rdist(PC_12_atanh[, 1:2], major_axis) 390 | projected_x = new_x_axis[apply(dis, 1, which.min)] 391 | names(projected_x) = rownames(PC_12_atanh) 392 | # projected_x = major_axis[apply(dis, 1, which.min)] 393 | project_info = list(x_pro=projected_x, major_axis=major_axis) 394 | return(project_info) 395 | } 396 | 397 | 398 | get_best_reorder <- function(hc_hybrid_x_pro, x_pro) 399 | { 400 | n = length(x_pro) 401 | reordered_names_x_pro_list = list() 402 | 403 | reordered_names_x_pro_list[[1]] = reorder_dendro(hc_hybrid_x_pro, (x_pro), aggregateFun=mean) ## here the clusters are assigned into A.1 A.2 B.1 B.2 404 | 405 | best_index = which.max(sapply(reordered_names_x_pro_list, function(v) cor(1:n, unname(rank(x_pro, ties.method='first')[v])))) 406 | return(reordered_names_x_pro_list[[1]]) 407 | } 408 | 409 | 410 | 411 | generate_hierachy_bed = function(chr, res, save_dir, bin_size) 412 | { 413 | chr_name = paste0('chr', chr) 414 | # res = reses[[chr_name]][[CELL_LINE]] 415 | hc = res$CALDER_hc 416 | 417 | hc_k_labels_full = try(get_cluser_levels(hc, k_clusters=Inf, balanced_4_clusters=FALSE)$cluster_labels) 418 | bin_comp = data.table::data.table(chr=chr, bin_index=as.numeric(res$bin_names), comp=rep(hc_k_labels_full, sapply(res$initial_clusters, length))) 419 | chr_bin_domain = bin_comp 420 | chr_bin_domain$chr = paste0('chr', chr_bin_domain$chr) 421 | 422 | # chr_bin_domain = chr_bin_domain[order(bin_index)] 423 | 424 | compartment_info_tab = create_compartment_bed_v4(chr_bin_domain, bin_size=bin_size) 425 | 426 | boundaries = unname(sapply(res$initial_clusters, max)) 427 | boundaries_ori = as.numeric(res$bin_names[boundaries])*bin_size 428 | 429 | compartment_info_tab$is_boundary = 'gap' 430 | compartment_info_tab[compartment_info_tab$pos_end %in% boundaries_ori, 'is_boundary'] = 'boundary' 431 | 432 | colnames(compartment_info_tab)[4] = 'compartment_label' 433 | compartments_tsv_file = paste0(save_dir, '/chr', chr, '_domain_hierachy.tsv') 434 | compartments_bed_file = paste0(save_dir, '/chr', chr, '_sub_compartments.bed') 435 | boundary_bed_file = paste0(save_dir, '/chr', chr, '_domain_boundaries.bed') 436 | 437 | options(scipen=999) 438 | write.table( compartment_info_tab, file=compartments_tsv_file, quote=FALSE, sep='\t', row.names=FALSE ) 439 | 440 | 441 | comp_cols = c("#FF0000", "#FF4848", "#FF9191", "#FFDADA", "#DADAFF", "#9191FF", "#4848FF", "#0000FF") 442 | names(comp_cols) = c('A.1.1', 'A.1.2', 'A.2.1', 'A.2.2', 'B.1.1', 'B.1.2', 'B.2.1', 'B.2.2') 443 | comp_val = (8:1)/8 444 | names(comp_val) = names(comp_cols) 445 | 446 | comp_8 = substr(compartment_info_tab$compartment_label, 1, 5) 447 | 448 | compartment_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[, 2:4], comp_val[comp_8], '.', compartment_info_tab[, 2:3], comp_cols[comp_8]) 449 | write.table( compartment_bed, file=compartments_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 450 | 451 | bounday_bed_raw = subset(compartment_info_tab, is_boundary=='boundary') 452 | bounday_bed = data.frame(chr=paste0('chr', compartment_info_tab$chr), compartment_info_tab[,3], compartment_info_tab[,3], '', '.', compartment_info_tab[,3], compartment_info_tab[,3], '#000000') 453 | write.table( bounday_bed, file=boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 454 | } 455 | 456 | 457 | create_compartment_bed_v4 = function(chr_bin_domain, bin_size) 458 | { 459 | # for( chr in chrs ) 460 | { 461 | v = chr_bin_domain 462 | # v$intra_domain = as.character(6 - (as.numeric(v$intra_domain))) ## invert the labeling 463 | # v$intra_domain = names(cols)[(as.numeric(v$intra_domain))] 464 | v = v[order(v$bin_index), ] 465 | 466 | 467 | borders_non_consecutive = which(diff(v$bin_index)!=1) 468 | borders_domain = cumsum(rle(v$comp)$lengths) 469 | borders = sort(union(borders_non_consecutive, borders_domain)) 470 | bins = v$bin_index 471 | to_id = as.numeric(bins[borders]) 472 | from_id = as.numeric(bins[c(1, head(borders, length(borders)-1)+1)]) 473 | 474 | pos_start = (from_id-1)*bin_size + 1 475 | pos_end = to_id*bin_size 476 | # chr = as.numeric( gsub('chr', '', v$chr) ) 477 | chr = gsub('chr', '', v$chr) ## no need for as.numeric, also makes it compatible with chrX 478 | 479 | compartment_info_tab = data.frame(chr=rep(unique(chr), length(pos_start)), pos_start=pos_start, pos_end=pos_end, domain=v$comp[borders]) 480 | } 481 | return(compartment_info_tab) 482 | } 483 | 484 | 485 | CALDER_sub_domains = function(intermediate_data_file=NULL, intermediate_data=NULL, chr, save_dir, bin_size) 486 | { 487 | time0 = Sys.time() 488 | log_file = paste0(save_dir, '/chr', chr, '_sub_domains_log.txt') 489 | 490 | cat('[', as.character(chr),']Begin compute sub-domains at:', as.character(Sys.time())) 491 | cat('>>>> Begin compute sub-domains at:', as.character(Sys.time()), '\n', file=log_file, append=FALSE) 492 | 493 | if(is.null(intermediate_data)) intermediate_data = readRDS(intermediate_data_file) 494 | { 495 | 496 | if(intermediate_data$chr!=chr) stop('intermediate_data$chr!=chr; check your input parameters\n') 497 | if( !setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) ) stop('!setequal(rownames(intermediate_data$mat), intermediate_data$bin_names) \n') 498 | compartment_segs = generate_compartment_segs( intermediate_data$initial_clusters ) 499 | 500 | cat('[', as.character(chr),'] Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n') 501 | cat('>>>> Begin compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 502 | 503 | sub_domains_raw = HRG_zigzag_compartment_domain_main_fun(intermediate_data$mat, './', compartment_segs, min_n_bins=2) 504 | 505 | no_output = post_process_sub_domains(chr, sub_domains_raw, ncores=1, out_dir=save_dir, bin_size=bin_size) 506 | 507 | cat('>>>> Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n', file=log_file, append=TRUE) 508 | cat('[', as.character(chr),'] Finish compute sub-domains within each compartment domain at:', as.character(Sys.time()), '\n') 509 | 510 | time1 = Sys.time() 511 | # delta_time = gsub('Time difference of', 'Total time used for computing compartment domains and their hierachy:', print(time1 - time0)) 512 | delta_time <- time1 - time0 513 | timediff <- format(round(delta_time, 2), nsmall = 2) 514 | 515 | cat('\n\n', 'Total time used for computing sub-domains:', timediff, '\n', file=log_file, append=TRUE) 516 | } 517 | # return(NULL) 518 | } 519 | 520 | -------------------------------------------------------------------------------- /R/CALDER_main.R: -------------------------------------------------------------------------------- 1 | 2 | CALDER = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chrs, bin_size, save_dir, save_intermediate_data=FALSE, swap_AB=0, genome='others', feature_track=NULL, black_list_bins=NULL, n_cores=1, sub_domains=FALSE, single_binsize_only=FALSE) 3 | { 4 | ########################### https://stackoverflow.com/questions/30216613/how-to-use-dopar-when-only-import-foreach-in-description-of-a-package 5 | 6 | `%dopar%` <- foreach::`%dopar%` 7 | `%do%` <- foreach::`%do%` 8 | 9 | ########################### 10 | 11 | chrs = as.character(chrs) 12 | n_chrs = length(chrs) 13 | 14 | if(n_chrs > 1) ## when computing for multiple chrs, contact_tab_dump or contact_file_dump should be a list with elements matching to that of chrs 15 | { 16 | if(!is.null(contact_tab_dump)) 17 | { 18 | if(class(contact_tab_dump)!='list' | length(contact_tab_dump)!=n_chrs) stop('contact_tab_dump should be a list of contact table with its length equal to the length of chrs you specified\n') 19 | names(contact_tab_dump) = chrs 20 | } 21 | 22 | if(!is.null(contact_file_dump)) 23 | { 24 | if(class(contact_file_dump)!='list' | length(contact_file_dump)!=n_chrs) stop('contact_file_dump should be a list of contact table with its length equal to the length of chrs you specified\n') 25 | names(contact_file_dump) = chrs 26 | } 27 | } 28 | 29 | ########################### try multiple bin_size parameters and choose the one that generates reliable compartments 30 | 31 | if(bin_size==5E3) bin_sizes = c(5E3, 10E3, 50E3, 100E3) 32 | if(bin_size==10E3) bin_sizes = c(10E3, 50E3, 100E3) 33 | if(bin_size==20E3) bin_sizes = c(20E3, 40E3, 100E3) 34 | if(bin_size==25E3) bin_sizes = c(25E3, 50E3, 100E3) 35 | if(bin_size==40E3) bin_sizes = c(40E3, 80E3) 36 | if(bin_size==50E3) bin_sizes = c(50E3, 100E3) 37 | if(!(bin_size %in% c(5E3, 10E3, 20E3, 25E3, 40E3, 50E3))) bin_sizes = bin_size 38 | 39 | if(genome=='others') single_binsize_only = TRUE 40 | if(single_binsize_only) bin_sizes = bin_size ## do not try multiple bin_sizes 41 | 42 | ## choose the already computed good compartment as reference to decide correctly the A/B compartment direction 43 | if(genome!='others') ref_compartment_file = system.file("extdata", sprintf('%s_all_sub_compartments.bed', genome), package = 'CALDER') 44 | if(genome=='others') ref_compartment_file = NULL 45 | 46 | ########################### 47 | 48 | if(is.null(ref_compartment_file) & is.null(feature_track)) stop('Should either specify genome in one of hg19, hg38, mm9, mm10, or provide a feature_track\n') 49 | 50 | ########################### 51 | 52 | doParallel::registerDoParallel(cores=n_cores) 53 | 54 | for(bin_size2look in bin_sizes) 55 | { 56 | bin_size_kb = sprintf("%skb", bin_size2look/1E3) 57 | save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb) 58 | dir.create(save_dir_binsize, recursive=TRUE, showWarnings=FALSE) 59 | } 60 | 61 | para_tab = data.table::data.table(expand.grid(bin_sizes, chrs)) 62 | for(i in 1:ncol(para_tab)) para_tab[[i]] = as.vector(para_tab[[i]]) 63 | colnames(para_tab) = c('bin_size', 'chr') 64 | 65 | ########################### compute compartment for each bin_size and chr 66 | 67 | silent_out = foreach::foreach(i=1:nrow(para_tab)) %dopar% 68 | { 69 | bin_size2look = para_tab$bin_size[i] 70 | chr = para_tab$chr[i] 71 | bin_size_kb = sprintf("%skb", bin_size2look/1E3) 72 | save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb) 73 | 74 | save_intermediate_data_tmp = save_intermediate_data*(bin_size2look==bin_size) 75 | 76 | CALDER_CD_hierarchy_v2(contact_tab_dump=contact_tab_dump[[chr]], contact_file_dump=contact_file_dump[[chr]], contact_file_hic=contact_file_hic, chr=chr, bin_size_input=bin_size, bin_size2look=bin_size2look, save_dir=save_dir_binsize, save_intermediate_data=save_intermediate_data_tmp, swap_AB=swap_AB, ref_compartment_file=ref_compartment_file, feature_track=feature_track, black_list_bins=black_list_bins) 77 | } 78 | 79 | ########################### build optimal compartment from multiple resolutions 80 | 81 | try(build_comp_table_opt(save_dir=save_dir, chrs=chrs, bin_sizes=bin_sizes, with_ref=!is.null(genome))) 82 | 83 | ########################### if computing sub-domains 84 | 85 | if(sub_domains==TRUE) 86 | { 87 | silence_out = foreach::foreach(chr=chrs) %do% ## use one core instead of multi-cores 88 | { 89 | chr_num = gsub('chr', '', chr, ignore.case=TRUE) 90 | intermediate_data_file = sprintf('%s/%skb/chr%s_intermediate_data.Rds', file.path(save_dir, 'intermediate_data/sub_compartments') , bin_size/1E3, chr_num) 91 | save_dir_sub_domains = file.path(save_dir, 'intermediate_data/sub_domains') 92 | dir.create(save_dir_sub_domains, showWarnings = FALSE) 93 | try(CALDER_sub_domains(intermediate_data_file=intermediate_data_file, chr=chr, save_dir=save_dir_sub_domains, bin_size=bin_size)) 94 | } 95 | 96 | save_dir_opt = file.path(save_dir, "sub_domains") 97 | dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE) 98 | save_opt_file = file.path(save_dir_opt, sprintf("all_nested_boundaries.bed")) 99 | 100 | opt_sub_domain_beds = sprintf('%s/chr%s_nested_boundaries.bed', file.path(save_dir, 'intermediate_data/sub_domains') , gsub('chr', '', chrs, ignore.case=TRUE)) 101 | cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_domain_beds, collapse=" "), save_opt_file) 102 | system(cmd_opt) 103 | } 104 | } 105 | 106 | 107 | -------------------------------------------------------------------------------- /R/HRG_MLE_each_compartment_fun.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## 12/04/2018 3 | ## 23/05/2018 4 | ## 29/05/2018 5 | 6 | ## A should be already named 7 | ## A should be symmetric 8 | ## Does not remove 0-rows/columns of A 9 | HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE) 10 | { 11 | min_n_bins_outer=2 12 | min_n_bins_inner=2 13 | distr = 'lnorm' 14 | A = pA_sym[seg, seg] 15 | if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices') 16 | arg_list = as.list(environment()) 17 | res_folder = file.path(res_dir) 18 | dir.create(res_folder, recursive=TRUE, showWarnings = TRUE) 19 | 20 | total_execution_time_file = file.path(res_dir, 'total_execution.time') 21 | cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE) 22 | time_begin = Sys.time() 23 | cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE) 24 | 25 | ## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken 26 | n_A = nrow(A) 27 | 28 | ## IF THE SEGMENT IS ALREADY SMALL ENOUGH 29 | if( n_A <= max(allowed_max_nbins_seq) ) 30 | { 31 | ## This will be modified 32 | { 33 | dists = (2*min_n_bins_outer - 1):(nrow(A)-1) 34 | counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer )) 35 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 36 | chunks = c(dists[1]-1, dists[split_info$break_points]) 37 | res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast ) 38 | } 39 | 40 | ## this part gets the hi_tree 41 | { 42 | hi_tree = get_tree_v0( res_outer ) 43 | igraph::V(hi_tree)$left_rescaled = igraph::V(hi_tree)$left 44 | igraph::V(hi_tree)$right_rescaled = igraph::V(hi_tree)$right 45 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right - igraph::V(hi_tree)$left + 1 46 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 47 | } 48 | 49 | full_tree = hi_tree 50 | res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) ) 51 | res_info$full_tree = full_tree 52 | res_folder_final = file.path(res_dir, 'final') 53 | dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 54 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 55 | 56 | time_finish = Sys.time() 57 | cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE) 58 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE) 59 | return(res_info) 60 | } 61 | 62 | ## which one leads to the least residue 63 | max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ] 64 | residue = n_A %% max_nbins 65 | 66 | residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0) 67 | A_final = residue_mat_info$A_final 68 | n2one = residue_mat_info$n2one 69 | 70 | ## A_low has no row/col name 71 | A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low 72 | 73 | ## Starting from here, the original row/col names are no longer used 74 | 75 | ## this part run HRG_core on the outer part 76 | { 77 | dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1) 78 | counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer )) 79 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 80 | chunks = c(dists[1]-1, dists[split_info$break_points]) 81 | res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast ) 82 | } 83 | 84 | ## this part gets the hi_tree 85 | { 86 | hi_tree = get_tree_v0( res_outer ) 87 | igraph::V(hi_tree)$left_rescaled = n2one*(igraph::V(hi_tree)$left - 1) + 1 88 | igraph::V(hi_tree)$right_rescaled = n2one*igraph::V(hi_tree)$right 89 | node2adjust = which(igraph::V(hi_tree)$right_rescaled==max(igraph::V(hi_tree)$right_rescaled)) ## append the residue bins here 90 | igraph::V(hi_tree)[node2adjust]$right_rescaled = n_A 91 | 92 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1 93 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 94 | } 95 | 96 | # segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp) 97 | 98 | segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine) 99 | 100 | ## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL 101 | ## 14-05-2018 102 | ## Also adjust the tree 103 | if(adjust_segmentss_topdom==TRUE) 104 | { 105 | shift = seg[1]-1 106 | segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift 107 | for( i in 1:nrow(segmentss) ) 108 | { 109 | index_left2adj = which(igraph::V(hi_tree)$left_rescaled == segmentss_nadj[i,1]) 110 | index_right2adj = which(igraph::V(hi_tree)$right_rescaled == segmentss_nadj[i,2]) 111 | igraph::V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1] 112 | igraph::V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2] 113 | igraph::V(hi_tree)$width_rescaled = igraph::V(hi_tree)$right_rescaled - igraph::V(hi_tree)$left_rescaled + 1 114 | igraph::V(hi_tree)$name = paste('(',igraph::V(hi_tree)$left_rescaled, ',', igraph::V(hi_tree)$right_rescaled, ')', sep='') 115 | } 116 | } 117 | 118 | trunk = hi_tree 119 | # save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata')) 120 | 121 | res_inner = rep(list(list()), nrow(segmentss)) 122 | for( i in 1:nrow(segmentss) ) 123 | { 124 | cat(i,'\n') 125 | # if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {} 126 | index = segmentss[i,1]:segmentss[i,2] 127 | A_part = A[index, index] 128 | dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1) 129 | counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner )) 130 | split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores)) 131 | chunks = c(dists[1]-1, dists[split_info$break_points]) 132 | 133 | ## A_part is named 134 | res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast ) 135 | res_inner[[i]] = res_info_inner #minimum size filtering 136 | cat('finished', '\n') 137 | } 138 | 139 | res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner ) 140 | res_folder_final = file.path(res_dir, 'final') 141 | dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 142 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 143 | 144 | time_finish = Sys.time() 145 | cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE) 146 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE) 147 | 148 | 149 | ## xenocfraf: 150 | # res_inner = res_inner[sapply(res_inner, length) > 0] 151 | branches = lapply( res_inner, get_tree_v0 ) 152 | for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1]) 153 | 154 | full_tree = xenocraft( trunk, branches ) 155 | names_tmp = do.call(rbind, strsplit(igraph::V(full_tree)$name, ',')) 156 | igraph::V(full_tree)$left = substring(names_tmp[,1], 2) 157 | igraph::V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1) 158 | if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree") 159 | 160 | res_info$full_tree = full_tree 161 | save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 162 | 163 | return( res_info ) 164 | } 165 | 166 | -------------------------------------------------------------------------------- /R/HighResolution2Low.R: -------------------------------------------------------------------------------- 1 | ## These functions try to aggregate bin-wise contact matrix into a domain-wise contact matrix 2 | 3 | HighResolution2Low_k_complete <-function( A, max_nbins ) 4 | { 5 | nbins = nrow( A ) 6 | 7 | max_nbins_bp = max_nbins 8 | max_nbinss = max_nbins + (-5:20) 9 | expand_fold_info = t(sapply(max_nbinss, fold_expand_rate, K=nbins)) 10 | index = max(which(expand_fold_info[,'rescale_rate']==min(expand_fold_info[,'rescale_rate']))) 11 | max_nbins = max_nbinss[ index ] 12 | n2one_head = expand_fold_info[index, 'n2one_head'] 13 | n2one_last = expand_fold_info[index, 'n2one_last'] 14 | 15 | if(n2one_head==1) return(list()) 16 | rescale_rate = n2one_last / n2one_head 17 | 18 | if(max_nbins!=max_nbins_bp) warning('Value of max_nbins has been adjusted from ', max_nbins_bp, ' to ', max_nbins, '. This results in rescale_rate as: ', rescale_rate, '\n') 19 | 20 | if(rescale_rate!=1) ## only do when rescale is not 1 21 | { 22 | aa_end = n2one_head*(max_nbins-1) 23 | bb_end = nrow(A) 24 | A_aa = A[1:aa_end, 1:aa_end] 25 | A_bb = A[(aa_end+1):bb_end, (aa_end+1):bb_end] 26 | A_ab = A[1:aa_end, (aa_end+1):bb_end, drop=FALSE] 27 | A_aa_low = HighResolution2Low_k(mat=A_aa, k=max_nbins-1) 28 | A_bb_low = sum( A_bb ) 29 | 30 | ## https://stackoverflow.com/questions/15265512/summing-every-n-points-in-r 31 | v = apply(A_ab, 1, sum) 32 | A_ab_low = unname(tapply(v, (seq_along(v)-1) %/% n2one_head, sum)) 33 | 34 | A_ab_low_new = A_ab_low / n2one_last*n2one_head 35 | A_bb_low_new = A_bb_low / n2one_last^2*n2one_head^2 36 | A_low = rbind(cbind(A_aa_low, A_ab_low_new), c( A_ab_low_new, A_bb_low_new )) 37 | colnames(A_low) = NULL 38 | } 39 | 40 | if(rescale_rate==1) A_low = HighResolution2Low_k(mat=A, k=max_nbins) ## k is the dimension of A_final_low 41 | ## A_low has no row/col name 42 | res = list(A_low=A_low, n2one_head=n2one_head, n2one_last=n2one_last) 43 | return(res) 44 | } 45 | 46 | split_info <-function( K, max_nbins ) 47 | { 48 | x = floor( K/max_nbins ) + 1 49 | n2 = max_nbins*x - K 50 | n1 = max_nbins - n2 51 | vec = c(rep(x, n1), rep(x-1, n2)) 52 | split_vec = split(1:K, rep(1:max_nbins, vec)) 53 | 54 | expand_fold_ratio = min(n1/n2, n2/n1) 55 | res = list(vec=vec, expand_fold_ratio=expand_fold_ratio, split_vec=split_vec) 56 | return(res) 57 | } 58 | 59 | 60 | ## diagonal values are summed twice 61 | HighResolution2Low <-function( A, rescale, which=2 ) 62 | { 63 | nbins = nrow(A) 64 | nbins_low = floor(nbins/rescale) 65 | A_low = matrix( , nbins_low, nbins_low) 66 | keep_rows = nbins - nbins%%nbins_low 67 | A_truncated = A[ 1:keep_rows, 1:keep_rows ] 68 | 69 | if(which==1) A_low = matsplitter_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low) 70 | if(which==2) A_low = mat_split_sum(A_truncated, keep_rows/nbins_low, keep_rows/nbins_low) 71 | 72 | return( A_low ) 73 | } 74 | 75 | ## https://stackoverflow.com/questions/24299171/function-to-split-a-matrix-into-sub-matrices-in-r 76 | ## Function to split a matrix into sub-matrices in R 77 | matsplitter_sum <- function(M, r, c) 78 | { 79 | rg <- (row(M)-1)%/%r + 1 80 | cg <- (col(M)-1)%/%c + 1 81 | rci <- (rg-1)*max(cg) + cg 82 | N <- prod(dim(M))/r/c 83 | cv <- unlist(lapply(1:N, function(x) M[rci==x])) 84 | dim(cv)<-c(r, c, N) 85 | res = matrix(apply(cv, 3, sum), nrow(M)/r, byrow=TRUE) 86 | return(res) 87 | } 88 | 89 | 90 | mat_split_sum <- function(M, r, c) 91 | { 92 | nr <- ceiling(nrow(M)/r) 93 | nc <- ceiling(ncol(M)/c) 94 | newM <- matrix(NA, nr*r, nc*c) 95 | newM[1:nrow(M), 1:ncol(M)] <- M 96 | 97 | div_k <- kronecker(matrix(seq_len(nr*nc), nr, byrow = TRUE), matrix(1, r, c)) 98 | matlist <- split(newM, div_k) 99 | res = matrix(sapply(matlist, sum), nrow(M)/r, byrow=TRUE) 100 | return(res) 101 | } 102 | 103 | ## this is much faster 104 | HighResolution2Low_k <- function(mat, k) 105 | { 106 | chunk2 <- function(x, n) split(x, cut(seq_along(x), n, labels = FALSE)) 107 | n = nrow(mat) 108 | A_low = matrix( , k, k ) 109 | indices = chunk2( 1:n, k ) 110 | for(row_index in 1:k) 111 | { 112 | A_sub = mat[indices[[row_index]], ] 113 | for(col_index in 1:k) A_low[row_index, col_index] = sum( A_sub[ , indices[[col_index]]] ) 114 | } 115 | return( A_low ) 116 | } 117 | 118 | HighResolution2Low_k_rectangle <- function(mat, row_split, col_split, sum_or_mean=c('sum', 'mean', 'median', 'p_above_one')) 119 | { 120 | # if(remove_zero==TRUE) mat[mat==0] = NA 121 | k_row = length(row_split) 122 | k_col = length(col_split) 123 | 124 | A_low = matrix( , k_row, k_col ) 125 | 126 | if(sum_or_mean=='sum') 127 | { 128 | for(row_index in 1:k_row) 129 | { 130 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 131 | for(col_index in 1:k_col) A_low[row_index, col_index] = sum( A_sub[ , col_split[[col_index]]] ) 132 | } 133 | } 134 | 135 | if(sum_or_mean=='mean') 136 | { 137 | for(row_index in 1:k_row) 138 | { 139 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 140 | for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]], na.rm=TRUE ) 141 | } 142 | } 143 | 144 | if(sum_or_mean=='median') 145 | { 146 | for(row_index in 1:k_row) 147 | { 148 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 149 | for(col_index in 1:k_col) A_low[row_index, col_index] = median( A_sub[ , col_split[[col_index]]], na.rm=TRUE ) 150 | } 151 | } 152 | 153 | if(sum_or_mean=='p_above_one') 154 | { 155 | for(row_index in 1:k_row) 156 | { 157 | A_sub = mat[row_split[[row_index]], , drop=FALSE ] 158 | for(col_index in 1:k_col) A_low[row_index, col_index] = mean( A_sub[ , col_split[[col_index]]] > 1 ) 159 | } 160 | } 161 | # if(remove_zero==TRUE) A_low[is.na(A_low)] = 0 162 | return( A_low ) 163 | } 164 | 165 | ## for a n x n matrix, this function generate a nxm matrix, with m*compress_size = n 166 | ## this function is used for generating the nxm matrix, where the i,jth value is the contact value 167 | ## 08-08-2018 168 | compress_mat_fast = function(input_mat, compress_size) 169 | { 170 | mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) ) 171 | n = ncol(input_mat) 172 | mat2prod = mat_block(n, compress_size) 173 | # return(t(input_mat%*%mat2prod / compress_size)) 174 | return(t(matrix_multiplication_cpp(input_mat, mat2prod)) / compress_size) 175 | } 176 | 177 | -------------------------------------------------------------------------------- /R/LikelihoodRatioTest_fun.R: -------------------------------------------------------------------------------- 1 | ## The following code tries to evaluate the performance of different likelihood ratio-based tests 2 | ## Yuanlong LIU 3 | ## 01-03-2018 4 | 5 | ############## model 1: keep the same structure ############## 6 | ## In this case, a tree structure should be provided 7 | ## Constraint: the top three levels have the same contact intensity parameter 8 | 9 | ## 01-03-2018 10 | p_likelihood_ratio <- function( A, head, mid, tail, num ) 11 | { 12 | A_whole = A[head:tail, head:tail] 13 | A_a = A[head:mid, head:mid] 14 | A_b = A[(mid+1):tail, (mid+1):tail] 15 | A_ab = A[head:mid, (mid+1):tail] 16 | 17 | tri_Awhole = A_whole[upper.tri(A_whole, diag=FALSE)] 18 | tri_Aa = A_a[upper.tri(A_a, diag=FALSE)] 19 | tri_Ab = A_b[upper.tri(A_b, diag=FALSE)] 20 | # gamma_fit(tri_Awhole, num) 21 | 22 | inner_a = get_prob( tri_Aa ) 23 | inner_b = get_prob( tri_Ab ) 24 | inter_ab = get_prob( A_ab ) 25 | 26 | ## negative binomial 27 | # inner_a = get_prob_ng( tri_Aa ) 28 | # inner_b = get_prob_ng( tri_Ab ) 29 | # inter_ab = get_prob_ng( A_ab ) 30 | 31 | 32 | ## log likelihood of H1 33 | LH1 = inner_a + inner_b + inter_ab 34 | LH0 = get_prob( tri_Awhole ) 35 | 36 | Lambda = -2*( LH0 - LH1 ) 37 | # cat(Lambda, '\n') 38 | df_h0 = 1*1 39 | df_h1 = 3*1 40 | df = df_h1 - df_h0 41 | 42 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 43 | info = list( Lambda=Lambda, p=p ) 44 | return(info) 45 | } 46 | 47 | p_likelihood_ratio_nb <- function( A, head, mid, tail ) 48 | { 49 | A_whole = A[head:tail, head:tail] 50 | A_a = A[head:mid, head:mid] 51 | A_b = A[(mid+1):tail, (mid+1):tail] 52 | A_ab = A[head:mid, (mid+1):tail] 53 | 54 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 55 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 56 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 57 | 58 | ## negative binomial 59 | inner_a = get_prob_nb( tri_Aa ) 60 | inner_b = get_prob_nb( tri_Ab ) 61 | inter_ab = get_prob_nb( A_ab ) 62 | 63 | 64 | ## log likelihood of H1 65 | LH1 = inner_a + inner_b + inter_ab 66 | LH0 = get_prob_nb( tri_Awhole ) 67 | 68 | Lambda = -2*( LH0 - LH1 ) 69 | # cat(Lambda, '\n') 70 | n_parameters = 2 71 | df_h0 = 1*n_parameters 72 | df_h1 = 3*n_parameters 73 | df = df_h1 - df_h0 74 | 75 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 76 | info = list( Lambda=Lambda, p=p ) 77 | return(info) 78 | } 79 | 80 | p_likelihood_ratio_norm <- function( A, head, mid, tail ) 81 | { 82 | A_whole = A[head:tail, head:tail] 83 | A_a = A[head:mid, head:mid] 84 | A_b = A[(mid+1):tail, (mid+1):tail] 85 | A_ab = A[head:mid, (mid+1):tail] 86 | 87 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 88 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 89 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 90 | 91 | ## norm 92 | inner_a = likelihood_norm( tri_Aa ) 93 | inner_b = likelihood_norm( tri_Ab ) 94 | inter_ab = likelihood_norm( A_ab ) 95 | 96 | 97 | ## log likelihood of H1 98 | LH1 = inner_a + inner_b + inter_ab 99 | LH0 = likelihood_norm( tri_Awhole ) 100 | 101 | Lambda = -2*( LH0 - LH1 ) 102 | # cat(Lambda, '\n') 103 | n_parameters = 2 104 | df_h0 = 1*n_parameters 105 | df_h1 = 3*n_parameters 106 | df = df_h1 - df_h0 107 | 108 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 109 | info = list( Lambda=Lambda, p=p ) 110 | return(info) 111 | } 112 | 113 | 114 | 115 | p_likelihood_ratio_gamma <- function( A, head, mid, tail, n_parameters, imputation ) 116 | { 117 | A_whole = A[head:tail, head:tail] 118 | A_a = A[head:mid, head:mid] 119 | A_b = A[(mid+1):tail, (mid+1):tail] 120 | A_ab = A[head:mid, (mid+1):tail] 121 | 122 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 123 | ## added 25-03-2018. If no zero values, no imputation 124 | if( length(tri_Awhole==0) == 0 ) imputation = FALSE 125 | 126 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 127 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 128 | 129 | ## norm 130 | inner_a = likelihood_gamma_mme( tri_Aa ) 131 | inner_b = likelihood_gamma_mme( tri_Ab ) 132 | inter_ab = likelihood_gamma_mme( A_ab ) 133 | whole = likelihood_gamma_mme( tri_Awhole ) 134 | 135 | if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 136 | { 137 | inner_a = likelihood_gamma_mme( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa ) 138 | inner_b = likelihood_gamma_mme( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab ) 139 | inter_ab = likelihood_gamma_mme( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab ) 140 | whole = likelihood_gamma_mme( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole ) 141 | n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account 142 | } 143 | 144 | 145 | ## log likelihood of H1 146 | LH1 = inner_a + inner_b + inter_ab 147 | LH0 = whole 148 | 149 | Lambda = -2*( LH0 - LH1 ) 150 | # cat(Lambda, '\n') 151 | df_h0 = 1*n_parameters 152 | df_h1 = 3*n_parameters 153 | df = df_h1 - df_h0 154 | 155 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 156 | info = list( Lambda=Lambda, p=p ) 157 | return(info) 158 | } 159 | 160 | lognormal_mean_test <- function( cA, head, mid, tail ) 161 | { 162 | A_whole = cA[head:tail, head:tail] 163 | A_a = cA[head:mid, head:mid] 164 | A_b = cA[(mid+1):tail, (mid+1):tail] 165 | A_ab = cA[head:mid, (mid+1):tail] 166 | 167 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 168 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 169 | 170 | tri_Aa_vec = as.vector( tri_Aa ) 171 | tri_Ab_vec = as.vector( tri_Ab ) 172 | A_ab_vec = as.vector( A_ab ) 173 | 174 | tri_Aa_vec_p = tri_Aa_vec[tri_Aa_vec!=0] 175 | tri_Ab_vec_p = tri_Ab_vec[tri_Ab_vec!=0] 176 | A_ab_vec_p = A_ab_vec[A_ab_vec!=0] 177 | 178 | p_Aa = p_lognormal_mean( tri_Aa_vec_p, A_ab_vec_p ) 179 | p_Ab = p_lognormal_mean( tri_Ab_vec_p, A_ab_vec_p ) 180 | return( list(p_Aa=p_Aa, p_Ab=p_Ab) ) 181 | } 182 | 183 | ## https://www.jstor.org/stable/2533570 184 | ## google: Methods for Comparing the Means of Two Independent Log-Normal Samples 185 | ## 03-07-2018 186 | p_lognormal_mean <- function( vec_aa, vec_ab ) 187 | { 188 | if( all(vec_aa==0) | all(vec_ab==0) ) return(0) 189 | 190 | n_aa = length(vec_aa) 191 | n_ab = length(vec_ab) 192 | fited_info_aa = MASS::fitdistr(vec_aa, 'lognormal') ## intra 193 | mu_aa = fited_info_aa$estimate[1] 194 | sd_aa = fited_info_aa$estimate[2] 195 | s2_aa = sd_aa^2 196 | 197 | fited_info_ab = MASS::fitdistr(vec_ab, 'lognormal') ## inter 198 | mu_ab = fited_info_ab$estimate[1] 199 | sd_ab = fited_info_ab$estimate[2] 200 | s2_ab = sd_ab^2 201 | 202 | z_score = ( (mu_aa - mu_ab) + (1/2)*(s2_aa - s2_ab) ) / sqrt( s2_aa/n_aa + s2_ab/n_ab + (1/2)*( s2_aa^2/(n_aa-1) + s2_ab^2/(n_ab-1) ) ) 203 | p = pnorm( z_score, lower.tail=FALSE ) 204 | return(p) 205 | } 206 | 207 | get_corner_xy <- function(A_whole) 208 | { 209 | n = nrow(A_whole) 210 | corner_size = floor(n/2) 211 | A_corner = A_whole[1:corner_size, (n - corner_size + 1 ):n] 212 | expected_high = A_corner[upper.tri(A_corner, diag=FALSE)] 213 | expected_low = A_corner[lower.tri(A_corner, diag=TRUE)] 214 | return(list(x=expected_low, y=expected_high)) 215 | } 216 | 217 | get_half_mat_values <- function(mat) 218 | { 219 | n1 = nrow(mat) 220 | n2 = ncol(mat) 221 | delta = n1/n2 222 | rows = lapply( 1:n2, function(x) (1+ceiling(x*delta)):n1 ) 223 | rows = rows[ sapply(rows, function(v) max(v) <= n1) ] 224 | flag = which(diff(sapply(rows, length)) > 0) 225 | if(length(flag)>0) rows = rows[ 1:min(flag) ] 226 | 227 | row_col_indices = cbind( unlist(rows), rep(1:length(rows), sapply(rows, length))) 228 | x = mat[row_col_indices] 229 | mat[row_col_indices] = NA 230 | y = na.omit(as.vector(mat)) 231 | return(list(x=x, y=y)) 232 | } 233 | 234 | 235 | get_half_mat_values_v2 <- function(mat) 236 | { 237 | ## https://stackoverflow.com/questions/52990525/get-upper-triangular-matrix-from-nonsymmetric-matrix/52991508#52991508 238 | y = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) > -1/2] 239 | x = mat[nrow(mat) * (2 * col(mat) - 1) / (2 * ncol(mat)) - row(mat) < -1/2] 240 | return(list(x=x, y=y)) 241 | } 242 | 243 | p_wilcox_test_nested <- function( A, head, mid, tail, alternative, is_CD ) 244 | { 245 | test_1 = p_wilcox_test( A, head, mid, tail, alternative, is_CD, only_corner=FALSE ) #: coner + inter 246 | if( (tail - head <= 4) | (test_1$p > 0.05) ) ## when > 0.05 or it is already small, do not cosider it as nested 247 | { 248 | info = list( Lambda=NULL, p=0.555555, mean_diff=0 ) 249 | return(info) 250 | } 251 | 252 | ## try_error happens when tad to test is too small. THEREFORE, ASSIGN P=0 TO THE TAD 253 | test_left_tad = try(p_wilcox_test( A, head, mid=ceiling((head+mid)/2), mid, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter 254 | if( class(test_left_tad)=="try-error" ) test_left_tad = list(p=0) 255 | test_right_tad = try(p_wilcox_test( A, mid+1, mid=ceiling((mid+1+tail)/2), tail, alternative, is_CD=FALSE, only_corner=TRUE )) #: coner + inter 256 | if( class(test_right_tad)=="try-error" ) test_right_tad = list(p=0) 257 | 258 | info = list( Lambda=NULL, p=max( test_1$p, min(test_left_tad$p, test_right_tad$p) ), mean_diff=0 ) 259 | return(info) 260 | } 261 | 262 | 263 | 264 | p_wilcox_test = function( A, head, mid, tail, alternative, is_CD=FALSE, only_corner=FALSE ) ## only_corner tests if a domain is a TAD (no nesting) 265 | { 266 | A_whole = A[head:tail, head:tail] 267 | A_a = A[head:mid, head:mid] 268 | A_b = A[(mid+1):tail, (mid+1):tail] 269 | A_ab = A[head:mid, (mid+1):tail] 270 | 271 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] ## need to check whether diag should be false or true 272 | 273 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] ## need to check whether diag should be false or true 274 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] ## need to check whether diag should be false or true 275 | 276 | tri_Aa_vec = as.vector( tri_Aa ) 277 | tri_Ab_vec = as.vector( tri_Ab ) 278 | A_ab_vec = as.vector( A_ab ) 279 | 280 | corner_mat_info = get_half_mat_values_v2(A_ab) 281 | A_ab_corner = as.vector(corner_mat_info$y) 282 | A_ab_ncorner = corner_mat_info$x 283 | p_corner = wilcox.test(x=A_ab_corner, y=A_ab_ncorner, alternative='greater', exact=F) 284 | p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_ncorner, alternative='greater', exact=F) 285 | # p_inter = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=c(A_ab_ncorner, A_ab_corner), alternative='greater', exact=F) 286 | 287 | if(is_CD==FALSE) p = max(p_inter$p.value, p_corner$p.value) ## if the tested part is the CD 288 | if(is_CD==TRUE) p = p_inter$p.value ## if the tested part is the CD 289 | # if(only_corner==TRUE) p = p_corner$p.value 290 | mean_diff_inter = mean(A_ab_ncorner) - mean(c(tri_Ab_vec, tri_Aa_vec)) ## negative is good 291 | mean_diff_corner = mean(A_ab_ncorner) - mean(A_ab_corner) ## negative is good 292 | 293 | 294 | if(is_CD==FALSE) mean_diff = min(mean_diff_corner, mean_diff_inter) ## if the tested part is the CD 295 | if(is_CD==TRUE) mean_diff = mean_diff_inter 296 | 297 | if(min(length(tri_Ab_vec), length(tri_Ab_vec)) < 10) mean_diff = 100 ## when one of the two twins is too small. 10: dim(4,4) 298 | 299 | # mean_diff = mean(A_ab_corner) - mean(A_ab_ncorner) 300 | 301 | # p_test_Aa = wilcox.test(x=A_ab_vec, y=tri_Aa_vec, alternative="less", exact=F) 302 | # p_test_Ab = wilcox.test(x=A_ab_vec, y=tri_Ab_vec, alternative="less", exact=F) 303 | # p = wilcox.test(x=c(tri_Ab_vec, tri_Aa_vec), y=A_ab_vec, alternative=alternative, exact=F) 304 | 305 | # xy = get_corner_xy(A_whole) 306 | # p = wilcox.test(x=xy$x, y=xy$y, alternative=alternative, exact=F) 307 | 308 | # p = max(p_test_Aab$p.value, p_test_Ab$p.value) 309 | info = list( Lambda=NULL, p=p, mean_diff=mean_diff) 310 | return(info) 311 | } 312 | 313 | 314 | 315 | 316 | p_likelihood_ratio_lnorm <- function( A, head, mid, tail, n_parameters, imputation, imputation_num=1E2 ) 317 | { 318 | likelihood_fun = likelihood_lnorm_mle 319 | A_whole = A[head:tail, head:tail] 320 | A_a = A[head:mid, head:mid] 321 | A_b = A[(mid+1):tail, (mid+1):tail] 322 | A_ab = A[head:mid, (mid+1):tail] 323 | 324 | tri_Awhole = A_whole[upper.tri(A_whole, diag=TRUE)] 325 | ## added 25-03-2018. If no zero values, no imputation 326 | no_zero_flag = 0 327 | if( sum(tri_Awhole==0) == 0 ) { no_zero_flag = 1; imputation = FALSE } 328 | 329 | tri_Aa = A_a[upper.tri(A_a, diag=TRUE)] 330 | tri_Ab = A_b[upper.tri(A_b, diag=TRUE)] 331 | 332 | tri_Aa_vec = as.vector( tri_Aa ) 333 | tri_Ab_vec = as.vector( tri_Ab ) 334 | A_ab_vec = as.vector( A_ab ) 335 | 336 | mean_a = mean( tri_Aa_vec[tri_Aa_vec!=0] ) 337 | mean_b = mean( tri_Ab_vec[tri_Ab_vec!=0] ) 338 | mean_ab = mean( A_ab_vec[A_ab_vec!=0] ) 339 | mean_diff = mean_ab - min(mean_a, mean_b) 340 | 341 | if( (all(tri_Aa_vec==0)) | (all(tri_Ab_vec==0)) | (all(A_ab_vec==0)) ) 342 | { 343 | info = list( Lambda=NA, p=0, mean_diff=mean_diff ) 344 | return(info) 345 | } 346 | 347 | ## lnorm 348 | if(!imputation) 349 | { 350 | inner_a = likelihood_fun( tri_Aa ) 351 | inner_b = likelihood_fun( tri_Ab ) 352 | inter_ab = likelihood_fun( A_ab ) 353 | whole = likelihood_fun( tri_Awhole ) 354 | 355 | ## log likelihood of H1 356 | LH1 = inner_a + inner_b + inter_ab 357 | LH0 = whole 358 | Lambda = -2*( LH0 - LH1 ) 359 | 360 | if( no_zero_flag ) n_parameters = n_parameters - 1 ## no alpha parameter 361 | df_h0 = 1*n_parameters ##(theta_1 = theta_2 = theta_3) 362 | df_h1 = 3*n_parameters ##(theta_1, theta_2, theta_3) 363 | df = df_h1 - df_h0 364 | 365 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 366 | info = list( Lambda=Lambda, p=p, mean_diff=mean_diff ) 367 | return(info) 368 | } 369 | 370 | if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 371 | { 372 | vec_list = list( tri_Aa=tri_Aa, tri_Ab=tri_Ab, A_ab=A_ab ) 373 | imputations = imputation_list( vec_list, imputation_num ) 374 | 375 | inner_as = apply(imputations$tri_Aa, 1, likelihood_fun) 376 | inner_bs = apply(imputations$tri_Ab, 1, likelihood_fun) 377 | inter_abs = apply(imputations$A_ab, 1, likelihood_fun) 378 | 379 | wholes = apply(do.call( cbind, imputations ), 1, likelihood_fun) 380 | LH1s = inner_as + inner_bs + inter_abs 381 | LH0s = wholes 382 | Lambdas = -2*( LH0s - LH1s ) 383 | Lambda = mean( Lambdas ) 384 | n_parameters = n_parameters - 1 ## the mixture parameter is not taken into account 385 | 386 | # cat(Lambda, '\n') 387 | df_h0 = 1*n_parameters 388 | df_h1 = 3*n_parameters 389 | df = df_h1 - df_h0 390 | 391 | p = pchisq(Lambda, df=df, lower.tail = FALSE, log.p = FALSE) 392 | info = list( Lambda=Lambda, p=p, mean_diff=mean_diff ) 393 | return(info) 394 | 395 | # if( imputation ) ## zero values are imputed by the estimated distribution based on non random values 396 | # { 397 | # inner_a = likelihood_lnorm( tri_Aa[tri_Aa!=0] )/length( tri_Aa[tri_Aa!=0] )*length( tri_Aa ) 398 | # inner_b = likelihood_lnorm( tri_Ab[tri_Ab!=0] )/length( tri_Ab[tri_Ab!=0] )*length( tri_Ab ) 399 | # inter_ab = likelihood_lnorm( A_ab )/length( A_ab[A_ab!=0] )*length( A_ab ) 400 | # whole = likelihood_lnorm( tri_Awhole[tri_Awhole!=0] )/length( tri_Awhole[tri_Awhole!=0] )*length( tri_Awhole ) 401 | # n_parameters = n_parameters - 1 ## the mixture parameter of 0 is not taken into account 402 | # } 403 | 404 | } 405 | } 406 | 407 | imputation_list <- function( vec_list, imputation_num ) 408 | { 409 | imputations = lapply( vec_list, function(v) 410 | { 411 | if(sum(v!=0)==1) {final_vec = matrix( v[v!=0], imputation_num, length(v) ); return(final_vec)} 412 | 413 | fit = fit_lnorm(v) 414 | set.seed(1) 415 | 416 | ## THERE WILL BE ERROR IF IS.NA SDLOG 417 | if(!is.na(fit['sdlog'])) imputation_vec = matrix(rlnorm( sum(v==0)*imputation_num, fit['meanlog'], fit['sdlog'] ), nrow=imputation_num) 418 | if(is.na(fit['sdlog'])) stop("In function imputation_list, sdlog=NA encountered") 419 | 420 | ori_vec = t(replicate(imputation_num, v[v!=0])) 421 | final_vec = cbind( ori_vec, imputation_vec ) 422 | } ) 423 | return( imputations ) 424 | } 425 | 426 | 427 | fit_lnorm <- function(vec) 428 | { 429 | vec = vec[vec!=0] 430 | fit = MASS::fitdistr(vec, 'lognormal')$estimate 431 | return(fit) 432 | } 433 | 434 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | matrix_multiplication_cpp <- function(A, B) { 5 | .Call(`_CALDER_matrix_multiplication_cpp`, A, B) 6 | } 7 | 8 | matrix_multiplication_sym_cpp <- function(A) { 9 | .Call(`_CALDER_matrix_multiplication_sym_cpp`, A) 10 | } 11 | 12 | loglik_lnorm_cpp <- function(sum_ln1, sum_ln2, p, q) { 13 | .Call(`_CALDER_loglik_lnorm_cpp`, sum_ln1, sum_ln2, p, q) 14 | } 15 | 16 | loglik_lnorm_cpp_vec <- function(vec_values) { 17 | .Call(`_CALDER_loglik_lnorm_cpp_vec`, vec_values) 18 | } 19 | 20 | get_A_len <- function(A) { 21 | .Call(`_CALDER_get_A_len`, A) 22 | } 23 | 24 | get_A_ln1 <- function(A) { 25 | .Call(`_CALDER_get_A_ln1`, A) 26 | } 27 | 28 | get_A_ln2 <- function(A) { 29 | .Call(`_CALDER_get_A_ln2`, A) 30 | } 31 | 32 | loglik_lnorm_cpp_mat <- function(sum_ln1, sum_ln2, ps, qs) { 33 | .Call(`_CALDER_loglik_lnorm_cpp_mat`, sum_ln1, sum_ln2, ps, qs) 34 | } 35 | 36 | zigzag_loglik_ancestors_v4_5 <- function(A, k, min_n_bins = 2L) { 37 | .Call(`_CALDER_zigzag_loglik_ancestors_v4_5`, A, k, min_n_bins) 38 | } 39 | 40 | compute_L <- function(A, L, k) { 41 | .Call(`_CALDER_compute_L`, A, L, k) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R/bisecting_kmeans.R: -------------------------------------------------------------------------------- 1 | 2 | ## k-means with replicatable seeds 3 | my_kmeans = function(iter.max=1E3, nstart=50, ...) 4 | { 5 | set.seed(1) 6 | res = kmeans(iter.max=iter.max, nstart=nstart, ...) 7 | return(res) 8 | } 9 | 10 | ## This function tries to adjust the height of each split, in order to generate a valid hclust object and with balanced compartments A.1 A.2 B.1 B.2 11 | ## Clusters with more nodes will get bigger height in case of same height 12 | adjust_hs <- function(l_r_h) 13 | { 14 | hs = sapply(l_r_h, function(v) v$h) 15 | all_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$l, v$r)))) 16 | r_names = sapply(l_r_h, function(v) paste0(collapse='_', sort(c(v$r)))) 17 | 18 | sizes = sapply(l_r_h, function(v) length(v$l) + length(v$r)) ## 19 | 20 | ################ This part deals with duplicated heights 21 | hs = hs + sizes*1E-7 22 | ################ This part tries to make the top-level left and right branch to have similar height, such that to make balanced A.1, A.2, B.1, B.2 compartments 23 | ## Find the index of second branch, whose number of nodes is n_total - n_left: sizes[1] - sizes[2] 24 | l_b = 2 ## left sub-branch 25 | # r_b = which(sizes==(sizes[1] - sizes[2]))[1] ## right sub-branch 26 | r_b = which(r_names[1]==all_names) ## right sub-branch 27 | 28 | l_h = hs[l_b] 29 | r_h = hs[r_b] 30 | max_h = max(l_h, r_h) ## the maximum height of the two branches 31 | hs_new = mean(sort(hs, decreasing=TRUE)[2:3]) ## hs_new is the 3rd largest height 32 | hs[l_b] = ifelse(l_h > r_h, max_h, hs_new) 33 | hs[r_b] = ifelse(r_h > l_h, max_h, hs_new) 34 | 35 | if(any(duplicated(hs))) stop('ERROR: DUPLICATED HEIGHTS exist in bisecting_kmeans') 36 | return( hs ) 37 | } 38 | 39 | bisecting_kmeans <- function(data) 40 | { 41 | dist_mat = as.matrix(stats::dist(data)) 42 | indices = 1:nrow(data) 43 | l_r_h <<- list() 44 | 45 | get_h <- function(l_indices, r_indices) 46 | { 47 | combined_indices = c(l_indices, r_indices) 48 | idx <- as.matrix(expand.grid(combined_indices, combined_indices)) 49 | max(dist_mat[idx]) ## diameter 50 | } 51 | 52 | get_sub_tree <- function( indices ) 53 | { 54 | n_nodes = length(indices) 55 | 56 | if(n_nodes==1) ## if only two nodes 57 | { 58 | h = NULL 59 | # tree = list(h=h, leaf=indices) 60 | return() 61 | } 62 | 63 | ############# if more than two nodes 64 | if(n_nodes==2) cluster=c(1,2) else cluster = my_kmeans(x=data[indices, ], centers=2)$cluster 65 | l_indices = indices[cluster==1] 66 | r_indices = indices[cluster==2] 67 | h = get_h(l_indices, r_indices) 68 | l_r_h <<- c(l_r_h, list(list(l=l_indices, r=r_indices, h=h))) 69 | 70 | # cat(h, '\n') 71 | l_branch = get_sub_tree( l_indices ) 72 | r_branch = get_sub_tree( r_indices ) 73 | # tree = list(h=h, l_branch=l_branch, r_branch=r_branch, l_indices=l_indices, r_indices=r_indices) 74 | # return(tree) 75 | } 76 | 77 | get_sub_tree(indices) 78 | 79 | hs = adjust_hs(l_r_h) 80 | 81 | r_hs = rank(hs) 82 | for( i in 1:length(l_r_h) ) {name=r_hs[i]; names(name)=paste0(collapse='_', sort(c(l_r_h[[i]]$l, l_r_h[[i]]$r))); l_r_h[[i]]$name=name} 83 | pos_names = sapply(l_r_h, function(v) v$name) 84 | neg_names = -(1:length(indices)); names(neg_names) = 1:length(indices); all_names = c(pos_names, neg_names) 85 | for( i in 1:length(l_r_h) ) {l_r_h[[i]]$l_name=unname(all_names[paste0(l_r_h[[i]]$l, collapse='_')]); l_r_h[[i]]$r_name=unname(all_names[paste0(l_r_h[[i]]$r, collapse='_')]) } 86 | 87 | merge_height = data.frame(l=sapply(l_r_h, function(v) v$l_name), r=sapply(l_r_h, function(v) v$r_name), h=hs) 88 | merge_height = merge_height[order(merge_height$h), ] 89 | rownames(merge_height) = NULL 90 | 91 | data_tmp = cbind(c(0,0,1,1), c(0,1,1,0)) 92 | hc = hclust(stats::dist(data_tmp), "com") 93 | hc$merge = as.matrix(unname(merge_height[,1:2])) 94 | hc$height = merge_height$h 95 | # hc$order = unname(unlist(res, recursive=TRUE)[grepl('leaf', names(unlist(res, recursive=TRUE)))]) 96 | # hc$order = 1:length(indices) 97 | hc$labels = 1:length(indices) 98 | den <- as.dendrogram(hc) 99 | hc_r <- as.hclust(reorder(den, 1:length(indices))) 100 | hc_r$method = "complete" 101 | hc_r$dist.method = "euclidean" 102 | l_r_h <<- list() 103 | rm(l_r_h) 104 | return(hc_r) 105 | } 106 | 107 | -------------------------------------------------------------------------------- /R/build_comp_table_opt.R: -------------------------------------------------------------------------------- 1 | 2 | ## function to obtain the optimal compartment calling from various resolutions (bin_sizes) 3 | 4 | build_comp_table_opt = function(save_dir, chrs, bin_sizes, with_ref) 5 | { 6 | 7 | `%dopar%` <- foreach::`%dopar%` 8 | `%do%` <- foreach::`%do%` 9 | 10 | get_opt_index = function(consist_tab) 11 | { 12 | if(ncol(consist_tab)==2) ## if only one bin_size value 13 | { 14 | opt_index = rep(1, nrow(consist_tab)) 15 | return(opt_index) 16 | } 17 | 18 | mins = apply(consist_tab[,-1], 1, max) - 0.05 ## allow at most 0.05 smaller than the maximum 19 | opt_index = apply(1*((consist_tab[,-1] >= mins)==1), 1, function(v) min(which(v==1))) 20 | return(opt_index) 21 | } 22 | 23 | ################################ 24 | 25 | identities = 'bulk' 26 | 27 | { 28 | consist_tab_li = foreach::foreach(identity=identities) %do% 29 | { 30 | # consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do% 31 | # { 32 | # bin_size_kb = sprintf("%skb", bin_size/1E3) 33 | # save_dir_binsize = file.path(save_dir, bin_size_kb) 34 | # cor_log_file = paste0(save_dir_binsize, '/cor_with_ref.txt') 35 | # log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt') 36 | 37 | # as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2]) 38 | 39 | # cor_tab = data.table::fread(cor_log_file) 40 | # colnames(cor_tab) = c("chr", bin_size_kb) 41 | # cor_tab 42 | # } 43 | 44 | 45 | consist_tab = foreach::foreach(bin_size=bin_sizes, .combine=merge) %do% 46 | { 47 | bin_size_kb = sprintf("%skb", bin_size/1E3) 48 | save_dir_binsize = file.path(save_dir, 'intermediate_data/sub_compartments', bin_size_kb) 49 | consist_tab_tmp = data.table::data.table(chr=paste0('chr', chrs), val=0) 50 | consist_tab_tmp$val = foreach::foreach(chr=chrs, .combine=c) %do% 51 | { 52 | log_file = paste0(save_dir_binsize, '/chr', chr, '_log.txt') 53 | # print(log_file) 54 | cor_val = as.numeric(strsplit(readLines(log_file)[5], 'this chr is:')[[1]][2]) 55 | # print(cor_val) 56 | cor_val 57 | } 58 | colnames(consist_tab_tmp)[2] = bin_size_kb 59 | consist_tab_tmp 60 | } 61 | 62 | s = gsub('chr', '', consist_tab[['chr']]) 63 | 64 | x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically 65 | consist_tab = consist_tab[order(x, 'is.na<-'(s, !is.na(x))), ] 66 | # print((consist_tab)) 67 | 68 | 69 | } 70 | 71 | names(consist_tab_li) = identities 72 | 73 | min_consist = (sapply(consist_tab_li, function(v) min(v[,2]))) 74 | 75 | ################################ Choose the best bin size (as small as possible, and save the chosen comp to the opt dir) 76 | 77 | # silent_out = foreach::foreach(identity=identities, .combine=merge) %do% ## do not use dopar 78 | { 79 | 80 | save_dir_opt = file.path(save_dir, "sub_compartments") 81 | dir.create(save_dir_opt, recursive=TRUE, showWarnings=FALSE) 82 | 83 | consist_tab = consist_tab_li[[identity]] 84 | opt_index = get_opt_index(consist_tab) 85 | names(opt_index) = gsub(":", "", consist_tab$chr) 86 | opt_bin_tab = data.table::data.table(chr=names(opt_index), opt_binsize=(colnames(consist_tab)[-1])[opt_index]) 87 | 88 | consist_tab_tmp = cbind( consist_tab, opt_bin_tab[, 'opt_binsize']) 89 | colnames(consist_tab_tmp)[ncol(consist_tab_tmp)] = 'opt_binsize' 90 | 91 | # consist_tab_tmp$chr_num = gsub(':', '', gsub("chr", "", consist_tab_tmp$chr)) 92 | # consist_tab_tmp[chr=="chrX:"]$chr_num = 23 93 | # consist_tab_tmp = consist_tab_tmp[order(chr_num)] 94 | 95 | cor_all_file = file.path(save_dir_opt, "cor_with_ref.ALL.txt") 96 | data.table::fwrite(consist_tab_tmp, file=cor_all_file, col.names=TRUE, sep="\t") 97 | 98 | 99 | 100 | cor_with_ref = foreach::foreach(j=1:nrow(opt_bin_tab), .combine=c) %do% 101 | { 102 | chr_query = opt_bin_tab$chr[j] 103 | opt_binsize = opt_bin_tab$opt_binsize[j] 104 | 105 | row_index = which(consist_tab$chr == chr_query) 106 | col_index = which(colnames(consist_tab)==opt_binsize) 107 | as.data.frame(consist_tab)[row_index, col_index] ## some wired behavior when using data.table. Convert to data.frame 108 | } 109 | 110 | 111 | cor_with_ref_tab = data.table::data.table(chr=consist_tab$chr, cor=cor_with_ref, chr_num=gsub("chr", "", opt_bin_tab$chr)) 112 | 113 | s = cor_with_ref_tab$chr_num 114 | x <- suppressWarnings(as.numeric(s)) ## https://stackoverflow.com/questions/70080294/sort-column-in-r-strings-first-alphabetically-then-numbers-numerically 115 | cor_with_ref_tab = cor_with_ref_tab[order(x, 'is.na<-'(s, !is.na(x))), ] 116 | 117 | 118 | cor_opt_file = file.path(save_dir_opt, "cor_with_ref.txt") 119 | data.table::fwrite(cor_with_ref_tab[, 1:2], file=cor_opt_file, col.names=TRUE, sep="\t") 120 | 121 | 122 | ################################ make plot 123 | 124 | cor_with_ref_tab$index = 1:nrow(cor_with_ref_tab) 125 | cor_opt_plot_file = file.path(save_dir_opt, "cor_with_ref.pdf") 126 | 127 | 128 | pdf(cor_opt_plot_file, width=8, height=6) 129 | p = ggplot2::ggplot(data=cor_with_ref_tab, ggplot2::aes(x=index, y=cor, label = format(round(cor_with_ref_tab$cor, 2), nsmall = 2))) + ggplot2::geom_line(color="red") + ggplot2::geom_point() 130 | p = p + ggplot2::geom_label() + ggplot2::scale_x_continuous(labels=cor_with_ref_tab$chr_num, breaks=1:nrow(cor_with_ref_tab)) 131 | p = p + ggplot2::geom_hline(yintercept=ifelse(with_ref, 0.4, 0.15), linetype=2, color='darkblue') 132 | if(with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.4 indicates imprecise compartment calling') 133 | if(!with_ref) p = p + ggplot2::xlab('chr') + ggplot2::ylab('Rho') + ggplot2::ggtitle('Correlation of compartment rank with reference\nRho < 0.15 indicates imprecise compartment calling') 134 | print(p) 135 | dev.off() 136 | 137 | ################################ 138 | 139 | opt_sub_comp_beds = foreach::foreach(i=1:nrow(opt_bin_tab)) %do% 140 | { 141 | opt_bin_size_kb = opt_bin_tab[[2]][i] 142 | chr_name = opt_bin_tab[[1]][i] 143 | opt_sub_comp_bed_chr = sprintf("%s/%s_sub_compartments.bed", file.path(save_dir, 'intermediate_data/sub_compartments', opt_bin_size_kb), chr_name) 144 | opt_sub_comp_bed_chr 145 | } 146 | 147 | # save_opt_file = file.path(save_dir_opt, "all_sub_compartments.bed") 148 | save_opt_file = file.path(save_dir_opt, sprintf("all_sub_compartments.bed")) 149 | 150 | cmd_opt = sprintf("cat %s > %s", paste0(opt_sub_comp_beds, collapse=" "), save_opt_file) 151 | system(cmd_opt) 152 | 153 | # cmd_replaceX = sprintf("sed -i 's/chrNA/chrX/g' %s", save_opt_file) ## the bed file contains NA because of chrX 154 | # system(cmd_replaceX) 155 | 156 | comp_raw = data.table::fread(save_opt_file) 157 | comp_tab = make_comp_tab(comp_raw, bin_size=10E3) 158 | 159 | 160 | cols2keep = c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank') 161 | comp_tab = comp_tab[, c('chr', 'pos_start', 'pos_end', 'comp_name', 'comp_rank', 'continous_rank')] 162 | # comp_tab = comp_tab[, mget(cols2keep)] ## mget does not work for some reasons 163 | 164 | save_opt_tab_file = file.path(save_dir_opt, sprintf("all_sub_compartments.tsv")) 165 | data.table::fwrite(comp_tab, file=save_opt_tab_file, sep='\t') 166 | } 167 | } 168 | } 169 | 170 | 171 | -------------------------------------------------------------------------------- /R/call_domains.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## Scripts used to call domains having high inter-domain score while low intra-domain score. Adapted from TopDom with the following authorship 3 | 4 | # @author : Hanjun Shin(shanjun@usc.edu) 5 | # @credit : Harris Lazaris(Ph.D Stduent, NYU), Dr. Gangqing Hu(Staff Scientist, NIH) 6 | # @brief : TopDom.R is a software package to identify topological domains for given Hi-C contact matrix. 7 | # @version 0.0.2 8 | 9 | # @fn TopDom 10 | # @param matrixFile : string, matrixFile Address, 11 | # - Format = {chromosome, bin start, bin end, N numbers normalized value } 12 | # - N * (N + 3), where N is the number of bins 13 | # @param window.size :integer, number of bins to extend. 14 | # @param out_binSignal : string, binSignal file address to write 15 | # @param out_ext : string, ext file address to write 16 | 17 | 18 | 19 | remove_small_domains = function(domain_size_min, binSignal, domains) ## remove all domains smaller than 40E3, ADDED by Yuanlong, 2019-July-09 20 | { 21 | # binSignal = res_input_mat$binSignal 22 | # domains = domains_bp 23 | # domains = domains[, c('from.id', 'to.id', 'size')] 24 | while(1) 25 | { 26 | i = min(setdiff(which(domains$size < domain_size_min/10E3), 1)) ## do not include the first one 27 | if(i==Inf) break 28 | ids = domains[(i-1):i, 'to.id'] 29 | index2rm = ((i-1):i)[which.max(binSignal[ids, 'mean.cf'])] 30 | domains = domains[-index2rm, ] 31 | domains[-1, 'from.id'] = domains[-nrow(domains), 'to.id'] + 1 32 | domains$size = domains[, 'to.id'] - domains[, 'from.id'] + 1 33 | cat(nrow(domains), '\n') 34 | if(nrow(domains) < 10) break ## avoid exceptions 35 | } 36 | # domains[, 'chr'] = domains_bp[1, 'chr'] 37 | domains[, 'from.coord'] = domains[, 'from.id'] - 1 38 | domains[, 'to.coord'] = domains[, 'to.id'] 39 | rownames(domains) = 1:nrow(domains) 40 | # print(head(domains)) 41 | return(domains) 42 | } 43 | 44 | ## modified input file format and added several potentially useful parameter options, Yuanlong LIU 45 | TopDom_v2 <- function( A_extended, window.size, outFile=NULL, statFilter=T, p_thresh=0.05, return_peak_and_binsignal=FALSE, window.sizes=NULL, stat_window_size=NULL, domain_size_min=NULL) 46 | { 47 | if(!is.null(window.sizes)) window.size = window.sizes[1] 48 | 49 | # print("#########################################################################") 50 | # print("Step 0 : File Read ") 51 | # print("#########################################################################") 52 | window.size = as.numeric(window.size) 53 | matdf = A_extended 54 | 55 | if( ncol(matdf) - nrow(matdf) == 3) { 56 | colnames(matdf) <- c("chr", "from.coord", "to.coord") 57 | } else if( ncol(matdf) - nrow(matdf) ==4 ) { 58 | colnames(matdf) <- c("id", "chr", "from.coord", "to.coord") 59 | } else { 60 | print("Unknwon Type of matrix file") 61 | return(0) 62 | } 63 | n_bins = 1*nrow(matdf) 64 | mean.cf <- rep(0, n_bins) 65 | pvalue <- rep(1, n_bins) 66 | 67 | local.ext = rep(-0.5, n_bins) 68 | 69 | bins <- data.frame(id=1:n_bins, 70 | chr=matdf[, "chr"], 71 | from.coord=matdf[, "from.coord"], 72 | to.coord=matdf[, "to.coord"] ) 73 | 74 | matrix.data <- as.matrix( matdf[, (ncol(matdf) - nrow(matdf)+1 ):ncol(matdf)] ) 75 | 76 | 77 | ptm <- proc.time() 78 | 79 | ## Only compute for one track of signal 80 | if(is.null(window.sizes)){ 81 | for(i in 1:n_bins) 82 | { 83 | diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size) 84 | mean.cf[i] = mean(diamond) 85 | } 86 | } 87 | 88 | 89 | if(!is.null(window.sizes)) ## compute multiple tracks of signal 90 | { 91 | mean.cfs = matrix(, n_bins, length(window.sizes)) 92 | for(k in 1:length(window.sizes) ) 93 | { 94 | window.size.iter = window.sizes[k] 95 | for(i in 1:n_bins) 96 | { 97 | diamond = Get.Diamond.Matrix(mat.data=matrix.data, i=i, size=window.size.iter) 98 | mean.cfs[i, k] = mean(diamond) 99 | } 100 | } 101 | mean.cf = apply(mean.cfs, 1, mean) 102 | } 103 | 104 | 105 | eltm = proc.time() - ptm 106 | # print(paste("Step 1 Running Time : ", eltm[3])) 107 | # print("Step 1 : Done !!") 108 | 109 | # print("#########################################################################") 110 | # print("Step 2 : Detect TD boundaries based on binSignals") 111 | # print("#########################################################################") 112 | 113 | ptm = proc.time() 114 | #gap.idx = Which.Gap.Region(matrix.data=matrix.data) 115 | #gap.idx = Which.Gap.Region2(mean.cf) 116 | gap.idx = Which.Gap.Region2(matrix.data=matrix.data, window.size) 117 | 118 | proc.regions = Which.process.region(rmv.idx=gap.idx, n_bins=n_bins, min.size=3) 119 | 120 | #print(proc.regions) 121 | 122 | for( i in 1:nrow(proc.regions)) 123 | { 124 | start = proc.regions[i, "start"] 125 | end = proc.regions[i, "end"] 126 | 127 | # print(paste("Process Regions from ", start, "to", end)) 128 | 129 | local.ext[start:end] = Detect.Local.Extreme(x=mean.cf[start:end]) 130 | } 131 | 132 | local.ext_bp = local.ext 133 | 134 | eltm = proc.time() - ptm 135 | # print(paste("Step 2 Running Time : ", eltm[3])) 136 | # print("Step 2 : Done !!") 137 | 138 | if(statFilter) 139 | { 140 | # print("#########################################################################") 141 | # print("Step 3 : Statistical Filtering of false positive TD boundaries") 142 | # print("#########################################################################") 143 | 144 | if(is.null(stat_window_size)) stat_window_size=window.size 145 | ptm = proc.time() 146 | # print("-- Matrix Scaling....") 147 | scale.matrix.data = matrix.data 148 | 149 | ## This is to normalize each off diag values -- 2018-09-20 150 | # for( i in 1:(2*window.size) ) 151 | for( i in 1:(2*stat_window_size) ) 152 | { 153 | #diag(scale.matrix.data[, i:n_bins] ) = scale( diag( matrix.data[, i:n_bins] ) ) 154 | scale.matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] = scale( matrix.data[ seq(1+(n_bins*i), n_bins*n_bins, 1+n_bins) ] ) 155 | } 156 | 157 | # print("-- Compute p-values by Wilcox Ranksum Test") 158 | for( i in 1:nrow(proc.regions)) 159 | { 160 | start = proc.regions[i, "start"] 161 | end = proc.regions[i, "end"] 162 | 163 | # print(paste("Process Regions from ", start, "to", end)) 164 | # pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=window.size, scale=1) 165 | 166 | pvalue[start:end] <- Get.Pvalue(matrix.data=scale.matrix.data[start:end, start:end], size=stat_window_size, scale=1) 167 | } 168 | # print("-- Done!") 169 | 170 | # print("-- Filtering False Positives") 171 | local.ext[intersect( union(which( local.ext==-1), which(local.ext==-1)), which(pvalue= n_proc.set ) { 280 | proc.regions = rbind(proc.regions, c(start=start, end=proc.set[j-1]) ) 281 | break 282 | } 283 | } 284 | 285 | colnames(proc.regions) = c("start", "end") 286 | proc.regions <- proc.regions[ which( abs(proc.regions[,"end"] - proc.regions[, "start"]) >= min.size ), ] 287 | 288 | return(proc.regions) 289 | } 290 | 291 | # @fn Which.Gap.Region 292 | # @breif version 0.0.1 used 293 | # @param matrix.data : n by n matrix 294 | # @return gap index 295 | Which.Gap.Region <- function(matrix.data) 296 | { 297 | n_bins = nrow(matrix.data) 298 | gap = rep(0, n_bins) 299 | 300 | i=1 301 | while(i < n_bins) 302 | { 303 | j = i + 1 304 | while( j <= n_bins) 305 | { 306 | if( sum( matrix.data[i:j, i:j]) == 0 ) { 307 | gap[i:j] = -0.5 308 | j = j+1 309 | #if(j-i > 1) gap[i:j]=-0.5 310 | #j=j+1 311 | } else break 312 | } 313 | 314 | i = j 315 | } 316 | 317 | idx = which(gap == -0.5) 318 | return(idx) 319 | } 320 | 321 | # @fn Which.Gap.Region3 322 | # @param matrix.data : n by n matrix 323 | # @return gap index 324 | Which.Gap.Region3 <- function(mean.cf) 325 | { 326 | n_bins = length(mean.cf) 327 | gapidx = which(mean.cf==0) 328 | 329 | return(gapidx) 330 | } 331 | 332 | # @fn Which.Gap.Region2 333 | # @breif version 0.0.2 used 334 | # @param matrix.data : n by n matrix 335 | # @return gap index 336 | Which.Gap.Region2 <- function(matrix.data, w) 337 | { 338 | n_bins = nrow(matrix.data) 339 | gap = rep(0, n_bins) 340 | 341 | for(i in 1:n_bins) 342 | { 343 | if( sum( matrix.data[i, max(1, i-w):min(i+w, n_bins)] ) == 0 ) gap[i]=-0.5 344 | } 345 | 346 | idx = which(gap == -0.5) 347 | return(idx) 348 | } 349 | 350 | # @fn Detect.Local.Extreme 351 | # @param x : original signal to find local minima 352 | # @return vector of local extrme, -1 if the index is local minimum, 1 if the index is local maxima, 0 otherwise. 353 | Detect.Local.Extreme <- function(x) 354 | { 355 | n_bins = length(x) 356 | ret = rep(0, n_bins) 357 | x[is.na(x)]=0 358 | 359 | if(n_bins <= 3) 360 | { 361 | ret[which.min(x)]=-1 362 | ret[which.max(x)]=1 363 | 364 | return(ret) 365 | } 366 | # Norm##################################################3 367 | new.point = Data.Norm(x=1:n_bins, y=x) 368 | x=new.point$y 369 | ################################################## 370 | cp = Change.Point(x=1:n_bins, y=x) 371 | 372 | if( length(cp$cp) <= 2 ) return(ret) 373 | if( length(cp$cp) == n_bins) return(ret) 374 | for(i in 2:(length(cp$cp)-1)) 375 | { 376 | if( x[cp$cp[i]] >= x[cp$cp[i]-1] && x[cp$cp[i]] >= x[cp$cp[i]+1] ) ret[cp$cp[i]] = 1 377 | else if(x[cp$cp[i]] < x[cp$cp[i]-1] && x[cp$cp[i]] < x[cp$cp[i]+1]) ret[cp$cp[i]] = -1 378 | 379 | min.val = min( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) 380 | max.val = max( x[ cp$cp[i-1] ], x[ cp$cp[i] ] ) 381 | 382 | if( min( x[cp$cp[i-1]:cp$cp[i]] ) < min.val ) ret[ cp$cp[i-1] - 1 + which.min( x[cp$cp[i-1]:cp$cp[i]] ) ] = -1 383 | if( max( x[cp$cp[i-1]:cp$cp[i]] ) > max.val ) ret[ cp$cp[i-1] - 1 + which.max( x[cp$cp[i-1]:cp$cp[i]] ) ] = 1 384 | } 385 | 386 | return(ret) 387 | } 388 | 389 | # @fn Data.Norm 390 | # @param x : x axis vector 391 | # @param x : y axis vector 392 | # @return list of normalized x and y 393 | Data.Norm <- function(x, y) 394 | { 395 | ret.x = rep(0, length(x)) 396 | ret.y = rep(0, length(y)) 397 | 398 | ret.x[1] = x[1] 399 | ret.y[1] = y[1] 400 | 401 | diff.x = diff(x) 402 | diff.y = diff(y) 403 | 404 | scale.x = 1 / mean( abs(diff(x) ) ) 405 | scale.y = 1 / mean( abs( diff(y) ) ) 406 | 407 | #print(scale.x) 408 | #print(scale.y) 409 | 410 | for(i in 2:length(x)) 411 | { 412 | ret.x[i] = ret.x[i-1] + (diff.x[i-1]*scale.x) 413 | ret.y[i] = ret.y[i-1] + (diff.y[i-1]*scale.y) 414 | } 415 | 416 | return(list(x=ret.x, y=ret.y)) 417 | } 418 | 419 | # @fn Change.Point 420 | # @param x : x axis vector 421 | # @param x : y axis vector 422 | # @return change point index in x vector, 423 | # Note that the first and the last point will be always change point 424 | Change.Point <- function( x, y ) 425 | { 426 | if( length(x) != length(y)) 427 | { 428 | print("ERROR : The length of x and y should be the same") 429 | return(0) 430 | } 431 | 432 | n_bins <- length(x) 433 | Fv <- rep(NA, n_bins) 434 | Ev <- rep(NA, n_bins) 435 | cp <- 1 436 | 437 | i=1 438 | Fv[1]=0 439 | while( i < n_bins ) 440 | { 441 | j=i+1 442 | Fv[j] = sqrt( (x[j]-x[i])^2 + (y[j] - y[i] )^2 ) 443 | 444 | while(j0) 581 | { 582 | from.coord = bins[proc.region[, "start"]+1, "from.coord"] 583 | boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 584 | ret = rbind(ret, boundary) 585 | } 586 | 587 | ret = rbind(gap, domain) 588 | ret = ret[order(ret[,3]), ] 589 | 590 | ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) 591 | ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) 592 | ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) 593 | ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] ## HERE THE SIZE IS COMPUTED / Yuanlong LIU 594 | 595 | if(!is.null(pvalues) && !is.null(pvalue.cut)) 596 | { 597 | for(i in 1:nrow(ret)) 598 | { 599 | if(ret[i, "tag"]=="domain") 600 | { 601 | domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] 602 | p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) 603 | 604 | if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" 605 | } 606 | } 607 | } 608 | 609 | new.bdr.set = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 610 | stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 611 | 612 | i=1 613 | while(i <= nrow(ret)) 614 | { 615 | if( ret[i, "tag"] == "boundary" ) 616 | { 617 | stack.bdr = rbind(stack.bdr, ret[i, ]) 618 | } else if(nrow(stack.bdr)>0) { 619 | new.bdr = data.frame(chr=bins[1, "chr"], 620 | from.id = min( stack.bdr[, "from.id"]), 621 | from.coord=min(stack.bdr[, "from.coord"]), 622 | to.id = max( stack.bdr[, "to.id"]), 623 | to.coord=max(stack.bdr[, "to.coord"]), 624 | tag="boundary", 625 | size=max(stack.bdr[, "to.coord"]) - min(stack.bdr[, "from.coord"])) 626 | new.bdr.set = rbind(new.bdr.set, new.bdr) 627 | stack.bdr = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 628 | } 629 | 630 | i = i + 1 631 | } 632 | 633 | ret = rbind( ret[ ret[, "tag"]!="boundary", ], new.bdr.set ) 634 | ret = ret[order(ret[, "to.coord"]), ] 635 | 636 | return(ret) 637 | } 638 | 639 | 640 | # @fn Convert.Bin.To.Domain 641 | # @param bins : bin information 642 | # @param signal.idx : signal index 643 | # @param signal.idx : gap index 644 | # @param pvalues : pvalue vector 645 | # @param pvalue.cut : pvalue threshold 646 | # @return dataframe storing domain information 647 | Convert.Bin.To.Domain.TMP <- function(bins, signal.idx, gap.idx, pvalues=NULL, pvalue.cut=NULL) 648 | { 649 | n_bins = nrow(bins) 650 | ret = data.frame(chr=character(0), from.id=numeric(0), from.coord=numeric(0), to.id=numeric(0), to.coord=numeric(0), tag=character(0), size=numeric(0)) 651 | levels( x=ret[, "tag"] ) = c("domain", "gap", "boundary") 652 | 653 | rmv.idx = setdiff(1:n_bins, gap.idx) 654 | proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) 655 | from.coord = bins[proc.region[, "start"], "from.coord"] 656 | n_procs = nrow(proc.region) 657 | gap = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("gap", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 658 | 659 | rmv.idx = union(signal.idx, gap.idx) 660 | proc.region = Which.process.region(rmv.idx, n_bins, min.size=0) 661 | n_procs = nrow(proc.region) 662 | from.coord = bins[proc.region[, "start"], "from.coord"] 663 | domain = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("domain", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 664 | 665 | rmv.idx = setdiff(1:n_bins, signal.idx) 666 | proc.region = as.data.frame( Which.process.region(rmv.idx, n_bins, min.size=1) ) 667 | n_procs = nrow(proc.region) 668 | if(n_procs>0) 669 | { 670 | from.coord = bins[proc.region[, "start"]+1, "from.coord"] 671 | boundary = data.frame(chr=rep( bins[1, "chr"], n_procs), from.id=rep(0, n_procs), from.coord=from.coord, to.id=rep(0, n_procs), to.coord=rep(0, n_procs), tag=rep("boundary", n_procs), size=rep(0, n_procs), stringsAsFactors=F) 672 | ret = rbind(ret, boundary) 673 | } 674 | 675 | ret = rbind(gap, domain) 676 | ret = ret[order(ret[,3]), ] 677 | 678 | ret[, "to.coord"] = c(ret[2:nrow(ret), "from.coord"], bins[n_bins, "to.coord"]) 679 | ret[, "from.id"] = match( ret[, "from.coord"], bins[, "from.coord"] ) 680 | ret[, "to.id"] = match(ret[, "to.coord"], bins[, "to.coord"]) 681 | ret[, "size"] = ret[,"to.coord"]-ret[,"from.coord"] 682 | 683 | if(!is.null(pvalues) && !is.null(pvalue.cut)) 684 | { 685 | for(i in 1:nrow(ret)) 686 | { 687 | if(ret[i, "tag"]=="domain") 688 | { 689 | domain.bins.idx = ret[i, "from.id"]:ret[i, "to.id"] 690 | p.value.constr = which( pvalues[ domain.bins.idx ] < pvalue.cut ) 691 | 692 | if( length(domain.bins.idx) == length(p.value.constr)) ret[i, "tag"] = "boundary" 693 | } 694 | } 695 | } 696 | 697 | return(ret) 698 | } -------------------------------------------------------------------------------- /R/compartment_PCA.R: -------------------------------------------------------------------------------- 1 | 2 | get_PCs = function( mat, which ) ## fast way to compute PCs 3 | { 4 | PC_mat = crossprod(mat) 5 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=max(which), which = "LM" ) 6 | if(any(res_eigs_sym$values <0)) stop('Non-positive eigenvalues for A^2') 7 | PCs = mat%*%(res_eigs_sym$vectors) 8 | return( PCs[, which] ) 9 | } 10 | 11 | 12 | PC_compartment <- function(A_oe) ## compute PC and define A/B compartment. Not used currently 13 | { 14 | cA_oe = fast_cor(A_oe) 15 | PC_mat = crossprod(cA_oe) 16 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" ) 17 | PC1 = cA_oe%*%res_eigs_sym$vectors[,1] 18 | PC2 = cA_oe%*%res_eigs_sym$vectors[,2] 19 | 20 | borders = which(diff(1*(PC1 > 0))!=0) 21 | to_id = as.numeric(rownames(A_oe)[borders]) 22 | from_id = as.numeric(rownames(A_oe)[c(1, head(borders, length(borders)-1)+1)]) 23 | 24 | start_poses = (from_id-1)*bin_size + 1 25 | end_poses = to_id*bin_size 26 | 27 | compartment_AB = data.frame(chr=paste0('chr', chr), start_poses=start_poses, end_poses=end_poses, A_or_B=NA, zero=0, dot='.', start_poses_2=start_poses, end_poses_2=end_poses, col=NA) 28 | compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'A_or_B'] = 'A' 29 | compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'A_or_B'] = 'B' 30 | 31 | compartment_AB[which((1:nrow(compartment_AB))%%2==0), 'col'] = '112,128,144' 32 | compartment_AB[which((1:nrow(compartment_AB))%%2==1), 'col'] = '255,255,0' 33 | 34 | compartments_bed_files = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_2', '.bed') 35 | write.table( compartment_AB, file=compartments_bed_files, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' ) 36 | } 37 | 38 | PC_compartment_slim <- function(A_oe, downsratio=NULL) ## compute and save the PC values only. Not used currently 39 | { 40 | cA_oe = fast_cor(A_oe) 41 | PC_mat = crossprod(cA_oe) 42 | res_eigs_sym = rARPACK::eigs_sym( PC_mat, k=2, which = "LM" ) 43 | PC1 = cA_oe%*%res_eigs_sym$vectors[,1] 44 | PC2 = cA_oe%*%res_eigs_sym$vectors[,2] 45 | 46 | compartment_AB = data.frame(PC1=PC1, bin_names=rownames(A_oe)) 47 | 48 | if( is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB.Rdata') 49 | if( !is.null(downsratio) ) compartments_AB_file = paste0(sub_folder, '/chr', chr, '_compartments_PCA_AB_downsratio_', downsratio, '.Rdata') 50 | 51 | save( compartment_AB, file=compartments_AB_file ) 52 | } 53 | 54 | 55 | -------------------------------------------------------------------------------- /R/compartment_data_generation_fun.R: -------------------------------------------------------------------------------- 1 | 2 | compress_mat_fast_tmp = function(input_mat, compress_size) 3 | { 4 | mat_block <- function(n, r) suppressWarnings( matrix(c(rep(1, r), rep(0, n)), n, n/r) ) 5 | n = ncol(input_mat) 6 | mat2prod = mat_block(n, compress_size) 7 | # return(t(input_mat%*%mat2prod / compress_size)) 8 | return(t(matrix_multiplication_cpp(input_mat, mat2prod))) 9 | } 10 | 11 | mat_10to40kb = function(mat2compress, bin_size2look, bin_size_input) ## small bin size to bigger bin_size 12 | { 13 | compress_size = bin_size2look / bin_size_input 14 | len = nrow(mat2compress) - nrow(mat2compress)%%compress_size 15 | mat2compress = mat2compress[1:len, 1:len] 16 | c_mat_compressed = compress_mat_fast_tmp( as.matrix(mat2compress), compress_size=compress_size ) 17 | mat_compressed = compress_mat_fast_tmp( c_mat_compressed, compress_size=compress_size ) 18 | rownames(mat_compressed) = colnames(mat_compressed) = as.character( 1:nrow(mat_compressed) ) 19 | return(mat_compressed) 20 | } 21 | 22 | contact_mat_processing_v2 = function(contact_tab_dump=NULL, contact_file_dump=NULL, contact_file_hic=NULL, chr_num, bin_size_input, bin_size2look, black_list_bins=NULL) 23 | { 24 | compress_size = ifelse(bin_size2look < 40E3, 1, 1) 25 | zero_ratio = 0.01 26 | 27 | 28 | if(!is.null(contact_tab_dump)) contact_mat_raw = contact_tab_dump ## if contact matrix in data.frame or data.table of strawr format is provided 29 | if(!is.null(contact_file_dump)) contact_mat_raw = data.table::fread(contact_file_dump) ## if contact matrix in strawr format is provided 30 | 31 | if(!is.null(contact_file_hic)) ## if contact matrix in hic format is provided 32 | { 33 | bin_sizes_in_hic = strawr::readHicBpResolutions(contact_file_hic) 34 | chrs_in_hic = as.vector(strawr::readHicChroms(contact_file_hic)[["name"]]) 35 | chr2query = chrs_in_hic[match(toupper(chr_num), gsub('chr', '', toupper(chrs_in_hic), ignore.case=TRUE))] 36 | 37 | if(!(bin_size_input %in% bin_sizes_in_hic)) stop(sprintf('Your provided hic file only contains resolutions of: %s', paste0(bin_sizes_in_hic, collapse=' '))) 38 | if(!(chr2query %in% chrs_in_hic)) stop(sprintf('Your provided hic file only contains chrs of: %s', paste0(chrs_in_hic, collapse=' '))) 39 | 40 | ## try different normalization to get available dataset 41 | 42 | contact_mat_raw = try(strawr::straw("KR", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input)) 43 | if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC_SQRT", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input)) 44 | if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) contact_mat_raw = try(strawr::straw("VC", contact_file_hic, as.character(chr2query), as.character(chr2query), "BP", bin_size_input)) 45 | if(class(contact_mat_raw)=='try-error' | (class(contact_mat_raw)!='try-error' & nrow(na.omit(contact_mat_raw)) < 100)) stop(sprintf('Your provided hic file does not contain information given the bin_size=%s and any of the normalization method KR/VC/VC_SQRT', bin_size_input)) 46 | contact_mat_raw = data.table::as.data.table(contact_mat_raw) 47 | } 48 | 49 | colnames(contact_mat_raw) = c('pos_1', 'pos_2', 'val') 50 | 51 | contact_mat = subset(contact_mat_raw, !is.na(val)) 52 | contact_mat[,1] = contact_mat[,1]/bin_size_input 53 | contact_mat[,2] = contact_mat[,2]/bin_size_input 54 | 55 | if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n') 56 | 57 | n_bins = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19) 58 | mat_sparse = Matrix::Matrix(0, nrow=n_bins, ncol=n_bins) 59 | mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]] 60 | 61 | rownames(mat_sparse) = colnames(mat_sparse) = as.character( 1:nrow(mat_sparse) ) 62 | 63 | ########################################################## remove black listed regions 64 | 65 | if(length(black_list_bins) > 0) 66 | { 67 | black_list_bins = intersect(as.character(black_list_bins), rownames(mat_sparse)) 68 | if(length(black_list_bins) > 0) mat_sparse[black_list_bins, ] = mat_sparse[, black_list_bins] = 0 69 | } 70 | 71 | ########################################################## remove bins having too sparse contacts 72 | 73 | mat_sparse = Matrix::forceSymmetric(mat_sparse, uplo='U') 74 | if(bin_size2look!=bin_size_input) mat_sparse = mat_10to40kb( mat_sparse, bin_size2look, bin_size_input ) 75 | mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A 76 | if(nrow(mat_dense) < 100) mat_dense = remove_blank_cols(mat_sparse, sparse=TRUE, ratio=0) ## when all are dense 77 | while(min(apply(mat_dense, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0 78 | { 79 | mat_dense = remove_blank_cols(mat_dense, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A 80 | if(nrow(mat_dense) < 1) stop('Error in generating mat_dense at the data generating step') 81 | } 82 | 83 | ########################################################## 84 | 85 | nrow2keep = nrow(mat_dense) - nrow(mat_dense)%%compress_size 86 | mat_dense_2_compress = mat_dense[, 1:nrow2keep] 87 | 88 | bin_names = rownames(mat_dense) 89 | 90 | mat_dense_compressed = compress_mat_fast( as.matrix(mat_dense_2_compress), compress_size=compress_size ) 91 | colnames(mat_dense_compressed) = bin_names 92 | rm(mat_dense_2_compress); gc() 93 | 94 | # range(mat_dense_compressed) 95 | # # sum(mat_dense_compressed > 1000) 96 | # # mat_dense_compressed[mat_dense_compressed > 1000] = 1000 97 | # mat_dense_compressed_sparse = mat_dense_compressed 98 | mat_dense_compressed = as.matrix(mat_dense_compressed) 99 | mat_dense_compressed_log = log2(mat_dense_compressed + 1) 100 | 101 | # ######################################################### 102 | # cat('compute correlation matrix ... ') 103 | 104 | cmat_dense_compressed_log = fast_cor(mat_dense_compressed_log) 105 | ccmat_dense_compressed_log = fast_cor(cmat_dense_compressed_log) 106 | 107 | # cat('compute correlation matrix done ... ') 108 | 109 | # ######################################################### 110 | # # ccmat_dense_compressed_atanh = atanh(ccmat_dense_compressed - 1E-7) 111 | ccmat_dense_compressed_log_atanh = atanh(ccmat_dense_compressed_log / (1+1E-7)) 112 | 113 | # rm(mat_dense_compressed, mat_dense_compressed_sparse, cmat_dense_compressed_log) 114 | gc() 115 | # ######################################################### 116 | # cat('ready to compute compartment domains\n') 117 | 118 | out = list(mat_dense=mat_dense, atanh_score=ccmat_dense_compressed_log_atanh) 119 | 120 | return(out) 121 | } 122 | 123 | 124 | 125 | # if(!file.exists(contact_mat_file)) contact_mat_file = paste0('/mnt/ndata/Yuanlong/2.Results/1.Juicer/', CELL_LINE, '/contact_mat/mat_chr', chr, '_', bin_size_initial_kb, 'kb_ob.txt.gz') 126 | 127 | ## bin_size_initial is the binsize of your input matrix, can be different from the bin_size of your planned analysis 128 | # contact_mat_processing = function(contact_mat_file, bin_size, bin_size_initial=bin_size) 129 | # { 130 | 131 | # compress_size = ifelse(bin_size < 40E3, 1, 1) 132 | # zero_ratio = 0.01 133 | 134 | # combined_xk_oe_raw = data.table::fread(contact_mat_file) 135 | 136 | # ## this code generates the compartment domains 137 | 138 | # combined_xk_oe_raw = subset(combined_xk_oe_raw, !is.na(V3)) 139 | # combined_xk_oe_raw[,1] = combined_xk_oe_raw[,1]/bin_size_initial 140 | # combined_xk_oe_raw[,2] = combined_xk_oe_raw[,2]/bin_size_initial 141 | # combined_xk_oe = combined_xk_oe_raw 142 | 143 | # colnames(combined_xk_oe) = c('pos_1', 'pos_2', 'val') 144 | # if(!all(combined_xk_oe[[2]] >= combined_xk_oe[[1]])) stop('\nYou provided matrix does not represent an upper triangular matrix!\n\n') 145 | 146 | # oe_size = max(max(combined_xk_oe[[1]]), max(combined_xk_oe[[2]])) + 1 ## should +1 because combined_xk_oe index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19) 147 | # mat_oe_sparse = Matrix::Matrix(0, nrow=oe_size, ncol=oe_size) 148 | # mat_oe_sparse[cbind(combined_xk_oe[[1]]+1, combined_xk_oe[[2]]+1)] <- combined_xk_oe[[3]] 149 | 150 | # rownames(mat_oe_sparse) = colnames(mat_oe_sparse) = as.character( 1:nrow(mat_oe_sparse) ) 151 | 152 | # mat_oe_sparse = Matrix::forceSymmetric(mat_oe_sparse, uplo='U') 153 | # if(bin_size!=bin_size_initial) mat_oe_sparse = mat_10to40kb( mat_oe_sparse, bin_size, bin_size_initial ) 154 | # A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A 155 | # if(nrow(A_oe) < 100) A_oe = remove_blank_cols(mat_oe_sparse, sparse=TRUE, ratio=0) ## when all are dense 156 | # while(min(apply(A_oe, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0 157 | # { 158 | # A_oe = remove_blank_cols(A_oe, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A 159 | # if(nrow(A_oe) < 1) stop('ERROR IN GENERATING MEANINGFUL A_oe at the data generating step') 160 | # } 161 | 162 | # ########################################################## 163 | 164 | # len = nrow(A_oe) - nrow(A_oe)%%compress_size 165 | # A_oe_2_compress = A_oe[, 1:len] 166 | 167 | # bin_names = rownames(A_oe) 168 | 169 | # A_oe_compressed = compress_mat_fast( as.matrix(A_oe_2_compress), compress_size=compress_size ) 170 | # colnames(A_oe_compressed) = bin_names 171 | # rm(A_oe_2_compress); gc() 172 | 173 | # range(A_oe_compressed) 174 | # # # sum(A_oe_compressed > 1000) 175 | # # # A_oe_compressed[A_oe_compressed > 1000] = 1000 176 | # A_oe_compressed_sparse = A_oe_compressed 177 | # A_oe_compressed = as.matrix(A_oe_compressed) 178 | # A_oe_compressed_log = log2(A_oe_compressed + 1) 179 | 180 | # # ######################################################### 181 | # # cat('compute correlation matrix ... ') 182 | 183 | # cA_oe_compressed_log = fast_cor(A_oe_compressed_log) 184 | # ccA_oe_compressed_log = fast_cor(cA_oe_compressed_log) 185 | 186 | # # cat('compute correlation matrix done ... ') 187 | 188 | # # ######################################################### 189 | # # # ccA_oe_compressed_atanh = atanh(ccA_oe_compressed - 1E-7) 190 | # ccA_oe_compressed_log_atanh = atanh(ccA_oe_compressed_log / (1+1E-7)) 191 | 192 | # # rm(A_oe_compressed, A_oe_compressed_sparse, cA_oe_compressed_log) 193 | # gc() 194 | # # ######################################################### 195 | # # cat('ready to compute compartment domains\n') 196 | 197 | # out = list(A_oe=A_oe, atanh_score=ccA_oe_compressed_log_atanh) 198 | 199 | # return(out) 200 | # } 201 | 202 | -------------------------------------------------------------------------------- /R/generate_compartments_bed_fun.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU, 09-08-2018 2 | 3 | generate_compartments_bed <- function(input_mat, p_thresh, out_file_name, chr, window.sizes=3, stat_window_size=NULL, bin_size) 4 | { 5 | 6 | input_mat_extended = data.frame(chr=paste0('chr', chr), pos_start=0:(nrow(input_mat)-1), pos_end=1:nrow(input_mat), mat=input_mat) 7 | 8 | res_input_mat = TopDom_v2(input_mat_extended, window.size=NULL, NULL, T, p_thresh=p_thresh, window.sizes=window.sizes, stat_window_size=stat_window_size, domain_size_min=NULL) 9 | cat('[', as.character(chr),'] Computing compartments\n') 10 | # return(res_input_mat) 11 | to_id = as.numeric(rownames(input_mat)[res_input_mat$domain$to.id]) 12 | from_id = as.numeric(rownames(input_mat)[res_input_mat$domain$from.id]) 13 | 14 | start_poses = (from_id-1)*bin_size + 1 15 | # end_poses = start_poses + n2one*bin_size 16 | end_poses = start_poses + bin_size 17 | 18 | input_mat_compartments_bed = data.frame(paste0('chr', chr), as.character(start_poses), as.character(end_poses) ) 19 | if(!is.null(out_file_name)) write.table( input_mat_compartments_bed, file=out_file_name, quote=FALSE, row.names=FALSE, col.names=FALSE, sep=' ' ) 20 | return( res_input_mat ) 21 | } 22 | -------------------------------------------------------------------------------- /R/post_processing_nested_domains.R: -------------------------------------------------------------------------------- 1 | 2 | get_cluster_index <- function(pos, initial_clusters, bin_names, bin_size) 3 | { 4 | compartment_segs = generate_compartment_segs( initial_clusters ) 5 | CDs = get_original_tad_indices( bin_names, compartment_segs, bin_size=bin_size ) 6 | cluster_index = which(apply( CDs, 1, function(v) (v[1] < pos)&(v[2]>pos) )==1) 7 | return(cluster_index) 8 | } 9 | 10 | get_original_tad_indices_extra <- function(names_A_final, TADs, bin_size) ## to get the TADs for the extra edges 11 | { 12 | # start_pos = as.numeric(names_A_final[TADs$start_pos]) 13 | end_pos = as.numeric(names_A_final[TADs$end_pos]) 14 | # start_pos_ori = (start_pos - 1)*bin_size + 1 15 | end_pos_ori = end_pos*bin_size 16 | TADs = data.frame( start_pos_ori=end_pos_ori+1, end_pos_ori=end_pos_ori ) 17 | return( TADs ) 18 | } 19 | 20 | 21 | LikelihoodRatioTest <- function(sub_domains_raw, ncores, remove_zero=FALSE, distr, n_parameters=3, imputation_num=1E2, A_already_corrected=FALSE) 22 | { 23 | `%dopar%` <- foreach::`%dopar%` 24 | `%do%` <- foreach::`%do%` 25 | 26 | pA_sym = sub_domains_raw$pA_sym 27 | if(A_already_corrected==TRUE) cpA_sym = pA_sym 28 | ## CAN SPEEDUP correct_A_fast_divide_by_mean BECAUSE ONLY SOME OFF-DIAGNAL LINES NEED TO BE CORRECTED 29 | if(A_already_corrected==FALSE) cpA_sym = correct_A_fast_divide_by_mean(pA_sym, remove_zero=remove_zero) ## corrected pA_sym 30 | trees = foreach::foreach(j=1:length( sub_domains_raw$res_inner )) %do% 31 | { 32 | name_index = rownames(sub_domains_raw$pA_sym)[sub_domains_raw$segmentss[j,1]:sub_domains_raw$segmentss[j,2]] 33 | sub_domains_raw$res_inner[[j]]$cA = as.matrix(cpA_sym[name_index, name_index]) ## the corrected A. Use this matrix is essential for reducing the distance-based false positves 34 | 35 | ## for example, when the number of invovled bins in sub_domains_raw$res_inner[[j]] is too small 36 | # tmp = try(get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num )) 37 | tmp = get_tree_decoration( sub_domains_raw$res_inner[[j]], distr=distr, n_parameters=n_parameters, imputation_num=imputation_num ) 38 | # if( class(tmp)=="try-error" ) 39 | if(class(tmp)!='igraph') ## if(tmp=='bad_tree') 40 | { 41 | root_tree = igraph::graph.empty() + 'root' 42 | igraph::V(root_tree)$width = nrow(sub_domains_raw$res_inner[[j]]$A) 43 | igraph::V(root_tree)$left = 1 44 | igraph::V(root_tree)$right = igraph::V(root_tree)$width 45 | tmp = root_tree ## represented by the length of the segment 46 | } 47 | tmp 48 | } 49 | return(trees) 50 | } 51 | 52 | 53 | # pipline <- function(chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end 54 | create_TADs <- function(sub_domains_raw, chr, TADs_raw, TADs_type, just_save_no_nest=FALSE, bin_size) ## TADs_tmp: start end 55 | { 56 | chr_name = paste0('chr', chr) 57 | bin_size_kb = bin_size / 1E3 58 | 59 | if(just_save_no_nest) 60 | { 61 | TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 62 | TADs = cbind(chr_name, TADs) 63 | save( TADs, file=Tads_R_File ) 64 | } 65 | TADs = get_original_tad_indices( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 66 | # if(TADs_type=='extra') TADs = get_original_tad_indices_extra( rownames(sub_domains_raw$pA_sym), TADs_raw, bin_size=bin_size_kb*1E3 ) 67 | 68 | # print(TADs) 69 | 70 | TADs = cbind(chr_name, TADs) 71 | return(TADs) 72 | } 73 | 74 | ##################################################################### 75 | 76 | ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE 77 | clean_TADs_all = function(TADs_all_raw, CDs, bin_size) 78 | { 79 | TADs_all_end = TADs_all_raw[,2] 80 | CD_end = CDs[,2] 81 | outer_mat = outer(TADs_all_end, CD_end, "-") 82 | min_dist = apply(outer_mat, 1, function(v) min(abs(v))) 83 | # end2rm = TADs_all_end[(min_dist <= 3) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb 84 | end2rm = TADs_all_end[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary, at least 40kb 85 | ## INDEED, WHEN RESOLUTION == 40KB, DO NOT REMOVE 86 | 87 | 88 | TADs_all_head = TADs_all_raw[,1] 89 | CD_head = CDs[,1] 90 | outer_mat = outer(TADs_all_head, CD_head, "-") 91 | min_dist = apply(outer_mat, 1, function(v) min(abs(v))) 92 | head2rm = TADs_all_head[(min_dist < 40E3/bin_size) & (min_dist > 0)] ## remove nested boundaris too close to CD boundary 93 | TADs_all = subset(TADs_all_raw, (!(start_pos %in% head2rm)) & (!(end_pos %in% end2rm)) ) 94 | return(TADs_all) 95 | } 96 | 97 | post_process_sub_domains = function(chr, sub_domains_raw, ncores, out_dir, bin_size) 98 | { 99 | 100 | distr=c('lnorm', 'wilcox')[2] 101 | remove_zero = FALSE 102 | n_parameters = 3 103 | imputation_num = 1E2 104 | A_already_corrected = FALSE 105 | 106 | decorated_branches = LikelihoodRatioTest(sub_domains_raw=sub_domains_raw, ncores=ncores, remove_zero=FALSE, distr=distr, n_parameters=n_parameters, imputation_num=imputation_num, A_already_corrected=A_already_corrected) 107 | chr_name = paste0('chr', chr) 108 | 109 | 110 | mean_diff_thresh = -0.1 111 | i = 1 112 | # for(i in 1:length(p0s)) 113 | { 114 | # cat(i, '\n') 115 | # p0 = p0s[i] 116 | 117 | normal_decorated_branches = decorated_branches[sapply(decorated_branches, igraph::vcount) > 1] 118 | # ps = sort(unlist(lapply(normal_decorated_branches, function(v) igraph::V(v)$wilcox_p)), decreasing=FALSE) ## fdr correction 119 | # p0_adj = ps[min(which(p.adjust(ps, method = 'fdr') > p0))] 120 | # p0_adj = ps[min(which(p.adjust(ps, method = 'bonferroni') > p0))] 121 | p0_adj = 0.05 122 | 123 | TADs_all_raw = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=TRUE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh) 124 | 125 | # CDs = prunning(decorated_branches, to_correct=FALSE, p0=p0_adj, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE); cat(nrow(CDs), '\n') 126 | CDs = prunning(decorated_branches, to_correct=FALSE, p0=-1, width_thresh=2, width_thresh_CD=width_thresh_CD, all_levels=FALSE, CD_border_adj=FALSE, top_down=TRUE, mean_diff_thresh=mean_diff_thresh) 127 | # if(nrow(CDs)!=length(res$initial_clusters)) stop('nrow(CDs)!=length(res$initial_clusters)') 128 | 129 | TADs_all = clean_TADs_all(TADs_all_raw, CDs, bin_size=bin_size) 130 | 131 | # Tad_edges <- sort(unique(c(TADs_tmp[,1], (TADs_tmp[,2]+1)))) 132 | TADs_all_edges <- sort(unique(TADs_all[,2])) 133 | CD_edges <- sort(unique(CDs[,2])) 134 | 135 | TADs_extra_edges = unique(setdiff(TADs_all_edges, CD_edges)) 136 | TADs_extra = data.frame(start_pos=TADs_extra_edges, end_pos= TADs_extra_edges) 137 | 138 | 139 | TADs_pos_all = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_all, TADs_type='ALL', bin_size=bin_size) 140 | TADs_pos_extra = create_TADs(sub_domains_raw=sub_domains_raw, chr, TADs_extra, TADs_type='extra', bin_size=bin_size) 141 | TADs_pos_CD = create_TADs(sub_domains_raw=sub_domains_raw, chr, CDs, TADs_type='CD', bin_size=bin_size) 142 | 143 | # TADs_info = list(decorated_branches=decorated_branches, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD) 144 | TADs_info = list(decorated_branches=decorated_branches, TADs_all=TADs_all, CDs=CDs, TADs_pos_all=TADs_pos_all, TADs_pos_CD=TADs_pos_CD, TADs_pos_extra=TADs_pos_extra) 145 | } 146 | 147 | 148 | options(stringsAsFactors=FALSE) 149 | level_5 = TADs_info$TADs_pos_extra[, c('chr_name', 'end_pos_ori')] 150 | colnames(level_5)[1:2] = c('chr', 'nested_boundary') 151 | sub_domain_boundary_bed_file = paste0(out_dir, '/chr', chr, '_nested_boundaries.bed') 152 | 153 | level_5_bed = data.frame(level_5, level_5[,2], '', '.', level_5, level_5[,2], '#000000') 154 | op <- options(scipen=999) 155 | write.table( level_5_bed, file=sub_domain_boundary_bed_file, quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE ) 156 | 157 | on.exit(options(op)) 158 | 159 | return(NULL) 160 | } 161 | 162 | 163 | -------------------------------------------------------------------------------- /R/prunning.R: -------------------------------------------------------------------------------- 1 | trim_tree_adaptive_top_down_v2 = function( tree, wilcox_p_thresh, mean_diff_thresh ) 2 | { 3 | # leaves = get_leaves(tree) 4 | if(igraph::vcount(tree)==1) return(tree) 5 | # cat('I am in trim_tree_adaptive_top_down_v2\n') 6 | nsig_nodes = union( igraph::V(tree)[which(igraph::V(tree)$wilcox_p > wilcox_p_thresh)]$name, igraph::V(tree)[which(igraph::V(tree)$mean_diff > mean_diff_thresh)]$name ) 7 | 8 | 9 | children_of_nsig = names(unlist(igraph::ego(tree, order=1, node=nsig_nodes, mode='out', mindist=1))) 10 | if(length(children_of_nsig)!=0) trimmed_tree = tree - children_of_nsig 11 | if(length(children_of_nsig)==0) trimmed_tree = tree 12 | 13 | comps = igraph::decompose(trimmed_tree) 14 | root_index = which(sapply( comps, function(comp) igraph::V(tree)[1]$name %in% igraph::V(comp)$name )==1) 15 | trimmed_tree = comps[[root_index]] 16 | if(!is_binary_tree( trimmed_tree )) stop("trim_tree_adaptive_top_down, the resulted tree is not a binary tree") 17 | return( trimmed_tree ) 18 | } 19 | 20 | 21 | prunning = function(branches, p0, to_correct=FALSE, width_thresh=-Inf, width_thresh_CD=5, boundary_signal_thresh=-Inf, return_which='TADs', top_down=FALSE, all_levels=FALSE, CD_border_adj=FALSE, peak_thresh=NULL, mean_diff_thresh) 22 | { 23 | 24 | size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v)))) 25 | p_thresh = p0/size2correct 26 | if(to_correct==FALSE) p_thresh=p0 27 | # if(!is.null(p0)) p_thresh = p0 28 | 29 | 30 | if(top_down==FALSE) 31 | { 32 | trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh, peak_thresh=peak_thresh ) 33 | # size2correct = sum(sapply(trimmed_branches, igraph::vcount)) - sum(sapply(trimmed_branches, function(v) length(get_leaves(v)))) 34 | 35 | # for( i in 1:length( trimmed_branches ) ) 36 | # { 37 | # trimmed_branch = trimmed_branches[[i]] 38 | # if(igraph::vcount(trimmed_branch) > 1) trimmed_branches[[i]] = lapply( trimmed_branches[i], trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=-1 )[[1]] 39 | # } 40 | 41 | 42 | } 43 | 44 | # if(top_down==TRUE) trimmed_branches = lapply( branches, trim_tree_adaptive_top_down, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh, boundary_signal_thresh=boundary_signal_thresh ) 45 | if(top_down==TRUE) trimmed_branches = lapply( branches, function(branch) trim_tree_adaptive_top_down_v2(wilcox_p_thresh=p_thresh, mean_diff_thresh=mean_diff_thresh, tree=branch )) 46 | 47 | if(CD_border_adj==TRUE) 48 | { 49 | all_tads = get_adjusted_nested_TADs( trimmed_branches, width_thresh_CD, all_levels ) 50 | return( all_tads ) 51 | } 52 | 53 | ## get all nested TADs in trimmed_branches 54 | if(all_levels==TRUE) 55 | { 56 | all_tads = data.frame(start_pos=numeric(), end_pos=numeric()) 57 | widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width)) 58 | for(i in 1:length(trimmed_branches)) 59 | { 60 | all_tads_i = get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i])) 61 | all_tads = rbind(all_tads, all_tads_i) 62 | } 63 | return( all_tads ) 64 | } 65 | 66 | if( return_which=='trimmed_branches' ) return( trimmed_branches ) 67 | 68 | tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 69 | tad_sizes = unlist(tad_sizes_ind) 70 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 71 | end_pos = cumsum(tad_sizes) 72 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 73 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 74 | return( tads ) 75 | } 76 | 77 | 78 | 79 | 80 | ## This function combines prunning with branches of only one node 81 | prunning_hybrid <- function(branches, ...) 82 | { 83 | names(branches) = as.character(1:length(branches)) 84 | normal_branches = branches[sapply( branches, function(v) class(v)=='igraph' )] 85 | unnormal_branches = branches[sapply( branches, function(v) class(v)!='igraph' )] ## that is reprsented as bin_start:bin_end 86 | 87 | trimmed_branches = prunning(normal_branches, return_which='trimmed_branches', ...) 88 | 89 | normal_tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 90 | unormal_tad_sizes_ind = unnormal_branches 91 | tad_sizes_ind = c(normal_tad_sizes_ind, unormal_tad_sizes_ind) 92 | tad_sizes_ind = tad_sizes_ind[names(branches)] 93 | tad_sizes = unlist(tad_sizes_ind) 94 | 95 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 96 | end_pos = cumsum(tad_sizes) 97 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 98 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 99 | return( tads ) 100 | } 101 | 102 | 103 | get_all_tads_in_a_trimmed_branch <- function(trimmed_branch, pos_shift) 104 | { 105 | res = data.frame( start_pos=igraph::V(trimmed_branch)$left + pos_shift, end_pos=igraph::V(trimmed_branch)$right + pos_shift ) 106 | res = res[order(res[,1], res[,2]), ] 107 | return(res) 108 | } 109 | 110 | prunning_bottom_up <- function(branches, p0=NULL, width_thresh) 111 | { 112 | size2correct = sum(sapply(branches, igraph::vcount)) - sum(sapply(branches, function(v) length(get_leaves(v)))) 113 | 114 | p_thresh = 0.05/size2correct 115 | if(!is.null(p0)) p_thresh = p0 116 | 117 | trimmed_branches = lapply( branches, trim_tree_adaptive, max_imp_p=p_thresh, max_nimp_p=Inf, width_thresh=width_thresh ) 118 | 119 | tad_sizes_ind = lapply( trimmed_branches, function(v) get_leaves(v, 'igraph')$width ) 120 | tad_sizes = unlist(tad_sizes_ind) 121 | # tads = split(1:sum(tad_sizes), rep(seq_along(tad_sizes), tad_sizes)) 122 | end_pos = cumsum(tad_sizes) 123 | start_pos = c(1, 1 + end_pos[-length(end_pos)]) 124 | tads = data.frame(start_pos=start_pos, end_pos=end_pos) 125 | return( tads ) 126 | } 127 | 128 | 129 | trim_tree_adaptive_bottom_up <- function( tree, which_p='imp_p' ) 130 | { 131 | if(which_p=='imp_p') ps = sort(unique(igraph::V(tree)$imp_p), decreasing=TRUE) 132 | # if(which_p=='nimp_p') ps = sort(unique(igraph::V(tree)$nimp_p), decreasing=TRUE) 133 | # if(which_p=='both') ps = sort(unique(pmin(igraph::V(tree)$nimp_p, igraph::V(tree)$imp_p)), decreasing=TRUE) 134 | 135 | trimed_tree_current = tree 136 | trimmed_branch_bottom_up = vector('list', length(ps)) 137 | for(i in 1:length(ps)) 138 | { 139 | trimed_tree_current = trim_tree_adaptive( tree, L_diff_thresh=-Inf, max_imp_p=ps[i], max_nimp_p=Inf, width_thresh=-Inf ) 140 | trimmed_branch_bottom_up[[i]] = trimed_tree_current 141 | } 142 | igraph::vcounts = sapply(trimmed_branch_bottom_up, igraph::vcount) 143 | ps = ps[!duplicated(igraph::vcounts)] 144 | trimmed_branch_bottom_up = trimmed_branch_bottom_up[!duplicated(igraph::vcounts)] 145 | res = list(ps=ps, trimmed_branch_bottom_up=trimmed_branch_bottom_up) 146 | return( res ) 147 | } 148 | 149 | 150 | ## get adjusted nested TADs 151 | get_adjusted_nested_TADs <- function( trimmed_branches, width_thresh_CD, all_levels ) 152 | { 153 | widths = c(0, sapply(trimmed_branches, function(v) igraph::V(v)[1]$width)) 154 | all_tads_i_list = lapply( 1:length(trimmed_branches), function(i) get_all_tads_in_a_trimmed_branch(trimmed_branches[[i]], pos_shift=sum(widths[1:i]))) 155 | 156 | for(i in 1:length(trimmed_branches)) 157 | { 158 | all_tads_i = all_tads_i_list[[i]] 159 | if( nrow(all_tads_i) <= 1 ) next 160 | ## move the left-most border a little bit right if needed 161 | left_borders = unique(all_tads_i[,1]) 162 | min_diff_left = left_borders[2] - left_borders[1] 163 | if( min_diff_left <= width_thresh_CD ) 164 | { 165 | all_tads_i[ all_tads_i==left_borders[1] ] = left_borders[2] 166 | all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1], ] ## remove "negative" TADs 167 | all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs 168 | 169 | all_tads_i_list[[i]] = all_tads_i 170 | 171 | ## need to modify the right border of nested TADs in previous CD if the left border of this CD is modified 172 | if(i > 1) 173 | { 174 | ## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i] 175 | all_tads_i_list[[i-1]][ all_tads_i_list[[i-1]]==max(all_tads_i_list[[i-1]]) ] = min(all_tads_i_list[[i]]) - 1 176 | } 177 | } 178 | 179 | if( nrow(all_tads_i) <= 1 ) next 180 | ## move the right-most border a little bit left if needed 181 | right_borders = unique(rev(all_tads_i[,2])) 182 | min_diff_right = right_borders[1] - right_borders[2] 183 | if( min_diff_right <= width_thresh_CD ) 184 | { 185 | all_tads_i[ all_tads_i==right_borders[1] ] = right_borders[2] 186 | all_tads_i = all_tads_i[ all_tads_i[,2] > all_tads_i[,1], ] 187 | all_tads_i = unique(all_tads_i[order(all_tads_i[,1], all_tads_i[,2]), ]) ## reorder the TADs 188 | 189 | all_tads_i_list[[i]] = all_tads_i 190 | 191 | if(i < length(trimmed_branches)) 192 | { 193 | ## replace the max value of [i-1], i.e., the right most border, as the min of [i]-1, i.e., the left most border of [i] 194 | all_tads_i_list[[i+1]][ all_tads_i_list[[i+1]]==min(all_tads_i_list[[i+1]]) ] = max(all_tads_i_list[[i]]) + 1 195 | } 196 | } 197 | } 198 | 199 | if(!all_levels) all_tads_i_list = lapply( all_tads_i_list, function(v) data.frame(start_pos=head(v[,1],1), end_pos=tail(v[,2],1)) ) 200 | 201 | all_tads = do.call(rbind, all_tads_i_list) 202 | colnames(all_tads) = c('start_pos', 'end_pos') 203 | return( all_tads ) 204 | } 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /R/zigzag_nested_domain_v2.R: -------------------------------------------------------------------------------- 1 | ## Yuanlong LIU 2 | ## 12/04/2018 3 | ## 06/06/2018 4 | ## 16/06/2018 5 | ## This code runs zigzag search on each compartment domain as a whole without dividing into 400 6 | 7 | ## 09-10-2018 8 | 9 | ## A should be already named 10 | ## "zigzag" resembles computational path of finding the optimum 11 | # HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq, max_nbins_fine, chr, min_n_bins=2) 12 | HRG_zigzag_compartment_domain_main_fun <- function(A, res_dir, compartment_segs, allowed_max_nbins_seq=NULL, max_nbins_fine=NULL, min_n_bins=2) 13 | { 14 | 15 | `%dopar%` <- foreach::`%dopar%` 16 | `%do%` <- foreach::`%do%` 17 | 18 | arg_list = as.list(environment()) 19 | 20 | res_folder = file.path(res_dir) 21 | dir.create(res_folder, recursive=TRUE, showWarnings = FALSE) 22 | 23 | total_execution_time_file = file.path(res_dir, 'total_execution.time') 24 | time_begin = Sys.time() 25 | cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE) 26 | 27 | ## check whether your input matrix is symmetric 28 | # A_sym = as.matrix( Matrix::forceSymmetric(data.matrix(A), uplo='U') ) 29 | # A_sym = Matrix::forceSymmetric(data.matrix(A), uplo='U') ## keep sparse, 2018-11-11 30 | A_sym = Matrix::forceSymmetric(A, uplo='U') ## keep sparse, 2018-11-11 31 | 32 | tol = 100 * .Machine$double.eps 33 | max_diff = max(abs(A_sym - A)) 34 | notSym_flag = max_diff > tol 35 | if( notSym_flag ) warning('Your input contact matrix is not symmetric. The maximum difference between symmetric values is: ', max_diff, '\nBy default, the contact profile used for downstream analysis is taken from the upper triangular part. To use the lower triangular part you can transpose the input contact matrix first') 36 | 37 | if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices') 38 | 39 | # pA_sym = rm_zeros(A_sym) ## pA_sym: positive A 40 | pA_sym = as.matrix(remove_blank_cols(A_sym, sparse=TRUE, ratio=0)) 41 | n_zero_rows = nrow(A_sym) - nrow(pA_sym) 42 | zero_rows_flag = n_zero_rows > 0 43 | if( zero_rows_flag ) 44 | { 45 | warning('There are ', n_zero_rows, ' rows/columns in your input contact matrix that have all their values being 0. These rows/columns are removed for downstream analysis') 46 | original_row_names = rownames(A_sym) 47 | kept_row_names = rownames(pA_sym) 48 | } 49 | 50 | # res_inner = rep(list(), nrow(compartment_segs)) ## for each compartment domain 51 | # for( i in 1:nrow(compartment_segs) ) 52 | # { 53 | # seg = compartment_segs[i,1]:compartment_segs[i,2] 54 | # cat('Compute seg:', i, 'of length:', length(seg), '\n') 55 | 56 | # A_seg = as.matrix(pA_sym[seg, seg]) 57 | # res_zigzag = zigzag_loglik_ancestors_v4(A_seg, nrow(A_seg)) 58 | # res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors) 59 | # res_inner[[i]] = res_outer 60 | # cat('finished', '\n') 61 | # } 62 | 63 | ## changed to paralell, 2018-11-11 64 | 65 | res_inner = foreach::foreach(i=1:nrow(compartment_segs)) %do% 66 | { 67 | seg = compartment_segs[i,1]:compartment_segs[i,2] 68 | cat('\r', sprintf('Find sub-domains in %d of %d CDs | length of current CD: %d bins\n', i, nrow(compartment_segs), length(seg))) 69 | 70 | A_seg = pA_sym[seg, seg] 71 | res_zigzag = zigzag_loglik_ancestors_v4_5(A_seg, nrow(A_seg), min_n_bins=min_n_bins) 72 | res_outer = list(A=A_seg, L=res_zigzag$L, ancestors=res_zigzag$ancestors, min_n_bins=min_n_bins) 73 | res_outer 74 | # res_inner[[i]] = res_outer 75 | } 76 | 77 | cat('\n') 78 | 79 | segmentss = compartment_segs 80 | res_info = list( arg_list=arg_list, pA_sym=pA_sym, A_final=pA_sym, segmentss=segmentss, res_inner=res_inner ) 81 | # res_folder_final = file.path(res_dir, 'final') 82 | # dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE) 83 | # save(res_info, file=file.path(res_folder_final, 'res_info.Rdata')) 84 | 85 | time_finish = Sys.time() 86 | cat('Execution finishes:', as.character(time_finish), '\n', file=total_execution_time_file, append=TRUE) 87 | cat('Total execution time:', capture.output( time_finish - time_begin ), '\n', file=total_execution_time_file, append=TRUE) 88 | 89 | return( res_info ) 90 | } 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/r-calder2/README.html) 2 | [![Docker Pulls](https://img.shields.io/docker/pulls/lucananni93/calder2?label=Docker%20image&logo=Docker&style=flat-square)](https://hub.docker.com/r/lucananni93/calder2) 3 | [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) 4 | [![nf-core](https://img.shields.io/badge/nf--core-calder2-green)](https://nf-co.re/modules/calder2) 5 | 6 | # CALDER user manual 7 | 8 | CALDER is a Hi-C analysis tool that allows: (1) compute chromatin domains from whole chromosome contacts; (2) derive their non-linear hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each chromatin domain from short-range contacts. CALDER is currently implemented in R. 9 | 10 | * Overview of the CALDER method: 11 | ![Alt text](./img/CALDER_methods.png "CALDER methods") 12 | 13 | * Calder connects chromatin 3D organization to genomic function: 14 | ![Alt text](./img/CALDER_features.png "CALDER features") 15 | 16 | ## (A note on the performance of Calder vs PC-based approach) 17 | 18 | * PC1 (sometimes PC2) of the correlation matrix was typically used to define A/B compartment. We found Calder demonstrates superior robustness over PC-based approach in identifying meaningful compartments, particularly when faced with complex chromosomal structural variations (figure on the left) or loose interaction between the p and q arms (figure on the right) 19 | 20 | ![Alt text](./img/Hela_chr11_and_RWPE1_chr9_pq.png "Hela_chr11_and_RWPE1_chr9_pq") 21 | 22 | 23 | ## Multiple new features were added in version 2.0 24 | 25 | * Support for hg19, hg38, mm9, mm10 and other genomes 26 | * Support input in .hic format generated by Juicer tools (https://github.com/aidenlab/juicer) 27 | * Optimized bin_size selection for more reliable compartment identification 28 | * Aggregated all chromosome output into a single file for easier visualization in IGV 29 | * Added output in tabular .txt format at bin level for easier downstream analysis 30 | 31 | Below we introduce two main updates: 32 | 33 | ### (1) Optimized `bin_size` selection 34 | 35 | 36 | Due to reasons such as low data quality or large scale structural variation, compartments can be unreliably called at one `bin_size` (equivalent to `resolution` in the literature) but properly called at another `bin_size`. We added an optimized `bin_size` selection strategy to call reliable compartments. This strategy is based on the observation from our large scale compartment analysis (https://www.nature.com/articles/s41467-021-22666-3), that although compartments can change between different conditions, their overall correlation `cor(compartment_rank_1, compartment_rank_2)` is high (> 0.4). 37 |
38 |
39 | **The strategy**: given a `bin_size` specified by user, we call compartments with extended `bin_sizes` and choose the smallest `bin_size` such that no bigger `bin_size` can increase the compartment correlation with a reference compartment more than 0.05. For example, if correlation for `bin_size=10000` is 0.2 while for `bin_size=50000` is 0.6, we are more confident that the latter is more reliable; if correlation for `bin_size=10000` is 0.5 while for `bin_size=50000` is 0.52, we would choose the former as it has higher resolution. 40 |
41 |
42 | `bin_size` is extended in the following way thus contact matrices at any larger `bin_sizes` can be aggregated from the input contact matrices directly: 43 | ``` 44 | if(bin_size==5E3) bin_sizes = c(5E3, 10E3, 50E3, 100E3) 45 | if(bin_size==10E3) bin_sizes = c(10E3, 50E3, 100E3) 46 | if(bin_size==20E3) bin_sizes = c(20E3, 40E3, 100E3) 47 | if(bin_size==25E3) bin_sizes = c(25E3, 50E3, 100E3) 48 | if(bin_size==40E3) bin_sizes = c(40E3, 80E3) 49 | if(bin_size==50E3) bin_sizes = c(50E3, 100E3) 50 | ``` 51 | Note that this strategy is currently only available for `hg19`, `hg38`, `mm9` and `mm10` genome for which we generated high quality reference compartments using Hi-C data from: GSE63525 for `hg19`, https://data.4dnucleome.org/files-processed/4DNFI1UEG1HD for `hg38`, GSM3959427 for `mm9`, http://hicfiles.s3.amazonaws.com/external/bonev/CN_mapq30.hic for `mm10`. 52 | 53 | ### (2) Support for other genomes 54 | 55 | Although CALDER was mainly tested on human and mouse dataset, it can be applied to dataset from other genomes. One additional information is required in such case: a `feature_track` presumably positively correlated with compartment score (thus higher values in A than in B compartment). This information will be used for correctly determining the `A/B` direction. Some suggested tracks are gene density, H3K27ac, H3K4me1, H3K4me2, H3K4me3, H3K36me3 (or negative transform of H3K9me3) signals. Note that this information will not alter the hierarchical compartment/TAD structure, and can come from any external study with matched genome. An example of `feature_track` is given in the **Usage** section. 56 | 57 | # Installation 58 | 59 | ## Installing from conda 60 | The easiest way to get the package is to install from [Bioconda](https://bioconda.github.io/index.html): 61 | ``` 62 | conda install --channel bioconda r-calder2 63 | ``` 64 | 65 | ## Installing from source 66 | 67 | ### Make sure all dependencies have been installed: 68 | 69 | * R.utils (>= 2.9.0), 70 | * doParallel (>= 1.0.15), 71 | * ape (>= 5.3), 72 | * dendextend (>= 1.12.0), 73 | * fitdistrplus (>= 1.0.14), 74 | * igraph (>= 1.2.4.1), 75 | * Matrix (>= 1.2.17), 76 | * rARPACK (>= 0.11.0), 77 | * factoextra (>= 1.0.5), 78 | * data.table (>= 1.12.2), 79 | * fields (>= 9.8.3), 80 | * GenomicRanges (>= 1.36.0) 81 | * ggplot2 (>= 3.3.5) 82 | * strawr (>= 0.0.9) 83 | 84 | ### Clone its repository and install it from source: 85 | 86 | On the command line: 87 | ``` 88 | git clone https://github.com/CSOgroup/CALDER2.git 89 | cd CALDER2 90 | ``` 91 | Then, once inside of the `R` interpreter: 92 | ``` 93 | install.packages(".", repos = NULL, type="source") # install from the cloned source file 94 | ``` 95 | 96 | ### Install CALDER and dependencies automaticly: 97 | One can also install directly from Github, together with the dependencies as follows: 98 | 99 | ``` 100 | if (!requireNamespace("BiocManager", quietly = TRUE)) 101 | install.packages("BiocManager") 102 | 103 | BiocManager::install("GenomicRanges") 104 | install.packages("remotes") 105 | remotes::install_github("CSOgroup/CALDER2.0") 106 | ``` 107 | 108 | Please contact yliueagle@googlemail.com for any questions about installation. 109 | 110 | ## Use as a docker container 111 | We provide a [Docker image](https://hub.docker.com/r/lucananni93/calder2) complete with all dependencies to run CALDER workflows. 112 | 113 | ``` 114 | # Pull the docker image from Dockerhub 115 | docker pull lucananni93/calder2 116 | 117 | # Run the image 118 | docker run -it lucananni93/calder2 119 | 120 | # Once inside the image we can run the command line Calder tool 121 | calder [options] 122 | 123 | # or we can just enter R 124 | R 125 | 126 | # and load Calder 127 | library(CALDER) 128 | ``` 129 | 130 | 131 | # Usage 132 | 133 | CALDER contains three modules: (1) compute chromatin domains; (2) derive their hierarchical organization and obtain sub-compartments; (3) compute nested sub-domains within each compartment domain. 134 | 135 | ### Input data format 136 | 137 | CALDER works on contact matrices compatible with that generated by Juicer tools (https://github.com/aidenlab/juicer), either a .hic file, or three-column `dump` table retrieved by the juicer dump (or straw) command (https://github.com/aidenlab/juicer/wiki/Data-Extraction): 138 | 139 | 16050000 16050000 10106.306 140 | 16050000 16060000 2259.247 141 | 16060000 16060000 7748.551 142 | 16050000 16070000 1251.3663 143 | 16060000 16070000 4456.1245 144 | 16070000 16070000 4211.7393 145 | 16050000 16080000 522.0705 146 | 16060000 16080000 983.1761 147 | 16070000 16080000 1996.749 148 | ... 149 | 150 | `feature_track` should be a data.frame or data.table of 4 columns (chr, start, end, score), and can be generated directly from conventional format such as bed or wig, see the example: 151 | 152 | ``` 153 | library(rtracklayer) 154 | feature_track = import('ENCFF934YOE.bigWig') ## from ENCODE https://www.encodeproject.org/files/ENCFF934YOE/@@download/ENCFF934YOE.bigWig 155 | feature_track = data.table::as.data.table(feature_track)[, c(1:3, 6)] 156 | ``` 157 | > feature_track 158 | chr start end score 159 | chr1 534179 534353 2.80512 160 | chr1 534354 572399 0 161 | chr1 572400 572574 2.80512 162 | chr1 572575 628400 0 163 | ... ... ... ... 164 | chrY 59031457 59032403 0 165 | chrY 59032404 59032413 0.92023 166 | chrY 59032414 59032415 0.96625 167 | chrY 59032416 59032456 0.92023 168 | chrY 59032457 59032578 0.78875 169 | 170 | ### Example usage (1): use contact matrix file in dump format as input 171 | ``` 172 | chrs = c(21:22) 173 | 174 | ## demo contact matrices in dump format 175 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs), 176 | package='CALDER')) 177 | names(contact_file_dump) = chrs 178 | 179 | ## Run CALDER to compute compartments but not nested sub-domains 180 | CALDER(contact_file_dump=contact_file_dump, 181 | chrs=chrs, 182 | bin_size=10E3, 183 | genome='hg19', 184 | save_dir=save_dir, 185 | save_intermediate_data=FALSE, 186 | n_cores=2, 187 | sub_domains=FALSE) 188 | 189 | ## Run CALDER to compute compartments and nested sub-domains / will take more time 190 | CALDER(contact_file_dump=contact_file_dump, 191 | chrs=chrs, 192 | bin_size=10E3, 193 | genome='hg19', 194 | save_dir=save_dir, 195 | save_intermediate_data=TRUE, 196 | n_cores=2, 197 | sub_domains=TRUE) 198 | ``` 199 | 200 | ### Example (2): use contact matrices stored in an R list 201 | ``` 202 | chrs = c(21:22) 203 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs), 204 | package='CALDER')) 205 | names(contact_file_dump) = chrs 206 | contact_tab_dump = lapply(contact_file_dump, data.table::fread) 207 | 208 | 209 | CALDER(contact_tab_dump=contact_tab_dump, 210 | chrs=chrs, 211 | bin_size=10E3, 212 | genome='hg19', 213 | save_dir=save_dir, 214 | save_intermediate_data=FALSE, 215 | n_cores=2, 216 | sub_domains=FALSE) 217 | ``` 218 | 219 | ### Example (3): use .hic file as input 220 | ``` 221 | chrs = c(21:22) 222 | hic_file = 'HMEC_combined_30.hic' ## can be downloaded from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63525/suppl/GSE63525_HMEC_combined_30.hic 223 | 224 | CALDER(contact_file_hic=hic_file, 225 | chrs=chrs, 226 | bin_size=10E3, 227 | genome='hg19', 228 | save_dir=save_dir, 229 | save_intermediate_data=FALSE, 230 | n_cores=2, 231 | sub_domains=FALSE) 232 | ``` 233 | 234 | ### Example (4): run CALDER on other genomes 235 | ``` 236 | ## prepare feature_track 237 | library(rtracklayer) 238 | feature_track_raw = import('ENCFF934YOE.bigWig') ## from ENCODE https://www.encodeproject.org/files/ENCFF934YOE/@@download/ENCFF934YOE.bigWig 239 | feature_track = data.table::as.data.table(feature_track_raw)[, c(1:3, 6)] 240 | 241 | ## Run CALDER 242 | chrs = c(21:22) 243 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs), 244 | package='CALDER')) 245 | names(contact_file_dump) = chrs 246 | 247 | CALDER(contact_file_dump=contact_file_dump, 248 | chrs=chrs, 249 | bin_size=10E3, 250 | genome='others', 251 | save_dir=save_dir, 252 | feature_track=feature_track, 253 | save_intermediate_data=FALSE, 254 | n_cores=2, 255 | sub_domains=FALSE) 256 | ``` 257 | 258 | 259 | 260 | ### Parameters: 261 | 262 | | Name | Description | 263 | | --------------------- | ----------------------- | 264 | | **chrs** | A vector of chromosome names to be analyzed, with or without 'chr'. Chromosome names should be consistent with those in `contact_file_hic` and `feature_track` if such files are provided 265 | | **contact_file_dump** |A list of contact files in dump format, named by `chrs`. Each contact file stores the contact information of the corresponding `chr`. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided 266 | | **contact_tab_dump** | A list of contact table in dump format, named by `chrs`, stored as an R object. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided 267 | | **contact_file_hic** | A hic file generated by Juicer tools. It should contain all chromosomes in `chrs`. Only one of `contact_file_dump`, `contact_tab_dump`, `contact_file_hic` should be provided 268 | | **genome** | One of 'hg19', 'hg38', 'mm9', 'mm10', 'others' (default). These compartments will be used as reference compartments for optimized bin_size selection. If `genome = 'others'`, a `feature_track` should be provided (see below) and no optimized bin_size selection will be performed 269 | | **save_dir** | the directory to be created for saving outputs 270 | | **bin_size** | The bin_size (resolution) to run CALDER. `bin_size` should be consistent with the data resolution in `contact_file_dump` or `contact_tab_dump` if these files are provided as input, otherwise `bin_size` should exist in `contact_file_hic`. Recommended `bin_size` is between **10000 to 100000** 271 | | **single_binsize_only** | logical. If TRUE, CALDER will compute compartments only using the bin_size specified by the user and not do bin size optimization 272 | | **feature_track** | A genomic feature track in `data.frame` or `data.table` format with 4 columns (chr, start, end, score). This track will be used for determining the A/B compartment direction when `genome='others'` and should presumably have higher values in A than in B compartment. Some suggested tracks can be gene density, H3K27ac, H3K4me1, H3K4me2, H3K4me3, H3K36me3 (or negative transform of H3K9me3 signals) 273 | | **save_intermediate_data** | logical. If TRUE, an intermediate_data will be saved. This file can be used for computing nested sub-domains later on 274 | | **n_cores** | integer. Number of cores to be registered for running CALDER in parallel 275 | | **sub_domains** | logical, whether to compute nested sub-domains 276 | 277 | 278 | 279 | ## Output Structure 280 | The output of the workflow is stored in the folder specified by `save_dir` and will look like this: 281 | ``` 282 | save_dir/ 283 | |-- sub_domains 284 | | `-- all_nested_boundaries.bed 285 | |-- sub_compartments 286 | | |-- cor_with_ref.txt 287 | | |-- cor_with_ref.pdf 288 | | |-- cor_with_ref.ALL.txt 289 | | |-- all_sub_compartments.tsv 290 | | `-- all_sub_compartments.bed 291 | `-- intermediate_data 292 | |-- sub_domains 293 | | |-- chr22_sub_domains_log.txt 294 | | |-- chr22_nested_boundaries.bed 295 | | |-- chr21_sub_domains_log.txt 296 | | `-- chr21_nested_boundaries.bed 297 | `-- sub_compartments 298 | |-- 50kb 299 | | |-- chr22_sub_compartments.bed 300 | | |-- chr22_log.txt 301 | | |-- chr22_domain_hierachy.tsv 302 | | |-- chr22_domain_boundaries.bed 303 | | |-- chr21_sub_compartments.bed 304 | | |-- chr21_log.txt 305 | | |-- chr21_domain_hierachy.tsv 306 | | `-- chr21_domain_boundaries.bed 307 | |-- 10kb 308 | | |-- chr22_sub_compartments.bed 309 | | |-- chr22_log.txt 310 | | |-- chr22_intermediate_data.Rds 311 | | |-- chr22_domain_hierachy.tsv 312 | | |-- chr22_domain_boundaries.bed 313 | | |-- chr21_sub_compartments.bed 314 | | |-- chr21_log.txt 315 | | |-- chr21_intermediate_data.Rds 316 | | |-- chr21_domain_hierachy.tsv 317 | | `-- chr21_domain_boundaries.bed 318 | `-- 100kb 319 | |-- chr22_sub_compartments.bed 320 | |-- chr22_log.txt 321 | |-- chr22_domain_hierachy.tsv 322 | |-- chr22_domain_boundaries.bed 323 | |-- chr21_sub_compartments.bed 324 | |-- chr21_log.txt 325 | |-- chr21_domain_hierachy.tsv 326 | `-- chr21_domain_boundaries.bed 327 | ``` 328 | 329 | ### File description: 330 | 331 | * The `sub_domains` folder stores the nested boundaries 332 | * The `sub_compartments` folder stores the optimal compartment information, including files: 333 | 334 | | Name | Description | 335 | | --------------------- | ----------------------- | 336 | | **all_sub_compartments.bed** | a .bed file containing the optimal compartments for all `chrs`, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments) 337 | | **all_sub_compartments.tsv** | optimal compartments stored in tabular text format. Each row represents one 10kb region 338 | | **cor_with_ref.ALL.txt** | a plot of correlation between compartment rank and the reference compartment rank for each of extended `bin_sizes`, and the optimal `bin_size` that is finally selected 339 | | **cor_with_ref.pdf** | correlation of compartment rank with the reference compartment rank using the optimal `bin_size` 340 | 341 | 342 | 343 | * The `intermediate_data` folder stores information at each of the extended `bin_sizes` for each chromosome 344 | 345 | 346 | | Name | Description | 347 | | --------------------- | ----------------------- | 348 | | **chrxx_domain_hierachy.tsv** | information of compartment domain and their hierarchical organization. The hierarchical structure is fully represented by `compartment_label`, for example, `B.2.2.2` and `B.2.2.1` are two sub-branches of `B.2.2`. The `pos_end` column specifies all compartment domain borders, except when it is marked as `gap`, which indicates it is the border of a gap chromosome region that has too few contacts and was excluded from the analysis (e.g., due to low mappability, deletion, technique flaw) 349 | | **chrxx_sub_compartments.bed** | a .bed file containing the compartment information, that can be visualized in IGV. Different colors were used to distinguish compartments (at the resolution of 8 sub-compartments) 350 | | **chrxx_domain_boundaries.bed** | a .bed file containing the chromatin domains boundaries, that can be visualized in IGV 351 | | **chrxx_nested_boundaries.bed** | a .bed file containing the nested sub-domain boundaries, that can be visualized in IGV 352 | | **chrxx_intermediate_data.Rds** | an Rds file storing the intermediate_data that can be used to compute nested sub-domains 353 | | **chrxx_log.txt, chrxx_sub_domains_log.txt** | log files storing the status and running time of each step 354 | 355 | 356 | All .bed files can be view directly through IGV: 357 | 358 | ![Alt text](./img/IGV_results.png "IGV") 359 | 360 | ## Running time: 361 | For the computational requirement, running CALDER on the GM12878 Hi-C dataset at bin size of 40kb took **36 minutes** to derive the chromatin domains and their hierarchy for all chromosomes (i.e., CALDER Step1 and Step2); **13 minutes** to derive the nested sub-domains (i.e., CALDER Step3). At the bin size of 10kb, it took **1 h 44 minutes and 55 minutes** correspondingly (server information: 40 cores, 64GB Ram, Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz). The evaluation was done using a single core although CALDER can be run in a parallel manner. 362 | 363 | # Citation 364 | 365 | If you use CALDER in your work, please cite: https://www.nature.com/articles/s41467-021-22666-3 366 | 367 | 368 | # Contact information 369 | 370 | * Author: Yuanlong LIU 371 | * Affiliation: Computational Systems Oncology group, Department of Computational Biology, University of Lausanne, Switzerland 372 | * Email: yliueagle@googlemail.com 373 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # We need Bioconductor, so we inherit from their repositorycd . 2 | FROM bioconductor/bioconductor_docker 3 | 4 | ARG tag_name 5 | ARG repo_name 6 | 7 | ENV github_tag=$tag_name 8 | ENV github_repo=$repo_name 9 | 10 | RUN echo "Building repo ${github_repo} with tag: ${github_tag}" 11 | 12 | ################ 13 | # CALDER SETUP # 14 | ################ 15 | 16 | # Get latest release tag 17 | RUN wget https://github.com/${github_repo}/archive/refs/tags/${github_tag}.tar.gz 18 | # Untar the release 19 | RUN tar xvfs ${github_tag}.tar.gz 20 | # Adding calder script to bin 21 | ENV calder_cmd=CALDER2-${github_tag}/scripts/calder 22 | RUN ln -s $(realpath ${calder_cmd}) /bin/ 23 | # Removing tar file 24 | RUN rm ${github_tag}.tar.gz 25 | 26 | # # Install the package 27 | RUN R -e "remotes::install_local('CALDER2-${github_tag}', dependencies=T); if (!library(CALDER, logical.return=T)) quit(status=10)" 28 | 29 | # # Install other packages 30 | RUN pip install cooler 31 | 32 | CMD ["bash"] 33 | -------------------------------------------------------------------------------- /docker/docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOCKERHUB_USERNAME="lucananni93" 4 | PACKAGE_NAME="calder2" 5 | GITHUB_REPO="CSOgroup/CALDER2" 6 | DOCKER_FILE="docker/Dockerfile" 7 | LATEST_TAG=$(curl --silent "https://api.github.com/repos/${GITHUB_REPO}/releases/latest" | grep '"tag_name"' | cut -d':' -f2 | tr -d '," ') 8 | 9 | echo "Building Docker for ${GITHUB_REPO} at ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG}" 10 | 11 | docker build . \ 12 | -t ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG} \ 13 | -t ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:latest \ 14 | -f ${DOCKER_FILE} \ 15 | --build-arg tag_name=${LATEST_TAG} \ 16 | --build-arg repo_name=${GITHUB_REPO} 17 | 18 | docker push ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:${LATEST_TAG} 19 | docker push ${DOCKERHUB_USERNAME}/${PACKAGE_NAME}:latest -------------------------------------------------------------------------------- /img/CALDER_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/CALDER_features.png -------------------------------------------------------------------------------- /img/CALDER_methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/CALDER_methods.png -------------------------------------------------------------------------------- /img/Hela_chr11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/Hela_chr11.png -------------------------------------------------------------------------------- /img/Hela_chr11_and_RWPE1_chr9_pq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/Hela_chr11_and_RWPE1_chr9_pq.png -------------------------------------------------------------------------------- /img/IGV_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/IGV_results.png -------------------------------------------------------------------------------- /img/RWPE1_chr9_pq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/img/RWPE1_chr9_pq.png -------------------------------------------------------------------------------- /inst/extdata/mat_chr21_10kb_ob.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/inst/extdata/mat_chr21_10kb_ob.txt.gz -------------------------------------------------------------------------------- /inst/extdata/mat_chr22_10kb_ob.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/inst/extdata/mat_chr22_10kb_ob.txt.gz -------------------------------------------------------------------------------- /man/CALDER-package.Rd: -------------------------------------------------------------------------------- 1 | \name{CALDER-package} 2 | \alias{CALDER-package} 3 | \alias{CALDER} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{CALDER} 7 | } 8 | \description{ 9 | \packageDescription{CALDER} 10 | } 11 | \details{ 12 | 13 | The DESCRIPTION file: 14 | \packageDESCRIPTION{CALDER} 15 | \packageIndices{CALDER} 16 | ~~ An overview of how to use the package, including the most important ~~ 17 | ~~ functions ~~ 18 | } 19 | \author{ 20 | \packageAuthor{CALDER} 21 | 22 | Maintainer: \packageMaintainer{CALDER} 23 | } 24 | \references{ 25 | ~~ Literature or other references for background information ~~ 26 | } 27 | ~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ 28 | ~~ the R documentation directory ~~ 29 | \keyword{ package } 30 | \seealso{ 31 | ~~ Optional links to other man pages, e.g. ~~ 32 | ~~ \code{\link[:-package]{}} ~~ 33 | } 34 | \examples{ 35 | ~~ simple examples of the most important functions ~~ 36 | } 37 | -------------------------------------------------------------------------------- /scripts/calder: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library(optparse)) 4 | suppressPackageStartupMessages(library(CALDER)) 5 | 6 | AVAILABLE_REFERENCE_TRACKS_GENOMES <- c("hg19", "hg38", "mm9", "mm10") 7 | INPUT_TYPES <- c("hic", "cool") 8 | CHROMS_TO_REMOVE <- c("ALL", "M", "chrM", "MT", "chrMT", "Y", "chrY") 9 | 10 | 11 | parse_arguments <- function(){ 12 | # Creating the argument parsing options 13 | option_list = list( 14 | make_option(c("-i", "--input"), action="store", default=NA, type='character', 15 | help="Input Hi-C contacts"), 16 | make_option(c("-t", "--type"), action="store", default='hic', type='character', 17 | help="The type of input: hic or cool [default %default]"), 18 | make_option(c("-b", "--bin_size"), action="store", default=50000, type='integer', 19 | help="Bin size to use for the analysis [default %default]"), 20 | make_option(c("-g", "--genome"), action="store", default="hg19", type='character', 21 | help="Genome assembly to use [default %default]"), 22 | make_option(c("-f", "--feature_track"), action="store", default=NA, type='character', 23 | help="Genomic feature track to be used to determine A/B compartment direction 24 | when genome == 'others'. The track should presumably have higher values 25 | in A than in B compartmnets. [default %default]"), 26 | make_option(c("-c", "--chromosomes"), action='store', default='all', type='character', 27 | help="Chromosomes to analyze, separated by comma. [default %default]"), 28 | make_option(c("-p", "--nproc"), action="store", default=1, type='integer', 29 | help="Number of cores to use [default %default]"), 30 | make_option(c("-o", "--outpath"), action="store", default=NA, type='character', 31 | help="Path to the output folder"), 32 | make_option(c("-k", "--keep_intermediate"), action="store_true", default=FALSE, type='logical', 33 | help="Keep intermediate data after done [default %default]"), 34 | make_option(c("-a", "--adaptive"), action="store_true", default=FALSE, type='logical', 35 | help="Use adaptive resolution choice [default %default]") 36 | ) 37 | parser <- OptionParser(usage = "%prog [options]", option_list=option_list) 38 | opt <- parse_args(parser) 39 | 40 | # Checking if input path exists 41 | if(is.na(opt$input)){ 42 | print_help(parser) 43 | stop(paste0("Input path (", opt$input,") does not exist")) 44 | } 45 | 46 | # Checking if output path is provided 47 | if(is.na(opt$outpath)){ 48 | stop("Output path was not provided") 49 | } 50 | 51 | # Check that the input type is one of the possible ones 52 | if(!(opt$type %in% INPUT_TYPES)){ 53 | stop(paste0("Input type ", opt$input_type, " not available")) 54 | } 55 | 56 | # Check if the provided genome is in the list of available reference genomes 57 | # or if a feature track is provided 58 | if((!(opt$genome %in% AVAILABLE_REFERENCE_TRACKS_GENOMES)) || (file.exists(opt$feature_track))){ 59 | # in this case, we just assign it the name 'others' 60 | opt$genome = "others" 61 | } 62 | 63 | writeLines(c( 64 | "*******************************", 65 | "* CALDER *", 66 | "*******************************", 67 | paste0("[Parameters] Input: ", opt$input), 68 | paste0("[Parameters] Input type: ", opt$type), 69 | paste0("[Parameters] Bin size: ", opt$bin_size), 70 | paste0("[Parameters] Genome: ", opt$genome), 71 | paste0("[Parameters] Feature Track: ", opt$feature_track), 72 | paste0("[Parameters] Chromosomes: ", opt$chromosomes), 73 | paste0("[Parameters] N. cores: ", opt$nproc), 74 | paste0("[Parameters] Output: ", opt$outpath), 75 | paste0("[Parameters] Keep Intermediate data: ", opt$keep_intermediate), 76 | paste0("[Parameters] Use adaptive resolution: ", opt$adaptive) 77 | )) 78 | 79 | if(file.exists(opt$feature_track)){ 80 | opt$feature_track <- read.table(opt$feature_track) 81 | } 82 | 83 | return(opt) 84 | } 85 | 86 | sanitize_chroms <- function(chroms){ 87 | res <- lapply(chroms, function(x){ 88 | if(startsWith(x, "chr")){ 89 | return(substring(x, 4)) 90 | } else{ 91 | return(x) 92 | } 93 | }) 94 | return(res) 95 | } 96 | 97 | handle_input_hic <- function(opt){ 98 | suppressPackageStartupMessages(library(strawr)) 99 | chromsizes <- readHicChroms(opt$input) 100 | if(opt$chromosomes == "all"){ 101 | chroms <- chromsizes[!(toupper(chromsizes$name) %in% toupper(CHROMS_TO_REMOVE)), "name"] 102 | } 103 | else{ 104 | chrom_list <- strsplit(opt$chromosomes, ",")[[1]] 105 | chroms <- chromsizes[chromsizes$name %in% chrom_list, "name"] 106 | } 107 | chroms <- sanitize_chroms(chroms) 108 | CALDER(contact_file_hic = opt$input, 109 | chrs = chroms, 110 | bin_size = opt$bin_size, 111 | genome = opt$genome, 112 | save_dir=opt$outpath, 113 | save_intermediate_data=TRUE, 114 | feature_track=opt$feature_track, 115 | single_binsize_only=!opt$adaptive, 116 | n_cores = opt$nproc, 117 | sub_domains=TRUE) 118 | } 119 | 120 | handle_input_cool <- function(opt){ 121 | intermediate_data_dir = file.path(opt$outpath, "intermediate_data") 122 | dir.create(intermediate_data_dir, recursive=TRUE, showWarnings=FALSE) 123 | 124 | system(paste0("cooler dump --table chroms --out ", 125 | file.path(intermediate_data_dir, "chroms.txt"), 126 | " --header ", 127 | opt$input)) 128 | chroms <- read.table(file.path(intermediate_data_dir, "chroms.txt"), sep="\t", header=TRUE) 129 | if(opt$chromosomes == "all"){ 130 | chroms <- chroms[!( toupper(chroms$name) %in% toupper(CHROMS_TO_REMOVE) ), "name"] 131 | } 132 | else{ 133 | chrom_list <- strsplit(opt$chromosomes, ",")[[1]] 134 | chroms <- chroms[chroms$name %in% chrom_list, "name"] 135 | } 136 | 137 | dump_paths <- list() 138 | for(chrom in chroms){ 139 | cat(paste0("[Pre-processing] Dumping ", chrom, "\n")) 140 | chrom_dump_path <- file.path(intermediate_data_dir, paste0(chrom, "_dump.txt")) 141 | dump_paths <- c(dump_paths, chrom_dump_path) 142 | if(! file.exists(chrom_dump_path)){ 143 | system(paste0("cooler dump --table pixels --range ", 144 | chrom, 145 | " --join --balanced ", 146 | opt$input, 147 | " | cut -f2,5,8 | awk '{if ($3) print;}' > ", 148 | chrom_dump_path)) 149 | } 150 | } 151 | 152 | chroms <- sanitize_chroms(chroms) 153 | names(dump_paths) <- chroms 154 | 155 | CALDER(contact_file_dump=dump_paths, 156 | chrs=chroms, 157 | bin_size=opt$bin_size, 158 | genome=opt$genome, 159 | save_dir=opt$outpath, 160 | feature_track=opt$feature_track, 161 | single_binsize_only=!opt$adaptive, 162 | save_intermediate_data=TRUE, 163 | n_cores=opt$nproc, 164 | sub_domains=TRUE) 165 | 166 | } 167 | 168 | opt <- parse_arguments() 169 | 170 | 171 | if(opt$type == "hic"){ 172 | handle_input_hic(opt) 173 | } else if(opt$type == "cool"){ 174 | handle_input_cool(opt) 175 | } else { 176 | stop("Unknown input type") 177 | } 178 | 179 | # Cleaning the output 180 | intermediate_data_dir = file.path(opt$outpath, "intermediate_data") 181 | if(dir.exists(intermediate_data_dir) && (!opt$keep_intermediate)){ 182 | cat('[Post-processing] Removing intermediate data\n') 183 | unlink(intermediate_data_dir, recursive=TRUE) 184 | } 185 | exec_time_file = "./total_execution.time" 186 | if(file.exists(exec_time_file)){ 187 | cat("[Post-processing] Removing total_execution.time\n") 188 | file.remove(exec_time_file) 189 | } 190 | 191 | -------------------------------------------------------------------------------- /scripts/clean_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ~/miniconda3/etc/profile.d/conda.sh 4 | 5 | TEST_ENV_NAME="R-Test" 6 | 7 | mamba create --yes --name ${TEST_ENV_NAME} --channel bioconda --channel conda-forge \ 8 | r-essentials r-devtools cooler r-nloptr 9 | conda activate ${TEST_ENV_NAME} 10 | 11 | R -e "devtools::install('.', dependencies=TRUE, Ncpus = 20)" 12 | R -e "devtools::test()" 13 | 14 | # conda deactivate 15 | mamba env remove --yes --name ${TEST_ENV_NAME} 16 | -------------------------------------------------------------------------------- /scripts/create_release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | release_name="0.3" 5 | release_description="Dockerization" 6 | 7 | 8 | release_notes="CALDER2 release: ${release_description} (${release_name})" 9 | 10 | 11 | latest_tag=$(git describe --tags --abbrev=0) 12 | 13 | if [[ "${release_name}" == "${latest_tag}" ]] 14 | then 15 | echo "Overwriting exiting release (${release_name})" 16 | echo "---------------------------------------------" 17 | 18 | # Detelting Github release (should prompt a confirmation) 19 | gh release delete ${release_name} 20 | # Deleting local tag 21 | git tag -d ${release_name} 22 | # Pushing the removal of the tag to Github 23 | git push origin :${release_name} 24 | fi 25 | 26 | echo "Uploading new release (${release_name})" 27 | echo "---------------------------------------------" 28 | # Creating new tag 29 | git tag -a ${release_name} -m ${release_description} 30 | # Pushing new tag 31 | git push origin ${release_name} 32 | # Creating release from tag 33 | gh release create ${release_name} --title "${release_description}" --notes "${release_notes}" -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // matrix_multiplication_cpp 15 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B); 16 | RcppExport SEXP _CALDER_matrix_multiplication_cpp(SEXP ASEXP, SEXP BSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 21 | Rcpp::traits::input_parameter< arma::mat >::type B(BSEXP); 22 | rcpp_result_gen = Rcpp::wrap(matrix_multiplication_cpp(A, B)); 23 | return rcpp_result_gen; 24 | END_RCPP 25 | } 26 | // matrix_multiplication_sym_cpp 27 | arma::mat matrix_multiplication_sym_cpp(arma::mat A); 28 | RcppExport SEXP _CALDER_matrix_multiplication_sym_cpp(SEXP ASEXP) { 29 | BEGIN_RCPP 30 | Rcpp::RObject rcpp_result_gen; 31 | Rcpp::RNGScope rcpp_rngScope_gen; 32 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 33 | rcpp_result_gen = Rcpp::wrap(matrix_multiplication_sym_cpp(A)); 34 | return rcpp_result_gen; 35 | END_RCPP 36 | } 37 | // loglik_lnorm_cpp 38 | double loglik_lnorm_cpp(double sum_ln1, double sum_ln2, double p, double q); 39 | RcppExport SEXP _CALDER_loglik_lnorm_cpp(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP pSEXP, SEXP qSEXP) { 40 | BEGIN_RCPP 41 | Rcpp::RObject rcpp_result_gen; 42 | Rcpp::RNGScope rcpp_rngScope_gen; 43 | Rcpp::traits::input_parameter< double >::type sum_ln1(sum_ln1SEXP); 44 | Rcpp::traits::input_parameter< double >::type sum_ln2(sum_ln2SEXP); 45 | Rcpp::traits::input_parameter< double >::type p(pSEXP); 46 | Rcpp::traits::input_parameter< double >::type q(qSEXP); 47 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp(sum_ln1, sum_ln2, p, q)); 48 | return rcpp_result_gen; 49 | END_RCPP 50 | } 51 | // loglik_lnorm_cpp_vec 52 | double loglik_lnorm_cpp_vec(arma::vec vec_values); 53 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_vec(SEXP vec_valuesSEXP) { 54 | BEGIN_RCPP 55 | Rcpp::RObject rcpp_result_gen; 56 | Rcpp::RNGScope rcpp_rngScope_gen; 57 | Rcpp::traits::input_parameter< arma::vec >::type vec_values(vec_valuesSEXP); 58 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_vec(vec_values)); 59 | return rcpp_result_gen; 60 | END_RCPP 61 | } 62 | // get_A_len 63 | arma::mat get_A_len(arma::mat A); 64 | RcppExport SEXP _CALDER_get_A_len(SEXP ASEXP) { 65 | BEGIN_RCPP 66 | Rcpp::RObject rcpp_result_gen; 67 | Rcpp::RNGScope rcpp_rngScope_gen; 68 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 69 | rcpp_result_gen = Rcpp::wrap(get_A_len(A)); 70 | return rcpp_result_gen; 71 | END_RCPP 72 | } 73 | // get_A_ln1 74 | arma::mat get_A_ln1(arma::mat A); 75 | RcppExport SEXP _CALDER_get_A_ln1(SEXP ASEXP) { 76 | BEGIN_RCPP 77 | Rcpp::RObject rcpp_result_gen; 78 | Rcpp::RNGScope rcpp_rngScope_gen; 79 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 80 | rcpp_result_gen = Rcpp::wrap(get_A_ln1(A)); 81 | return rcpp_result_gen; 82 | END_RCPP 83 | } 84 | // get_A_ln2 85 | arma::mat get_A_ln2(arma::mat A); 86 | RcppExport SEXP _CALDER_get_A_ln2(SEXP ASEXP) { 87 | BEGIN_RCPP 88 | Rcpp::RObject rcpp_result_gen; 89 | Rcpp::RNGScope rcpp_rngScope_gen; 90 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 91 | rcpp_result_gen = Rcpp::wrap(get_A_ln2(A)); 92 | return rcpp_result_gen; 93 | END_RCPP 94 | } 95 | // loglik_lnorm_cpp_mat 96 | arma::mat loglik_lnorm_cpp_mat(arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs); 97 | RcppExport SEXP _CALDER_loglik_lnorm_cpp_mat(SEXP sum_ln1SEXP, SEXP sum_ln2SEXP, SEXP psSEXP, SEXP qsSEXP) { 98 | BEGIN_RCPP 99 | Rcpp::RObject rcpp_result_gen; 100 | Rcpp::RNGScope rcpp_rngScope_gen; 101 | Rcpp::traits::input_parameter< arma::mat >::type sum_ln1(sum_ln1SEXP); 102 | Rcpp::traits::input_parameter< arma::mat >::type sum_ln2(sum_ln2SEXP); 103 | Rcpp::traits::input_parameter< arma::mat >::type ps(psSEXP); 104 | Rcpp::traits::input_parameter< arma::mat >::type qs(qsSEXP); 105 | rcpp_result_gen = Rcpp::wrap(loglik_lnorm_cpp_mat(sum_ln1, sum_ln2, ps, qs)); 106 | return rcpp_result_gen; 107 | END_RCPP 108 | } 109 | // zigzag_loglik_ancestors_v4_5 110 | List zigzag_loglik_ancestors_v4_5(arma::mat A, int k, int min_n_bins); 111 | RcppExport SEXP _CALDER_zigzag_loglik_ancestors_v4_5(SEXP ASEXP, SEXP kSEXP, SEXP min_n_binsSEXP) { 112 | BEGIN_RCPP 113 | Rcpp::RObject rcpp_result_gen; 114 | Rcpp::RNGScope rcpp_rngScope_gen; 115 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 116 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 117 | Rcpp::traits::input_parameter< int >::type min_n_bins(min_n_binsSEXP); 118 | rcpp_result_gen = Rcpp::wrap(zigzag_loglik_ancestors_v4_5(A, k, min_n_bins)); 119 | return rcpp_result_gen; 120 | END_RCPP 121 | } 122 | // compute_L 123 | List compute_L(arma::mat A, arma::mat L, int k); 124 | RcppExport SEXP _CALDER_compute_L(SEXP ASEXP, SEXP LSEXP, SEXP kSEXP) { 125 | BEGIN_RCPP 126 | Rcpp::RObject rcpp_result_gen; 127 | Rcpp::RNGScope rcpp_rngScope_gen; 128 | Rcpp::traits::input_parameter< arma::mat >::type A(ASEXP); 129 | Rcpp::traits::input_parameter< arma::mat >::type L(LSEXP); 130 | Rcpp::traits::input_parameter< int >::type k(kSEXP); 131 | rcpp_result_gen = Rcpp::wrap(compute_L(A, L, k)); 132 | return rcpp_result_gen; 133 | END_RCPP 134 | } 135 | 136 | static const R_CallMethodDef CallEntries[] = { 137 | {"_CALDER_matrix_multiplication_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_cpp, 2}, 138 | {"_CALDER_matrix_multiplication_sym_cpp", (DL_FUNC) &_CALDER_matrix_multiplication_sym_cpp, 1}, 139 | {"_CALDER_loglik_lnorm_cpp", (DL_FUNC) &_CALDER_loglik_lnorm_cpp, 4}, 140 | {"_CALDER_loglik_lnorm_cpp_vec", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_vec, 1}, 141 | {"_CALDER_get_A_len", (DL_FUNC) &_CALDER_get_A_len, 1}, 142 | {"_CALDER_get_A_ln1", (DL_FUNC) &_CALDER_get_A_ln1, 1}, 143 | {"_CALDER_get_A_ln2", (DL_FUNC) &_CALDER_get_A_ln2, 1}, 144 | {"_CALDER_loglik_lnorm_cpp_mat", (DL_FUNC) &_CALDER_loglik_lnorm_cpp_mat, 4}, 145 | {"_CALDER_zigzag_loglik_ancestors_v4_5", (DL_FUNC) &_CALDER_zigzag_loglik_ancestors_v4_5, 3}, 146 | {"_CALDER_compute_L", (DL_FUNC) &_CALDER_compute_L, 3}, 147 | {NULL, NULL, 0} 148 | }; 149 | 150 | RcppExport void R_init_CALDER(DllInfo *dll) { 151 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 152 | R_useDynamicSymbols(dll, FALSE); 153 | } 154 | -------------------------------------------------------------------------------- /src/matrix_multiplication_fun.cpp: -------------------------------------------------------------------------------- 1 | 2 | //http://arma.sourceforge.net/docs.html#stats_fns 3 | #include "RcppArmadillo.h" 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp::export]] 7 | arma::mat matrix_multiplication_cpp(arma::mat A, arma::mat B) 8 | { 9 | return A*B; 10 | } 11 | 12 | // [[Rcpp::export]] 13 | arma::mat matrix_multiplication_sym_cpp(arma::mat A) 14 | { 15 | return A*A; 16 | } 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/zigzag_loglik_ancestors_v4.5.cpp: -------------------------------------------------------------------------------- 1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*- 2 | 3 | #include "RcppArmadillo.h" 4 | using namespace Rcpp; 5 | using namespace std; 6 | // #define M_PI 3.141592653589793238462643383280 /* pi */ 7 | 8 | // [[Rcpp::export]] 9 | double loglik_lnorm_cpp( double sum_ln1, double sum_ln2, double p, double q ) // several times faster than fitdistr 10 | { 11 | if( sum_ln2 < 0 ) cout << "sum_ln2 not valid in loglik_lnorm_cpp\n"; 12 | if( p < 0 ) cout << "p not valid in loglik_lnorm_cpp\n"; 13 | if( q < 0 ) cout << "q not valid in loglik_lnorm_cpp\n"; 14 | 15 | if( q <=1 ) return 0; // p = sum(x==0): the number of zero-values 16 | double alpha = 1.0*p/(p+q); 17 | double mu = sum_ln1/q; 18 | 19 | double sigma2 = sum_ln2/q + pow(mu, 2) - 2*mu*sum_ln1/q; // %: element-wise product 20 | if( abs(sigma2) <=1E-10 ) 21 | { 22 | return 0; 23 | } 24 | double loglik = -sum_ln1 - 0.5*q*(log(2*M_PI) + log(sigma2)) - (sum_ln2 + q*pow(mu, 2) - 2*mu*sum_ln1)/2/sigma2; 25 | if( p==0 ) {return loglik;} else {return p*log(alpha) + q*log(1-alpha) + loglik;} 26 | } 27 | 28 | // [[Rcpp::export]] 29 | double loglik_lnorm_cpp_vec( arma::vec vec_values ) //log-likelihood of vec_values 30 | { 31 | int p, q; 32 | int n = vec_values.n_elem; 33 | if( n < 2 ) return 0; // added on 19-07-2018 34 | 35 | double sum_ln1, sum_ln2; 36 | arma::vec positive_vec = vec_values.elem(find(vec_values > 0)); 37 | q = positive_vec.n_elem; // the number of positive values 38 | p = n - q; // the number of zeros 39 | if( q <= 1 ) return 0; 40 | 41 | sum_ln1 = sum(log(positive_vec)); 42 | sum_ln2 = sum(pow(log(positive_vec), 2)); 43 | 44 | return loglik_lnorm_cpp( sum_ln1, sum_ln2, p, q ); 45 | } 46 | 47 | // [[Rcpp::export]] 48 | arma::mat get_A_len(arma::mat A) // get matrix A_len: A_len := A*(A>0). This is used for computing the number of positive elements in a rectangle region 49 | { 50 | int n_row = A.n_rows; 51 | arma::mat A_len=arma::zeros(n_row, n_row); // for test 52 | arma::uvec ids = find(A > 0); 53 | arma::vec new_values = arma::ones(ids.n_elem); 54 | A_len.elem(ids) = new_values; 55 | return A_len; 56 | } 57 | 58 | // [[Rcpp::export]] 59 | arma::mat get_A_ln1(arma::mat A) // log(A_ij) 60 | { 61 | int n_row = A.n_rows; 62 | arma::mat A_ln1=arma::zeros(n_row, n_row); // for test 63 | arma::uvec ids = find(A > 0); 64 | arma::vec new_values = log(A.elem(ids)); 65 | A_ln1.elem(ids) = new_values; 66 | return A_ln1; 67 | } 68 | 69 | // [[Rcpp::export]] 70 | arma::mat get_A_ln2(arma::mat A) // log(A_ij)^2 71 | { 72 | int n_row = A.n_rows; 73 | arma::mat A_ln2=arma::zeros(n_row, n_row); // for test 74 | arma::uvec ids = find(A > 0); 75 | arma::vec new_values = pow(log(A.elem(ids)), 2); 76 | A_ln2.elem(ids) = new_values; 77 | return A_ln2; 78 | } 79 | 80 | // compute the loglik matrix 81 | // [[Rcpp::export]] 82 | arma::mat loglik_lnorm_cpp_mat( arma::mat sum_ln1, arma::mat sum_ln2, arma::mat ps, arma::mat qs ) // several times faster than fitdistr 83 | { 84 | int n_row = sum_ln1.n_rows; 85 | int n_col = sum_ln1.n_cols; 86 | arma::mat loglik(n_row, n_col); 87 | for(int i=0; i 0) 103 | arma::mat A_ln1 = get_A_ln1(A); // log(A_ij) 104 | arma::mat A_ln2 = get_A_ln2(A); // log(A_ij)^2 105 | StringMatrix ancestors(n_row, n_row); 106 | arma::mat L=arma::zeros(n_row, n_row); // for test 107 | Rcpp::List res; 108 | 109 | for( int k= min_n_bins-1; k<=(2*min_n_bins - 2); k++ ) // other values of L are 0 110 | { 111 | for( int v=1; v<= (n_row - k); v++ ) 112 | { 113 | arma::mat tmp_mat=A.submat(arma::span(v-1, v+k-1), arma::span(v-1, v+k-1)); // span(0,1) := 1:2 in R 114 | arma::vec upper_tri_vec = tmp_mat.elem(find(trimatu(tmp_mat))); 115 | L(v-1, v-1+k) = loglik_lnorm_cpp_vec( upper_tri_vec ); 116 | } 117 | } // Checked to be the same in R 118 | // cout << "Finished initialize L\n"; 119 | 120 | // initialize the rad_mat as the first off-diagonal values 121 | arma::mat rad_mat_current_ln1(n_row-1, 1); 122 | arma::mat rad_mat_current_ln2(n_row-1, 1); 123 | arma::mat rad_mat_current_len(n_row-1, 1); 124 | 125 | for(int i=0; i<(n_row-1); i++) // The first off-diagonal values 126 | { 127 | rad_mat_current_ln1(i, 0) = A_ln1(i, i+1); 128 | rad_mat_current_ln2(i, 0) = A_ln2(i, i+1); 129 | rad_mat_current_len(i, 0) = A_len(i, i+1); 130 | } 131 | 132 | // initialized to be two vertical cells (2 rows) 133 | arma::mat vertical_columns_next_ln1(2, n_row-2); 134 | arma::mat vertical_columns_next_ln2(2, n_row-2); 135 | arma::mat vertical_columns_next_len(2, n_row-2); 136 | 137 | for(int i=0; i<(n_row-2); i++) 138 | { 139 | vertical_columns_next_ln1.col(i) = A_ln1( arma::span(i, i+1), i+2 ); 140 | vertical_columns_next_ln2.col(i) = A_ln2( arma::span(i, i+1), i+2 ); 141 | vertical_columns_next_len.col(i) = A_len( arma::span(i, i+1), i+2 ); 142 | } 143 | 144 | for(int i=1; i<2; i++) // cumsum of the two vertical cells (i=1:1) 145 | { 146 | vertical_columns_next_ln1.row(i) = vertical_columns_next_ln1.row(i) + vertical_columns_next_ln1.row(i-1); //cumsum 147 | vertical_columns_next_ln2.row(i) = vertical_columns_next_ln2.row(i) + vertical_columns_next_ln2.row(i-1); //cumsum 148 | vertical_columns_next_len.row(i) = vertical_columns_next_len.row(i) + vertical_columns_next_len.row(i-1); //cumsum 149 | } 150 | 151 | arma::mat rad_mat_next_ln1 = rad_mat_current_ln1; 152 | arma::mat rad_mat_next_ln2 = rad_mat_current_ln2; 153 | arma::mat rad_mat_next_len = rad_mat_current_len; 154 | 155 | arma::mat vertical_columns_current_ln1 = vertical_columns_next_ln1; // this line just create the vertical_columns_current_ln 156 | arma::mat vertical_columns_current_ln2 = vertical_columns_next_ln2; // this line just create the vertical_columns_current_ln 157 | arma::mat vertical_columns_current_len = vertical_columns_next_len; // this line just create the vertical_columns_current_ln 158 | 159 | // Rcout << L << "\n"; 160 | // cout << "Begin iteration:\n"; 161 | 162 | // time complexity of this part: n^3 163 | // for(int shift=3; shift<=n_row; shift++) 164 | // each row of rad_mat represent one off-diagonal point 165 | for(int shift=3; shift<=k; shift++) 166 | { 167 | rad_mat_current_ln1 = rad_mat_next_ln1; 168 | rad_mat_current_ln2 = rad_mat_next_ln2; 169 | rad_mat_current_len = rad_mat_next_len; 170 | 171 | rad_mat_next_ln1 = arma::mat( n_row-shift+1, shift-1); 172 | rad_mat_next_ln2 = arma::mat( n_row-shift+1, shift-1); 173 | rad_mat_next_len = arma::mat( n_row-shift+1, shift-1); 174 | 175 | n_cells = arma::zeros(shift-1); // size of each rectangle 176 | for( int i=0; i< (shift-1); i++ ) n_cells(i) = (i+1)*(shift-i-1); 177 | arma::mat rad_mat_next_len_all = (arma::ones(n_row-shift+1, shift-1))*(arma::diagmat(n_cells)); // In R: rep(vec, n_row times) // Schur product: element-wise multiplication of two objects 178 | 179 | for(int i=1; i<=(n_row-shift+1); i++) 180 | { 181 | // next = current + vertical_columns_next_ln values 182 | rad_mat_next_ln1.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln1( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln1( arma::span(0, shift-2-1), i-1 ).t(); 183 | rad_mat_next_ln2.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_ln2( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_ln2( arma::span(0, shift-2-1), i-1 ).t(); 184 | rad_mat_next_len.submat(i-1, 0, i-1, shift-2-1) = rad_mat_current_len( i-1, arma::span(0, shift-2-1) ) + vertical_columns_next_len( arma::span(0, shift-2-1), i-1 ).t(); 185 | 186 | rad_mat_next_ln1(i-1, shift-1-1) = vertical_columns_next_ln1(shift-1-1, i-1); // the last new element 187 | rad_mat_next_ln2(i-1, shift-1-1) = vertical_columns_next_ln2(shift-1-1, i-1); // the last new element 188 | rad_mat_next_len(i-1, shift-1-1) = vertical_columns_next_len(shift-1-1, i-1); // the last new element 189 | } 190 | 191 | 192 | 193 | ////////////////////////////////////// compute the vertical_columns_next values 194 | if(shift < n_row) //stop when shift=n 195 | { 196 | vertical_columns_current_ln1 = vertical_columns_next_ln1; 197 | vertical_columns_current_ln2 = vertical_columns_next_ln2; 198 | vertical_columns_current_len = vertical_columns_next_len; 199 | 200 | arma::mat first_row_ln1(1, n_row-shift); 201 | arma::mat first_row_ln2(1, n_row-shift); 202 | arma::mat first_row_len(1, n_row-shift); 203 | 204 | for(int i=0; i<(n_row-shift); i++) 205 | { 206 | first_row_ln1(0, i) = A_ln1(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 207 | first_row_ln2(0, i) = A_ln2(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 208 | first_row_len(0, i) = A_len(i, i+shift); // off-diagonal values to be appended to vertical_columns_next_ln 209 | } 210 | 211 | vertical_columns_next_ln1 = vertical_columns_current_ln1.submat(0, 1, shift-2, n_row-shift); // drop the first column 212 | vertical_columns_next_ln2 = vertical_columns_current_ln2.submat(0, 1, shift-2, n_row-shift); // drop the first column 213 | vertical_columns_next_len = vertical_columns_current_len.submat(0, 1, shift-2, n_row-shift); // drop the first column 214 | 215 | vertical_columns_next_ln1 = arma::join_cols(first_row_ln1, vertical_columns_next_ln1); 216 | vertical_columns_next_ln2 = arma::join_cols(first_row_ln2, vertical_columns_next_ln2); 217 | vertical_columns_next_len = arma::join_cols(first_row_len, vertical_columns_next_len); 218 | 219 | for(int i=1; i= 4) 229 | if(shift >= 2*min_n_bins) // 230 | { 231 | ps = rad_mat_next_len_all - rad_mat_next_len; // number of positive values 232 | loglik_tmp = loglik_lnorm_cpp_mat( rad_mat_next_ln1, rad_mat_next_ln2, ps, rad_mat_next_len ); 233 | // loglik = loglik_tmp.submat(0, 1, n_row-shift, shift-2-1); // remove first and last col because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat 234 | loglik = loglik_tmp.submat(0, min_n_bins-1, n_row-shift, shift-2-(min_n_bins-1)); // 2018-11-14, remove first and last min_n_bins-1 cols because of the min_n_bins. SHOULD BE MODIFIED. submat: X.submat( first_row, first_col, last_row, last_col ), http://arma.sourceforge.net/docs.html#submat 235 | 236 | 237 | arma::mat cases(1, shift-2*min_n_bins+1); // shift=5: 1:2, i.e., two cases 238 | // arma::mat loglik(n_row-shift, shift-min_n_bins); 239 | for( int row=1; row<=(n_row-shift+1); row++ ) 240 | { 241 | p = row; //7 242 | q = row + shift; // 7 + 4 = 11 243 | cases = loglik( p-1, arma::span(0, shift-2*min_n_bins) ) + L(p-1, arma::span(p-1-1 + min_n_bins, q - min_n_bins-1-1) ) + (L(arma::span(p-1 + min_n_bins, q - min_n_bins-1), q-1-1)).t(); 244 | L(p-1, q-1-1) = cases.max(); 245 | 246 | max_mid_index = cases.index_max() + 1; //The c++ offset 1 247 | max_mid = (p-1 + min_n_bins) + max_mid_index -1; // should minus one 248 | // ancestor = paste(i, max_mid, max_mid+1, j, sep='-') 249 | ancestors(p-1, q-1-1) = to_string(p) + "-" + to_string(max_mid) + "-" + to_string(max_mid+1) + "-" + to_string(q-1); 250 | // cout << "cases:" << L(p-1, q-1-1) << "\n"; 251 | } 252 | } 253 | 254 | } 255 | 256 | res["ancestors"] = ancestors; 257 | res["L"] = L; 258 | return res; 259 | } 260 | 261 | // compute the ancestors 262 | // [[Rcpp::export]] 263 | List compute_L( arma::mat A, arma::mat L, int k ) // A seems not needed here, and can be removed, Y.L, 2018-11-14 (Indeed this part is not used) 264 | { 265 | int n_row = A.n_rows; 266 | int min_n_bins = 2; 267 | StringMatrix ancestors(n_row, n_row); 268 | 269 | // should rewrite this part 270 | // for( int i= min_n_bins-1; i<=(2*min_n_bins - 2); i++ ) 271 | // for( int j=i; j ${outpath}/test.chrom.sizes 18 | cooler cload pairs --chrom1 2 --pos1 3 --chrom2 4 --pos2 5 \ 19 | ${outpath}/test.chrom.sizes:${binsize} \ 20 | ${outpath}/source.pairs.gz \ 21 | ${outpath}/test.cool 22 | rm ${outpath}/source.pairs.gz 23 | 24 | cooler balance --force --max-iters 1000 ${outpath}/test.cool 25 | } 26 | 27 | 28 | create_test_mcool(){ 29 | cool_path=$1 30 | outpath=$2 31 | binsize=$3 32 | 33 | if [[ ! -f ${outpath} ]] 34 | then 35 | cooler zoomify --resolutions ${binsize}N \ 36 | --balance \ 37 | --balance-args '--force --max-iters 1000' \ 38 | --out ${outpath} ${cool_path} 39 | fi 40 | } 41 | 42 | 43 | test_data_path="tests/testthat/data" 44 | 45 | mkdir -p ${test_data_path} 46 | 47 | # Test cool file 48 | source_cool_file="https://data.4dnucleome.org/files-processed/4DNFI2EK1IOQ/@@download/4DNFI2EK1IOQ.pairs.gz" 49 | test_cool_binsize=50000 50 | 51 | create_test_cool ${source_cool_file} ${test_data_path} ${test_cool_binsize} 52 | 53 | # Creating MCOOL from COOL 54 | create_test_mcool ${test_data_path}/test.cool ${test_data_path}/test.mcool ${test_cool_binsize} -------------------------------------------------------------------------------- /tests/test_cmd_cool.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p "tests/output" 4 | 5 | scripts/calder --input tests/data/test.cool \ 6 | --type cool \ 7 | --bin_size 50000 \ 8 | --genome hg38 \ 9 | --nproc 10 \ 10 | --outpath "tests/output/test_cmd_cool_out" -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(CALDER) 3 | 4 | test_check("CALDER") 5 | -------------------------------------------------------------------------------- /tests/testthat/data/test.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr21 46709983 2 | chr22 50818468 3 | -------------------------------------------------------------------------------- /tests/testthat/data/test.cool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/tests/testthat/data/test.cool -------------------------------------------------------------------------------- /tests/testthat/data/test.mcool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSOgroup/CALDER2/d66c2d2d5038279207d0cf34b0c251e8ae3f99be/tests/testthat/data/test.mcool -------------------------------------------------------------------------------- /tests/testthat/test-main.R: -------------------------------------------------------------------------------- 1 | 2 | CALDER_CLI=file.path(system.file("scripts", package="CALDER"), "calder") 3 | 4 | 5 | test_that("CALDER works with cool files", { 6 | input_cool_path <- file.path(testthat::test_path("data"), "test.cool") 7 | output_path <- testthat::test_path("test-main-cool-out") 8 | 9 | CMD = paste0(CALDER_CLI, " -i ", input_cool_path, " -t cool -g hg38 -o ", output_path) 10 | system(CMD) 11 | 12 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestCool_all_sub_compartments.bed") 13 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestCool_all_sub_compartments.tsv") 14 | expect_snapshot_file(file.path(output_path, "sub_domains", "all_nested_boundaries.bed"), name = "TestCool_all_nested_boundaries.bed") 15 | unlink(output_path, recursive=TRUE) 16 | }) 17 | 18 | 19 | test_that("CALDER works with dumps", { 20 | 21 | chrs = c(21:22) 22 | 23 | ## demo contact matrices in dump format 24 | contact_file_dump = as.list(system.file("extdata", sprintf("mat_chr%s_10kb_ob.txt.gz", chrs), 25 | package='CALDER')) 26 | names(contact_file_dump) = chrs 27 | output_path <- testthat::test_path("test-main-dump-out") 28 | ## Run CALDER to compute compartments but not nested sub-domains 29 | CALDER(contact_file_dump=contact_file_dump, 30 | chrs=chrs, 31 | bin_size=10E3, 32 | genome='hg19', 33 | save_dir=output_path, 34 | save_intermediate_data=FALSE, 35 | n_cores=2, 36 | sub_domains=FALSE) 37 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestDump_all_sub_compartments.bed") 38 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestDump_all_sub_compartments.tsv") 39 | unlink(output_path, recursive=TRUE) 40 | }) 41 | 42 | test_that("CALDER works with custom feature track", { 43 | input_cool_path <- file.path(testthat::test_path("data"), "test.cool") 44 | feature_track_path <- file.path(testthat::test_path("data"), "test_gene_coverage.bed") 45 | output_path <- testthat::test_path("test-main-featuretrack-out") 46 | 47 | CMD = paste0(CALDER_CLI, " -i ", input_cool_path, " -t cool -g hg38 -o ", output_path, " -f ", feature_track_path) 48 | system(CMD) 49 | 50 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.bed"), name = "TestFeatureTrack_all_sub_compartments.bed") 51 | expect_snapshot_file(file.path(output_path, "sub_compartments", "all_sub_compartments.tsv"), name = "TestFeatureTrack_all_sub_compartments.tsv") 52 | expect_snapshot_file(file.path(output_path, "sub_domains", "all_nested_boundaries.bed"), name = "TestFeatureTrack_all_nested_boundaries.bed") 53 | unlink(output_path, recursive=TRUE) 54 | }) --------------------------------------------------------------------------------