├── LICENSE ├── README.md ├── agg_sv.py ├── agg_sv_align_info.py ├── assembly_score.py ├── centromere_hg37.txt ├── centromere_hg38.txt ├── chrx.py ├── combine.py ├── compress_liftover.py ├── download_asm.sh ├── download_files.sh ├── filter_truth_set.py ├── fn.py ├── fn.pyc ├── func.py ├── get_align_info.py ├── get_conf_int.py ├── get_sv_bed_pos.py ├── help_func.py ├── hg37_tandem_repeats.bed ├── hg38_tandem_repeats.bed ├── liftover.sh ├── lo_assem_to_ref.py ├── lo_assem_to_ref_0.py ├── reg_dup.py ├── run_ttmars.sh ├── simu_dup ├── simu_dup_1.py └── simu_dup_2.py ├── trim_overlapping_contigs.py ├── ttmars.py ├── ttmars_old.py └── validate.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, ChaissonLab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TT-Mars 2 | 3 | TT-Mars: S**t**ructural Varian**t**s Assess**m**ent B**a**sed on Haplotype-**r**esolved A**s**semblies. 4 | 5 | ## Usage 6 | 7 | 0. Clone TT-Mars from github and `cd TT-Mars`. Python >= 3.8 is preferred. 8 | 1. Create environment and activate: `conda create -n ttmars` and `conda activate ttmars`. 9 | 2. Run `dowaload_files.sh` to download required files to `./ttmars_files`. 10 | 3. Run `download_asm.sh` to download assembly files of 10 samples from HGSVC. 11 | 4. Install packages: `conda install -c bioconda pysam`, `conda install -c anaconda numpy`, `conda install -c bioconda mappy`, `conda install -c conda-forge biopython`, `conda install -c bioconda pybedtools`. 12 | 5. Run TT-Mars with following steps: `run_ttmars.sh` includes more instructions. Users can run it to run TT-Mars after setting up. 13 | 14 | The main program: run `python ttmars.py -h` for help. 15 | 16 | `python ttmars.py output_dir files_dir centro_file vcf_file reference asm_h1 asm_h2 tr_file num_X_chr` 17 | 18 | ## Positional arguments 19 | 20 | 1. `output_dir`: Output directory. 21 | 2. `files_dir`: Input files directory. `./ttmars_files/sample_name`. The directory where you store required files after running `dowaload_files.sh`. 22 | 3. `centro_file`: provided centromere file. 23 | 4. `vcf_file`: callset file callset.vcf(.gz). 24 | 5. `reference`: referemce file reference_genome.fasta. 25 | 6. `asm_h1`: assembly files assembly1.fa, which were downloaded after running `download_asm.sh`. 26 | 7. `asm_h2`: assembly files assembly2.fa, which were downloaded after running `download_asm.sh`. 27 | 8. `tr_file`: provided tandem repeats file. 28 | 9. `num_X_chr`: if male sample: 1; if female sample: 2. 29 | 30 | ## Optional arguments 31 | 32 | `-n/--not_hg38`: if reference is NOT hg38/chm13 (hg19). 33 | `-p/--passonly`: if consider PASS calls only. 34 | `-s/--seq_resolved`: if consider sequence resolved calls. 35 | `-w/--wrong_len`: if count wrong length calls as True. 36 | `-g/--gt_vali`: conduct genotype validation. 37 | `-i/--gt_info`: index with GT info. (For phased callsets) 38 | `-d/--phased `: take phased information. (For phased callsets) 39 | `-v/--vcf_out`: output results as vcf files (tp (true positive), fp (false positive) and na). 40 | `-f/--false_neg`: output recall, must be used together with `-t/--truth_file`. 41 | `-t/--truth_file`: input truth vcf file, must be used together with `-f/--false_neg`. 42 | 43 | ## Example Output 44 | 45 | ttmars_combined_res.txt: 46 | |SV index| relative length| relative score| validation result| chr| start| end| Type| Genotype Match| 47 | | :----: | :----: | :----: | :----: | :----: | :----: | :----: |:----: | :----: | 48 | |0| 1.0| 3.48| True| chr1| 249912| 249912| INS| True| 49 | 50 | ## Accompanying Resources 51 | 52 | ### Liftover files 53 | | Samples | Reference Liftover Hap1 | Reference Liftover Hap2 | Assembly Liftover Hap1 | Assembly Liftover Hap2 | 54 | | :----: | :----: | :----: | :----: | :----: | 55 | | HG00096 | https://figshare.com/ndownloader/files/30817390 | https://figshare.com/ndownloader/files/30817384 | https://figshare.com/ndownloader/files/30817387 | https://figshare.com/ndownloader/files/30817381 | 56 | | HG00171 | https://figshare.com/ndownloader/files/30817402 | https://figshare.com/ndownloader/files/30817396 | https://figshare.com/ndownloader/files/30817399 | https://figshare.com/ndownloader/files/30817393 | 57 | | HG00513 | https://figshare.com/ndownloader/files/30817411 | https://figshare.com/ndownloader/files/30817405 | https://figshare.com/ndownloader/files/30817408 | https://figshare.com/ndownloader/files/30817414 | 58 | | HG00731 | https://figshare.com/ndownloader/files/30817426 | https://figshare.com/ndownloader/files/30817420 | https://figshare.com/ndownloader/files/30817423 | https://figshare.com/ndownloader/files/30817417 | 59 | | HG00732 | https://figshare.com/ndownloader/files/30817435 | https://figshare.com/ndownloader/files/30817429 | https://figshare.com/ndownloader/files/30817432 | https://figshare.com/ndownloader/files/30817438 | 60 | | HG00864 | https://figshare.com/ndownloader/files/30817450 | https://figshare.com/ndownloader/files/30817444 | https://figshare.com/ndownloader/files/30817447 | https://figshare.com/ndownloader/files/30817441 | 61 | | HG01114 | https://figshare.com/ndownloader/files/30817459 | https://figshare.com/ndownloader/files/30817453 | https://figshare.com/ndownloader/files/30817456 | https://figshare.com/ndownloader/files/30817462 | 62 | | HG01505 | https://figshare.com/ndownloader/files/30817471 | https://figshare.com/ndownloader/files/30817465 | https://figshare.com/ndownloader/files/30817468 | https://figshare.com/ndownloader/files/30817474 | 63 | | HG01596 | https://figshare.com/ndownloader/files/30817486 | https://figshare.com/ndownloader/files/30817480 | https://figshare.com/ndownloader/files/30817483 | https://figshare.com/ndownloader/files/30817477 | 64 | | HG03009 | https://figshare.com/ndownloader/files/30817498 | https://figshare.com/ndownloader/files/30817492 | https://figshare.com/ndownloader/files/30817495 | https://figshare.com/ndownloader/files/30817489 | 65 | | HG002 (hg19 ref) | https://figshare.com/ndownloader/files/31455682 | https://figshare.com/ndownloader/files/31455676 | https://figshare.com/ndownloader/files/31455685 | https://figshare.com/ndownloader/files/31455679 | 66 | 67 | 68 | ### Genome coverage files 69 | | Samples | Hap1 | Hap2 | 70 | | :----: | :----: | :----: | 71 | | HG00096 | https://figshare.com/ndownloader/files/30850246 | https://figshare.com/ndownloader/files/30850249 | 72 | | HG00171 | https://figshare.com/ndownloader/files/30850258 | https://figshare.com/ndownloader/files/30850261 | 73 | | HG00513 | https://figshare.com/ndownloader/files/30850639 | https://figshare.com/ndownloader/files/30850642 | 74 | | HG00731 | https://figshare.com/ndownloader/files/30850663 | https://figshare.com/ndownloader/files/30850660 | 75 | | HG00732 | https://figshare.com/ndownloader/files/30850687 | https://figshare.com/ndownloader/files/30850681 | 76 | | HG00864 | https://figshare.com/ndownloader/files/30850708 | https://figshare.com/ndownloader/files/30850711 | 77 | | HG01114 | https://figshare.com/ndownloader/files/30850726 | https://figshare.com/ndownloader/files/30850729 | 78 | | HG01505 | https://figshare.com/ndownloader/files/30850747 | https://figshare.com/ndownloader/files/30850744 | 79 | | HG01596 | https://figshare.com/ndownloader/files/30850768 | https://figshare.com/ndownloader/files/30850762 | 80 | | HG03009 | https://figshare.com/ndownloader/files/30850777 | https://figshare.com/ndownloader/files/30850780 | 81 | | HG002 (hg19 ref) | https://figshare.com/ndownloader/files/31455670 | https://figshare.com/ndownloader/files/31455673 | 82 | -------------------------------------------------------------------------------- /agg_sv.py: -------------------------------------------------------------------------------- 1 | import ttmars 2 | 3 | output_dir = ttmars.output_dir 4 | vcf_file = ttmars.vcf_file 5 | ref_file = ttmars.ref_file 6 | #assembly fasta files 7 | query_file1 = ttmars.query_file1 8 | query_file2 = ttmars.query_file2 9 | liftover_file1 = ttmars.liftover_file1 10 | liftover_file2 = ttmars.liftover_file2 11 | 12 | #liftover interval 13 | if_hg38 = ttmars.if_hg38 14 | #if pass_only 15 | if_pass_only = ttmars.if_pass_only 16 | #if seq_resolved 17 | seq_resolved = ttmars.seq_resolved 18 | #if include wrong length as TP 19 | wrong_len = ttmars.wrong_len 20 | #if take GT info 21 | if_gt_info = ttmars.if_gt_info 22 | #if consider phased 23 | if_phased = ttmars.if_phased 24 | #if validate GT 25 | if_gt = ttmars.if_gt 26 | 27 | # ####################################### 28 | # ####################################### 29 | import sys 30 | import csv 31 | import pysam 32 | import numpy as np 33 | import math 34 | 35 | import get_conf_int 36 | import validate 37 | import get_align_info 38 | 39 | import func 40 | import help_func 41 | 42 | import heapq 43 | 44 | import mappy 45 | import os 46 | 47 | #chr names 48 | chr_list = ttmars.chr_list 49 | #approximate length of chromosomes 50 | chr_len = ttmars.chr_len 51 | 52 | #max/min length of allowed SV not DUP 53 | memory_limit = ttmars.memory_limit 54 | memory_min = ttmars.memory_min 55 | #max length of allowed DUP 56 | dup_memory_limit = ttmars.dup_memory_limit 57 | dup_memory_min = ttmars.dup_memory_min 58 | #max length of allowed interspersed DUP 59 | reg_dup_upper_len = ttmars.reg_dup_upper_len 60 | #flanking regions for searching 61 | # region_len_m = 1000 62 | region_len_m = ttmars.region_len_m 63 | 64 | #valid types 65 | valid_types = ttmars.valid_types 66 | 67 | #CONST for interspersed DUP 68 | valid_ins_ratio = ttmars.valid_ins_ratio 69 | valid_aligned_portion = ttmars.valid_aligned_portion 70 | ins_rela_len_lb = ttmars.ins_rela_len_lb 71 | ins_rela_len_ub = ttmars.ins_rela_len_ub 72 | non_ins_rela_len_ub = ttmars.non_ins_rela_len_ub 73 | 74 | #alt/ref length threshold for abn SV 75 | alt_len_lb = ttmars.alt_len_lb 76 | 77 | #group SVs 78 | max_btw_dist = ttmars.max_btw_dist 79 | 80 | #max no of sv in a comb 81 | max_no_of_sv = ttmars.max_no_of_sv 82 | 83 | #max no of sv in a group 84 | max_sv_group_size = ttmars.max_sv_group_size 85 | 86 | #interval length for mapping 87 | interval = ttmars.interval 88 | 89 | ####################################### 90 | ####################################### 91 | 92 | #index SVs 93 | def idx_sv(vcf_file): 94 | f = pysam.VariantFile(vcf_file,'r') 95 | sv_list = [] 96 | 97 | for count, rec in enumerate(f.fetch()): 98 | #get sv_type 99 | try: 100 | sv_type = rec.info['SVTYPE'] 101 | except: 102 | print("invalid sv type info") 103 | continue 104 | 105 | if func.first_filter(rec, sv_type, valid_types, if_pass_only, chr_list): 106 | continue 107 | 108 | #get sv length 109 | if sv_type == 'INV': 110 | sv_len = abs(rec.stop - rec.pos + 1) 111 | else: 112 | try: 113 | sv_len = rec.info['SVLEN'][0] 114 | except: 115 | try: 116 | sv_len = rec.info['SVLEN'] 117 | except: 118 | sv_len = abs(rec.stop - rec.pos + 1) 119 | #handle del length > 0: 120 | if sv_type == 'DEL': 121 | sv_len = -abs(sv_len) 122 | 123 | if abs(sv_len) < memory_min: 124 | continue 125 | 126 | #get gt 127 | #only taking the first sample genotype 128 | if if_gt_info: 129 | sv_gt = rec.samples[0]["GT"] 130 | #bad genotype 131 | if sv_gt not in [(1, 1), (1, 0), (0, 1), (None, 1), (1, None)]: 132 | #test 133 | # print("not valid GT", sv_gt, rec.pos, rec.stop) 134 | sv_gt = None 135 | continue 136 | else: 137 | sv_gt = None 138 | 139 | ref_len = len(rec.ref) 140 | alt_len = len(rec.alts[0]) 141 | 142 | sv_list.append(func.struc_var(count, rec.chrom, sv_type, rec.pos, rec.stop, sv_len, sv_gt, wrong_len, ref_len, alt_len)) 143 | 144 | #add ins seq for seq-resolved insertion 145 | #no multi-allelic considered 146 | if (sv_type == 'INS') and seq_resolved: 147 | sv_list[len(sv_list)-1].ins_seq = rec.alts[0] 148 | sv_list[len(sv_list)-1].if_seq_resolved = True 149 | 150 | #add alt seq for abn DEL 151 | if (sv_type == 'DEL') and alt_len > alt_len_lb: 152 | sv_list[len(sv_list)-1].alt_seq = rec.alts[0] 153 | 154 | f.close() 155 | 156 | for sv in sv_list: 157 | #TODO: use second filter, and get_large_intervals() 158 | # func.second_filter(sv, if_hg38, dict_centromere, exclude_assem1_non_cover, exclude_assem2_non_cover) 159 | func.third_filter(sv, memory_min, memory_limit, dup_memory_min, dup_memory_limit) 160 | 161 | return sv_list 162 | 163 | 164 | ####################################### 165 | ####################################### 166 | 167 | #build sv dict by idx 168 | def build_sv_idx_dict(sv_list): 169 | sv_idx_dict = dict() 170 | for sv in sv_list: 171 | sv_idx_dict[sv.idx] = sv 172 | return sv_idx_dict 173 | 174 | #build sv comb dict by idx 175 | 176 | def build_sv_comb_idx_dict(sv_groups_combs): 177 | comb_dict = dict() 178 | for comb_sv_list in sv_groups_combs: 179 | for comb_sv in comb_sv_list: 180 | comb_dict[tuple(comb_sv.idx)] = comb_sv 181 | return comb_dict 182 | 183 | #check if two SV overlapping 184 | def check_sv_ol(sv1, sv2): 185 | x1 = sv1.sv_pos 186 | x2 = sv1.sv_stop 187 | 188 | y1 = sv2.sv_pos 189 | y2 = sv2.sv_stop 190 | 191 | if x1 <= y2 and y1 <= x2: 192 | return True 193 | 194 | return False 195 | 196 | #get sv groups: a group contains SVs that are closed to each other by at most max_btw_dist 197 | 198 | #groups allow overlapping 199 | 200 | class sv_group_class: 201 | def __init__(self): 202 | self.sv_idx_list = [] 203 | 204 | self.start = -1 205 | self.end = -1 206 | 207 | self.length = 0 208 | 209 | #store all combs by idx as list 210 | self.comb_list = [] 211 | 212 | #heap of top rela len 213 | #store the top k comb in terms of rela len 214 | self.no_top_comb = 5 215 | #list of tuple: (-dist_2_one, comb_key) 216 | #can be empty 217 | self.top_rela_len_comb_idx = [] 218 | 219 | def add_sv(self, sv): 220 | self.sv_idx_list.append(sv.idx) 221 | 222 | if self.start < 0: 223 | self.start = sv.sv_pos 224 | 225 | self.end = max(self.end, sv.sv_stop) 226 | self.length += 1 227 | 228 | def add_comb(self, comb_sv): 229 | self.comb_list.append(comb_sv.idx) 230 | 231 | def cal_dist_2_one(self, comb_sv): 232 | rela_len_1 = comb_sv.cal_rela_len(comb_sv.len_query_hap1, comb_sv.len_ref_hap1) 233 | rela_len_2 = comb_sv.cal_rela_len(comb_sv.len_query_hap2, comb_sv.len_ref_hap2) 234 | 235 | dist_2_one_1 = abs(rela_len_1 - 1) 236 | dist_2_one_2 = abs(rela_len_2 - 1) 237 | 238 | dist_2_one = min(dist_2_one_1, dist_2_one_2) 239 | return dist_2_one 240 | 241 | def get_top_comb(self, comb_dict): 242 | for comb_idx_list in self.comb_list: 243 | comb_key = tuple(comb_idx_list) 244 | comb_sv = comb_dict[comb_key] 245 | if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 246 | continue 247 | 248 | dist_2_one = round(self.cal_dist_2_one(comb_sv), 2) 249 | 250 | heapq.heappush(self.top_rela_len_comb_idx, (-dist_2_one, comb_key)) 251 | 252 | if len(self.top_rela_len_comb_idx) > self.no_top_comb: 253 | heapq.heappop(self.top_rela_len_comb_idx) 254 | 255 | def get_sv_groups(sv_list, max_btw_dist, max_sv_group_size): 256 | sv_groups = [] 257 | cur_group = sv_group_class() 258 | pre_chr = "" 259 | 260 | for sv in sv_list: 261 | #allow INS and DEL to aggregate for now 262 | if sv.sv_type not in ['INS', 'DEL']: 263 | continue 264 | 265 | if sv.is_third_fil: 266 | continue 267 | 268 | #if move to the next chr 269 | cur_chr = sv.ref_name 270 | if cur_chr != pre_chr: 271 | if cur_group.length > 1 and cur_group.length <= max_sv_group_size: 272 | sv_groups.append(cur_group) 273 | cur_group = sv_group_class() 274 | cur_group.add_sv(sv) 275 | pre_chr = cur_chr 276 | continue 277 | 278 | if sv.sv_pos < cur_group.end + max_btw_dist: 279 | cur_group.add_sv(sv) 280 | else: 281 | if cur_group.length > 1 and cur_group.length <= max_sv_group_size: 282 | sv_groups.append(cur_group) 283 | cur_group = sv_group_class() 284 | cur_group.add_sv(sv) 285 | # test 286 | # print(sv.sv_pos, cur_group.end, sv.sv_stop, cur_group.length, cur_group.sv_idx_list, len(sv_groups)) 287 | 288 | if cur_group.length > 1 and cur_group.length <= max_sv_group_size: 289 | sv_groups.append(cur_group) 290 | 291 | return sv_groups 292 | 293 | 294 | #for each sv group, get all the valid combinations (of <= n SVs) 295 | 296 | def get_sv_comb(sv_group, if_gt_info, if_phased, sv_idx_dict, max_no_of_sv): 297 | sv_group_list = [] 298 | for idx in sv_group.sv_idx_list: 299 | sv_group_list.append(sv_idx_dict[idx]) 300 | 301 | # max_no_of_sv = 3 302 | 303 | #dfs 304 | cur_idx = 0 305 | cur_comb = [] 306 | combs = [] 307 | 308 | if if_gt_info and if_phased: 309 | #add combs of phase 0 310 | get_combs_dfs_phased(sv_group_list, cur_idx, cur_comb, combs, max_no_of_sv, 0) 311 | 312 | cur_idx = 0 313 | cur_comb = [] 314 | 315 | #add combs of phase 1 316 | get_combs_dfs_phased(sv_group_list, cur_idx, cur_comb, combs, max_no_of_sv, 1) 317 | 318 | if (not if_gt_info) or (not if_phased): 319 | get_combs_dfs(sv_group_list, cur_idx, cur_comb, combs, max_no_of_sv) 320 | 321 | return combs 322 | 323 | def get_combs_dfs(sv_group, cur_idx, cur_comb, combs, max_no_of_sv): 324 | if len(cur_comb) == len(sv_group) or len(cur_comb) == max_no_of_sv: 325 | if check_comb_valid(cur_comb): 326 | combs.append(help_func.idx_comb(list(cur_comb))) 327 | # combs.append(list(cur_comb)) 328 | return 329 | else: 330 | if len(cur_comb) > 1 and check_comb_valid(cur_comb): 331 | combs.append(help_func.idx_comb(list(cur_comb))) 332 | # combs.append(list(cur_comb)) 333 | 334 | for i in range(cur_idx, len(sv_group)): 335 | cur_comb.append(sv_group[i]) 336 | get_combs_dfs(sv_group, i+1, cur_comb, combs, max_no_of_sv) 337 | cur_comb.pop() 338 | 339 | def get_combs_dfs_phased(sv_group, cur_idx, cur_comb, combs, max_no_of_sv, phase): 340 | if len(cur_comb) == len(sv_group) or len(cur_comb) == max_no_of_sv: 341 | if check_comb_valid(cur_comb): 342 | combs.append(help_func.idx_comb(list(cur_comb))) 343 | # combs.append(list(cur_comb)) 344 | return 345 | else: 346 | if len(cur_comb) > 1 and check_comb_valid(cur_comb): 347 | combs.append(help_func.idx_comb(list(cur_comb))) 348 | # combs.append(list(cur_comb)) 349 | 350 | for i in range(cur_idx, len(sv_group)): 351 | #check phasing 352 | if sv_group[i].gt[phase] != 1: 353 | continue 354 | 355 | cur_comb.append(sv_group[i]) 356 | get_combs_dfs_phased(sv_group, i+1, cur_comb, combs, max_no_of_sv, phase) 357 | cur_comb.pop() 358 | 359 | def comb_overlap(comb): 360 | cur_sv = comb[0] 361 | for next_sv in comb[1:]: 362 | if check_sv_ol(cur_sv, next_sv): 363 | return True 364 | else: 365 | cur_sv = next_sv 366 | 367 | return False 368 | 369 | #check if length valid, if overlapping 370 | def check_comb_valid(comb): 371 | #check length requirments 372 | len_min = 50 373 | len_max = 500000 374 | 375 | #validate length 376 | agg_len = 0 377 | for sv in comb: 378 | agg_len += sv.length 379 | 380 | #comb sv length 381 | if abs(agg_len) < len_min or abs(agg_len) > len_max: 382 | #test 383 | # for sv in comb: 384 | # print(sv.idx) 385 | return False 386 | 387 | #length of sequence that comb sv spans 388 | if comb[-1].sv_stop - comb[0].sv_pos > len_max: 389 | return False 390 | 391 | #check overlapping 392 | if comb_overlap(comb): 393 | return False 394 | 395 | return True 396 | 397 | #return combs that include all possible SVs on one hap 398 | #require phased SVs 399 | def get_hap_all_comb(sv_group): 400 | #gready 401 | haps = [0, 1] 402 | combs = [] 403 | 404 | for hap in haps: 405 | cur_comb = [] 406 | for sv in sv_group: 407 | if sv.gt[hap] == 1: 408 | cur_comb.append(sv) 409 | if len(cur_comb) > 1: 410 | if check_comb_valid(cur_comb): 411 | combs.append(cur_comb) 412 | 413 | return combs 414 | 415 | #match each info needed to find res 416 | def match_sv_with_comb_res(sv, comb_sv): 417 | sv.length = comb_sv.length 418 | 419 | sv.analyzed_hap1 = comb_sv.analyzed_hap1 420 | sv.analyzed_hap2 = comb_sv.analyzed_hap2 421 | sv.len_query_hap1 = comb_sv.len_query_hap1 422 | sv.len_query_hap2 = comb_sv.len_query_hap2 423 | sv.len_ref_hap1 = comb_sv.len_ref_hap1 424 | sv.len_ref_hap2 = comb_sv.len_ref_hap2 425 | sv.score_before_hap1 = comb_sv.score_before_hap1 426 | sv.score_after_hap1 = comb_sv.score_after_hap1 427 | sv.score_before_hap2 = comb_sv.score_before_hap2 428 | sv.score_after_hap2 = comb_sv.score_after_hap2 429 | 430 | #check if the comb res is better than an sv 431 | def check_comb_better_than_sv(sv, comb_sv, if_gt): 432 | #return a tuple: (res, rela_len, rela_score, gt_validate) 433 | comb_res = comb_sv.get_vali_res(if_gt) 434 | sv_res = sv.get_vali_res(if_gt) 435 | 436 | if comb_res[0] and (not sv_res[0]): 437 | return True 438 | elif sv_res[0] and (not comb_res[0]): 439 | return False 440 | else: 441 | comb_rela_len = comb_res[1] 442 | sv_rela_len = sv_res[1] 443 | 444 | if abs(comb_rela_len - 1) <= abs(sv_rela_len - 1): 445 | return True 446 | else: 447 | return False 448 | 449 | #update sv info with comb res 450 | def update_sv_with_comb_res(sv, comb_sv, if_gt): 451 | if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 452 | return 453 | 454 | if (not sv.analyzed_hap1) or (not sv.analyzed_hap2): 455 | match_sv_with_comb_res(sv, comb_sv) 456 | else: 457 | if check_comb_better_than_sv(sv, comb_sv, if_gt): 458 | match_sv_with_comb_res(sv, comb_sv) 459 | 460 | ####################################### 461 | ####################################### 462 | 463 | 464 | # #preprocessing: liftover mapping 465 | 466 | # contig_name_list_1, contig_pos_list_1, contig_name_dict_1 = get_align_info.build_map_compress(chr_len, interval, liftover_file1, if_hg38) 467 | # contig_name_list_2, contig_pos_list_2, contig_name_dict_2 = get_align_info.build_map_compress(chr_len, interval, liftover_file2, if_hg38) 468 | 469 | 470 | ####################################### 471 | ####################################### 472 | #analysis data 473 | 474 | # #build sv groups 475 | # sv_list = idx_sv(vcf_file) 476 | # sv_idx_dict = build_sv_idx_dict(sv_list) 477 | 478 | # sv_groups = get_sv_groups(sv_list, max_btw_dist, max_sv_group_size) 479 | 480 | # assert len(sv_groups) > 0 481 | 482 | # #build sv combs 483 | # sv_groups_combs = [] 484 | # for sv_group in sv_groups: 485 | # comb_sv_list = get_sv_comb(sv_group, if_gt_info, if_phased, sv_idx_dict, max_no_of_sv) 486 | 487 | # for comb_sv in comb_sv_list: 488 | # sv_group.add_comb(comb_sv) 489 | 490 | # sv_groups_combs.append(comb_sv_list) 491 | 492 | # #build comb dict based on [idx] 493 | # comb_dict = build_sv_comb_idx_dict(sv_groups_combs) 494 | 495 | # query_fasta_file_1 = pysam.FastaFile(query_file1) 496 | # query_fasta_file_2 = pysam.FastaFile(query_file2) 497 | # ref_fasta_file = pysam.FastaFile(ref_file) 498 | # cur_ref_name = "" 499 | 500 | # #test 501 | # counter = 0 502 | 503 | # for comb_sv_list in sv_groups_combs: 504 | # for comb_sv in comb_sv_list: 505 | # #test 506 | # counter += 1 507 | # if counter % 500 == 1: 508 | # print(counter) 509 | 510 | # if cur_ref_name != comb_sv.ref_name: 511 | # cur_ref_name = comb_sv.ref_name 512 | # ref_rec = ref_fasta_file.fetch(cur_ref_name) 513 | 514 | # help_func.get_comb_vali_info_len_only(comb_sv, 1, interval, contig_name_list_1, contig_pos_list_1, 515 | # contig_name_dict_1, if_hg38, ref_rec, query_fasta_file_1, sv_idx_dict, region_len_m) 516 | 517 | # help_func.get_comb_vali_info_len_only(comb_sv, 2, interval, contig_name_list_2, contig_pos_list_2, 518 | # contig_name_dict_2, if_hg38, ref_rec, query_fasta_file_2, sv_idx_dict, region_len_m) 519 | 520 | # if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 521 | # continue 522 | 523 | # # help_func.update_sv_res_len_only(comb_sv) 524 | 525 | 526 | # #for each sv group, find the top k comb in terms for the rela length (for either haps) 527 | # for cur_group in sv_groups: 528 | # #test 529 | # # print(cur_group.comb_list) 530 | # cur_group.get_top_comb(comb_dict) 531 | 532 | 533 | 534 | # cur_ref_name = "" 535 | 536 | # #test 537 | # counter = 0 538 | 539 | # for cur_group in sv_groups: 540 | # #can be empty if every comb failed to be analyzed 541 | # if len(cur_group.top_rela_len_comb_idx) > 0: 542 | 543 | # for _, comb_key in cur_group.top_rela_len_comb_idx: 544 | # #test 545 | # counter += 1 546 | # if counter % 100 == 1: 547 | # print(counter) 548 | 549 | # comb_sv = comb_dict[comb_key] 550 | 551 | # if cur_ref_name != comb_sv.ref_name: 552 | # cur_ref_name = comb_sv.ref_name 553 | # ref_rec = ref_fasta_file.fetch(cur_ref_name) 554 | 555 | # help_func.get_comb_vali_info_align_only(comb_sv, 1, interval, contig_name_list_1, contig_pos_list_1, 556 | # contig_name_dict_1, if_hg38, ref_rec, query_fasta_file_1, sv_idx_dict) 557 | # help_func.get_comb_vali_info_align_only(comb_sv, 2, interval, contig_name_list_2, contig_pos_list_2, 558 | # contig_name_dict_2, if_hg38, ref_rec, query_fasta_file_2, sv_idx_dict) 559 | 560 | # if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 561 | # continue 562 | 563 | 564 | # #update SV info 565 | 566 | # for cur_group in sv_groups: 567 | # #don't consider comb not in the top list 568 | # #can be empty if every comb failed to be analyzed 569 | # if len(cur_group.top_rela_len_comb_idx) > 0: 570 | # for _, comb_key in cur_group.top_rela_len_comb_idx: 571 | # comb_sv = comb_dict[comb_key] 572 | 573 | # #skip NA comb_sv 574 | # if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 575 | # continue 576 | 577 | # for sv_idx in comb_key: 578 | # update_sv_with_comb_res(sv_idx_dict[sv_idx], comb_sv, if_gt) 579 | 580 | # #write SV info 581 | # func.write_vali_info_agg(sv_list, output_dir, if_gt) -------------------------------------------------------------------------------- /agg_sv_align_info.py: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ####################################### 3 | 4 | 5 | import help_func 6 | 7 | import sys 8 | import csv 9 | import pysam 10 | import numpy as np 11 | import math 12 | 13 | import get_conf_int 14 | import validate 15 | import get_align_info 16 | 17 | import func 18 | 19 | import mappy 20 | import os 21 | 22 | ####################################### 23 | ####################################### 24 | 25 | 26 | #liftover interval 27 | interval = 20 28 | if_hg38 = True 29 | if_pass_only = False 30 | seq_resolved = True 31 | wrong_len = False 32 | 33 | if_gt_info = True 34 | if_phased = True 35 | 36 | if_gt = False 37 | 38 | #chr names 39 | chr_list = [] 40 | if if_hg38: 41 | chr_list = ["chr1", "chr2", "chr3", "chr4", "chr5", 42 | "chr6", "chr7", "chr8", "chr9", "chr10", 43 | "chr11", "chr12", "chr13", "chr14", "chr15", 44 | "chr16", "chr17", "chr18", "chr19", "chr20", 45 | "chr21", "chr22", "chrX"] 46 | else: 47 | chr_list = ["1", "2", "3", "4", "5", 48 | "6", "7", "8", "9", "10", 49 | "11", "12", "13", "14", "15", 50 | "16", "17", "18", "19", "20", 51 | "21", "22", "X"] 52 | #approximate length of chromosomes 53 | chr_len = [250000000, 244000000, 199000000, 192000000, 182000000, 54 | 172000000, 160000000, 147000000, 142000000, 136000000, 55 | 136000000, 134000000, 116000000, 108000000, 103000000, 56 | 90400000, 83300000, 80400000, 59200000, 64500000, 57 | 48200000, 51400000, 157000000, 59400000] 58 | 59 | #max/min length of allowed SV not DUP 60 | memory_limit = 100000 61 | memory_min = 10 62 | #max length of allowed DUP 63 | dup_memory_limit = 50000 64 | dup_memory_min = 10 65 | #max length of allowed interspersed DUP 66 | reg_dup_upper_len = 10000000 67 | 68 | #valid types 69 | valid_types = ['DEL', 'INS', 'INV', 'DUP:TANDEM', 'DUP'] 70 | 71 | #CONST for interspersed DUP 72 | valid_ins_ratio = 0.6 73 | valid_aligned_portion = 0.9 74 | ins_rela_len_lb = 0.7 75 | ins_rela_len_ub = 1.3 76 | non_ins_rela_len_ub = 0.4 77 | 78 | #file and dir 79 | vcf_file = "/scratch2/jianzhiy/elber_vcf/test_vcf/HG00438.minigraph.chr10.vcf" 80 | output_dir = "/scratch2/jianzhiy/ttmars/output/0413_elber_agg_test/elber_minigraph/HG00438_chr10" 81 | 82 | #liftover files 83 | liftover_file1 = "/scratch2/jianzhiy/data/assemblies/hprc/HG00438/lra/lo_pos_assem1_result_compressed.bed" 84 | liftover_file2 = "/scratch2/jianzhiy/data/assemblies/hprc/HG00438/lra/lo_pos_assem2_result_compressed.bed" 85 | 86 | #ref file 87 | ref_file = "/panfs/qcb-panasas/jianzhiy/data/reference/hg38.no_alts.fasta" 88 | 89 | #query file2 90 | query_file_1 = "/scratch2/jianzhiy/data/assemblies/hprc/HG00438/h1.fa" 91 | query_file_2 = "/scratch2/jianzhiy/data/assemblies/hprc/HG00438/h2.fa" 92 | 93 | ####################################### 94 | ####################################### 95 | 96 | #build sv dict by idx 97 | def build_sv_idx_dict(sv_list): 98 | sv_idx_dict = dict() 99 | for sv in sv_list: 100 | sv_idx_dict[sv.idx] = sv 101 | return sv_idx_dict 102 | 103 | #check if two SV overlapping 104 | def check_sv_ol(sv1, sv2): 105 | x1 = sv1.sv_pos 106 | x2 = sv1.sv_stop 107 | 108 | y1 = sv2.sv_pos 109 | y2 = sv2.sv_stop 110 | 111 | if x1 <= y2 and y1 <= x2: 112 | return True 113 | 114 | return False 115 | 116 | #get sv groups: a group contains SVs that are closed to each other by at most max_btw_dist 117 | 118 | #groups allow overlapping 119 | 120 | # class sv_group: 121 | # def __init__(self, sv): 122 | # self.comb_idx = [sv.idx] 123 | # self.start = sv.sv_pos 124 | # self.end = sv.sv_stop 125 | 126 | # def add(self, sv): 127 | # self.comb_idx.append(sv.idx) 128 | # self.end = sv.sv_stop 129 | 130 | def get_sv_groups(sv_list, max_btw_dist): 131 | sv_groups = [] 132 | cur_group = [] 133 | cur_stop = 0 134 | for sv in sv_list: 135 | #allow INS and DEL to aggregate for now 136 | if sv.sv_type not in ['INS', 'DEL']: 137 | continue 138 | 139 | if sv.is_third_fil: 140 | continue 141 | 142 | if sv.sv_pos < cur_stop + max_btw_dist: 143 | cur_group.append(sv) 144 | cur_stop = max(cur_stop, sv.sv_stop) 145 | else: 146 | if len(cur_group) > 1: 147 | sv_groups.append(list(cur_group)) 148 | cur_stop = sv.sv_stop 149 | cur_group = [sv] 150 | # test 151 | # print(sv.sv_pos, cur_stop, sv.sv_stop, len(cur_group), len(sv_groups)) 152 | 153 | if len(cur_group) > 1: 154 | sv_groups.append(list(cur_group)) 155 | 156 | return sv_groups 157 | 158 | #for each sv group, get all the valid combinations (of <= 5 SVs) 159 | 160 | def get_sv_comb(sv_group, if_gt_info, if_phased): 161 | max_no_of_sv = 3 162 | 163 | #dfs 164 | cur_idx = 0 165 | cur_comb = [] 166 | combs = [] 167 | 168 | if if_gt_info and if_phased: 169 | #add combs of phase 0 170 | get_combs_dfs_phased(sv_group, cur_idx, cur_comb, combs, max_no_of_sv, 0) 171 | 172 | cur_idx = 0 173 | cur_comb = [] 174 | 175 | #add combs of phase 1 176 | get_combs_dfs_phased(sv_group, cur_idx, cur_comb, combs, max_no_of_sv, 1) 177 | 178 | if (not if_gt_info) and (not if_phased): 179 | get_combs_dfs(sv_group, cur_idx, cur_comb, combs, max_no_of_sv) 180 | 181 | return combs 182 | 183 | def get_combs_dfs(sv_group, cur_idx, cur_comb, combs, max_no_of_sv): 184 | if len(cur_comb) == len(sv_group) or len(cur_comb) == max_no_of_sv: 185 | if check_comb_valid(cur_comb): 186 | combs.append(list(cur_comb)) 187 | return 188 | else: 189 | if len(cur_comb) > 1 and check_comb_valid(cur_comb): 190 | combs.append(list(cur_comb)) 191 | 192 | for i in range(cur_idx, len(sv_group)): 193 | cur_comb.append(sv_group[i]) 194 | get_combs_dfs(sv_group, i+1, cur_comb, combs, max_no_of_sv) 195 | cur_comb.pop() 196 | 197 | def get_combs_dfs_phased(sv_group, cur_idx, cur_comb, combs, max_no_of_sv, phase): 198 | if len(cur_comb) == len(sv_group) or len(cur_comb) == max_no_of_sv: 199 | if check_comb_valid(cur_comb): 200 | combs.append(list(cur_comb)) 201 | return 202 | else: 203 | if len(cur_comb) > 1 and check_comb_valid(cur_comb): 204 | combs.append(list(cur_comb)) 205 | 206 | for i in range(cur_idx, len(sv_group)): 207 | #check phasing 208 | if sv_group[i].gt[phase] != 1: 209 | continue 210 | 211 | cur_comb.append(sv_group[i]) 212 | get_combs_dfs_phased(sv_group, i+1, cur_comb, combs, max_no_of_sv, phase) 213 | cur_comb.pop() 214 | 215 | def comb_overlap(comb): 216 | cur_sv = comb[0] 217 | for next_sv in comb[1:]: 218 | if check_sv_ol(cur_sv, next_sv): 219 | return True 220 | else: 221 | cur_sv = next_sv 222 | 223 | return False 224 | 225 | #check if length valid, if overlapping 226 | def check_comb_valid(comb): 227 | #check length requirments 228 | len_min = 50 229 | len_max = 500000 230 | 231 | #validate length 232 | agg_len = 0 233 | for sv in comb: 234 | agg_len += sv.length 235 | 236 | if abs(agg_len) < len_min or abs(agg_len) > len_max: 237 | #test 238 | # for sv in comb: 239 | # print(sv.idx) 240 | return False 241 | 242 | #check overlapping 243 | if comb_overlap(comb): 244 | return False 245 | 246 | return True 247 | 248 | 249 | ####################################### 250 | ####################################### 251 | 252 | 253 | #match each info needed to find res 254 | def match_sv_with_comb_res(sv, comb_sv): 255 | sv.length = comb_sv.length 256 | 257 | sv.analyzed_hap1 = comb_sv.analyzed_hap1 258 | sv.analyzed_hap2 = comb_sv.analyzed_hap2 259 | sv.len_query_hap1 = comb_sv.len_query_hap1 260 | sv.len_query_hap2 = comb_sv.len_query_hap2 261 | sv.len_ref_hap1 = comb_sv.len_ref_hap1 262 | sv.len_ref_hap2 = comb_sv.len_ref_hap2 263 | sv.score_before_hap1 = comb_sv.score_before_hap1 264 | sv.score_after_hap1 = comb_sv.score_after_hap1 265 | sv.score_before_hap2 = comb_sv.score_before_hap2 266 | sv.score_after_hap2 = comb_sv.score_after_hap2 267 | 268 | #check if the comb res is better than an sv 269 | def check_comb_better_than_sv(sv, comb_sv, if_gt): 270 | #return a tuple: (res, rela_len, rela_score, gt_validate) 271 | comb_res = comb_sv.get_vali_res(if_gt) 272 | sv_res = sv.get_vali_res(if_gt) 273 | 274 | if comb_res[0] and (not sv_res[0]): 275 | return True 276 | elif sv_res[0] and (not comb_res[0]): 277 | return False 278 | else: 279 | comb_rela_len = comb_res[1] 280 | sv_rela_len = sv_res[1] 281 | 282 | if abs(comb_rela_len - 1) <= abs(sv_rela_len - 1): 283 | return True 284 | else: 285 | return False 286 | 287 | #update sv info with comb res 288 | def update_sv_with_comb_res(sv, comb_sv, if_gt): 289 | if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 290 | return 291 | 292 | if (not sv.analyzed_hap1) or (not sv.analyzed_hap2): 293 | match_sv_with_comb_res(sv, comb_sv) 294 | else: 295 | if check_comb_better_than_sv(sv, comb_sv, if_gt): 296 | match_sv_with_comb_res(sv, comb_sv) 297 | 298 | 299 | 300 | 301 | ####################################### 302 | ####################################### 303 | #main function 304 | def get_agg_align_info(contig_name_list_1, contig_pos_list_1, contig_name_dict_1, 305 | contig_name_list_2, contig_pos_list_2, contig_name_dict_2, 306 | sv_list, if_gt_info, if_phased, query_file_1, query_file_2, 307 | ref_file, interval, if_hg38, if_gt): 308 | 309 | #group SVs 310 | max_btw_dist = 1000 311 | 312 | sv_idx_dict = build_sv_idx_dict(sv_list) 313 | 314 | sv_groups = get_sv_groups(sv_list, max_btw_dist) 315 | 316 | sv_groups_combs = [] 317 | for sv_group in sv_groups: 318 | combs = get_sv_comb(sv_group, if_gt_info, if_phased) 319 | sv_groups_combs.append(combs) 320 | 321 | ####################################### 322 | ####################################### 323 | #preprocessing: liftover mapping 324 | 325 | # contig_name_list_1, contig_pos_list_1, contig_name_dict_1 = get_align_info.build_map_compress(chr_len, interval, liftover_file1, if_hg38) 326 | # contig_name_list_2, contig_pos_list_2, contig_name_dict_2 = get_align_info.build_map_compress(chr_len, interval, liftover_file2, if_hg38) 327 | 328 | query_fasta_file_1 = pysam.FastaFile(query_file_1) 329 | query_fasta_file_2 = pysam.FastaFile(query_file_2) 330 | 331 | ref_fasta_file = pysam.FastaFile(ref_file) 332 | cur_ref_name = "" 333 | ref_rec_len = 0 334 | 335 | comb_dict = {} 336 | 337 | for combs in sv_groups_combs: 338 | for comb in combs: 339 | comb_sv = help_func.idx_comb(comb) 340 | #test 341 | # comb_sv.print_info() 342 | 343 | if cur_ref_name != comb_sv.ref_name: 344 | cur_ref_name = comb_sv.ref_name 345 | ref_rec = ref_fasta_file.fetch(cur_ref_name) 346 | ref_rec_len = len(ref_rec) 347 | 348 | help_func.get_comb_vali_info(comb_sv, 1, interval, contig_name_list_1, contig_pos_list_1, 349 | contig_name_dict_1, if_hg38, ref_rec, query_fasta_file_1, sv_idx_dict) 350 | help_func.get_comb_vali_info(comb_sv, 2, interval, contig_name_list_2, contig_pos_list_2, 351 | contig_name_dict_2, if_hg38, ref_rec, query_fasta_file_2, sv_idx_dict) 352 | 353 | if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 354 | #test 355 | print("NA") 356 | continue 357 | 358 | #test 359 | #print res 360 | # gt = False 361 | # print(comb_sv.get_vali_res(gt)) 362 | 363 | comb_dict[tuple(comb_sv.idx)] = comb_sv 364 | 365 | # help_func.update_sv_res_len_only(comb_sv) 366 | 367 | #update SV info 368 | 369 | for idx_tuple in comb_dict: 370 | comb_sv = comb_dict[idx_tuple] 371 | 372 | #skip NA comb_sv 373 | if (not comb_sv.analyzed_hap1) or (not comb_sv.analyzed_hap2): 374 | continue 375 | 376 | for sv_idx in idx_tuple: 377 | update_sv_with_comb_res(sv_idx_dict[sv_idx], comb_sv, if_gt) 378 | 379 | #write SV info 380 | func.write_vali_info_agg(sv_list, output_dir, if_gt) -------------------------------------------------------------------------------- /assembly_score.py: -------------------------------------------------------------------------------- 1 | #Get confidence scores of assembly intervals 2 | import pysam 3 | import sys 4 | import os 5 | import csv 6 | from Bio import SeqIO 7 | 8 | class indexed_read: 9 | # constructor 10 | def __init__(self, read): 11 | self.ori_read = read 12 | self.read_cur_ref_pos = read.reference_start 13 | self.cur_cigar_tuple = 0 14 | 15 | def get_interval_cigar(cigar_tuples, start_pos, end_pos, read, consume_ref): 16 | #t 17 | #read_ref_start <= start_pos (100) 18 | #read_ref_end >= end_pos (200) - 1 19 | #cigar covering the interval 20 | read_ref_start = read.read_cur_ref_pos 21 | interval_cigar = [] 22 | #first cigar tup entering the interval 23 | first_tup = read.cur_cigar_tuple 24 | #cur_ref_pos: the end of previous stop +1 25 | #print(read_ref_start, first_tup) 26 | cur_ref_pos = read_ref_start 27 | for i in range(read.cur_cigar_tuple, len(cigar_tuples)): 28 | tup = cigar_tuples[i] 29 | if tup[0] in consume_ref: 30 | cur_ref_pos += tup[1] 31 | #if entering the interval 32 | if cur_ref_pos > start_pos: 33 | first_tup = i 34 | break 35 | #get the interval_cigar 36 | if cur_ref_pos >= end_pos: 37 | interval_cigar.append((tup[0], end_pos - start_pos)) 38 | read.read_cur_ref_pos = cur_ref_pos - tup[1] 39 | read.cur_cigar_tuple = first_tup 40 | else: 41 | interval_cigar.append((tup[0], cur_ref_pos - start_pos)) 42 | for i in range(first_tup + 1, len(cigar_tuples)): 43 | tup = cigar_tuples[i] 44 | if tup[0] not in consume_ref: 45 | interval_cigar.append(tup) 46 | else: 47 | cur_ref_pos += tup[1] 48 | if cur_ref_pos < end_pos: 49 | interval_cigar.append(tup) 50 | else: 51 | interval_cigar.append((tup[0], tup[1] - (cur_ref_pos - end_pos))) 52 | read.read_cur_ref_pos = cur_ref_pos - tup[1] 53 | read.cur_cigar_tuple = i 54 | break 55 | #note interval_cigar length may not sum to the interval length: including consuming query only 56 | return interval_cigar 57 | 58 | #return percentage of bad bases 59 | def bad_base_prct(interval_cigar, interval_len): 60 | bad_bases = 0 61 | for tup in interval_cigar: 62 | #if tup[0] not in [0, 7]: 63 | if tup[0] != 7: 64 | bad_bases += tup[1] 65 | return bad_bases/interval_len 66 | 67 | #get contig seq 68 | def getSeqRec(seq_name, file_name): 69 | #fasta_file = SeqIO.parse(file_name, "fasta") 70 | #for record in fasta_file: 71 | # if seq_name == record.id.strip(): 72 | # return record 73 | fasta_file = pysam.FastaFile(file_name) 74 | seq = fasta_file.fetch(seq_name) 75 | return seq 76 | 77 | #write score 78 | def write_score(contig_name, start_pos, end_pos, score, g): 79 | g.write(str(contig_name) + "\t") 80 | g.write(str(start_pos) + "\t") 81 | g.write(str(end_pos) + "\t") 82 | g.write(str(score)) 83 | g.write("\n") 84 | 85 | #parse reads 86 | def parse_reads(output_file_name, contig_list, interval_len, consume_ref, cut_off, samfile, min_depth, max_depth): 87 | # #test 88 | # contig_ctr = 0 89 | g = open(output_file_name, "w") 90 | for contig in contig_list: 91 | chr_name = contig[0] 92 | # #test 93 | # contig_ctr += 1 94 | # print(contig_ctr) 95 | print(chr_name) 96 | contig_len = contig[1] 97 | #test 98 | #stop when parsed no_of_reads 99 | #no_of_reads = 400000 100 | #counter_read = 0 101 | #interval is left-closed, starting from 0 102 | cur_start_pos = 0 103 | cur_end_pos = cur_start_pos + interval_len 104 | 105 | ol_reads = [] 106 | read_interval_cigars = [] 107 | 108 | #TODO: check empty iteration? 109 | #print(contig_len) 110 | #loop through iter, index will not be reset 111 | #iter = samfile.fetch(chr_name, 187700, 188900) 112 | # iter = samfile.fetch(chr_name, 1000, 1100) 113 | # 0 23 11 0.48 114 | iter = samfile.fetch(chr_name) 115 | 116 | # for rec in iter: 117 | # ol_reads.append(rec) 118 | # break 119 | 120 | while cur_end_pos < contig_len: 121 | #add reads to ol_reads 122 | #the last one added can be out of range 123 | 124 | if len(ol_reads) == 0: 125 | try: 126 | # get the next item 127 | rec = next(iter) 128 | ol_reads.append(indexed_read(rec)) 129 | # do something with element 130 | except StopIteration: 131 | # if StopIteration is raised, break from loop 132 | break 133 | # for rec in iter: 134 | # ol_reads.append(rec) 135 | # break 136 | 137 | if ol_reads[len(ol_reads)-1].ori_read.reference_start < cur_end_pos: 138 | for rec in iter: 139 | ol_reads.append(indexed_read(rec)) 140 | # if not overlapping, stop adding 141 | if ol_reads[len(ol_reads)-1].ori_read.reference_start >= cur_end_pos: 142 | break; 143 | 144 | # test_ind_read = indexed_read(ol_reads[0].ori_read) 145 | # print(test_ind_read.cur_cigar_tuple) 146 | # test_ind_read.cur_cigar_tuple = 5 147 | # print(test_ind_read.cur_cigar_tuple) 148 | # test_change_tuple(test_ind_read) 149 | # print(test_ind_read.cur_cigar_tuple) 150 | # break 151 | 152 | # class indexed_read: 153 | # # constructor 154 | # def __init__(self, read): 155 | # self.ori_read = read 156 | # self.read_cur_ref_pos = read.reference_start 157 | # self.cur_cigar_tuple = 0 158 | 159 | 160 | #delete bottom reads that not overlapping with current read 161 | #note that the non-overlapping reads may not be at the bottom: lengths are different 162 | #thus, get rid of the non-overlapping reads at the bottom first 163 | #then check overlapping (with indels) every time before indels detection 164 | while ol_reads[0].ori_read.reference_end < cur_start_pos: 165 | ol_reads.pop(0) 166 | if len(ol_reads) == 0: 167 | #print(chr_name, cur_start_pos) 168 | break 169 | 170 | #skip if depth 0 or too large 171 | if len(ol_reads) == 0 or len(ol_reads) > max_depth: 172 | #TODO: output bad score 0 for these intervals 173 | write_score(chr_name, cur_start_pos, cur_end_pos, 0, g) 174 | cur_start_pos += interval_len 175 | cur_end_pos = cur_start_pos + interval_len 176 | continue 177 | 178 | #get the reads in iter 179 | # for rec in iter: 180 | # if rec.reference_start > cur_start_pos or rec.reference_end < cur_end_pos: 181 | # continue 182 | # ol_reads.append(rec) 183 | valid_ol_reads_ctr = 0 184 | 185 | for cur_read in ol_reads: 186 | #test 187 | #print(cur_read.reference_start, cur_read.reference_end) 188 | #if cur_read.reference_end < cur_start_pos or cur_read.reference_start >= cur_end_pos: 189 | if cur_read.ori_read.reference_end < (cur_end_pos - 1) or cur_read.ori_read.reference_start > cur_start_pos: 190 | continue 191 | valid_ol_reads_ctr += 1 192 | cur_cigar_tuples = cur_read.ori_read.cigartuples 193 | #test 194 | #print(cur_cigar_tuples) 195 | interval_cigar = get_interval_cigar(cur_cigar_tuples, cur_start_pos, cur_end_pos, cur_read, consume_ref) 196 | read_interval_cigars.append(interval_cigar) 197 | #test 198 | #print(interval_cigar) 199 | 200 | bad_read_ctr = 0 201 | #Question: more elegent way to check empty in this case? 202 | #Coverage below threshold will be counted as 0 score intervals 203 | if len(read_interval_cigars) > min_depth: 204 | for interval_cigar in read_interval_cigars: 205 | #calculate percentage of matched bases 206 | #print(bad_base_prct(interval_cigar, interval_len)) 207 | if bad_base_prct(interval_cigar, interval_len) >= cut_off: 208 | #test 209 | #print(cur_start_pos) 210 | bad_read_ctr += 1 211 | #write score 212 | write_score(chr_name, cur_start_pos, cur_end_pos, 1 - round(bad_read_ctr/valid_ol_reads_ctr, 2), g) 213 | else: 214 | write_score(chr_name, cur_start_pos, cur_end_pos, 0, g) 215 | 216 | #test 217 | #if bad_read_ctr > 0: 218 | #if round(bad_read_ctr/len(ol_reads), 2) > 0.3: 219 | # print(cur_start_pos, valid_ol_reads_ctr, bad_read_ctr, round(bad_read_ctr/valid_ol_reads_ctr, 2)) 220 | # if cur_start_pos % 100000 == 0: 221 | # print(cur_start_pos) 222 | # print(cur_start_pos, valid_ol_reads_ctr, bad_read_ctr, round(bad_read_ctr/valid_ol_reads_ctr, 2)) 223 | # break 224 | 225 | cur_start_pos += interval_len 226 | cur_end_pos = cur_start_pos + interval_len 227 | 228 | #ol_reads = [] 229 | read_interval_cigars = [] 230 | #if cur_end_pos >= 2000: 231 | # break 232 | g.close() 233 | 234 | # new parse reads 235 | def parse_reads_1(output_file_name, contig_list, interval_len, consume_ref, cut_off, samfile, min_depth, max_depth, safe_len): 236 | # #test 237 | # contig_ctr = 0 238 | g = open(output_file_name, "w") 239 | for contig in contig_list: 240 | chr_name = contig[0] 241 | # #test 242 | # contig_ctr += 1 243 | # print(contig_ctr) 244 | print(chr_name) 245 | contig_len = contig[1] 246 | #test 247 | #stop when parsed no_of_reads 248 | #no_of_reads = 400000 249 | #counter_read = 0 250 | #interval is left-closed, starting from 0 251 | cur_start_pos = 0 252 | cur_end_pos = cur_start_pos + interval_len 253 | 254 | ol_reads = [] 255 | read_interval_cigars = [] 256 | 257 | #TODO: check empty iteration? 258 | #print(contig_len) 259 | #loop through iter, index will not be reset 260 | #iter = samfile.fetch(chr_name, 187700, 188900) 261 | # iter = samfile.fetch(chr_name, 1000, 1100) 262 | # 0 23 11 0.48 263 | iter = samfile.fetch(chr_name) 264 | 265 | # for rec in iter: 266 | # ol_reads.append(rec) 267 | # break 268 | 269 | while cur_end_pos < contig_len: 270 | #add reads to ol_reads 271 | #the last one added can be out of range 272 | 273 | if len(ol_reads) == 0: 274 | try: 275 | # get the next item 276 | rec = next(iter) 277 | ol_reads.append(indexed_read(rec)) 278 | # do something with element 279 | except StopIteration: 280 | # if StopIteration is raised, break from loop 281 | break 282 | # for rec in iter: 283 | # ol_reads.append(rec) 284 | # break 285 | 286 | if ol_reads[len(ol_reads)-1].ori_read.reference_start < cur_end_pos: 287 | for rec in iter: 288 | ol_reads.append(indexed_read(rec)) 289 | # if not overlapping, stop adding 290 | if ol_reads[len(ol_reads)-1].ori_read.reference_start >= cur_end_pos: 291 | break 292 | 293 | #delete bottom reads that not overlapping with current read 294 | #note that the non-overlapping reads may not be at the bottom: lengths are different 295 | #thus, get rid of the non-overlapping reads at the bottom first 296 | #then check overlapping (with indels) every time before indels detection 297 | while ol_reads[0].ori_read.reference_end < cur_start_pos: 298 | ol_reads.pop(0) 299 | if len(ol_reads) == 0: 300 | #print(chr_name, cur_start_pos) 301 | break 302 | 303 | #skip if depth 0 or too large 304 | if len(ol_reads) == 0 or len(ol_reads) > max_depth: 305 | #TODO: output bad score 0 for these intervals 306 | write_score(chr_name, cur_start_pos, cur_end_pos, 0, g) 307 | cur_start_pos += interval_len 308 | cur_end_pos = cur_start_pos + interval_len 309 | continue 310 | 311 | #get the reads in iter 312 | # for rec in iter: 313 | # if rec.reference_start > cur_start_pos or rec.reference_end < cur_end_pos: 314 | # continue 315 | # ol_reads.append(rec) 316 | valid_ol_reads_ctr = 0 317 | good_reads_ctr = 0 318 | 319 | for cur_read in ol_reads: 320 | #test 321 | #print(cur_read.reference_start, cur_read.reference_end) 322 | #if cur_read.reference_end < cur_start_pos or cur_read.reference_start >= cur_end_pos: 323 | if cur_read.ori_read.reference_end < (cur_end_pos - 1) or cur_read.ori_read.reference_start > cur_start_pos: 324 | continue 325 | valid_ol_reads_ctr += 1 326 | #count reads extend the interval by at least save_len as good 327 | if cur_read.ori_read.reference_end >= min((cur_end_pos + safe_len), contig_len-1) and \ 328 | cur_read.ori_read.reference_start <= max((cur_start_pos - safe_len), 0): 329 | good_reads_ctr += 1 330 | 331 | # bad_read_ctr = 0 332 | # #Question: more elegent way to check empty in this case? 333 | # #Coverage below threshold will be counted as 0 score intervals 334 | if valid_ol_reads_ctr > min_depth: 335 | #write score 336 | write_score(chr_name, cur_start_pos, cur_end_pos, round(good_reads_ctr/valid_ol_reads_ctr, 2), g) 337 | else: 338 | write_score(chr_name, cur_start_pos, cur_end_pos, 0, g) 339 | 340 | cur_start_pos += interval_len 341 | cur_end_pos = cur_start_pos + interval_len 342 | 343 | #ol_reads = [] 344 | read_interval_cigars = [] 345 | g.close() 346 | 347 | # new parse reads 348 | def parse_reads_0831(output_file_name, contig_list, interval_len, consume_ref, cut_off, samfile, min_depth, max_depth, safe_len): 349 | # #test 350 | # contig_ctr = 0 351 | g = open(output_file_name, "w") 352 | for contig in contig_list: 353 | contig_name = contig[0] 354 | print(contig_name) 355 | contig_len = contig[1] 356 | 357 | #interval is left-closed, starting from 0: eg: [0, 100) 358 | cur_start_pos = 0 359 | cur_end_pos = cur_start_pos + interval_len 360 | 361 | ol_reads = [] 362 | 363 | #loop through iter, index will not be reset 364 | #iter = samfile.fetch(contig_name, 187700, 188900) 365 | # iter = samfile.fetch(contig_name, 1000, 1100) 366 | iter = samfile.fetch(contig_name) 367 | 368 | while cur_end_pos < contig_len: 369 | #add reads to ol_reads 370 | #the last one added can be out of current interval 371 | if len(ol_reads) == 0: 372 | try: 373 | # get the next item 374 | rec = next(iter) 375 | ol_reads.append(indexed_read(rec)) 376 | # do something with element 377 | except StopIteration: 378 | # if StopIteration is raised, break from loop 379 | break 380 | 381 | # for rec in iter: 382 | # ol_reads.append(rec) 383 | # break 384 | 385 | if ol_reads[len(ol_reads)-1].ori_read.reference_start < cur_end_pos: 386 | for rec in iter: 387 | ol_reads.append(indexed_read(rec)) 388 | # if not overlapping, stop adding 389 | if ol_reads[len(ol_reads)-1].ori_read.reference_start >= cur_end_pos: 390 | break 391 | 392 | #delete bottom reads that not overlapping with current read 393 | #note that the non-overlapping reads may not be at the bottom: lengths are different 394 | #thus, get rid of the non-overlapping reads at the bottom first 395 | #then check overlapping (with indels) every time before indels detection 396 | while ol_reads[0].ori_read.reference_end < cur_start_pos: 397 | ol_reads.pop(0) 398 | if len(ol_reads) == 0: 399 | #print(contig_name, cur_start_pos) 400 | break 401 | 402 | #skip if depth 0 403 | if len(ol_reads) == 0: 404 | write_score(contig_name, cur_start_pos, cur_end_pos, 0, g) 405 | cur_start_pos += interval_len 406 | cur_end_pos = cur_start_pos + interval_len 407 | continue 408 | 409 | valid_ol_reads_ctr = 0 410 | good_reads_ctr = 0 411 | 412 | for cur_read in ol_reads: 413 | #test 414 | #print(cur_read.reference_start, cur_read.reference_end) 415 | #if cur_read.reference_end < cur_start_pos or cur_read.reference_start >= cur_end_pos: 416 | #make sure read has to cover the interval 417 | if cur_read.ori_read.reference_end < (cur_end_pos - 1) or cur_read.ori_read.reference_start > cur_start_pos: 418 | continue 419 | valid_ol_reads_ctr += 1 420 | #count reads extend the interval by at least save_len as good 421 | if cur_read.ori_read.reference_end >= min((cur_end_pos + safe_len), contig_len-1) and \ 422 | cur_read.ori_read.reference_start <= max((cur_start_pos - safe_len), 0): 423 | good_reads_ctr += 1 424 | 425 | write_score(contig_name, cur_start_pos, cur_end_pos, good_reads_ctr, g) 426 | 427 | cur_start_pos += interval_len 428 | cur_end_pos = cur_start_pos + interval_len 429 | g.close() 430 | 431 | #main function 432 | def main(): 433 | #get command line input 434 | #n = len(sys.argv) 435 | output_dir = sys.argv[1] + "/" 436 | #read to assembly bam files 437 | read2assem1_bamfile = sys.argv[2] 438 | read2assem2_bamfile = sys.argv[3] 439 | centromere_file = sys.argv[4] 440 | #assembly fasta files 441 | assem1_fasta = sys.argv[5] 442 | assem2_fasta = sys.argv[6] 443 | #output names 444 | output_name1 = sys.argv[7] 445 | output_name2 = sys.argv[8] 446 | 447 | interval_len = 100 448 | safe_len = 1000 449 | #if not matches bases less than cut_off, the read is good in currect interval 450 | cut_off = 0.1 451 | #opeartion index that consume reference 452 | consume_ref = [0,2,3,7,8] 453 | #operation index that consume query 454 | consume_query = [0,1,4,7,8] 455 | #min valid depth for an interval, below will have score 0 456 | min_depth = 15 457 | max_depth = 2000 458 | 459 | #hap1 460 | output_file = output_dir + output_name1 461 | samfile = pysam.AlignmentFile(read2assem1_bamfile, "rb") 462 | contig_list = [] 463 | for seq_record in SeqIO.parse(assem1_fasta, "fasta"): 464 | #print(seq_record.id) 465 | #print(repr(seq_record.seq)) 466 | #print(len(seq_record)) 467 | contig_list.append([seq_record.id, len(seq_record)]) 468 | #parse reads and get scores of assembly intervals and write results 469 | parse_reads_0831(output_file, contig_list, interval_len, consume_ref, cut_off, samfile, min_depth, max_depth, safe_len) 470 | 471 | #hap2 472 | output_file = output_dir + output_name2 473 | samfile = pysam.AlignmentFile(read2assem2_bamfile, "rb") 474 | contig_list = [] 475 | for seq_record in SeqIO.parse(assem2_fasta, "fasta"): 476 | #print(seq_record.id) 477 | #print(repr(seq_record.seq)) 478 | #print(len(seq_record)) 479 | contig_list.append([seq_record.id, len(seq_record)]) 480 | #parse reads and get scores of assembly intervals and write results 481 | parse_reads_0831(output_file, contig_list, interval_len, consume_ref, cut_off, samfile, min_depth, max_depth, safe_len) 482 | 483 | if __name__ == "__main__": 484 | main() -------------------------------------------------------------------------------- /centromere_hg37.txt: -------------------------------------------------------------------------------- 1 | chr1 121500000 125000000 p11.1 acen 2 | chr1 125000000 128900000 q11 acen 3 | chr2 90500000 93300000 p11.1 acen 4 | chr2 93300000 96800000 q11.1 acen 5 | chr3 87900000 91000000 p11.1 acen 6 | chr3 91000000 93900000 q11.1 acen 7 | chr4 48200000 50400000 p11 acen 8 | chr4 50400000 52700000 q11 acen 9 | chr5 46100000 48400000 p11 acen 10 | chr5 48400000 50700000 q11.1 acen 11 | chr6 58700000 61000000 p11.1 acen 12 | chr6 61000000 63300000 q11.1 acen 13 | chr7 58000000 59900000 p11.1 acen 14 | chr7 59900000 61700000 q11.1 acen 15 | chr8 43100000 45600000 p11.1 acen 16 | chr8 45600000 48100000 q11.1 acen 17 | chr9 47300000 49000000 p11.1 acen 18 | chr9 49000000 50700000 q11 acen 19 | chrX 58100000 60600000 p11.1 acen 20 | chrX 60600000 63000000 q11.1 acen 21 | chrY 11600000 12500000 p11.1 acen 22 | chrY 12500000 13400000 q11.1 acen 23 | chr10 38000000 40200000 p11.1 acen 24 | chr10 40200000 42300000 q11.1 acen 25 | chr11 51600000 53700000 p11.11 acen 26 | chr11 53700000 55700000 q11 acen 27 | chr12 33300000 35800000 p11.1 acen 28 | chr12 35800000 38200000 q11 acen 29 | chr13 16300000 17900000 p11.1 acen 30 | chr13 17900000 19500000 q11 acen 31 | chr14 16100000 17600000 p11.1 acen 32 | chr14 17600000 19100000 q11.1 acen 33 | chr15 15800000 19000000 p11.1 acen 34 | chr15 19000000 20700000 q11.1 acen 35 | chr16 34600000 36600000 p11.1 acen 36 | chr16 36600000 38600000 q11.1 acen 37 | chr17 22200000 24000000 p11.1 acen 38 | chr17 24000000 25800000 q11.1 acen 39 | chr18 15400000 17200000 p11.1 acen 40 | chr18 17200000 19000000 q11.1 acen 41 | chr19 24400000 26500000 p11 acen 42 | chr19 26500000 28600000 q11 acen 43 | chr20 25600000 27500000 p11.1 acen 44 | chr20 27500000 29400000 q11.1 acen 45 | chr21 10900000 13200000 p11.1 acen 46 | chr21 13200000 14300000 q11.1 acen 47 | chr22 12200000 14700000 p11.1 acen 48 | chr22 14700000 17900000 q11.1 acen -------------------------------------------------------------------------------- /centromere_hg38.txt: -------------------------------------------------------------------------------- 1 | chr1 121700000 123400000 p11.1 acen 2 | chr1 123400000 125100000 q11 acen 3 | chr2 91800000 93900000 p11.1 acen 4 | chr2 93900000 96000000 q11.1 acen 5 | chr3 87800000 90900000 p11.1 acen 6 | chr3 90900000 94000000 q11.1 acen 7 | chr4 48200000 50000000 p11 acen 8 | chr4 50000000 51800000 q11 acen 9 | chr5 46100000 48800000 p11 acen 10 | chr5 48800000 51400000 q11.1 acen 11 | chr6 58500000 59800000 p11.1 acen 12 | chr6 59800000 62600000 q11.1 acen 13 | chr7 58100000 60100000 p11.1 acen 14 | chr7 60100000 62100000 q11.1 acen 15 | chr8 43200000 45200000 p11.1 acen 16 | chr8 45200000 47200000 q11.1 acen 17 | chr9 42200000 43000000 p11.1 acen 18 | chr9 43000000 45500000 q11 acen 19 | chrX 58100000 61000000 p11.1 acen 20 | chrX 61000000 63800000 q11.1 acen 21 | chrY 10300000 10400000 p11.1 acen 22 | chrY 10400000 10600000 q11.1 acen 23 | chr10 38000000 39800000 p11.1 acen 24 | chr10 39800000 41600000 q11.1 acen 25 | chr11 51000000 53400000 p11.11 acen 26 | chr11 53400000 55800000 q11 acen 27 | chr12 33200000 35500000 p11.1 acen 28 | chr12 35500000 37800000 q11 acen 29 | chr13 16500000 17700000 p11.1 acen 30 | chr13 17700000 18900000 q11 acen 31 | chr14 16100000 17200000 p11.1 acen 32 | chr14 17200000 18200000 q11.1 acen 33 | chr15 17500000 19000000 p11.1 acen 34 | chr15 19000000 20500000 q11.1 acen 35 | chr16 35300000 36800000 p11.1 acen 36 | chr16 36800000 38400000 q11.1 acen 37 | chr17 22700000 25100000 p11.1 acen 38 | chr17 25100000 27400000 q11.1 acen 39 | chr18 15400000 18500000 p11.1 acen 40 | chr18 18500000 21500000 q11.1 acen 41 | chr19 24200000 26200000 p11 acen 42 | chr19 26200000 28100000 q11 acen 43 | chr20 25700000 28100000 p11.1 acen 44 | chr20 28100000 30400000 q11.1 acen 45 | chr21 10900000 12000000 p11.1 acen 46 | chr21 12000000 13000000 q11.1 acen 47 | chr22 13700000 15000000 p11.1 acen 48 | chr22 15000000 17400000 q11.1 acen -------------------------------------------------------------------------------- /chrx.py: -------------------------------------------------------------------------------- 1 | ########################################################## 2 | ########################################################## 3 | #arguments 4 | 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser() 8 | 9 | parser.add_argument("output_dir", 10 | help="output directory") 11 | parser.add_argument("centromere_file", 12 | help="centromere file") 13 | parser.add_argument("assem1_non_cov_regions_file", 14 | help="Regions that are not covered on hap1") 15 | parser.add_argument("assem2_non_cov_regions_file", 16 | help="Regions that are not covered on hap2") 17 | parser.add_argument("vcf_file", 18 | help="input vcf file") 19 | parser.add_argument("ref_file", 20 | help="reference file") 21 | parser.add_argument("query_file1", 22 | help="assembly fasta file hap1") 23 | parser.add_argument("query_file2", 24 | help="assembly fasta file hap2") 25 | parser.add_argument("liftover_file1", 26 | help="liftover file hap1") 27 | parser.add_argument("liftover_file2", 28 | help="liftover file hap2") 29 | parser.add_argument("tandem_file", 30 | help="tandem repeats regions") 31 | # parser.add_argument("if_hg38_input", 32 | # help="if reference is hg38 or not") 33 | parser.add_argument("-n", 34 | "--not_hg38", 35 | help="if reference is NOT hg38 (hg19)", 36 | action="store_true") 37 | # parser.add_argument("if_passonly_input", 38 | # help="if consider PASS calls only or not") 39 | parser.add_argument("-p", 40 | "--passonly", 41 | help="if consider PASS calls only", 42 | action="store_true") 43 | # parser.add_argument("seq_resolved_input", 44 | # help="if consider sequence resolved calls (INS) or not") 45 | parser.add_argument("-s", 46 | "--seq_resolved", 47 | help="if consider sequence resolved calls (INS)", 48 | action="store_true") 49 | # parser.add_argument("wrong_len_input", 50 | # help="if count wrong length calls as True") 51 | parser.add_argument("-w", 52 | "--wrong_len", 53 | help="if count wrong length calls as True", 54 | action="store_true") 55 | parser.add_argument("-g", 56 | "--gt_vali", 57 | help="conduct genotype validation", 58 | action="store_true") 59 | args = parser.parse_args() 60 | 61 | import sys 62 | import csv 63 | import pysam 64 | import numpy as np 65 | import math 66 | 67 | import get_conf_int 68 | import validate 69 | import get_align_info 70 | 71 | ########################################################## 72 | ########################################################## 73 | #input 74 | output_dir = args.output_dir + "/" 75 | # if_hg38_input = args.if_hg38_input 76 | centromere_file = args.centromere_file 77 | #assembly bam files 78 | assem1_non_cov_regions_file = args.assem1_non_cov_regions_file 79 | assem2_non_cov_regions_file = args.assem2_non_cov_regions_file 80 | #avg_read_depth = sys.argv[6] 81 | #read_bam_file = sys.argv[6] 82 | vcf_file = args.vcf_file 83 | #ref fasta file 84 | ref_file = args.ref_file 85 | #assembly fasta files 86 | query_file1 = args.query_file1 87 | query_file2 = args.query_file2 88 | liftover_file1 = args.liftover_file1 89 | liftover_file2 = args.liftover_file2 90 | tandem_file = args.tandem_file 91 | 92 | ########################################################## 93 | ########################################################## 94 | #constants 95 | 96 | #liftover interval 97 | interval = 20 98 | if_hg38 = not args.not_hg38 99 | # if if_hg38_input == "True": 100 | # if_hg38 = True 101 | #if pass_only 102 | if_pass_only = args.passonly 103 | # if if_passonly_input == "True": 104 | # if_pass_only = True 105 | #if seq_resolved 106 | seq_resolved = args.seq_resolved 107 | # if seq_resolved_input == "True": 108 | # seq_resolved = True 109 | #if include wrong length as TP 110 | wrong_len = args.wrong_len 111 | # if wrong_len_input == "True": 112 | # wrong_len = True 113 | #chr names 114 | chr_list = [] 115 | if if_hg38: 116 | chr_list = ["chrX"] 117 | else: 118 | chr_list = ["X"] 119 | #approximate length of chromosomes 120 | chr_len = [250000000, 244000000, 199000000, 192000000, 182000000, 121 | 172000000, 160000000, 147000000, 142000000, 136000000, 122 | 136000000, 134000000, 116000000, 108000000, 103000000, 123 | 90400000, 83300000, 80400000, 59200000, 64500000, 124 | 48200000, 51400000, 157000000, 59400000] 125 | 126 | #max/min length of allowed SV not DUP 127 | memory_limit = 100000 128 | memory_min = 10 129 | #max length of allowed DUP 130 | dup_memory_limit = 50000 131 | dup_memory_min = 10 132 | 133 | #valid types 134 | valid_types = ['DEL', 'INS', 'INV', 'DUP:TANDEM', 'DUP'] 135 | 136 | #tandem repeats regions file 137 | with open(tandem_file) as f: 138 | reader = csv.reader(f, delimiter="\t") 139 | tandem_info = list(reader) 140 | f.close() 141 | 142 | #get tandem start and end list 143 | tandem_start_list, tandem_end_list = get_align_info.get_chr_tandem_shart_end_list(tandem_info, if_hg38) 144 | 145 | ########################################################## 146 | ########################################################## 147 | 148 | #build lists for excluded SV positions 149 | 150 | #Output regions on ref where its not covered by at least one of the assembly 151 | # get_conf_int.get_non_cover_regions(output_dir, bam_file1, 1, chr_list) 152 | # get_conf_int.get_non_cover_regions(output_dir, bam_file2, 2, chr_list) 153 | 154 | #Get regions where read depth > 2 * avg_read_depth 155 | #get_conf_int.get_high_depth_calls_info(output_dir, read_bam_file, vcf_file, avg_read_depth) 156 | 157 | #Output sv positions 158 | get_conf_int.get_sv_positions(output_dir, vcf_file) 159 | 160 | #Output filtered calls in non-covered regions 161 | SV_positions_file = output_dir + "SV_positions.bed" 162 | # assem1_non_cov_regions_file = output_dir + "assem1_non_cov_regions.bed" 163 | # assem2_non_cov_regions_file = output_dir + "assem2_non_cov_regions.bed" 164 | get_conf_int.output_non_cov_call_info(output_dir, SV_positions_file, assem1_non_cov_regions_file, assem2_non_cov_regions_file) 165 | 166 | #get filtered sv info, using results from get_conf_int.py 167 | exclude_assem1_non_cover, exclude_assem2_non_cover = validate.get_filtered_sv_pos(output_dir + "exclude_assem1_non_cover.bed", 168 | output_dir + "exclude_assem2_non_cover.bed") 169 | 170 | #build centromere dictionary 171 | dict_centromere = validate.build_centro_dict(centromere_file) 172 | 173 | 174 | 175 | #return False if not filtered 176 | #first_filter: type, PASS, chr_name 177 | def first_filter(sv, sv_type): 178 | #type filter 179 | if sv_type not in valid_types: 180 | return True 181 | #PASS filter 182 | if if_pass_only: 183 | if 'PASS' not in sv.filter.keys(): 184 | return True 185 | chr_name = sv.chrom 186 | #chr filter 187 | if chr_name not in chr_list: 188 | return True 189 | return False 190 | 191 | #second_filter: centromere, non-cov 192 | def second_filter(sv): 193 | index = sv.idx 194 | ref_name = sv.ref_name 195 | sv_pos = sv.sv_pos 196 | sv_stop = sv.sv_stop 197 | 198 | if if_hg38: 199 | centro_start = int(dict_centromere[ref_name][0]) 200 | centro_end = int(dict_centromere[ref_name][1]) 201 | else: 202 | centro_start = int(dict_centromere['chr'+ref_name][0]) 203 | centro_end = int(dict_centromere['chr'+ref_name][1]) 204 | 205 | #centromere 206 | if (sv_pos > centro_start and sv_pos < centro_end) or (sv_stop > centro_start and sv_stop < centro_end): 207 | sv.is_sec_fil = True 208 | return True 209 | 210 | #non-cov 211 | list_to_check = [str(ref_name), str(sv_pos), str(sv_stop)] 212 | #if sv in non-covered regions, skip 213 | if validate.check_exclude_chrx(list_to_check, exclude_assem1_non_cover, exclude_assem2_non_cover): 214 | sv.is_sec_fil = True 215 | return True 216 | 217 | #third_filter: size 218 | def third_filter(sv): 219 | #size 220 | if sv.sv_type not in ['DUP:TANDEM', 'DUP']: 221 | if abs(sv.length) < memory_min or abs(sv.length) > memory_limit: 222 | sv.is_third_fil = True 223 | return True 224 | else: 225 | if abs(sv.length) < dup_memory_min or abs(sv.length) > dup_memory_limit: 226 | sv.is_third_fil = True 227 | return True 228 | 229 | 230 | #get validation info 231 | def write_vali_info(sv_list): 232 | g = open(output_dir + "ttmars_chrx_res.txt", "w") 233 | for sv in sv_list: 234 | #skip if not analyzed 235 | if (not sv.analyzed_hap1) and (not sv.analyzed_hap2): 236 | continue 237 | 238 | res = sv.get_vali_res() 239 | 240 | g.write(str(sv.ref_name) + "\t") 241 | g.write(str(sv.sv_pos) + "\t") 242 | g.write(str(sv.sv_stop) + "\t") 243 | g.write(str(sv.sv_type) + "\t") 244 | g.write(str(res[1]) + "\t") 245 | g.write(str(res[2]) + "\t") 246 | g.write(str(res[0])) 247 | 248 | if args.gt_vali: 249 | g.write("\t" + str(res[3])) 250 | 251 | g.write("\n") 252 | g.close() 253 | 254 | #define class 255 | class struc_var: 256 | def __init__(self, idx, ref_name, sv_type, sv_pos, sv_stop, length, gt): 257 | self.idx = idx 258 | self.ref_name = ref_name 259 | self.sv_pos = sv_pos 260 | self.sv_stop = sv_stop 261 | self.sv_type = sv_type 262 | self.length = length 263 | self.gt = gt 264 | #if the call is part of an aggregate SV 265 | self.is_agg = False 266 | #if second filtered out 267 | self.is_sec_fil = False 268 | self.is_third_fil = False 269 | 270 | self.query_name_hap1 = "NA" 271 | self.query_name_hap2 = "NA" 272 | 273 | self.ref_start_best_hap1 = -1 274 | self.ref_end_best_hap1 = -1 275 | self.query_start_best_hap1 = -1 276 | self.query_end_best_hap1 = -1 277 | 278 | self.ref_start_best_hap2 = -1 279 | self.ref_end_best_hap2 = -1 280 | self.query_start_best_hap2 = -1 281 | self.query_end_best_hap2 = -1 282 | 283 | self.analyzed_hap1 = False 284 | self.analyzed_hap2 = False 285 | 286 | self.len_query_hap1 = -1 287 | self.len_ref_hap1 = -1 288 | self.len_query_hap2 = -1 289 | self.len_ref_hap2 = -1 290 | 291 | self.score_before_hap1 = -1 292 | self.score_after_hap1 = -1 293 | self.score_before_hap2 = -1 294 | self.score_after_hap2 = -1 295 | 296 | self.neg_strand_hap1 = False 297 | self.neg_strand_hap2 = False 298 | 299 | self.ins_seq = "" 300 | self.if_seq_resolved = False 301 | 302 | def check_tp(self, rela_len, rela_score): 303 | result = True 304 | if self.sv_type in ['DEL', 'DUP', 'DUP:TANDEM']: 305 | if rela_score >= 0 and rela_score <= 2.5: 306 | if rela_len >= -0.05*rela_score + 0.8 and rela_len <= 0.05*rela_score + 1.2: 307 | result = True 308 | else: 309 | result = False 310 | elif rela_score > 2.5: 311 | if rela_len >= 0.675 and rela_len <= 1.325: 312 | result = True 313 | else: 314 | result = False 315 | else: 316 | result = False 317 | elif self.sv_type == 'INS': 318 | #not seq-resolved 319 | #if len(self.ins_seq) == 0: 320 | if not self.if_seq_resolved: 321 | if rela_len < 0.675 or rela_len > 1.325: 322 | result = False 323 | #seq-resolved 324 | else: 325 | if rela_score >= 0 and rela_score <= 2.5: 326 | if rela_len >= -0.05*rela_score + 0.8 and rela_len <= 0.05*rela_score + 1.2: 327 | result = True 328 | else: 329 | result = False 330 | elif rela_score > 2.5: 331 | if rela_len >= 0.675 and rela_len <= 1.325: 332 | result = True 333 | else: 334 | result = False 335 | else: 336 | result = False 337 | 338 | elif self.sv_type == 'INV': 339 | if rela_score <= 0: 340 | result = False 341 | return result 342 | 343 | #TP when wrong length flag presents -- looser rules for TP 344 | def check_tp_wlen(self, rela_len, rela_score): 345 | result = True 346 | if self.sv_type in ['DEL', 'DUP', 'DUP:TANDEM']: 347 | if rela_score >= 0 and rela_score <= 2.5: 348 | if rela_len >= -0.05*rela_score + 0.6 and rela_len <= 0.05*rela_score + 1.4: 349 | result = True 350 | else: 351 | result = False 352 | elif rela_score > 2.5: 353 | if rela_len >= 0.475 and rela_len <= 1.525: 354 | result = True 355 | else: 356 | result = False 357 | else: 358 | result = False 359 | elif self.sv_type == 'INS': 360 | #not seq-resolved 361 | #if len(self.ins_seq) == 0: 362 | if not self.if_seq_resolved: 363 | if rela_len < 0.475 or rela_len > 1.525: 364 | result = False 365 | #seq-resolved 366 | else: 367 | if rela_score >= 0 and rela_score <= 2.5: 368 | if rela_len >= -0.05*rela_score + 0.6 and rela_len <= 0.05*rela_score + 1.4: 369 | result = True 370 | else: 371 | result = False 372 | elif rela_score > 2.5: 373 | if rela_len >= 0.475 and rela_len <= 1.525: 374 | result = True 375 | else: 376 | result = False 377 | else: 378 | result = False 379 | 380 | elif self.sv_type == 'INV': 381 | if rela_score <= 0: 382 | result = False 383 | return result 384 | 385 | def print_info(self): 386 | print(self.idx, self.ref_name, self.sv_pos, self.sv_stop, self.sv_type, self.length, self.gt, self.is_agg, self.is_sec_fil, self.is_third_fil) 387 | 388 | def cal_rela_score(self, score_before, score_after): 389 | if score_before > -1 and score_before < 0: 390 | tmp_score_before = -1 391 | tmp_score_after = score_after + (tmp_score_before - score_before) 392 | return round((tmp_score_after - tmp_score_before) / abs(tmp_score_before), 2) 393 | 394 | elif score_before >= 0 and score_before < 1: 395 | tmp_score_before = 1 396 | tmp_score_after = score_after + (tmp_score_before - score_before) 397 | return round((tmp_score_after - tmp_score_before) / abs(tmp_score_before), 2) 398 | 399 | else: 400 | return round((score_after - score_before) / abs(score_before), 2) 401 | 402 | def cal_rela_len(self, query_len, ref_len): 403 | return round((query_len - ref_len) / self.length, 2) 404 | 405 | def get_vali_res(self): 406 | if (not self.analyzed_hap1) and (not self.analyzed_hap2): 407 | return -1 408 | 409 | elif self.analyzed_hap1 and self.analyzed_hap2: 410 | rela_len_1 = self.cal_rela_len(self.len_query_hap1, self.len_ref_hap1) 411 | rela_len_2 = self.cal_rela_len(self.len_query_hap2, self.len_ref_hap2) 412 | 413 | rela_score_1 = self.cal_rela_score(self.score_before_hap1, self.score_after_hap1) 414 | rela_score_2 = self.cal_rela_score(self.score_before_hap2, self.score_after_hap2) 415 | 416 | if not wrong_len: 417 | res_hap1 = self.check_tp(rela_len_1, rela_score_1) 418 | res_hap2 = self.check_tp(rela_len_2, rela_score_2) 419 | else: 420 | res_hap1 = self.check_tp_wlen(rela_len_1, rela_score_1) 421 | res_hap2 = self.check_tp_wlen(rela_len_2, rela_score_2) 422 | 423 | gt_validate = False 424 | if args.gt_vali: 425 | if res_hap1 and res_hap2: 426 | if self.gt == (1,1): 427 | gt_validate = True 428 | elif res_hap1 or res_hap2: 429 | if self.gt == (1,0) or self.gt == (0,1): 430 | gt_validate = True 431 | 432 | if res_hap1 and res_hap2: 433 | if abs(rela_len_1 - 1) <= abs(rela_len_2 - 1): 434 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 435 | else: 436 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 437 | elif res_hap1: 438 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 439 | elif res_hap2: 440 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 441 | else: 442 | if abs(rela_len_1 - 1) <= abs(rela_len_2 - 1): 443 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 444 | else: 445 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 446 | 447 | elif self.analyzed_hap1: 448 | rela_len_1 = self.cal_rela_len(self.len_query_hap1, self.len_ref_hap1) 449 | 450 | rela_score_1 = self.cal_rela_score(self.score_before_hap1, self.score_after_hap1) 451 | 452 | res_hap1 = self.check_tp(rela_len_1, rela_score_1) 453 | 454 | gt_validate = False 455 | if args.gt_vali: 456 | if res_hap1: 457 | if self.gt == (1,0) or self.gt == (0,1): 458 | gt_validate = True 459 | 460 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 461 | elif self.analyzed_hap2: 462 | rela_len_2 = self.cal_rela_len(self.len_query_hap2, self.len_ref_hap2) 463 | 464 | rela_score_2 = self.cal_rela_score(self.score_before_hap2, self.score_after_hap2) 465 | 466 | res_hap2 = self.check_tp(rela_len_2, rela_score_2) 467 | 468 | gt_validate = False 469 | if args.gt_vali: 470 | if res_hap2: 471 | if self.gt == (1,0) or self.gt == (0,1): 472 | gt_validate = True 473 | 474 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 475 | 476 | class alignment: 477 | def __init__(self, idx, agt_rec, hap, query_length): 478 | self.idx = idx 479 | self.ref_name = 'NA' 480 | self.ref_start = -1 481 | self.ref_end = -1 482 | self.contig_name = agt_rec.reference_name 483 | self.contig_start = agt_rec.reference_start 484 | self.contig_end = agt_rec.reference_end 485 | self.query_name = agt_rec.query_name 486 | #This the index of the first base in seq that is not soft-clipped 487 | self.query_start = agt_rec.query_alignment_start 488 | self.query_end = agt_rec.query_alignment_end 489 | #the index of the last base in seq that is not soft-clipped - the index of the first base in seq that is not soft-clipped 490 | self.aligned_length = agt_rec.query_alignment_length 491 | 492 | #use query length from the fasta file instead!!! 493 | self.query_length = query_length 494 | self.hap = hap 495 | 496 | def cal_aligned_portion(self): 497 | return self.aligned_length/self.query_length 498 | 499 | def cal_ins_portion(self): 500 | return 1 - (self.ref_end - self.ref_start)/self.aligned_length 501 | 502 | def set_ref_info(self, ref_name, ref_start, ref_end): 503 | self.ref_name = ref_name 504 | self.ref_start = ref_start 505 | self.ref_end = ref_end 506 | 507 | def print_info(self): 508 | print(self.idx, self.ref_name, self.ref_start, self.ref_end, self.contig_name, self.contig_start, self.contig_end, self.query_name, self.query_start, self.query_end,\ 509 | self.aligned_length, self.query_length, self.hap) 510 | 511 | #main function 512 | def main(): 513 | 514 | #get validation info files 515 | 516 | #build map and get validation info on hap1 517 | contig_name_list_1, contig_pos_list_1, contig_name_dict_1 = get_align_info.build_map_compress(chr_len, interval, liftover_file1, if_hg38) 518 | #build map and get validation info on hap2 519 | contig_name_list_2, contig_pos_list_2, contig_name_dict_2 = get_align_info.build_map_compress(chr_len, interval, liftover_file2, if_hg38) 520 | 521 | #index SVs 522 | f = pysam.VariantFile(vcf_file,'r') 523 | sv_list = [] 524 | for count, rec in enumerate(f.fetch()): 525 | #get sv_type 526 | try: 527 | sv_type = rec.info['SVTYPE'] 528 | except: 529 | print("invalid sv type info") 530 | continue 531 | 532 | if first_filter(rec, sv_type): 533 | continue 534 | 535 | #get sv length 536 | if sv_type == 'INV': 537 | sv_len = abs(rec.stop - rec.pos + 1) 538 | else: 539 | try: 540 | sv_len = rec.info['SVLEN'][0] 541 | except: 542 | try: 543 | sv_len = rec.info['SVLEN'] 544 | except: 545 | sv_len = abs(rec.stop - rec.pos + 1) 546 | #print("invalid sv length info") 547 | # try: 548 | # sv_len = rec.info['SVLEN'][0] 549 | # except: 550 | # sv_len = rec.info['SVLEN'] 551 | #handle del length > 0: 552 | if sv_type == 'DEL': 553 | sv_len = -abs(sv_len) 554 | 555 | if abs(sv_len) < memory_min: 556 | continue 557 | 558 | #get gt 559 | #only taking the first sample genotype 560 | if args.gt_vali: 561 | sv_gt = rec.samples[0]["GT"] 562 | #bad genotype 563 | if sv_gt not in [(1, 1), (1, 0), (0, 1)]: 564 | sv_gt = None 565 | else: 566 | sv_gt = None 567 | 568 | sv_list.append(struc_var(count, rec.chrom, sv_type, rec.pos, rec.stop, sv_len, sv_gt)) 569 | 570 | #add ins seq for seq-resolved insertion 571 | #no multi-allelic considered 572 | if (sv_type == 'INS') and seq_resolved: 573 | sv_list[len(sv_list)-1].ins_seq = rec.alts[0] 574 | sv_list[len(sv_list)-1].if_seq_resolved = True 575 | 576 | f.close() 577 | #index sv: second_filter: centromere, non-cov 578 | #third_filter: size 579 | 580 | for sv in sv_list: 581 | second_filter(sv) 582 | third_filter(sv) 583 | 584 | get_align_info.get_vali_info(output_dir, vcf_file, query_file1, 1, ref_file, interval, 585 | contig_name_list_1, contig_pos_list_1, contig_name_dict_1, memory_limit, if_hg38, chr_list, 586 | tandem_start_list, tandem_end_list, tandem_info, sv_list, seq_resolved) 587 | 588 | get_align_info.get_vali_info(output_dir, vcf_file, query_file2, 2, ref_file, interval, 589 | contig_name_list_2, contig_pos_list_2, contig_name_dict_2, memory_limit, if_hg38, chr_list, 590 | tandem_start_list, tandem_end_list, tandem_info, sv_list, seq_resolved) 591 | 592 | #get validation info 593 | write_vali_info(sv_list) 594 | 595 | if __name__ == "__main__": 596 | main() 597 | 598 | -------------------------------------------------------------------------------- /combine.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | import argparse 5 | 6 | import fn 7 | import ttmars 8 | 9 | # parser = argparse.ArgumentParser() 10 | # parser.add_argument("output_dir", 11 | # help="output directory") 12 | # parser.add_argument("no_X_chr", 13 | # choices=[1, 2], 14 | # help="male sample 1, female sample 2", 15 | # type=int) 16 | # parser.add_argument("-v", 17 | # "--vcf_out", 18 | # help="output results as vcf files, must be used together with -f/--vcf_file", 19 | # action="store_true") 20 | # parser.add_argument("-f", 21 | # "--vcf_file", 22 | # help="input vcf file using as template, must be used together with -v/--vcf_out or -n/--false_neg") 23 | # parser.add_argument("-g", 24 | # "--gt_vali", 25 | # help="conduct genotype validation", 26 | # action="store_true") 27 | # parser.add_argument("-n", 28 | # "--false_neg", 29 | # help="output false negative, must be used together with -t/--truth_file and -f/--vcf_file", 30 | # action="store_true") 31 | # parser.add_argument("-t", 32 | # "--truth_file", 33 | # help="input truth vcf file, must be used together with -n/--false_neg") 34 | # args = parser.parse_args() 35 | 36 | # if bool(args.vcf_out) ^ bool(args.vcf_file): 37 | # parser.error('-f/--vcf_file must be used with -v/--vcf_out') 38 | 39 | # if bool(args.false_neg) ^ bool(args.truth_file): 40 | # parser.error('-t/--truth_file must be used with -n/--false_neg') 41 | 42 | # if (bool(args.false_neg)) and (not bool(args.vcf_file)): 43 | # parser.error('-n/--false_neg must be used with -f/--vcf_file') 44 | 45 | # output_dir = args.output_dir + "/" 46 | 47 | # if int(args.no_X_chr) == 1: 48 | # if_male = True 49 | # else: 50 | # if_male = False 51 | 52 | # if args.vcf_out: 53 | # if_vcf = True 54 | # in_vcf_file = args.vcf_file 55 | # else: 56 | # if_vcf = False 57 | 58 | # if args.false_neg: 59 | # output_fn = True 60 | # in_truth_file = args.truth_file 61 | # else: 62 | # output_fn = False 63 | 64 | output_dir = ttmars.output_dir 65 | if_male = ttmars.if_male 66 | if_vcf = ttmars.if_vcf 67 | in_vcf_file = ttmars.vcf_file 68 | output_fn = ttmars.output_fn 69 | if output_fn: 70 | in_truth_file = ttmars.in_truth_file 71 | if_gt = ttmars.if_gt 72 | 73 | def combine_output(): 74 | other_sv_res_file = output_dir+"ttmars_res.txt" 75 | regdup_res_file = output_dir+"ttmars_regdup_res.txt" 76 | agg_res_file = output_dir+"ttmars_agg_res.txt" 77 | 78 | with open(other_sv_res_file) as f: 79 | reader = csv.reader(f, delimiter="\t") 80 | other_sv_res = list(reader) 81 | f.close() 82 | 83 | if_dup = True 84 | try: 85 | with open(regdup_res_file) as f: 86 | reader = csv.reader(f, delimiter="\t") 87 | regdup_res = list(reader) 88 | f.close() 89 | except: 90 | if_dup = False 91 | 92 | if_agg = True 93 | try: 94 | with open(agg_res_file) as f: 95 | reader = csv.reader(f, delimiter="\t") 96 | agg_res = list(reader) 97 | f.close() 98 | except: 99 | if_agg = False 100 | 101 | 102 | #if output fn 103 | if output_fn: 104 | #input truth set 105 | sv_list = fn.idx_sv(in_truth_file) 106 | #input candidate set 107 | cand_sv_list = fn.idx_sv(in_vcf_file) 108 | 109 | sv_list_sorted = fn.sort_sv_list(sv_list) 110 | cand_sv_list_sorted = fn.sort_sv_list(cand_sv_list) 111 | 112 | tp_base_ctr = fn.count_tp_base_dist_only(sv_list_sorted, cand_sv_list_sorted) 113 | recall = tp_base_ctr / len(sv_list_sorted) 114 | 115 | print("Recall of candidate callset: " + str(recall)) 116 | 117 | sv_dict = {} 118 | #results of SVs other than interspersed DUP 119 | for rec in other_sv_res: 120 | sv_idx = rec[0] 121 | ref_name = rec[1] 122 | sv_pos = int(rec[2]) 123 | sv_end = int(rec[3]) 124 | sv_type = rec[4] 125 | 126 | if not if_gt: 127 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 128 | else: 129 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 130 | 131 | # if not if_gt: 132 | # sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6]] 133 | # else: 134 | # sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6], rec[7]] 135 | 136 | #interspersed DUP 137 | if if_dup: 138 | for rec in regdup_res: 139 | sv_idx = rec[0] 140 | ref_name = rec[1] 141 | sv_pos = int(rec[2]) 142 | sv_end = int(rec[3]) 143 | sv_type = rec[4] 144 | 145 | if sv_idx in sv_dict: 146 | if not if_gt: 147 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 148 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 149 | else: 150 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 151 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 152 | elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True': 153 | if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False': 154 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 155 | else: 156 | if not if_gt: 157 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 158 | else: 159 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 160 | 161 | #agg sv 162 | if if_agg: 163 | for rec in agg_res: 164 | sv_idx = rec[0] 165 | ref_name = rec[1] 166 | sv_pos = int(rec[2]) 167 | sv_end = int(rec[3]) 168 | sv_type = rec[4] 169 | 170 | if sv_idx in sv_dict: 171 | if not if_gt: 172 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 173 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 174 | else: 175 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 176 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 177 | elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True': 178 | if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False': 179 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 180 | else: 181 | if not if_gt: 182 | if rec[7] == 'True': 183 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 184 | else: 185 | if rec[7] == 'True': 186 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 187 | 188 | if if_male: 189 | chrx_res_file = output_dir+"ttmars_chrx_res.txt" 190 | with open(chrx_res_file) as f: 191 | reader = csv.reader(f, delimiter="\t") 192 | chrx_res = list(reader) 193 | f.close() 194 | for rec in chrx_res: 195 | if len(rec) == 0: 196 | continue 197 | 198 | sv_idx = rec[0] 199 | ref_name = rec[1] 200 | sv_pos = int(rec[2]) 201 | sv_end = int(rec[3]) 202 | sv_type = rec[4] 203 | 204 | if sv_idx in sv_dict: 205 | if not if_gt: 206 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 207 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 208 | else: 209 | if rec[7] == 'True' and sv_dict[sv_idx][2] == 'False': 210 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 211 | elif rec[7] == 'True' and sv_dict[sv_idx][2] == 'True': 212 | if rec[8] == 'True' and sv_dict[sv_idx][3] == 'False': 213 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 214 | else: 215 | if not if_gt: 216 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type] 217 | else: 218 | sv_dict[sv_idx] = [rec[5], rec[6], rec[7], ref_name, int(sv_pos), int(sv_end), sv_type, rec[8]] 219 | 220 | g = open(output_dir + "/ttmars_combined_res.txt", "w") 221 | for key in sv_dict: 222 | res = sv_dict[key] 223 | 224 | g.write(str(key) + "\t") 225 | g.write(str(res[0]) + "\t") 226 | g.write(str(res[1]) + "\t") 227 | g.write(str(res[2]) + "\t") 228 | g.write(str(res[3]) + "\t") 229 | g.write(str(res[4]) + "\t") 230 | g.write(str(res[5]) + "\t") 231 | g.write(str(res[6])) 232 | 233 | if if_gt: 234 | g.write("\t" + str(res[7])) 235 | 236 | g.write("\n") 237 | g.close() 238 | 239 | #if output vcf 240 | if if_vcf: 241 | from pysam import VariantFile 242 | vcf_in = VariantFile(in_vcf_file) 243 | vcfh = vcf_in.header 244 | #vcfh.add_meta('INFO', items=[('ID',"TTMars"), ('Number',1), ('Type','String'),('Description','TT-Mars NA12878 results: TP, FA, NA or .')]) 245 | vcfh.add_meta('INFO', items=[('ID',"GT_vali"), ('Number',1), ('Type','String'),('Description','TT-Mars GT validation (require flag -g): True, False or NA')]) 246 | vcf_out_tp = VariantFile(output_dir+"ttmars_tp.vcf", 'w', header=vcfh) 247 | vcf_out_fp = VariantFile(output_dir+"ttmars_fp.vcf", 'w', header=vcfh) 248 | vcf_out_na = VariantFile(output_dir+"ttmars_na.vcf", 'w', header=vcfh) 249 | 250 | for count, rec in enumerate(vcf_in.fetch()): 251 | key = str(count) 252 | try: 253 | validation_res = sv_dict[key][2] 254 | if validation_res == 'True': 255 | if if_gt: 256 | rec.info['GT_vali'] = sv_dict[key][7] 257 | vcf_out_tp.write(rec) 258 | elif validation_res == 'False': 259 | vcf_out_fp.write(rec) 260 | except: 261 | vcf_out_na.write(rec) 262 | 263 | # for rec in vcf_in.fetch(): 264 | # ref_name = rec.chrom 265 | # sv_type = rec.info['SVTYPE'] 266 | # sv_pos = rec.pos 267 | # sv_end = rec.stop 268 | 269 | # try: 270 | # validation_res = sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)][2] 271 | # if validation_res == 'True': 272 | # if if_gt: 273 | # rec.info['GT_vali'] = sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)][3] 274 | # vcf_out_tp.write(rec) 275 | # elif validation_res == 'False': 276 | # vcf_out_fp.write(rec) 277 | # except: 278 | # vcf_out_na.write(rec) 279 | 280 | vcf_out_tp.close() 281 | vcf_out_fp.close() 282 | vcf_out_na.close() 283 | vcf_in.close() 284 | 285 | #remove files 286 | def remove_files(): 287 | import os 288 | for name in ['assem1_non_cov_regions.bed', 'assem2_non_cov_regions.bed', 289 | 'exclude_assem1_non_cover.bed', 'exclude_assem2_non_cover.bed', 290 | 'SV_positions.bed', 'ttmars_chrx_res.txt', 'ttmars_regdup_res.txt', 291 | 'ttmars_res.txt', 'align_info_assem1_chrall.txt', 'align_info_assem2_chrall.txt', 292 | 'all_reg_dup.fasta', 'all_reg_dup.fasta.fai']: 293 | if os.path.exists(output_dir + name): 294 | os.remove(output_dir + name) 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | -------------------------------------------------------------------------------- /compress_liftover.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | interval = 20 4 | 5 | output_dir = sys.argv[1] + "/" 6 | in_file_name = output_dir + sys.argv[2] 7 | out_file_name = output_dir + sys.argv[3] 8 | 9 | #consider reverse strand also 10 | with open(in_file_name, "r") as file: 11 | reader = csv.reader(file, delimiter="\t") 12 | start_row = next(reader) 13 | chr_name = start_row[4] 14 | contig_name = start_row[0] 15 | chr_pos = int(start_row[5]) 16 | contig_pos = int(start_row[1]) 17 | pre_chr_pos = chr_pos 18 | pre_contig_pos = contig_pos 19 | ctr = 1 20 | forward_strand = True 21 | out_file = open(out_file_name, "w") 22 | out_file.write(chr_name + "\t" + contig_name + "\t") 23 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 24 | 25 | for row in reader: 26 | if (row[4] == chr_name) and (row[0] == contig_name) and (int(row[5]) - pre_chr_pos == interval) and \ 27 | (int(row[1]) - pre_contig_pos == interval) and forward_strand: 28 | pre_chr_pos = int(row[5]) 29 | pre_contig_pos = int(row[1]) 30 | ctr += 1 31 | elif (row[4] == chr_name) and (row[0] == contig_name) and (int(row[5]) - pre_chr_pos == interval) and \ 32 | (int(row[1]) - pre_contig_pos == -interval) and not forward_strand: 33 | pre_chr_pos = int(row[5]) 34 | pre_contig_pos = int(row[1]) 35 | ctr += 1 36 | else: 37 | #if different chr or contig 38 | if row[4] != chr_name or row[0] != contig_name: 39 | #print(chr_name, contig_name) 40 | out_file.write(str(ctr)+":") 41 | if forward_strand: 42 | out_file.write("+;") 43 | else: 44 | out_file.write("-;") 45 | out_file.write("\n") 46 | chr_name = row[4] 47 | contig_name = row[0] 48 | chr_pos = int(row[5]) 49 | contig_pos = int(row[1]) 50 | pre_chr_pos = chr_pos 51 | pre_contig_pos = contig_pos 52 | ctr = 1 53 | forward_strand = True 54 | out_file.write(chr_name + "\t" + contig_name + "\t") 55 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 56 | elif int(row[5]) - pre_chr_pos != interval: 57 | out_file.write(str(ctr)+":") 58 | if forward_strand: 59 | out_file.write("+;") 60 | else: 61 | out_file.write("-;") 62 | chr_pos = int(row[5]) 63 | contig_pos = int(row[1]) 64 | pre_chr_pos = chr_pos 65 | pre_contig_pos = contig_pos 66 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 67 | ctr = 1 68 | forward_strand = True 69 | elif (int(row[1]) - pre_contig_pos != interval) and (int(row[1]) - pre_contig_pos != -interval): 70 | out_file.write(str(ctr)+":") 71 | if forward_strand: 72 | out_file.write("+;") 73 | else: 74 | out_file.write("-;") 75 | chr_pos = int(row[5]) 76 | contig_pos = int(row[1]) 77 | pre_chr_pos = chr_pos 78 | pre_contig_pos = contig_pos 79 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 80 | ctr = 1 81 | forward_strand = True 82 | elif (int(row[1]) - pre_contig_pos == interval) and not forward_strand: 83 | out_file.write(str(ctr)+":") 84 | out_file.write("-;") 85 | chr_pos = int(row[5]) 86 | contig_pos = int(row[1]) 87 | pre_chr_pos = chr_pos 88 | pre_contig_pos = contig_pos 89 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 90 | ctr = 1 91 | forward_strand = True 92 | elif (int(row[1]) - pre_contig_pos == -interval) and forward_strand: 93 | out_file.write(str(ctr)+":") 94 | out_file.write("+;") 95 | chr_pos = int(row[5]) 96 | contig_pos = int(row[1]) 97 | pre_chr_pos = chr_pos 98 | pre_contig_pos = contig_pos 99 | out_file.write(str(chr_pos)+":"+str(contig_pos)+":") 100 | ctr = 1 101 | forward_strand = False 102 | #write last lines 103 | out_file.close() -------------------------------------------------------------------------------- /download_asm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sample=HG00096_asm 4 | mkdir ttmars_files/$sample 5 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00096_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 6 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00096_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 7 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00096_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 8 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00096_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 9 | 10 | 11 | sample=HG00171_asm 12 | mkdir ttmars_files/$sample 13 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00171_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 14 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00171_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 15 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00171_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 16 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00171_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 17 | 18 | sample=HG00513_asm 19 | mkdir ttmars_files/$sample 20 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00513_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta 21 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00513_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta.fai 22 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00513_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta 23 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00513_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta.fai 24 | 25 | sample=HG00731_asm 26 | mkdir ttmars_files/$sample 27 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00731_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta 28 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00731_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta.fai 29 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00731_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta 30 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00731_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta.fai 31 | 32 | sample=HG00732_asm 33 | mkdir ttmars_files/$sample 34 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00732_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta 35 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00732_hgsvc_pbsq2-ccs_1000-pereg.h1-un.racon-p2.fasta.fai 36 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00732_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta 37 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200628_HHU_assembly-results_CCS_v12/assemblies/phased/v12_HG00732_hgsvc_pbsq2-ccs_1000-pereg.h2-un.racon-p2.fasta.fai 38 | 39 | sample=HG00864_asm 40 | mkdir ttmars_files/$sample 41 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00864_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 42 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00864_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 43 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00864_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 44 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG00864_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 45 | 46 | sample=HG01114_asm 47 | mkdir ttmars_files/$sample 48 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01114_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 49 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01114_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 50 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01114_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 51 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01114_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 52 | 53 | sample=HG01505_asm 54 | mkdir ttmars_files/$sample 55 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01505_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 56 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01505_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 57 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01505_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 58 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01505_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 59 | 60 | sample=HG01596_asm 61 | mkdir ttmars_files/$sample 62 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01596_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 63 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01596_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 64 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01596_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 65 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200612_HHU_assembly-results_CLR_v12/assemblies/phased/v12_HG01596_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai 66 | 67 | sample=HG03009_asm 68 | mkdir ttmars_files/$sample 69 | wget -O ttmars_files/$sample/h1.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200828_JAX_assembly-results_CLR_v12/assemblies/phased/v12_HG03009_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta 70 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200828_JAX_assembly-results_CLR_v12/assemblies/phased/v12_HG03009_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1.fasta.fai 71 | wget -O ttmars_files/$sample/h2.fa http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200828_JAX_assembly-results_CLR_v12/assemblies/phased/v12_HG03009_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta 72 | wget -O ttmars_files/$sample/h1.fa.fai http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v1.0/assemblies/20200828_JAX_assembly-results_CLR_v12/assemblies/phased/v12_HG03009_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1.fasta.fai -------------------------------------------------------------------------------- /download_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir ttmars_files 4 | 5 | sample=HG00096 6 | mkdir ttmars_files/$sample 7 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817390 8 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817384 9 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817387 10 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817381 11 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850246 12 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850249 13 | 14 | 15 | sample=HG00171 16 | mkdir ttmars_files/$sample 17 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817402 18 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817396 19 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817399 20 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817393 21 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850258 22 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850261 23 | 24 | sample=HG00513 25 | mkdir ttmars_files/$sample 26 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817411 27 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817405 28 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817408 29 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817414 30 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850639 31 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850642 32 | 33 | sample=HG00731 34 | mkdir ttmars_files/$sample 35 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817426 36 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817420 37 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817423 38 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817417 39 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850663 40 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850660 41 | 42 | sample=HG00732 43 | mkdir ttmars_files/$sample 44 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817435 45 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817429 46 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817432 47 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817438 48 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850687 49 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850681 50 | 51 | sample=HG00864 52 | mkdir ttmars_files/$sample 53 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817450 54 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817444 55 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817447 56 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817441 57 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850708 58 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850711 59 | 60 | sample=HG01114 61 | mkdir ttmars_files/$sample 62 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817459 63 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817453 64 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817456 65 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817462 66 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850726 67 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850729 68 | 69 | sample=HG01505 70 | mkdir ttmars_files/$sample 71 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817471 72 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817465 73 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817468 74 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817474 75 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850747 76 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850744 77 | 78 | sample=HG01596 79 | mkdir ttmars_files/$sample 80 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817486 81 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817480 82 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817483 83 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817477 84 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850768 85 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850762 86 | 87 | sample=HG03009 88 | mkdir ttmars_files/$sample 89 | wget -O ttmars_files/$sample/lo_pos_assem1_result_compressed.bed https://figshare.com/ndownloader/files/30817498 90 | wget -O ttmars_files/$sample/lo_pos_assem2_result_compressed.bed https://figshare.com/ndownloader/files/30817492 91 | wget -O ttmars_files/$sample/lo_pos_assem1_0_result_compressed.bed https://figshare.com/ndownloader/files/30817495 92 | wget -O ttmars_files/$sample/lo_pos_assem2_0_result_compressed.bed https://figshare.com/ndownloader/files/30817489 93 | wget -O ttmars_files/$sample/assem1_non_cov_regions.bed https://figshare.com/ndownloader/files/30850777 94 | wget -O ttmars_files/$sample/assem2_non_cov_regions.bed https://figshare.com/ndownloader/files/30850780 -------------------------------------------------------------------------------- /filter_truth_set.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | import csv 3 | import math 4 | 5 | from pysam import VariantFile 6 | 7 | from Bio.Seq import Seq 8 | from Bio import pairwise2 9 | import sys 10 | 11 | def check_ol_coor(sv_coor_list, trimmed_coor): 12 | name = sv_coor_list[0] 13 | cur_start = sv_coor_list[1] 14 | cur_end = sv_coor_list[2] 15 | for rec in trimmed_coor: 16 | if str(name) == str(rec[0]): 17 | trim_start = int(rec[1]) 18 | trim_end = int(rec[2]) 19 | cur_start = int(cur_start) 20 | cur_end = int(cur_end) 21 | 22 | if cur_start <= trim_end and trim_start <= cur_start: 23 | return True 24 | return False 25 | 26 | def filter_dipcall_for_fn_trimcoor(bcf_in_file, asm1_trimmed_file, asm2_trimmed_file, bcf_out_file): 27 | # print(sample_name) 28 | 29 | del_ctr = 0 30 | ins_ctr = 0 31 | 32 | bcf_in = VariantFile(bcf_in_file) 33 | 34 | ############################################## 35 | ############################################## 36 | #trim coordinate files 37 | with open(asm1_trimmed_file) as f: 38 | reader = csv.reader(f, delimiter="\t") 39 | asm1_trimmed_coor = list(reader) 40 | f.close() 41 | with open(asm2_trimmed_file) as f: 42 | reader = csv.reader(f, delimiter="\t") 43 | asm2_trimmed_coor = list(reader) 44 | f.close() 45 | ############################################## 46 | ############################################## 47 | 48 | vcfh = bcf_in.header 49 | bcf_out = VariantFile(bcf_out_file, 'w', header=vcfh) 50 | 51 | #g = open("./HG00514/HG00514_filtered.vcf", "a") 52 | for counter, rec in enumerate(bcf_in.fetch()): 53 | name = rec.chrom 54 | 55 | cur_start = rec.pos 56 | cur_end = rec.stop 57 | cur_length = 1 58 | 59 | ############################################## 60 | ############################################## 61 | #filter dipcall sv if its overlapping with trimmed coordinates 62 | if check_ol_coor([name, cur_start, cur_end], asm1_trimmed_coor): 63 | continue 64 | if check_ol_coor([name, cur_start, cur_end], asm2_trimmed_coor): 65 | continue 66 | ############################################## 67 | ############################################## 68 | 69 | bcf_out.write(rec) 70 | 71 | # if counter > 200: 72 | # break 73 | #bcf_out.write(new_rec) 74 | 75 | bcf_out.close() 76 | bcf_in.close() 77 | 78 | # print(del_ctr, ins_ctr) 79 | 80 | def main(): 81 | #hg38 samples 82 | sample_names = ["HG00096", "HG01505", "HG01596", "HG03009", "HG00731", "HG00171", "HG00864", "HG01114", "HG00513", "HG00732"] 83 | chr_list = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7', 84 | 'chr8','chr9','chr10','chr11','chr12','chr13','chr14', 85 | 'chr15','chr16','chr17','chr18','chr19','chr20','chr21', 86 | 'chr22','chrX'] 87 | 88 | truth_sv_size_lb = 50 89 | 90 | for sample_name in sample_names: 91 | bcf_in_file = sample_name + "_dip_sv_lenfilter_chrfilter_pass.vcf" 92 | 93 | asm1_trimmed_file = "assem1_sort_trimmed_coor.bed" 94 | asm2_trimmed_file = "assem2_sort_trimmed_coor.bed" 95 | 96 | # filter_dipcall_for_fn(sample_name) 97 | filter_dipcall_for_fn_trimcoor(bcf_in_file, asm1_trimmed_file, asm2_trimmed_file, bcf_out_file) 98 | 99 | if __name__ == "__main__": 100 | main() -------------------------------------------------------------------------------- /fn.py: -------------------------------------------------------------------------------- 1 | from pysam import VariantFile 2 | 3 | #CONST 4 | ########################################################### 5 | #fn const 6 | max_dist_to_merge = 1500 7 | max_dist_search = 1000 8 | ratio_size_lb = 0.7 9 | 10 | ########################################################### 11 | #ttmars const 12 | chr_list = ["chr1", "chr2", "chr3", "chr4", "chr5", 13 | "chr6", "chr7", "chr8", "chr9", "chr10", 14 | "chr11", "chr12", "chr13", "chr14", "chr15", 15 | "chr16", "chr17", "chr18", "chr19", "chr20", 16 | "chr21", "chr22", "chrX"] 17 | 18 | memory_limit = 100000 19 | memory_min = 10 20 | 21 | #valid types 22 | valid_types = ['DEL', 'INS', 'INV', 'DUP:TANDEM', 'DUP'] 23 | #pass_only 24 | if_pass_only = True 25 | wrong_len = False 26 | gt_vali = False 27 | if_hg38 = True 28 | 29 | ########################################################### 30 | 31 | #first_filter: type, PASS, chr_name 32 | def first_filter(sv, sv_type): 33 | #type filter 34 | if sv_type not in valid_types: 35 | return True 36 | #PASS filter 37 | if if_pass_only: 38 | if 'PASS' not in sv.filter.keys(): 39 | return True 40 | chr_name = sv.chrom 41 | #chr filter 42 | if chr_name not in chr_list: 43 | return True 44 | return False 45 | 46 | 47 | #define class 48 | class struc_var: 49 | def __init__(self, idx, ref_name, sv_type, sv_pos, sv_stop, length, gt): 50 | self.idx = idx 51 | self.ref_name = ref_name 52 | self.sv_pos = sv_pos 53 | self.sv_stop = sv_stop 54 | self.sv_type = sv_type 55 | self.length = length 56 | self.gt = gt 57 | #if the call is part of an aggregate SV 58 | self.is_agg = False 59 | #if second filtered out 60 | self.is_sec_fil = False 61 | self.is_third_fil = False 62 | 63 | self.query_name_hap1 = "NA" 64 | self.query_name_hap2 = "NA" 65 | 66 | self.ref_start_best_hap1 = -1 67 | self.ref_end_best_hap1 = -1 68 | self.query_start_best_hap1 = -1 69 | self.query_end_best_hap1 = -1 70 | 71 | self.ref_start_best_hap2 = -1 72 | self.ref_end_best_hap2 = -1 73 | self.query_start_best_hap2 = -1 74 | self.query_end_best_hap2 = -1 75 | 76 | self.analyzed_hap1 = False 77 | self.analyzed_hap2 = False 78 | 79 | self.len_query_hap1 = -1 80 | self.len_ref_hap1 = -1 81 | self.len_query_hap2 = -1 82 | self.len_ref_hap2 = -1 83 | 84 | self.score_before_hap1 = -1 85 | self.score_after_hap1 = -1 86 | self.score_before_hap2 = -1 87 | self.score_after_hap2 = -1 88 | 89 | self.neg_strand_hap1 = False 90 | self.neg_strand_hap2 = False 91 | 92 | self.ins_seq = "" 93 | self.if_seq_resolved = False 94 | 95 | def check_tp(self, rela_len, rela_score): 96 | result = True 97 | if self.sv_type in ['DEL', 'DUP', 'DUP:TANDEM']: 98 | if rela_score >= 0 and rela_score <= 2.5: 99 | if rela_len >= -0.05*rela_score + 0.8 and rela_len <= 0.05*rela_score + 1.2: 100 | result = True 101 | else: 102 | result = False 103 | elif rela_score > 2.5: 104 | if rela_len >= 0.675 and rela_len <= 1.325: 105 | result = True 106 | else: 107 | result = False 108 | else: 109 | result = False 110 | elif self.sv_type == 'INS': 111 | #not seq-resolved 112 | #if len(self.ins_seq) == 0: 113 | if not self.if_seq_resolved: 114 | if rela_len < 0.675 or rela_len > 1.325: 115 | result = False 116 | #seq-resolved 117 | else: 118 | if rela_score >= 0 and rela_score <= 2.5: 119 | if rela_len >= -0.05*rela_score + 0.8 and rela_len <= 0.05*rela_score + 1.2: 120 | result = True 121 | else: 122 | result = False 123 | elif rela_score > 2.5: 124 | if rela_len >= 0.675 and rela_len <= 1.325: 125 | result = True 126 | else: 127 | result = False 128 | else: 129 | result = False 130 | 131 | elif self.sv_type == 'INV': 132 | if rela_score <= 0: 133 | result = False 134 | return result 135 | 136 | #TP when wrong length flag presents -- looser rules for TP 137 | def check_tp_wlen(self, rela_len, rela_score): 138 | result = True 139 | if self.sv_type in ['DEL', 'DUP', 'DUP:TANDEM']: 140 | if rela_score >= 0 and rela_score <= 2.5: 141 | if rela_len >= -0.05*rela_score + 0.6 and rela_len <= 0.05*rela_score + 1.4: 142 | result = True 143 | else: 144 | result = False 145 | elif rela_score > 2.5: 146 | if rela_len >= 0.475 and rela_len <= 1.525: 147 | result = True 148 | else: 149 | result = False 150 | else: 151 | result = False 152 | elif self.sv_type == 'INS': 153 | #not seq-resolved 154 | #if len(self.ins_seq) == 0: 155 | if not self.if_seq_resolved: 156 | if rela_len < 0.475 or rela_len > 1.525: 157 | result = False 158 | #seq-resolved 159 | else: 160 | if rela_score >= 0 and rela_score <= 2.5: 161 | if rela_len >= -0.05*rela_score + 0.6 and rela_len <= 0.05*rela_score + 1.4: 162 | result = True 163 | else: 164 | result = False 165 | elif rela_score > 2.5: 166 | if rela_len >= 0.475 and rela_len <= 1.525: 167 | result = True 168 | else: 169 | result = False 170 | else: 171 | result = False 172 | 173 | elif self.sv_type == 'INV': 174 | if rela_score <= 0: 175 | result = False 176 | return result 177 | 178 | def print_info(self): 179 | print(self.idx, self.ref_name, self.sv_pos, self.sv_stop, self.sv_type, self.length, self.gt, self.is_agg, self.is_sec_fil, self.is_third_fil) 180 | 181 | def cal_rela_score(self, score_before, score_after): 182 | if score_before > -1 and score_before < 0: 183 | tmp_score_before = -1 184 | tmp_score_after = score_after + (tmp_score_before - score_before) 185 | return round((tmp_score_after - tmp_score_before) / abs(tmp_score_before), 2) 186 | 187 | elif score_before >= 0 and score_before < 1: 188 | tmp_score_before = 1 189 | tmp_score_after = score_after + (tmp_score_before - score_before) 190 | return round((tmp_score_after - tmp_score_before) / abs(tmp_score_before), 2) 191 | 192 | else: 193 | return round((score_after - score_before) / abs(score_before), 2) 194 | 195 | def cal_rela_len(self, query_len, ref_len): 196 | return round((query_len - ref_len) / self.length, 2) 197 | 198 | def get_vali_res(self): 199 | if (not self.analyzed_hap1) or (not self.analyzed_hap2): 200 | return -1 201 | 202 | if self.analyzed_hap1 and self.analyzed_hap2: 203 | rela_len_1 = self.cal_rela_len(self.len_query_hap1, self.len_ref_hap1) 204 | rela_len_2 = self.cal_rela_len(self.len_query_hap2, self.len_ref_hap2) 205 | 206 | rela_score_1 = self.cal_rela_score(self.score_before_hap1, self.score_after_hap1) 207 | rela_score_2 = self.cal_rela_score(self.score_before_hap2, self.score_after_hap2) 208 | 209 | if not wrong_len: 210 | res_hap1 = self.check_tp(rela_len_1, rela_score_1) 211 | res_hap2 = self.check_tp(rela_len_2, rela_score_2) 212 | else: 213 | res_hap1 = self.check_tp_wlen(rela_len_1, rela_score_1) 214 | res_hap2 = self.check_tp_wlen(rela_len_2, rela_score_2) 215 | 216 | gt_validate = False 217 | # if args.gt_vali: 218 | if gt_vali: 219 | if res_hap1 and res_hap2: 220 | if self.gt == (1,1): 221 | gt_validate = True 222 | elif res_hap1 or res_hap2: 223 | if self.gt == (1,0) or self.gt == (0,1): 224 | gt_validate = True 225 | 226 | if res_hap1 and res_hap2: 227 | if abs(rela_len_1 - 1) <= abs(rela_len_2 - 1): 228 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 229 | else: 230 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 231 | elif res_hap1: 232 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 233 | elif res_hap2: 234 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 235 | else: 236 | if abs(rela_len_1 - 1) <= abs(rela_len_2 - 1): 237 | return (res_hap1, rela_len_1, rela_score_1, gt_validate) 238 | else: 239 | return (res_hap2, rela_len_2, rela_score_2, gt_validate) 240 | 241 | 242 | def idx_sv(input_vcf): 243 | f = VariantFile(input_vcf,'r') 244 | sv_list = [] 245 | for count, rec in enumerate(f.fetch()): 246 | #get sv_type 247 | try: 248 | sv_type = rec.info['SVTYPE'] 249 | except: 250 | print("invalid sv type info") 251 | continue 252 | 253 | if first_filter(rec, sv_type): 254 | continue 255 | 256 | #get sv length 257 | if sv_type == 'INV': 258 | sv_len = abs(rec.stop - rec.pos + 1) 259 | else: 260 | try: 261 | sv_len = rec.info['SVLEN'][0] 262 | except: 263 | try: 264 | sv_len = rec.info['SVLEN'] 265 | except: 266 | sv_len = abs(rec.stop - rec.pos + 1) 267 | #print("invalid sv length info") 268 | # try: 269 | # sv_len = rec.info['SVLEN'][0] 270 | # except: 271 | # sv_len = rec.info['SVLEN'] 272 | #handle del length > 0: 273 | if sv_type == 'DEL': 274 | sv_len = -abs(sv_len) 275 | 276 | if abs(sv_len) < memory_min: 277 | continue 278 | 279 | #get gt 280 | #only taking the first sample genotype 281 | if gt_vali: 282 | sv_gt = rec.samples[0]["GT"] 283 | #bad genotype 284 | if sv_gt not in [(1, 1), (1, 0), (0, 1)]: 285 | sv_gt = None 286 | else: 287 | sv_gt = None 288 | 289 | # if len(rec.samples.values()) != 1: 290 | # raise Exception("Wrong number of sample genotype(s)") 291 | # gts = [s['GT'] for s in rec.samples.values()] 292 | 293 | sv_list.append(struc_var(count, rec.chrom, sv_type, rec.pos, rec.stop, sv_len, sv_gt)) 294 | 295 | f.close() 296 | 297 | return sv_list 298 | 299 | 300 | # merge close SVs 301 | def merge_sv(sv_list): 302 | merged_sv_list = [] 303 | cur_idx = sv_list[0].idx 304 | cur_chrom = sv_list[0].ref_name 305 | cur_start = sv_list[0].sv_pos 306 | cur_end = sv_list[0].sv_stop 307 | cur_length = sv_list[0].length 308 | cur_gt = None 309 | 310 | for sv in sv_list[1:]: 311 | if sv.ref_name != cur_chrom or sv.sv_stop - cur_end > max_dist_to_merge: 312 | if cur_length > 0: 313 | cur_type = 'INS' 314 | else: 315 | cur_type = 'DEL' 316 | merged_sv_list.append(struc_var(cur_idx, cur_chrom, cur_type, cur_start, cur_end, cur_length, cur_gt)) 317 | 318 | cur_idx = sv.idx 319 | cur_chrom = sv.ref_name 320 | cur_start = sv.sv_pos 321 | cur_end = sv.sv_stop 322 | cur_length = sv.length 323 | cur_gt = None 324 | 325 | continue 326 | 327 | cur_end = sv.sv_stop 328 | cur_length += sv.length 329 | 330 | merged_sv_list.append(struc_var(cur_idx, cur_chrom, cur_type, cur_start, cur_end, cur_length, cur_gt)) 331 | 332 | return merged_sv_list 333 | 334 | 335 | 336 | #return T if two sv match 337 | def match_sv(base_sv, cand_sv): 338 | if base_sv.ref_name != cand_sv.ref_name: 339 | #test 340 | # print("name") 341 | return False 342 | if base_sv.sv_type != cand_sv.sv_type: 343 | #test 344 | # print("type") 345 | return False 346 | if min(abs(base_sv.length), abs(cand_sv.length)) / max(abs(base_sv.length), abs(cand_sv.length)) < ratio_size_lb: 347 | #test 348 | # print("size") 349 | return False 350 | 351 | if base_sv.sv_pos - cand_sv.sv_stop > max_dist_search: 352 | return False 353 | 354 | if cand_sv.sv_pos - base_sv.sv_stop > max_dist_search: 355 | return False 356 | 357 | return True 358 | 359 | #index chr name: chr1-chrX: 1-23 360 | def idx_chr_name(chr_name): 361 | if not if_hg38: 362 | chr_name = 'chr' + chr_name 363 | return chr_list.index(chr_name) + 1 364 | 365 | #compare two sv position 366 | #return T if sv1 is before sv2 367 | def compare_sv_positions(sv1, sv2): 368 | sv1_chr_idx = idx_chr_name(sv1.ref_name) 369 | sv2_chr_idx = idx_chr_name(sv2.ref_name) 370 | 371 | if sv1_chr_idx < sv2_chr_idx: 372 | return True 373 | elif sv1_chr_idx > sv2_chr_idx: 374 | return False 375 | 376 | if sv1.sv_pos < sv2.sv_pos: 377 | return True 378 | else: 379 | return False 380 | 381 | #merge 2 sorted sv_list 382 | def merge_sv_list(sv_list1, sv_list2): 383 | merged_list = [] 384 | ptr1 = 0 385 | ptr2 = 0 386 | 387 | while(True): 388 | if compare_sv_positions(sv_list1[ptr1], sv_list2[ptr2]): 389 | merged_list.append(sv_list1[ptr1]) 390 | ptr1 += 1 391 | else: 392 | merged_list.append(sv_list2[ptr2]) 393 | ptr2 += 1 394 | 395 | if ptr1 == len(sv_list1) or ptr2 == len(sv_list2): 396 | break 397 | 398 | if ptr1 == len(sv_list1): 399 | for ptr in range(ptr2, len(sv_list2)): 400 | merged_list.append(sv_list2[ptr]) 401 | elif ptr2 == len(sv_list2): 402 | for ptr in range(ptr1, len(sv_list1)): 403 | merged_list.append(sv_list1[ptr]) 404 | 405 | return merged_list 406 | 407 | #sort SV list by chr and start pos 408 | def sort_sv_list(sv_list): 409 | if len(sv_list) == 0: 410 | return sv_list 411 | 412 | if len(sv_list) == 1: 413 | return sv_list 414 | 415 | if len(sv_list) == 2: 416 | if compare_sv_positions(sv_list[0], sv_list[1]): 417 | return sv_list 418 | else: 419 | return [sv_list[1], sv_list[0]] 420 | 421 | mid_pt = len(sv_list)//2 422 | sorted_sv_list = merge_sv_list(sort_sv_list(sv_list[:mid_pt]), sort_sv_list(sv_list[mid_pt:])) 423 | return sorted_sv_list 424 | 425 | 426 | def count_tp_base(merged_sv_list_sorted, cand_sv_list_sorted): 427 | tp_base_ctr = 0 428 | cand_start_idx = 0 429 | for count, base_sv in enumerate(merged_sv_list_sorted): 430 | 431 | cur_chrom = base_sv.ref_name 432 | cur_start = base_sv.sv_pos 433 | cur_end = base_sv.sv_stop 434 | cur_length = base_sv.length 435 | 436 | cur_cand_start_idx = cand_start_idx 437 | new_cand_start_idx = cand_start_idx 438 | 439 | for count1, cand_sv in enumerate(cand_sv_list_sorted[cur_cand_start_idx:]): 440 | if (cur_start - cand_sv.sv_stop > max_dist_search and base_sv.ref_name == cand_sv.ref_name) or (idx_chr_name(base_sv.ref_name) > idx_chr_name(cand_sv.ref_name)): 441 | new_cand_start_idx += 1 442 | continue 443 | 444 | if match_sv(base_sv, cand_sv): 445 | tp_base_ctr += 1 446 | break 447 | 448 | if (cand_sv.sv_pos - cur_end > max_dist_search and base_sv.ref_name == cand_sv.ref_name) or (idx_chr_name(base_sv.ref_name) < idx_chr_name(cand_sv.ref_name)): 449 | break 450 | 451 | cand_start_idx = new_cand_start_idx 452 | return tp_base_ctr 453 | 454 | 455 | def match_sv_dist_only(base_sv, cand_sv): 456 | if base_sv.ref_name != cand_sv.ref_name: 457 | #test 458 | # print("name") 459 | return False 460 | # if base_sv.sv_type != cand_sv.sv_type: 461 | # #test 462 | # # print("type") 463 | # return False 464 | # if min(abs(base_sv.length), abs(cand_sv.length)) / max(abs(base_sv.length), abs(cand_sv.length)) < ratio_size_lb: 465 | # #test 466 | # # print("size") 467 | # return False 468 | 469 | if base_sv.sv_pos - cand_sv.sv_stop > max_dist_search: 470 | return False 471 | 472 | if cand_sv.sv_pos - base_sv.sv_stop > max_dist_search: 473 | return False 474 | 475 | return True 476 | 477 | def count_tp_base_dist_only(sv_list_sorted, cand_sv_list_sorted): 478 | tp_base_ctr = 0 479 | cand_start_idx = 0 480 | for count, base_sv in enumerate(sv_list_sorted): 481 | #test 482 | # if count % 3000 == 0: 483 | # print(count, tp_base_ctr) 484 | # base_sv.print_info() 485 | 486 | cur_chrom = base_sv.ref_name 487 | cur_start = base_sv.sv_pos 488 | cur_end = base_sv.sv_stop 489 | cur_length = base_sv.length 490 | 491 | cur_cand_start_idx = cand_start_idx 492 | new_cand_start_idx = cand_start_idx 493 | 494 | for count1, cand_sv in enumerate(cand_sv_list_sorted[cur_cand_start_idx:]): 495 | if (cur_start - cand_sv.sv_stop > max_dist_search and base_sv.ref_name == cand_sv.ref_name) or (idx_chr_name(base_sv.ref_name) > idx_chr_name(cand_sv.ref_name)): 496 | new_cand_start_idx += 1 497 | continue 498 | 499 | if match_sv_dist_only(base_sv, cand_sv): 500 | tp_base_ctr += 1 501 | break 502 | 503 | if (cand_sv.sv_pos - cur_end > max_dist_search and base_sv.ref_name == cand_sv.ref_name) or (idx_chr_name(base_sv.ref_name) < idx_chr_name(cand_sv.ref_name)): 504 | break 505 | 506 | cand_start_idx = new_cand_start_idx 507 | return tp_base_ctr 508 | 509 | ########################################################### 510 | 511 | def main(): 512 | #hg38 samples 513 | chr_list = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7', 514 | 'chr8','chr9','chr10','chr11','chr12','chr13','chr14', 515 | 'chr15','chr16','chr17','chr18','chr19','chr20','chr21', 516 | 'chr22','chrX'] 517 | 518 | sv_list = idx_sv(truth_vcf) 519 | cand_sv_list = idx_sv(cand_vcf) 520 | 521 | sv_list_sorted = sort_sv_list(sv_list) 522 | cand_sv_list_sorted = sort_sv_list(cand_sv_list) 523 | 524 | tp_base_ctr = count_tp_base_dist_only(sv_list_sorted, cand_sv_list_sorted) 525 | recall = tp_base_ctr / len(sv_list_sorted) 526 | 527 | for sample_name in sample_names: 528 | bcf_in_file = sample_name + "_dip_sv_lenfilter_chrfilter_pass.vcf" 529 | 530 | asm1_trimmed_file = "assem1_sort_trimmed_coor.bed" 531 | asm2_trimmed_file = "assem2_sort_trimmed_coor.bed" 532 | 533 | filter_dipcall_for_fn_trimcoor(bcf_in_file, asm1_trimmed_file, asm2_trimmed_file, bcf_out_file) 534 | 535 | if __name__ == "__main__": 536 | main() 537 | -------------------------------------------------------------------------------- /fn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/TT-Mars/303f63792971f2d9a04ceb8f9d38428c4c40e10f/fn.pyc -------------------------------------------------------------------------------- /get_conf_int.py: -------------------------------------------------------------------------------- 1 | #Get regions where there are reads < k bp 2 | import pysam 3 | import sys 4 | import os 5 | import numpy as np 6 | import pybedtools 7 | 8 | #TODO: optimize SV filters 9 | 10 | #Get regions on ref where its not covered by at least one of the assembly 11 | def get_non_cover_regions(output_dir, assem_bam_file, hap, chr_list): 12 | #hap is an int = 1/2 13 | samfile = pysam.AlignmentFile(assem_bam_file, "rb") 14 | g = open(output_dir + "assem" + str(hap) + "_non_cov_regions.bed", "w") 15 | for chr_name in chr_list: 16 | #test 17 | #print(chr_name) 18 | #stop when parsed no_of_reads 19 | #no_of_reads = 400000 20 | cur_end = 0 21 | #loop through iter, index will not be reset 22 | #if if_hg38: 23 | # ref_name = "chr" + chr_name 24 | #else: 25 | # ref_name = chr_name 26 | ref_name = chr_name 27 | iter = samfile.fetch(ref_name) 28 | 29 | for rec in iter: 30 | if rec.reference_start > cur_end: 31 | g.write(str(ref_name) + "\t") 32 | g.write(str(cur_end) + "\t") 33 | g.write(str(rec.reference_start)) 34 | g.write("\n") 35 | cur_end = rec.reference_end 36 | else: 37 | if rec.reference_end > cur_end: 38 | cur_end = rec.reference_end 39 | g.close() 40 | 41 | #Get SV positions 42 | def get_sv_positions(output_dir, vcf_file): 43 | f = pysam.VariantFile(vcf_file, 'r') 44 | #create bedfile contains SVs' positions 45 | g = open(output_dir + "SV_positions.bed", "w") 46 | for counter, rec in enumerate(f.fetch()): 47 | ref_name = rec.chrom 48 | sv_type = rec.info['SVTYPE'] 49 | sv_len = rec.rlen 50 | #TODOL double check the start for different types 51 | sv_pos = rec.pos 52 | sv_end = rec.stop 53 | if sv_type not in ['DEL', 'INS', 'INV', 'DUP']: 54 | continue 55 | g.write(str(ref_name) + "\t") 56 | g.write(str(sv_pos) + "\t") 57 | g.write(str(sv_end)) 58 | g.write("\n") 59 | g.close() 60 | f.close() 61 | 62 | #Output filtered calls' info in non-covered regions 63 | def output_non_cov_call_info(output_dir, SV_positions_file, assem1_non_cov_regions_file, assem2_non_cov_regions_file): 64 | SV_positions = pybedtools.BedTool(SV_positions_file) 65 | assem1_non_cov_regions = pybedtools.BedTool(assem1_non_cov_regions_file) 66 | assem2_non_cov_regions = pybedtools.BedTool(assem2_non_cov_regions_file) 67 | 68 | exclude_assem1_non_cover = SV_positions.intersect(assem1_non_cov_regions, u = True) 69 | exclude_assem2_non_cover = SV_positions.intersect(assem2_non_cov_regions, u = True) 70 | 71 | exclude_assem1_non_cover.saveas(output_dir + 'exclude_assem1_non_cover.bed') 72 | exclude_assem2_non_cover.saveas(output_dir + 'exclude_assem2_non_cover.bed') 73 | 74 | 75 | #Get regions where read depth > 2 * avg_read_depth 76 | #For now, we filter calls by read depth 77 | #In other words, here output calls having high read depth 78 | #TODO: Too slow here 79 | def get_high_depth_calls_info(output_dir, read_bam_file, vcf_file, avg_read_depth): 80 | sv_len_limit = 100000 81 | avg_read_depth = float(avg_read_depth) 82 | 83 | samfile = pysam.AlignmentFile(read_bam_file, "rb") 84 | f = pysam.VariantFile(vcf_file,'r') 85 | #TODO: change open condition 86 | g = open(output_dir + "exclude_high_depth.bed", "w") 87 | for counter, rec in enumerate(f.fetch()): 88 | #test 89 | #if counter > 250: 90 | # break 91 | 92 | #get ref start and ref end 93 | name = rec.chrom 94 | sv_pos = rec.pos 95 | sv_end = rec.stop 96 | sv_type = rec.info['SVTYPE'] 97 | if sv_type not in ['DEL', 'INS', 'INV', 'DUP']: 98 | continue 99 | 100 | sv_len = abs(rec.info['SVLEN'][0]) 101 | if sv_len > sv_len_limit: 102 | continue 103 | 104 | #print(sv_end - sv_pos) 105 | res = samfile.count_coverage(name, sv_pos, sv_end+1, quality_threshold = 0) 106 | #print(res) 107 | if round(np.sum(res)/(sv_end+1-sv_pos), 2) > 2*avg_read_depth: 108 | #test 109 | #print(counter, round(np.sum(res)/(sv_end+1-sv_pos), 2)) 110 | g.write(str(name) + "\t") 111 | g.write(str(sv_pos) + "\t") 112 | g.write(str(sv_end)) 113 | g.write("\n") 114 | g.close() 115 | f.close() 116 | 117 | def main(): 118 | #get command line input 119 | #n = len(sys.argv) 120 | output_dir = sys.argv[1] + "/" 121 | #assembly bam file 122 | bam_file1 = sys.argv[2] 123 | bam_file2 = sys.argv[3] 124 | if_hg38_str = sys.argv[4] 125 | # #if_hg38 = True 126 | # avg_read_depth = sys.argv[5] 127 | # #reads bam file 128 | # read_bam_file = sys.argv[6] 129 | # #callset file 130 | # vcf_file = sys.argv[7] 131 | 132 | #constants 133 | if if_hg38_str == "True": 134 | if_hg38 = True 135 | else: 136 | if_hg38 = False 137 | 138 | chr_list = [] 139 | if if_hg38: 140 | chr_list = ["chr1", "chr2", "chr3", "chr4", "chr5", 141 | "chr6", "chr7", "chr8", "chr9", "chr10", 142 | "chr11", "chr12", "chr13", "chr14", "chr15", 143 | "chr16", "chr17", "chr18", "chr19", "chr20", 144 | "chr21", "chr22", "chrX"] 145 | else: 146 | chr_list = ["1", "2", "3", "4", "5", 147 | "6", "7", "8", "9", "10", 148 | "11", "12", "13", "14", "15", 149 | "16", "17", "18", "19", "20", 150 | "21", "22", "X"] 151 | 152 | #Output regions on ref where its not covered by at least one of the assembly 153 | get_non_cover_regions(output_dir, bam_file1, 1, chr_list) 154 | get_non_cover_regions(output_dir, bam_file2, 2, chr_list) 155 | 156 | #Output sv positions 157 | # get_sv_positions(output_dir, vcf_file) 158 | 159 | #Output filtered calls in non-covered regions 160 | # SV_positions_file = output_dir + "SV_positions.bed" 161 | # assem1_non_cov_regions_file = output_dir + "assem1_non_cov_regions.bed" 162 | # assem2_non_cov_regions_file = output_dir + "assem2_non_cov_regions.bed" 163 | # output_non_cov_call_info(output_dir, SV_positions_file, assem1_non_cov_regions_file, assem2_non_cov_regions_file) 164 | 165 | #Get regions where read depth > 2 * avg_read_depth 166 | # get_high_depth_calls_info(output_dir, read_bam_file, vcf_file, avg_read_depth) 167 | 168 | if __name__ == "__main__": 169 | main() -------------------------------------------------------------------------------- /get_sv_bed_pos.py: -------------------------------------------------------------------------------- 1 | # %% 2 | #bedtools filter out SVs that are too close to end of contigs 3 | 4 | #Generate plots 5 | import csv 6 | import math 7 | #pysam: https://pysam.readthedocs.io/en/latest/usage.html 8 | import pysam 9 | #print(pysam.__version__) 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | import sys 15 | 16 | #get command line input 17 | #n = len(sys.argv) 18 | output_dir = sys.argv[1] 19 | vcf_file = sys.argv[2] 20 | 21 | 22 | # %% 23 | #process liftover results 24 | ''' 25 | with open(output_dir + "contig_positions_assem1_draft_result.bed") as f: 26 | reader = csv.reader(f, delimiter="\t") 27 | contig_positions_assem1_result = list(reader) 28 | f.close() 29 | 30 | for record in contig_positions_assem1_result: 31 | name = record[0] 32 | start = record[1] 33 | end = record[2] 34 | if int(start) >= int(end): 35 | continue 36 | g = open(output_dir + "contig_positions_assem1.bed", "a") 37 | g.write(str(name) + "\t") 38 | g.write(str(start) + "\t") 39 | g.write(str(end)) 40 | g.write("\n") 41 | g.close() 42 | 43 | with open(output_dir + "contig_positions_assem2_draft_result.bed") as f: 44 | reader = csv.reader(f, delimiter="\t") 45 | contig_positions_assem2_result = list(reader) 46 | f.close() 47 | 48 | for record in contig_positions_assem2_result: 49 | name = record[0] 50 | start = record[1] 51 | end = record[2] 52 | if int(start) >= int(end): 53 | continue 54 | g = open(output_dir + "contig_positions_assem2.bed", "a") 55 | g.write(str(name) + "\t") 56 | g.write(str(start) + "\t") 57 | g.write(str(end)) 58 | g.write("\n") 59 | g.close() 60 | 61 | ''' 62 | 63 | #bedtools filter out SVs that are too close to each other 64 | 65 | #create bedfiles 66 | 67 | f = pysam.VariantFile(vcf_file, 'r') 68 | 69 | #create bedfile contains SVs' positions 70 | g = open(output_dir + "SV_positions.bed", "w") 71 | for counter, rec in enumerate(f.fetch()): 72 | ref_name = rec.chrom 73 | sv_type = rec.info['SVTYPE'] 74 | sv_len = rec.rlen 75 | #TODOL double check the start for different types 76 | sv_pos = rec.pos 77 | sv_end = rec.stop 78 | 79 | g.write(str(ref_name) + "\t") 80 | g.write(str(sv_pos) + "\t") 81 | g.write(str(sv_end)) 82 | g.write("\n") 83 | g.close() 84 | 85 | -------------------------------------------------------------------------------- /help_func.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import csv 3 | from Bio import Align 4 | 5 | import func 6 | import get_align_info 7 | from Bio.Seq import Seq 8 | 9 | #constants 10 | import ttmars 11 | 12 | memory_min = ttmars.memory_min 13 | 14 | if_hg38 = ttmars.if_hg38 15 | if_pass_only = ttmars.if_hg38 16 | 17 | valid_types = ttmars.valid_types 18 | 19 | #not consider chrY calls 20 | chr_list = ttmars.chr_list 21 | 22 | #modified from ttmars.py to take sv dict object as input 23 | def first_filter(sv, sv_type, valid_types, if_pass_only, chr_list): 24 | #type filter 25 | if sv_type not in valid_types: 26 | return True 27 | #PASS filter 28 | if if_pass_only: 29 | if 'PASS' not in sv.filter.keys(): 30 | return True 31 | chr_name = sv.chrom 32 | #chr filter 33 | if chr_name not in chr_list: 34 | return True 35 | return False 36 | 37 | 38 | #functions 39 | 40 | #build sv dict 41 | def build_dict(vcf): 42 | sv_dict = dict() 43 | f = pysam.VariantFile(vcf) 44 | for counter, rec in enumerate(f.fetch()): 45 | #test 46 | #if counter < 91: 47 | # continue 48 | 49 | ref_name = rec.chrom 50 | 51 | sv_type = rec.info['SVTYPE'] 52 | # sv_len = rec.rlen 53 | 54 | if first_filter(rec, sv_type, valid_types, if_pass_only, chr_list): 55 | continue 56 | 57 | if sv_type == 'INV': 58 | sv_len = abs(rec.stop - rec.pos + 1) 59 | else: 60 | try: 61 | sv_len = rec.info['SVLEN'][0] 62 | except: 63 | try: 64 | sv_len = rec.info['SVLEN'] 65 | except: 66 | sv_len = abs(rec.stop - rec.pos + 1) 67 | 68 | if sv_type == 'DEL': 69 | sv_len = -abs(sv_len) 70 | 71 | if abs(sv_len) < memory_min: 72 | continue 73 | 74 | #TODOL double check the start for different types 75 | sv_pos = rec.pos 76 | sv_end = rec.stop 77 | 78 | n = 5 79 | if_abnormal = False 80 | if len(rec.ref) > n: 81 | if sv_type == 'INS': 82 | if_abnormal = True 83 | if len(rec.alts[0]) > n: 84 | if sv_type == 'DEL': 85 | if_abnormal = True 86 | 87 | # if (ref_name, int(sv_pos), int(sv_end), sv_type, int(sv_len)) in sv_dict: 88 | # print((ref_name, int(sv_pos), int(sv_end), sv_type, int(sv_len))) 89 | #dict to store sv: {(chr, start, end, type): [vapor score, vapor gt, vapor result, truvari result, ttmars result, rela_len, rela_score, len, assem_score, wlen result]} 90 | sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', int(sv_len), 'NA', 'NA', rec.filter.keys(), if_abnormal] 91 | #test 92 | #else: 93 | # print(sv_type) 94 | f.close() 95 | return sv_dict 96 | 97 | 98 | 99 | def add_ttmars_res(sv_dict, ttmars_res): 100 | with open(ttmars_res) as f: 101 | reader = csv.reader(f, delimiter="\t") 102 | ttmars_result = list(reader) 103 | f.close() 104 | 105 | for rec in ttmars_result: 106 | key = (rec[0], int(rec[1]), int(rec[2]), rec[3]) 107 | try: 108 | sv_dict[key] 109 | except: 110 | continue 111 | if rec[6] == 'True': 112 | sv_dict[key][4] = 'tp' 113 | else: 114 | sv_dict[key][4] = 'fp' 115 | sv_dict[key][5] = float(rec[4]) 116 | sv_dict[key][6] = float(rec[5]) 117 | 118 | def tuple_2_list(input_tup): 119 | output_list = [] 120 | for ele in input_tup: 121 | output_list.append(ele) 122 | return output_list 123 | 124 | 125 | #in ttmars.py: 126 | #call get_align_info.get_vali_info for both haplotypes 127 | #then func.write_vali_info(sv_list, output_dir, if_gt) 128 | 129 | #idx comb sv 130 | #requires seq resolved here 131 | def idx_comb(comb): 132 | ref_name = comb[0].ref_name 133 | wrong_len = False 134 | 135 | sv_len = 0 136 | ref_len = 0 137 | alt_len = 0 138 | 139 | sv_pos = float('inf') 140 | sv_stop = 0 141 | 142 | sv_idx = [] 143 | 144 | for sv in comb: 145 | sv_idx.append(sv.idx) 146 | 147 | sv_len += sv.length 148 | 149 | ref_len += sv.ref_len 150 | alt_len += sv.alt_len 151 | 152 | sv_pos = min(sv_pos, sv.sv_pos) 153 | sv_stop = max(sv_stop, sv.sv_stop) 154 | 155 | sv_type = "" 156 | if sv_len > 0: 157 | sv_type = "INS" 158 | else: 159 | sv_type = "DEL" 160 | 161 | sv_gt = None 162 | 163 | comb_sv = func.struc_var(sv_idx, ref_name, sv_type, sv_pos, sv_stop, sv_len, sv_gt, wrong_len, ref_len, alt_len) 164 | comb_sv.if_seq_resolved = True 165 | 166 | return comb_sv 167 | 168 | # def idx_comb_len_only(comb): 169 | # ref_name = comb[0].ref_name 170 | # wrong_len = False 171 | 172 | # sv_len = 0 173 | # ref_len = 0 174 | # alt_len = 0 175 | 176 | # sv_pos = float('inf') 177 | # sv_stop = 0 178 | 179 | # sv_idx = [] 180 | 181 | # for sv in comb: 182 | # sv_idx.append(sv.idx) 183 | 184 | # sv_len += sv.length 185 | 186 | # ref_len += sv.ref_len 187 | # alt_len += sv.alt_len 188 | 189 | # sv_pos = min(sv_pos, sv.sv_pos) 190 | # sv_stop = max(sv_stop, sv.sv_stop) 191 | 192 | # sv_type = "" 193 | # if sv_len > 0: 194 | # sv_type = "INS" 195 | # else: 196 | # sv_type = "DEL" 197 | 198 | # sv_gt = None 199 | 200 | # comb_sv = func.struc_var(sv_idx, ref_name, sv_type, sv_pos, sv_stop, sv_len, sv_gt, wrong_len, ref_len, alt_len) 201 | 202 | # return comb_sv 203 | 204 | def get_comb_vali_info_align_only(comb_sv, hap, interval, contig_name_list, contig_pos_list, contig_name_dict, if_hg38, ref_rec, query_fasta_file, sv_idx_dict): 205 | if hap == 1: 206 | query_rec = query_fasta_file.fetch(comb_sv.query_name_hap1) 207 | ref_start = comb_sv.ref_start_best_hap1 208 | ref_end = comb_sv.ref_end_best_hap1 209 | query_start = comb_sv.query_start_best_hap1 210 | query_end = comb_sv.query_end_best_hap1 211 | neg_strand = comb_sv.neg_strand_hap1 212 | elif hap == 2: 213 | query_rec = query_fasta_file.fetch(comb_sv.query_name_hap2) 214 | ref_start = comb_sv.ref_start_best_hap2 215 | ref_end = comb_sv.ref_end_best_hap2 216 | query_start = comb_sv.query_start_best_hap2 217 | query_end = comb_sv.query_end_best_hap2 218 | neg_strand = comb_sv.neg_strand_hap2 219 | 220 | if query_start >= len(query_rec) or query_end >= len(query_rec): 221 | message = "bad_query_pos" 222 | #write_err(output_file_name, message, g) 223 | if hap == 1: 224 | comb_sv.analyzed_hap1 = False 225 | if hap == 2: 226 | comb_sv.analyzed_hap2 = True 227 | 228 | return False 229 | 230 | #definitely need alignment score for comb_sv 231 | 232 | query_frag = query_rec[query_start:query_end] 233 | ref_frag = ref_rec[ref_start:ref_end] 234 | ref_after_sv_frag = get_comb_ref_frag_after_sv(comb_sv, ref_rec, sv_idx_dict, ref_start, ref_end) 235 | 236 | #neg strand 237 | if neg_strand: 238 | seq = Seq(query_frag) 239 | query_frag = seq.reverse_complement() 240 | 241 | #get to upper case 242 | ref_frag = ref_frag.upper() 243 | query_frag = query_frag.upper() 244 | ref_after_sv_frag = ref_after_sv_frag.upper() 245 | 246 | #test 247 | # print(len(ref_frag), len(ref_after_sv_frag), len(ref_after_sv_frag)-len(ref_frag)) 248 | 249 | #TODO: find appropriate alignment parameters 250 | #paras: match, mismatch, open gap, extend gap 251 | alignment_beforeSV, alignment_afterSV = align_before_after_comb(query_frag, ref_frag, ref_after_sv_frag) 252 | 253 | if hap == 1: 254 | comb_sv.analyzed_hap1 = True 255 | comb_sv.score_before_hap1 = alignment_beforeSV 256 | comb_sv.score_after_hap1 = alignment_afterSV 257 | # comb_sv.len_query_hap1 = query_end - query_start + 1 258 | # comb_sv.len_ref_hap1 = ref_end - ref_start + 1 259 | elif hap == 2: 260 | comb_sv.analyzed_hap2 = True 261 | comb_sv.score_before_hap2 = alignment_beforeSV 262 | comb_sv.score_after_hap2 = alignment_afterSV 263 | # comb_sv.len_query_hap2 = query_end - query_start + 1 264 | # comb_sv.len_ref_hap2 = ref_end - ref_start + 1 265 | 266 | return True 267 | 268 | def get_comb_vali_info_len_only(comb_sv, hap, interval, contig_name_list, contig_pos_list, contig_name_dict, if_hg38, ref_rec, query_fasta_file, sv_idx_dict, region_len_m): 269 | 270 | if not get_comb_intervals_len_only(comb_sv, if_hg38, interval, contig_name_list, contig_pos_list, contig_name_dict, hap, ref_rec, region_len_m): 271 | return False 272 | 273 | if hap == 1: 274 | # query_rec = query_fasta_file.fetch(comb_sv.query_name_hap1) 275 | ref_start = comb_sv.ref_start_best_hap1 276 | ref_end = comb_sv.ref_end_best_hap1 277 | query_start = comb_sv.query_start_best_hap1 278 | query_end = comb_sv.query_end_best_hap1 279 | neg_strand = comb_sv.neg_strand_hap1 280 | elif hap == 2: 281 | # query_rec = query_fasta_file.fetch(comb_sv.query_name_hap2) 282 | ref_start = comb_sv.ref_start_best_hap2 283 | ref_end = comb_sv.ref_end_best_hap2 284 | query_start = comb_sv.query_start_best_hap2 285 | query_end = comb_sv.query_end_best_hap2 286 | neg_strand = comb_sv.neg_strand_hap2 287 | 288 | # if query_start >= len(query_rec) or query_end >= len(query_rec): 289 | # message = "bad_query_pos" 290 | # #write_err(output_file_name, message, g) 291 | # return False 292 | 293 | #skip alignment in this mode (although alignment score is suggested for comb_sv) 294 | 295 | # query_frag = query_rec[query_start:query_end] 296 | # ref_frag = ref_rec[ref_start:ref_end] 297 | 298 | if hap == 1: 299 | comb_sv.analyzed_hap1 = True 300 | # comb_sv.score_before_hap1 = alignment_beforeSV 301 | # comb_sv.score_after_hap1 = alignment_afterSV 302 | comb_sv.len_query_hap1 = query_end - query_start + 1 303 | comb_sv.len_ref_hap1 = ref_end - ref_start + 1 304 | elif hap == 2: 305 | comb_sv.analyzed_hap2 = True 306 | # comb_sv.score_before_hap2 = alignment_beforeSV 307 | # comb_sv.score_after_hap2 = alignment_afterSV 308 | comb_sv.len_query_hap2 = query_end - query_start + 1 309 | comb_sv.len_ref_hap2 = ref_end - ref_start + 1 310 | 311 | return True 312 | 313 | def get_comb_intervals_len_only(sv, if_hg38, interval, contig_name_list, contig_pos_list, contig_name_dict, hap, ref_rec, region_len_m): 314 | ref_rec_len = len(ref_rec) 315 | # region_len_m = 1000 316 | 317 | #test: 318 | # print(sv.sv_pos, sv.sv_stop) 319 | 320 | ref_start_start = max(get_align_info.getRefStart(sv.sv_pos, interval) - region_len_m, 0) 321 | ref_start_end = get_align_info.getRefStart(sv.sv_pos, interval) 322 | 323 | ref_end_start = get_align_info.getRefEnd(sv.sv_stop, interval) 324 | ref_end_end = min(get_align_info.getRefEnd(sv.sv_stop, interval) + region_len_m, get_align_info.getRefEnd(ref_rec_len, interval) - interval) 325 | 326 | #first level key: chr index as an int 327 | int_ref_name = get_align_info.get_int_chr_name(sv.ref_name, if_hg38) 328 | lo_list_index = int_ref_name - 1 329 | first_key = lo_list_index 330 | 331 | #second level key: ref list_pos 332 | min_diff_len = float('inf') 333 | ref_start_best = ref_start_end 334 | ref_end_best = ref_end_start 335 | query_start_best = None 336 | query_end_best = None 337 | start_contig_name_ctr = -1 338 | end_contig_name_ctr = -1 339 | neg_strand = False 340 | 341 | #test 342 | # print(ref_end_end, ref_end_start, ref_end_start - ref_end_end) 343 | # print(ref_start_start, ref_start_end, ref_start_start - ref_start_end) 344 | 345 | for ref_end_cand in range(ref_end_end, ref_end_start, -interval): 346 | for ref_start_cand in range(ref_start_start, ref_start_end, interval): 347 | if ref_end_cand <= ref_start_cand: 348 | continue 349 | 350 | #second_key_start = str(ref_start_cand) 351 | #second_key_end = str(ref_end_cand) 352 | second_key_start = ref_start_cand//interval 353 | second_key_end = ref_end_cand//interval 354 | 355 | start_contig_name_ctr_cand = contig_name_list[first_key][second_key_start] 356 | end_contig_name_ctr_cand = contig_name_list[first_key][second_key_end] 357 | 358 | if start_contig_name_ctr_cand == -1 or end_contig_name_ctr_cand == -1: 359 | #print("wrong second key") 360 | message = "wrong_sec_key" 361 | #write_err(output_file_name, message, g) 362 | continue 363 | 364 | if start_contig_name_ctr_cand != end_contig_name_ctr_cand: 365 | #print("Not same contig") 366 | message = "not_same_contig" 367 | #write_err(output_file_name, message, g) 368 | continue 369 | 370 | query_start = contig_pos_list[first_key][second_key_start] 371 | query_end = contig_pos_list[first_key][second_key_end] 372 | 373 | neg_strand_tep = False 374 | #in case: negtive strand 375 | if query_end < query_start: 376 | tem = query_start 377 | query_start = query_end 378 | query_end = tem 379 | neg_strand_tep = True 380 | 381 | #take the best relative length to be the optimal interval 382 | if abs((query_end - query_start) - (ref_end_cand - ref_start_cand) - sv.length) < min_diff_len: 383 | min_diff_len = abs((query_end - query_start) - (ref_end_cand - ref_start_cand) - sv.length) 384 | ref_start_best = ref_start_cand 385 | ref_end_best = ref_end_cand 386 | query_start_best = query_start 387 | query_end_best = query_end 388 | start_contig_name_ctr = start_contig_name_ctr_cand 389 | end_contig_name_ctr = end_contig_name_ctr_cand 390 | 391 | if neg_strand_tep: 392 | neg_strand = True 393 | else: 394 | neg_strand = False 395 | 396 | if query_start_best == None or query_end_best == None: 397 | #print("Wrong query pos") 398 | message = "Wrong_query_pos" 399 | #write_err(output_file_name, message, g) 400 | return False 401 | 402 | if ref_start_best == sv.sv_pos: 403 | ref_start_best = ref_start_best - 1 404 | if ref_end_best == sv.sv_stop: 405 | ref_end_best = ref_end_best + 1 406 | if query_end_best == query_start_best: 407 | query_end_best += 1 408 | 409 | #test 410 | # print(ref_start_best, ref_end_best) 411 | # print(query_start_best, query_end_best) 412 | 413 | if hap == 1: 414 | sv.ref_start_best_hap1 = ref_start_best 415 | sv.ref_end_best_hap1 = ref_end_best 416 | sv.query_start_best_hap1 = query_start_best 417 | sv.query_end_best_hap1 = query_end_best 418 | sv.query_name_hap1 = contig_name_dict[start_contig_name_ctr] 419 | sv.neg_strand_hap1 = neg_strand 420 | elif hap == 2: 421 | sv.ref_start_best_hap2 = ref_start_best 422 | sv.ref_end_best_hap2 = ref_end_best 423 | sv.query_start_best_hap2 = query_start_best 424 | sv.query_end_best_hap2 = query_end_best 425 | sv.query_name_hap2 = contig_name_dict[start_contig_name_ctr] 426 | sv.neg_strand_hap2 = neg_strand 427 | 428 | return True 429 | 430 | def align_before_after_comb(query_seq, ref_seq_1, ref_seq_2): 431 | #used third_fil when build groups 432 | aligner = Align.PairwiseAligner() 433 | aligner.mode = 'global' 434 | #aligner.mode = 'local' 435 | aligner.match_score = 1 436 | aligner.mismatch_score = -1 437 | aligner.open_gap_score = -1 438 | aligner.extend_gap_score = -0.5 439 | #aligner.score_only = True 440 | alignment_beforeSV = aligner.score(query_seq, ref_seq_1) 441 | alignment_afterSV = aligner.score(query_seq, ref_seq_2) 442 | 443 | return alignment_beforeSV, alignment_afterSV 444 | 445 | def get_comb_vali_info(comb_sv, hap, interval, contig_name_list, contig_pos_list, contig_name_dict, if_hg38, ref_rec, query_fasta_file, sv_idx_dict, region_len_m): 446 | if not get_comb_intervals_len_only(comb_sv, if_hg38, interval, contig_name_list, contig_pos_list, contig_name_dict, hap, ref_rec, region_len_m): 447 | return False 448 | 449 | if hap == 1: 450 | query_rec = query_fasta_file.fetch(comb_sv.query_name_hap1) 451 | ref_start = comb_sv.ref_start_best_hap1 452 | ref_end = comb_sv.ref_end_best_hap1 453 | query_start = comb_sv.query_start_best_hap1 454 | query_end = comb_sv.query_end_best_hap1 455 | neg_strand = comb_sv.neg_strand_hap1 456 | elif hap == 2: 457 | query_rec = query_fasta_file.fetch(comb_sv.query_name_hap2) 458 | ref_start = comb_sv.ref_start_best_hap2 459 | ref_end = comb_sv.ref_end_best_hap2 460 | query_start = comb_sv.query_start_best_hap2 461 | query_end = comb_sv.query_end_best_hap2 462 | neg_strand = comb_sv.neg_strand_hap2 463 | 464 | if query_start >= len(query_rec) or query_end >= len(query_rec): 465 | message = "bad_query_pos" 466 | #write_err(output_file_name, message, g) 467 | return False 468 | 469 | #definitely need alignment score for comb_sv 470 | 471 | query_frag = query_rec[query_start:query_end] 472 | ref_frag = ref_rec[ref_start:ref_end] 473 | ref_after_sv_frag = get_comb_ref_frag_after_sv(comb_sv, ref_rec, sv_idx_dict, ref_start, ref_end) 474 | 475 | #neg strand 476 | if neg_strand: 477 | seq = Seq(query_frag) 478 | query_frag = seq.reverse_complement() 479 | 480 | #get to upper case 481 | ref_frag = ref_frag.upper() 482 | query_frag = query_frag.upper() 483 | ref_after_sv_frag = ref_after_sv_frag.upper() 484 | 485 | #test 486 | # print(len(ref_frag), len(ref_after_sv_frag), len(ref_after_sv_frag)-len(ref_frag)) 487 | 488 | #TODO: find appropriate alignment parameters 489 | #paras: match, mismatch, open gap, extend gap 490 | alignment_beforeSV, alignment_afterSV = align_before_after_comb(query_frag, ref_frag, ref_after_sv_frag) 491 | 492 | if hap == 1: 493 | comb_sv.analyzed_hap1 = True 494 | comb_sv.score_before_hap1 = alignment_beforeSV 495 | comb_sv.score_after_hap1 = alignment_afterSV 496 | comb_sv.len_query_hap1 = query_end - query_start + 1 497 | comb_sv.len_ref_hap1 = ref_end - ref_start + 1 498 | elif hap == 2: 499 | comb_sv.analyzed_hap2 = True 500 | comb_sv.score_before_hap2 = alignment_beforeSV 501 | comb_sv.score_after_hap2 = alignment_afterSV 502 | comb_sv.len_query_hap2 = query_end - query_start + 1 503 | comb_sv.len_ref_hap2 = ref_end - ref_start + 1 504 | 505 | return True 506 | 507 | def get_comb_ref_frag_after_sv(comb_sv, ref_rec, sv_idx_dict, ref_start, ref_end): 508 | cur_seq = "" 509 | cur_pos = ref_start 510 | 511 | #test 512 | # print("ref_start", ref_start) 513 | 514 | #note comb_sv sv are not overlapping 515 | sv_idx_list = comb_sv.idx 516 | for idx in sv_idx_list: 517 | sv = sv_idx_dict[idx] 518 | sv_pos = sv.sv_pos 519 | sv_stop = sv.sv_stop 520 | 521 | cur_seq += ref_rec[cur_pos:sv_pos] 522 | 523 | #test 524 | # print("cur_pos", cur_pos, "sv_pos", sv_pos, "cur_len", len(cur_seq), len(ref_rec)) 525 | 526 | if sv.sv_type == "INS": 527 | cur_seq += sv.ins_seq 528 | #also skipped ref seq of INS, if any 529 | cur_pos = sv_stop 530 | elif sv.sv_type == "DEL": 531 | #skipped deleted seq 532 | cur_pos = sv_stop 533 | 534 | #test 535 | # print("cur_pos", cur_pos, "ref_end", ref_end) 536 | assert cur_pos <= ref_end 537 | assert len(ref_rec) >= ref_end 538 | 539 | cur_seq += ref_rec[cur_pos:ref_end] 540 | return cur_seq 541 | 542 | 543 | 544 | ################################################################## 545 | ################################################################## 546 | #functions from agg_sv 547 | -------------------------------------------------------------------------------- /liftover.sh: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | ####################################################### 3 | #1. Align assembly to reference 4 | 5 | #use lra (https://github.com/ChaissonLab/LRA) to align asm to ref 6 | lra index -CONTIG $reference 7 | lra align -CONTIG $reference h1.fa -t 16 -p s | samtools sort -o assem1_sort.bam 8 | lra align -CONTIG $reference h2.fa -t 16 -p s | samtools sort -o assem2_sort.bam 9 | 10 | #index bam file 11 | samtools index assem1_sort.bam 12 | samtools index assem2_sort.bam 13 | 14 | ####################################################### 15 | ####################################################### 16 | #2. Trim overlapping contigs 17 | 18 | #trim overlapping contigs 19 | python trim_overlapping_contigs.py assem1_sort.bam $output_dir $if_hg38 20 | python trim_overlapping_contigs.py assem2_sort.bam $output_dir $if_hg38 21 | 22 | #sort trimmed bam file 23 | samtools sort $output_dir/assem1_sort_nool.bam -o $output_dir/assem1_nool_sort.bam 24 | samtools sort $output_dir/assem2_sort_nool.bam -o $output_dir/assem2_nool_sort.bam 25 | 26 | #index sorted trimmed file 27 | samtools index $output_dir/assem1_nool_sort.bam 28 | samtools index $output_dir/assem2_nool_sort.bam 29 | 30 | ####################################################### 31 | ####################################################### 32 | #3. Liftover 33 | 34 | #convert to sam file 35 | samtools view -h $output_dir/assem1_nool_sort.bam | samtools sort -O sam -o $output_dir/assem1_nool_sort.sam 36 | samtools view -h $output_dir/assem2_nool_sort.bam | samtools sort -O sam -o $output_dir/assem2_nool_sort.sam 37 | 38 | #liftover using samLiftover (https://github.com/mchaisso/mcutils): ref to asm lo 39 | python lo_assem_to_ref.py $output_dir $output_dir/assem1_nool_sort.bam $output_dir/assem2_nool_sort.bam 40 | 41 | samLiftover $output_dir/assem1_nool_sort.sam $output_dir/lo_pos_assem1.bed $output_dir/lo_pos_assem1_result.bed --dir 1 42 | samLiftover $output_dir/assem2_nool_sort.sam $output_dir/lo_pos_assem2.bed $output_dir/lo_pos_assem2_result.bed --dir 1 43 | 44 | #liftover using samLiftover (https://github.com/mchaisso/mcutils): asm to ref lo 45 | python lo_assem_to_ref_0.py $output_dir $output_dir/assem1_nool_sort.bam $output_dir/assem2_nool_sort.bam 46 | 47 | samLiftover $output_dir/assem1_nool_sort.sam $output_dir/lo_pos_assem1_0.bed $output_dir/lo_pos_assem1_0_result.bed --dir 0 48 | samLiftover $output_dir/assem2_nool_sort.sam $output_dir/lo_pos_assem2_0.bed $output_dir/lo_pos_assem2_0_result.bed --dir 0 49 | 50 | ####################################################### 51 | ####################################################### 52 | #4. Compress liftover files 53 | 54 | python compress_liftover.py $output_dir lo_pos_assem1_result.bed lo_pos_assem1_result_compressed.bed 55 | python compress_liftover.py $output_dir lo_pos_assem2_result.bed lo_pos_assem2_result_compressed.bed 56 | python compress_liftover.py $output_dir lo_pos_assem1_0_result.bed lo_pos_assem1_0_result_compressed.bed 57 | python compress_liftover.py $output_dir lo_pos_assem2_0_result.bed lo_pos_assem2_0_result_compressed.bed 58 | 59 | ####################################################### 60 | ####################################################### 61 | #5. Get non-covered regions 62 | 63 | python get_conf_int.py $output_dir $output_dir/assem1_nool_sort.bam $output_dir/assem2_nool_sort.bam $if_hg38 64 | -------------------------------------------------------------------------------- /lo_assem_to_ref.py: -------------------------------------------------------------------------------- 1 | # %% 2 | #generate bed file that contains positions on ref to be lifted to contigs 3 | import csv 4 | import math 5 | #pysam: https://pysam.readthedocs.io/en/latest/usage.html 6 | import pysam 7 | #print(pysam.__version__) 8 | import sys 9 | 10 | #get command line input 11 | #n = len(sys.argv) 12 | output_dir = sys.argv[1] + "/" 13 | assembly_bam_file_hap1 = sys.argv[2] 14 | assembly_bam_file_hap2 = sys.argv[3] 15 | #print("\nName of Python script:", sys.argv[0]) 16 | 17 | interval = 20 18 | 19 | #check if k-th bit of a given number is set or not 20 | def isKthBitSet(n, k): 21 | if n & (1 << (k - 1)): 22 | return True 23 | else: 24 | return False 25 | 26 | #for assem1 27 | #generate bed file liftover 28 | 29 | #TODO: change output 30 | g = open(output_dir + "lo_pos_assem1.bed", "w") 31 | 32 | samfile = pysam.AlignmentFile(assembly_bam_file_hap1, "rb") 33 | for record in samfile: 34 | #TODO: solve the unmapped segment problem??? 35 | if not isKthBitSet(record.flag, 3): 36 | #samLiftover's bed file needs ref name for both directions 37 | ref_name = record.reference_name 38 | query_name = record.query_name 39 | #TODO: check not matching with paf 40 | query_start = record.query_alignment_start 41 | query_end = record.query_alignment_end 42 | ref_start = record.reference_start 43 | ref_end = record.reference_end 44 | query_start = math.ceil(query_start/interval) * interval 45 | query_end = math.floor(query_end/interval) * interval 46 | ref_start = math.ceil(ref_start/interval) * interval 47 | ref_end = math.floor(ref_end/interval) * interval 48 | 49 | for i in range(ref_start, ref_end + interval, interval): 50 | g.write(ref_name + "\t") 51 | g.write(str(i) + "\t") 52 | g.write(str(i+1) + "\t") 53 | g.write(query_name + "\t") 54 | g.write("\n") 55 | g.close() 56 | 57 | #for assem2 58 | g = open(output_dir + "lo_pos_assem2.bed", "w") 59 | 60 | samfile = pysam.AlignmentFile(assembly_bam_file_hap2, "rb") 61 | for record in samfile: 62 | #TODO: solve the unmapped segment problem??? 63 | if not isKthBitSet(record.flag, 3): 64 | #samLiftover's bed file needs ref name for both directions 65 | ref_name = record.reference_name 66 | query_name = record.query_name 67 | #TODO: check not matching with paf 68 | query_start = record.query_alignment_start 69 | query_end = record.query_alignment_end 70 | ref_start = record.reference_start 71 | ref_end = record.reference_end 72 | query_start = math.ceil(query_start/interval) * interval 73 | query_end = math.floor(query_end/interval) * interval 74 | ref_start = math.ceil(ref_start/interval) * interval 75 | ref_end = math.floor(ref_end/interval) * interval 76 | 77 | for i in range(ref_start, ref_end + interval, interval): 78 | g.write(ref_name + "\t") 79 | g.write(str(i) + "\t") 80 | g.write(str(i+1) + "\t") 81 | g.write(query_name + "\t") 82 | g.write("\n") 83 | 84 | g.close() 85 | -------------------------------------------------------------------------------- /lo_assem_to_ref_0.py: -------------------------------------------------------------------------------- 1 | # %% 2 | #generate bed file that contains positions on ref to be lifted to contigs 3 | import csv 4 | import math 5 | #pysam: https://pysam.readthedocs.io/en/latest/usage.html 6 | import pysam 7 | #print(pysam.__version__) 8 | import sys 9 | 10 | #get command line input 11 | #n = len(sys.argv) 12 | output_dir = sys.argv[1] + "/" 13 | assembly_bam_file_hap1 = sys.argv[2] 14 | assembly_bam_file_hap2 = sys.argv[3] 15 | #print("\nName of Python script:", sys.argv[0]) 16 | 17 | interval = 20 18 | 19 | #check if k-th bit of a given number is set or not 20 | def isKthBitSet(n, k): 21 | if n & (1 << (k - 1)): 22 | return True 23 | else: 24 | return False 25 | 26 | #for assem1 27 | #generate bed file liftover 28 | 29 | #TODO: change output 30 | g = open(output_dir + "lo_pos_assem1_0.bed", "w") 31 | 32 | samfile = pysam.AlignmentFile(assembly_bam_file_hap1, "rb") 33 | for record in samfile: 34 | #TODO: solve the unmapped segment problem??? 35 | if not isKthBitSet(record.flag, 3): 36 | #samLiftover's bed file needs ref name for both directions 37 | ref_name = record.reference_name 38 | query_name = record.query_name 39 | #TODO: check not matching with paf 40 | query_start = record.query_alignment_start 41 | query_end = record.query_alignment_end 42 | ref_start = record.reference_start 43 | ref_end = record.reference_end 44 | query_start = math.ceil(query_start/interval) * interval 45 | query_end = math.floor(query_end/interval) * interval 46 | ref_start = math.ceil(ref_start/interval) * interval 47 | ref_end = math.floor(ref_end/interval) * interval 48 | 49 | #if start with hard clip 50 | if record.cigartuples[0][0] == 5: 51 | query_start += record.cigartuples[0][1] 52 | query_end += record.cigartuples[0][1] 53 | 54 | if query_end < query_start: 55 | tem = query_start 56 | query_start = query_end 57 | query_end = tem 58 | 59 | for i in range(query_start, query_end + interval, interval): 60 | g.write(query_name + "\t") 61 | g.write(str(i) + "\t") 62 | g.write(str(i+1) + "\t") 63 | g.write(ref_name + "\t") 64 | g.write("\n") 65 | g.close() 66 | 67 | #for assem2 68 | g = open(output_dir + "lo_pos_assem2_0.bed", "w") 69 | 70 | samfile = pysam.AlignmentFile(assembly_bam_file_hap2, "rb") 71 | for record in samfile: 72 | #TODO: solve the unmapped segment problem??? 73 | if not isKthBitSet(record.flag, 3): 74 | #samLiftover's bed file needs ref name for both directions 75 | ref_name = record.reference_name 76 | query_name = record.query_name 77 | #TODO: check not matching with paf 78 | query_start = record.query_alignment_start 79 | query_end = record.query_alignment_end 80 | ref_start = record.reference_start 81 | ref_end = record.reference_end 82 | query_start = math.ceil(query_start/interval) * interval 83 | query_end = math.floor(query_end/interval) * interval 84 | ref_start = math.ceil(ref_start/interval) * interval 85 | ref_end = math.floor(ref_end/interval) * interval 86 | 87 | #if start with hard clip 88 | if record.cigartuples[0][0] == 5: 89 | query_start += record.cigartuples[0][1] 90 | query_end += record.cigartuples[0][1] 91 | 92 | if query_end < query_start: 93 | tem = query_start 94 | query_start = query_end 95 | query_end = tem 96 | 97 | for i in range(query_start, query_end + interval, interval): 98 | g.write(query_name + "\t") 99 | g.write(str(i) + "\t") 100 | g.write(str(i+1) + "\t") 101 | g.write(ref_name + "\t") 102 | g.write("\n") 103 | g.close() 104 | -------------------------------------------------------------------------------- /run_ttmars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # conda create -n ttmars 4 | # conda activate ttmars 5 | # conda install -c bioconda pysam 6 | # conda install -c anaconda numpy 7 | # conda install -c bioconda mappy 8 | # conda install -c conda-forge biopython 9 | # conda install -c bioconda pybedtools 10 | 11 | #sample: HG00096 HG00171 HG00513 HG00731 HG00732 HG00864 HG01114 HG01505 HG01596 HG03009 12 | sample=HG00096 13 | reference=path-to-reference_file/hg38.no_alts.fasta 14 | vcf_file=path-to-target-vcf/callset.vcf 15 | #assembly files, can be downloaded by download_asm.sh 16 | asm_h1=path-to-assemblies/h1.fa 17 | asm_h2=path-to-assemblies/h2.fa 18 | output_dir=output-directory 19 | 20 | #path to downloaded ttmars files: default ./ttmars_files/sample 21 | files_dir=./ttmars_files/$sample 22 | 23 | #provided centromere file 24 | centro_file=centromere_hg38.txt 25 | #provided tandem repeats file 26 | tr_file=hg38_tandem_repeats.bed 27 | #1: if male sample; 2: if female sample 28 | num_X_chr=1 29 | 30 | python ttmars.py "$output_dir" "$files_dir" "$centro_file" "$vcf_file" "$reference" "$asm_dir"/h1.fa "$asm_dir"/h2.fa "$tr_file" "$num_X_chr" -s -g -w -d -i -v 31 | 32 | # positional arguments: 33 | # output_dir output directory 34 | # files_dir input directory that stores files used in tt-mars for the current sample 35 | # Should include: 36 | # assem1_non_cov_regions_file 37 | # Regions that are not covered on hap1 38 | # assem2_non_cov_regions_file 39 | # Regions that are not covered on hap2 40 | # liftover_file1 liftover file hap1 41 | # liftover_file2 liftover file hap2 42 | # liftover_file1_0 liftover file hap1 asm to ref 43 | # liftover_file2_0 liftover file hap2 asm to ref 44 | # centromere_file centromere file, default is provided by tt-mars 45 | # vcf_file input vcf file 46 | # ref_file reference file 47 | # query_file1 assembly fasta file hap1 48 | # query_file2 assembly fasta file hap2 49 | # tandem_file tandem repeats regions, default is provided by tt-mars 50 | # region_len_m region_len_m 51 | # {1,2} male sample 1, female sample 2 52 | 53 | # optional arguments: 54 | # -h, --help show this help message and exit 55 | # -n, --not_hg38 if reference is NOT hg38 (hg19) 56 | # -p, --passonly if consider PASS calls only 57 | # -s, --seq_resolved if consider sequence resolved calls (INS) 58 | # -w, --wrong_len if count wrong length calls as True 59 | # -g, --gt_vali conduct genotype validation 60 | # -i, --gt_info index with GT info 61 | # -d, --phased take phased information 62 | # -v, --vcf_out output results as vcf files, must be used together with -f/--vcf_file 63 | # -f, --false_neg output false negative, must be used together with -t/--truth_file and -f/--vcf_file 64 | # -t TRUTH_FILE, --truth_file TRUTH_FILE 65 | # input truth vcf file, must be used together with -n/--false_neg -------------------------------------------------------------------------------- /trim_overlapping_contigs.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | import os 4 | import csv 5 | #import copy 6 | 7 | #input: input bam file (sorted bam), output directory, if_hg38 8 | #output: output directory/output bam file (unsorted) 9 | 10 | file_name = sys.argv[1] 11 | infile = pysam.AlignmentFile(file_name, "rb") 12 | #file not sorted 13 | 14 | #outfile_name = "mm2_hg38_asm5_woSed_assem2_nool.bam" 15 | #outfile_name = sys.argv[2] 16 | 17 | infile_name_base=os.path.basename(file_name) 18 | outfile_name_wo_ext = os.path.splitext(infile_name_base)[0] 19 | 20 | outfile_name = outfile_name_wo_ext + "_nool.bam" 21 | 22 | #output_dir = "assemblies/HG00096" 23 | output_dir = sys.argv[2] + "/" 24 | 25 | outfile = pysam.AlignmentFile(output_dir + outfile_name, "wb", template=infile) 26 | 27 | #if_hg38 = True 28 | if_hg38_str = sys.argv[3] 29 | if if_hg38_str == "True": 30 | if_hg38 = True 31 | else: 32 | if_hg38 = False 33 | 34 | #sort contigs by length from short to long 35 | def mergesort_contigs(unsorted_list): 36 | if len(unsorted_list) > 1: 37 | #floor division 38 | mid = len(unsorted_list)//2 39 | #Divide 40 | L_list = unsorted_list[:mid] 41 | R_list = unsorted_list[mid:] 42 | #Conquer 43 | mergesort_contigs(L_list) 44 | mergesort_contigs(R_list) 45 | #Merge 46 | i = j = k = 0 47 | while i < len(L_list) and j < len(R_list): 48 | if L_list[i].query_alignment_length < R_list[j].query_alignment_length: 49 | unsorted_list[k] = L_list[i] 50 | i += 1 51 | else: 52 | unsorted_list[k] = R_list[j] 53 | j += 1 54 | k += 1 55 | 56 | while i < len(L_list): 57 | unsorted_list[k] = L_list[i] 58 | i += 1 59 | k += 1 60 | 61 | while j < len(R_list): 62 | unsorted_list[k] = R_list[j] 63 | j += 1 64 | k += 1 65 | 66 | 67 | def find_query_pos(contig, ref_pos): 68 | #note -1? 69 | cur_contig_pos = contig.query_alignment_start 70 | cur_ref_pos = contig.reference_start 71 | cigar_tuples = contig.cigartuples 72 | 73 | #here3 [(4, 1), (0, 1195), (5, 239224)] 74 | #here4 197757984 197756789 197757984 75 | #test 76 | #print("here4", ref_pos, contig.reference_start, contig.reference_end) 77 | #bad input ref_pos 78 | if ref_pos < contig.reference_start or ref_pos > contig.reference_end: 79 | return -1 80 | #loop through cigar tuples 81 | for tup in cigar_tuples: 82 | 83 | #if softclip 84 | # if tup[0] == 4: 85 | # #or skip this tuple: we had cur_contig_pos = contig.query_alignment_start 86 | # cur_contig_pos = tup[1] - 1 87 | # continue 88 | 89 | #if tup consumes both contig and ref, and reached the target ref_pos 90 | if tup[0] in [0, 7, 8] and cur_ref_pos < ref_pos and cur_ref_pos + tup[1] >= ref_pos: 91 | return cur_contig_pos + (ref_pos - cur_ref_pos) 92 | #elif tup consemes ref only, and reached the target ref_pos 93 | elif tup[0] in [2, 3] and cur_ref_pos < ref_pos and cur_ref_pos + tup[1] >= ref_pos: 94 | return cur_contig_pos 95 | #if consume contig, have not reached the target ref_pos 96 | # if tup[0] in [0, 1, 4, 7, 8]: 97 | #no SC is considered 98 | if tup[0] in [0, 1, 7, 8]: 99 | #the start pos of the next cigar tup 100 | cur_contig_pos += tup[1] 101 | #if consume ref, have not reached the target ref_pos 102 | if tup[0] in [0, 2, 3, 7, 8]: 103 | cur_ref_pos += tup[1] 104 | #bad return 105 | #test 106 | #print("here5") 107 | return -1 108 | 109 | #TODO: check +1/-1 position 110 | def modify_cigar(contig, start_pos, end_pos, left): 111 | 112 | #left = True if trimming from left 113 | #modi_contig = copy.deepcopy(contig) 114 | #if read is covered by another read 115 | cigar_tuples = contig.cigartuples 116 | 117 | if_hc_first = False 118 | if_hc_last = False 119 | hc_first_tuple = [] 120 | hc_last_tuple = [] 121 | 122 | if cigar_tuples[0][0] == 5: 123 | if_hc_first = True 124 | hc_first_tuple.append(cigar_tuples[0]) 125 | cigar_tuples = cigar_tuples[1:] 126 | if cigar_tuples[-1][0] == 5: 127 | if_hc_last = True 128 | hc_last_tuple.append(cigar_tuples[-1]) 129 | cigar_tuples = cigar_tuples[0:len(cigar_tuples)-1] 130 | 131 | #test 132 | # print(cigar_tuples) 133 | 134 | new_cigars = [] 135 | if end_pos == -1: 136 | if if_hc_first: 137 | new_cigars.append(hc_first_tuple[0]) 138 | new_cigars.append((4, contig.infer_query_length())) 139 | if if_hc_last: 140 | new_cigars.append(hc_last_tuple[0]) 141 | contig.cigar = new_cigars 142 | #start_pos, end_pos: trimming start and end on the ref 143 | #trim from left 144 | elif left: 145 | #the no. of bases to be trimmed AFTER soft clip 146 | trim_base = 0 147 | #trim start and pos on contig, 0 base 148 | trim_start_pos = 0 149 | trim_end_pos = find_query_pos(contig, end_pos) 150 | 151 | if trim_end_pos == contig.infer_query_length(): 152 | trim_end_pos -= 1 153 | 154 | trim_base = trim_end_pos - trim_start_pos + 1 155 | 156 | #test 157 | #print("here1", trim_base, trim_end_pos) 158 | #passed_base 159 | passed_base = 0 160 | if if_hc_first: 161 | new_cigars.append(hc_first_tuple[0]) 162 | new_cigars.append((4, trim_base)) 163 | #the first index of cigar tuple that will be kept 164 | trim_index = 0 165 | for tup_index, tup in enumerate(cigar_tuples): 166 | #if consume contig, have reached the target ref_pos 167 | if tup[0] in [0, 1, 4, 7, 8] and passed_base < trim_base and passed_base + tup[1] >= trim_base: 168 | if passed_base + tup[1] > trim_base: 169 | #test 170 | #print("here2", (tup[0], tup[1] - (trim_base - passed_base))) 171 | new_cigars.append((tup[0], passed_base + tup[1] - trim_base)) 172 | #new_cigars.append((tup[0], trim_base - passed_base)) 173 | trim_index = tup_index + 1 174 | else: 175 | trim_index = tup_index + 1 176 | break 177 | #if consume contig 178 | if tup[0] in [0, 1, 4, 7, 8]: 179 | passed_base += tup[1] 180 | 181 | if trim_index <= len(cigar_tuples) - 1: 182 | new_cigars.extend(cigar_tuples[trim_index:]) 183 | if if_hc_last: 184 | new_cigars.append(hc_last_tuple[0]) 185 | contig.cigar = new_cigars 186 | #test 187 | #print("here3", new_cigars) 188 | contig.reference_start = end_pos + 1 189 | #trim from right 190 | elif not left: 191 | trim_base = 0 192 | #print("here0", start_pos, end_pos) 193 | trim_start_pos = find_query_pos(contig, start_pos) 194 | trim_end_pos = contig.infer_query_length() - 1 195 | 196 | if trim_start_pos == contig.infer_query_length(): 197 | trim_start_pos -= 1 198 | 199 | trim_base = trim_end_pos - trim_start_pos + 1 200 | #test 201 | #print("here1", trim_base, trim_end_pos, trim_start_pos) 202 | #passed_base 203 | passed_base = 0 204 | if if_hc_last: 205 | new_cigars.append(hc_last_tuple[0]) 206 | new_cigars = [(4, trim_base)] + new_cigars 207 | trim_index = 0 208 | for i in range(len(cigar_tuples)-1, -1, -1): 209 | tup = cigar_tuples[i] 210 | #if consume contig, have reached the target ref_pos 211 | if tup[0] in [0, 1, 4, 7, 8] and passed_base < trim_base and passed_base + tup[1] >= trim_base: 212 | if passed_base + tup[1] > trim_base: 213 | #test 214 | #print("here2", (tup[0], tup[1] - (trim_base - passed_base))) 215 | new_cigars = [(tup[0], tup[1] - (trim_base - passed_base))] + new_cigars 216 | trim_index = i - 1 217 | else: 218 | trim_index = i - 1 219 | break 220 | #if consume contig 221 | if tup[0] in [0, 1, 4, 7, 8]: 222 | passed_base += tup[1] 223 | 224 | if trim_index != -1: 225 | new_cigars = cigar_tuples[:trim_index+1] + new_cigars 226 | if if_hc_first: 227 | new_cigars = hc_first_tuple + new_cigars 228 | contig.cigar = new_cigars 229 | 230 | #test 231 | #print("here3", new_cigars) 232 | #return modi_contig 233 | 234 | 235 | 236 | def trim_by_ol(contig, index, contig_list, remove_list): 237 | #loop through contig_list looking for overlapping 238 | for i in range(index+1, len(contig_list)): 239 | contig_start = contig.reference_start 240 | contig_end = contig.reference_end - 1 241 | cur_start = contig_list[i].reference_start 242 | cur_end = contig_list[i].reference_end - 1 243 | 244 | #if the short contig is covered by current long contig 245 | if contig_start >= cur_start and contig_end <= cur_end: 246 | modify_cigar(contig, 0, -1, False) 247 | remove_list[index] = 1 248 | break 249 | #if overlapping 250 | elif contig_start <= cur_end and contig_end > cur_end: 251 | 252 | modify_cigar(contig, contig_start, cur_end, True) 253 | 254 | 255 | elif contig_start < cur_start and contig_end >= cur_start: 256 | 257 | modify_cigar(contig, cur_start, contig_end, False) 258 | #may not needed 259 | contig_list[index] = contig 260 | 261 | 262 | if if_hg38: 263 | chr_list = ["chr1", "chr2", "chr3", "chr4", "chr5", 264 | "chr6", "chr7", "chr8", "chr9", "chr10", 265 | "chr11", "chr12", "chr13", "chr14", "chr15", 266 | "chr16", "chr17", "chr18", "chr19", "chr20", 267 | "chr21", "chr22", "chrX", "chrY"] 268 | else: 269 | chr_list = ["1", "2", "3", "4", "5", 270 | "6", "7", "8", "9", "10", 271 | "11", "12", "13", "14", "15", 272 | "16", "17", "18", "19", "20", 273 | "21", "22", "X", "Y"] 274 | #chr_list = ["2"] 275 | 276 | for chr_index in chr_list: 277 | if_first = True 278 | #sort fetched contigs by asending length (.query_alignment_length) 279 | contig_list = [] 280 | #test 281 | #for rec in infile.fetch(chr_index, 197556789, 198056789): 282 | #for rec in infile.fetch(chr_index, 598250, 740208): 283 | #for rec in infile.fetch(chr_index, 240063977, 240298717): 284 | for rec in infile.fetch(chr_index): 285 | #print(rec.reference_start) 286 | contig_list.append(rec) 287 | #sort contig_list by length 288 | mergesort_contigs(contig_list) 289 | 290 | remove_list = [0] * len(contig_list) 291 | 292 | 293 | for index, contig in enumerate(contig_list): 294 | cigar = contig.cigartuples 295 | consume_query = [0,1,7,8] 296 | consumed_base = 0 297 | for tup in cigar: 298 | if tup[0] in consume_query: 299 | consumed_base += tup[1] 300 | #print("nooooooooo", consumed_base, contig.query_alignment_end - contig.query_alignment_start) 301 | #print(contig.reference_start, contig.reference_end) 302 | #print(cigar) 303 | 304 | #loop through sorted contig list, trim the shorter contig if overlapping 305 | for index, contig in enumerate(contig_list): 306 | trim_by_ol(contig, index, contig_list, remove_list) 307 | 308 | #test 309 | print(chr_index, len(contig_list)) 310 | #print(remove_list) 311 | for index, contig in enumerate(contig_list): 312 | if remove_list[index] != 1: 313 | outfile.write(contig) 314 | #test 315 | cigar = contig.cigartuples 316 | consume_query = [0,1,7,8] 317 | consumed_base = 0 318 | for tup in cigar: 319 | if tup[0] in consume_query: 320 | consumed_base += tup[1] 321 | if consumed_base != contig.query_alignment_end - contig.query_alignment_start: 322 | print("nooooooooo", consumed_base, contig.query_alignment_end - contig.query_alignment_start) 323 | #print(contig.reference_start, contig.reference_end) 324 | #print(cigar) 325 | #for i in range(0, 100): 326 | # print(contig_list[i].query_alignment_length) 327 | 328 | infile.close() 329 | outfile.close() -------------------------------------------------------------------------------- /validate.py: -------------------------------------------------------------------------------- 1 | ## %% 2 | 3 | # Validation 4 | 5 | import csv 6 | import math 7 | #pysam: https://pysam.readthedocs.io/en/latest/usage.html 8 | import pysam 9 | #print(pysam.__version__) 10 | 11 | import numpy as np 12 | import sys 13 | 14 | 15 | #build centromere position dictionary 16 | def build_centro_dict(centromere_file): 17 | #centromere file 18 | centromere_raw = [] 19 | with open(centromere_file) as f: 20 | reader = csv.reader(f, delimiter="\t") 21 | centromere_raw = list(reader) 22 | f.close() 23 | 24 | #build dictionay 25 | dict_centromere = dict() 26 | pre_centro_chr = "" 27 | start_centro_pos = "" 28 | for record in centromere_raw: 29 | if record[0] == pre_centro_chr: 30 | end_centro_pos = str(record[2]) 31 | dict_centromere[record[0]] = (start_centro_pos, end_centro_pos) 32 | else: 33 | start_centro_pos = str(record[1]) 34 | pre_centro_chr = record[0] 35 | return dict_centromere 36 | 37 | #build lists for excluded SV positions 38 | def get_filtered_sv_pos(exclude_assem1_non_cover_file, exclude_assem2_non_cover_file): 39 | 40 | with open(exclude_assem1_non_cover_file) as f: 41 | reader = csv.reader(f, delimiter="\t") 42 | exclude_assem1_non_cover = list(reader) 43 | f.close() 44 | 45 | with open(exclude_assem2_non_cover_file) as f: 46 | reader = csv.reader(f, delimiter="\t") 47 | exclude_assem2_non_cover = list(reader) 48 | f.close() 49 | 50 | ''' 51 | with open(output_dir + "exclude_assem1_short_reads_250000.bed") as f: 52 | reader = csv.reader(f, delimiter="\t") 53 | exclude_assem1_short_reads = list(reader) 54 | f.close() 55 | 56 | with open(output_dir + "exclude_assem2_short_reads_250000.bed") as f: 57 | reader = csv.reader(f, delimiter="\t") 58 | exclude_assem2_short_reads = list(reader) 59 | f.close() 60 | ''' 61 | 62 | # with open(exclude_high_depth_file) as f: 63 | # #with open("/panfs/qcb-panasas/jianzhiy/illuVeri/use_mcutils/output/v4.6.1/lumpy/HG002/exclude_high_depth.bed") as f: 64 | # reader = csv.reader(f, delimiter="\t") 65 | # exclude_high_depth = list(reader) 66 | # f.close() 67 | 68 | return exclude_assem1_non_cover, exclude_assem2_non_cover 69 | 70 | #check if current cases is in the excluded list 71 | def check_exclude(list_to_check, exclude_assem1_non_cover, exclude_assem2_non_cover): 72 | if (list_to_check in exclude_assem1_non_cover) or (list_to_check in exclude_assem2_non_cover): 73 | return True 74 | #With overlapping contigs trimmed, 75 | #elif (list_to_check in exclude_assem1_short_reads) or (list_to_check in exclude_assem2_short_reads): 76 | # return True 77 | # elif list_to_check in exclude_high_depth: 78 | # return True 79 | else: 80 | return False 81 | 82 | #check if current cases is in the excluded list for male chr X 83 | def check_exclude_chrx(list_to_check, exclude_assem1_non_cover, exclude_assem2_non_cover): 84 | if (list_to_check in exclude_assem1_non_cover) and (list_to_check in exclude_assem2_non_cover): 85 | return True 86 | #With overlapping contigs trimmed, 87 | #elif (list_to_check in exclude_assem1_short_reads) or (list_to_check in exclude_assem2_short_reads): 88 | # return True 89 | # elif list_to_check in exclude_high_depth: 90 | # return True 91 | else: 92 | return False 93 | 94 | #get depth of coverage given position of a chromosome 95 | def get_depth(ref_name, ref_pos, bam_file): 96 | pos_str = ref_name + ':' + str(int(ref_pos) - 1) + '-' + str(ref_pos) 97 | res = pysam.depth("-r", pos_str, bam_file) 98 | if res=='': 99 | return 0 100 | start = 0 101 | end = len(res) - 1 102 | for i in range(len(res) - 1, -1, -1): 103 | if res[i] == '\t': 104 | start = i + 1 105 | break 106 | return int(res[start:end]) 107 | 108 | #check if true positive or not 109 | def check_tp(rela_len, rela_score, sv_type): 110 | result = True 111 | if sv_type in ['DEL', 'DUP', 'DUP:TANDEM']: 112 | if rela_score >= 0 and rela_score <= 2.5: 113 | if rela_len >= -0.05*rela_score + 0.8 and rela_len <= 0.05*rela_score + 1.2: 114 | result = True 115 | else: 116 | result = False 117 | elif rela_score > 2.5: 118 | if rela_len >= 0.675 and rela_len <= 1.325: 119 | result = True 120 | else: 121 | result = False 122 | else: 123 | result = False 124 | elif sv_type == 'INS': 125 | if rela_len < 0.675 or rela_len > 1.325: 126 | result = False 127 | elif sv_type == 'INV': 128 | if rela_score <= 0: 129 | result = False 130 | return result 131 | 132 | # build dictionary for validation 133 | def updateDict(dict_score, align_info, exclude_assem1_non_cover, exclude_assem2_non_cover, dict_centromere, chr_list, if_hg38): 134 | for record in align_info: 135 | #len > 30 136 | if int(record[3]) > 0 and abs(int(record[8])) > 30 and str(record[9]) in chr_list: 137 | # if int(record[3]) > 0 and abs(int(record[8])) >= 10 and abs(int(record[8])) <= 30 and str(record[9]) in chr_list: 138 | #filter out centromere cases 139 | index = str(record[0]) 140 | ref_name = str(record[9]) 141 | #interval start and end 142 | ref_start = int(record[12]) 143 | ref_end = int(record[13]) 144 | 145 | if if_hg38: 146 | centro_start = int(dict_centromere[ref_name][0]) 147 | centro_end = int(dict_centromere[ref_name][1]) 148 | else: 149 | centro_start = int(dict_centromere['chr'+ref_name][0]) 150 | centro_end = int(dict_centromere['chr'+ref_name][1]) 151 | 152 | #if SV in the exclude_list: start or end of a contig 153 | sv_pos = int(record[10]) 154 | sv_end = int(record[11]) 155 | list_to_check = [str(ref_name), str(sv_pos), str(sv_end)] 156 | #if sv in high-depth regions or non-covered regions, skip 157 | if check_exclude(list_to_check, exclude_assem1_non_cover, exclude_assem2_non_cover): 158 | continue 159 | 160 | #if ref start or ref end in centromere, skip 161 | if (ref_start > centro_start and ref_start < centro_end) or (ref_end > centro_start and ref_end < centro_end): 162 | continue 163 | 164 | #start to build a dictionay: merge validation information from 2 haplotypes 165 | #TODO: solve the score/length = 0 problem 166 | #skip before score/length = 0 167 | if float(record[1]) == 0 or float(record[8]) == 0: 168 | continue 169 | 170 | if record[0] not in dict_score: 171 | dict_score[record[0]] = record[1:len(record)] 172 | else: 173 | #TODO: solve the score = 0 problem 174 | #choose the better relative score one: 175 | #if one of the before score is 0, choose the better relative length one: 176 | ''' 177 | if float(dict_score[record[0]][0]) == 0 or float(record[1]) == 0: 178 | old_rela_len = (float(dict_score[record[0]][5]) - float(dict_score[record[0]][6]))/abs(float(dict_score[record[0]][7])) 179 | new_rela_len = (float(record[6]) - float(record[7]))/abs(float(record[8])) 180 | if new_rela_len < old_rela_len: 181 | dict_score.update({record[0]: record[1:len(record)]}) 182 | continue 183 | old_rela_change = (float(dict_score[record[0]][1]) - float(dict_score[record[0]][0]))/abs(float(dict_score[record[0]][0])) 184 | new_rela_change = (float(record[2]) - float(record[1]))/abs(float(record[1])) 185 | if new_rela_change > old_rela_change: 186 | dict_score.update({record[0]: record[1:len(record)]}) 187 | ''' 188 | #if INS or DEL choose the tp one 189 | #if both tp/fp, choose the better relative length one 190 | if record[4] in ['INS', 'DEL', 'DUP', 'DUP:TANDEM']: 191 | old_rela_score = (float(dict_score[record[0]][1]) - float(dict_score[record[0]][0]))/abs(float(dict_score[record[0]][0])) 192 | new_rela_score = (float(record[2]) - float(record[1]))/abs(float(record[1])) 193 | old_rela_len = (float(dict_score[record[0]][5]) - float(dict_score[record[0]][6]))/(float(dict_score[record[0]][7])) 194 | new_rela_len = (float(record[6]) - float(record[7]))/(float(record[8])) 195 | 196 | old_res = check_tp(old_rela_len, old_rela_score, record[4]) 197 | new_res = check_tp(new_rela_len, new_rela_score, record[4]) 198 | 199 | if new_res and not old_res: 200 | dict_score.update({record[0]: record[1:len(record)]}) 201 | elif old_res and not new_res: 202 | continue 203 | else: 204 | if abs(new_rela_len - 1) < abs(old_rela_len - 1): 205 | dict_score.update({record[0]: record[1:len(record)]}) 206 | #if INV choose the better relative score one: 207 | elif record[4] == 'INV': 208 | if float(dict_score[record[0]][0]) == 0 or float(record[1]) == 0: 209 | continue 210 | else: 211 | old_rela_score = (float(dict_score[record[0]][1]) - float(dict_score[record[0]][0]))/abs(float(dict_score[record[0]][0])) 212 | new_rela_score = (float(record[2]) - float(record[1]))/abs(float(record[1])) 213 | 214 | if old_rela_score < new_rela_score: 215 | dict_score.update({record[0]: record[1:len(record)]}) 216 | return dict_score 217 | 218 | #validate by both haplotypes 219 | def vali_info(output_dir, exclude_assem1_non_cover, exclude_assem2_non_cover, assem1_info_file, assem2_info_file, dict_centromere, chr_list, if_hg38): 220 | with open(output_dir + assem1_info_file) as f: 221 | reader = csv.reader(f, delimiter="\t") 222 | align_info_assem1 = list(reader) 223 | f.close() 224 | 225 | with open(output_dir + assem2_info_file) as f: 226 | reader = csv.reader(f, delimiter="\t") 227 | align_info_assem2 = list(reader) 228 | f.close() 229 | 230 | dict_comb = dict() 231 | dict_comb = updateDict(dict_comb, align_info_assem1, exclude_assem1_non_cover, exclude_assem2_non_cover, dict_centromere, chr_list, if_hg38) 232 | dict_comb = updateDict(dict_comb, align_info_assem2, exclude_assem1_non_cover, exclude_assem2_non_cover, dict_centromere, chr_list, if_hg38) 233 | 234 | return dict_comb 235 | 236 | 237 | #TODO: output tp/fp as vcf files 238 | #output a text file 239 | #CHR POS END SVTYPE rela_len rela_score validation_res 240 | 241 | #write output 242 | def write_output(output_dir, dict_comb): 243 | g = open(output_dir + "ttmars_res.txt", "w") 244 | for record in dict_comb: 245 | #TODO: solve 0 score problem 246 | #if zero score 247 | if float(dict_comb[record][0]) == 0 or float(dict_comb[record][7]) == 0: 248 | #rela_score = 0.01 249 | continue 250 | rela_score = round((float(dict_comb[record][1]) - float(dict_comb[record][0]))/abs(float(dict_comb[record][0])), 2) 251 | rela_len = round((float(dict_comb[record][5]) - float(dict_comb[record][6]))/float(dict_comb[record][7]), 2) 252 | 253 | g.write(str(dict_comb[record][8]) + "\t") 254 | g.write(str(dict_comb[record][9]) + "\t") 255 | g.write(str(dict_comb[record][10]) + "\t") 256 | g.write(str(dict_comb[record][3]) + "\t") 257 | g.write(str(rela_len) + "\t") 258 | g.write(str(rela_score) + "\t") 259 | g.write(str(check_tp(rela_len, rela_score, str(dict_comb[record][3])))) 260 | g.write("\n") 261 | g.close() 262 | 263 | #main function 264 | def main(): 265 | #input 266 | output_dir = sys.argv[1] + "/" 267 | if_hg38_input = sys.argv[2] 268 | centromere_file = sys.argv[3] 269 | #exclude_assem1_non_cover_file = sys.argv[4] 270 | #exclude_assem2_non_cover_file = sys.argv[5] 271 | #exclude_high_depth_file = sys.argv[6] 272 | 273 | #constants 274 | interval = 20 275 | if_hg38 = False 276 | if if_hg38_input == "True": 277 | if_hg38 = True 278 | chr_list = [] 279 | if if_hg38: 280 | chr_list = ["chr1", "chr2", "chr3", "chr4", "chr5", 281 | "chr6", "chr7", "chr8", "chr9", "chr10", 282 | "chr11", "chr12", "chr13", "chr14", "chr15", 283 | "chr16", "chr17", "chr18", "chr19", "chr20", 284 | "chr21", "chr22", "chrX", "chrY"] 285 | else: 286 | chr_list = ["1", "2", "3", "4", "5", 287 | "6", "7", "8", "9", "10", 288 | "11", "12", "13", "14", "15", 289 | "16", "17", "18", "19", "20", 290 | "21", "22", "X", "Y"] 291 | 292 | #build centromere dictionary 293 | dict_centromere = build_centro_dict(centromere_file) 294 | 295 | #build lists for excluded SV positions 296 | exclude_assem1_non_cover, exclude_assem2_non_cover = get_filtered_sv_pos(output_dir + "exclude_assem1_non_cover.bed", 297 | output_dir + "exclude_assem2_non_cover.bed") 298 | 299 | #validate by both haplotypes 300 | dict_comb = vali_info(output_dir, 301 | exclude_assem1_non_cover, 302 | exclude_assem2_non_cover, 303 | "align_info_assem1_chrall.txt", 304 | "align_info_assem2_chrall.txt", 305 | dict_centromere, 306 | chr_list, 307 | if_hg38) 308 | 309 | #write output 310 | write_output(output_dir, dict_comb) 311 | 312 | if __name__ == "__main__": 313 | main() --------------------------------------------------------------------------------