├── minda ├── __init__.py ├── __version__.py ├── truthset.py ├── stats.py ├── main.py ├── ensemble.py └── decompose.py ├── requirements.txt ├── .gitignore ├── environment.yml ├── scripts ├── add_vaf_severus.py └── add_vaf_new_colo829.py ├── minda.py ├── setup.py ├── annotation ├── README.md └── minda_stratify.py ├── LICENSE ├── README.md └── data └── colo829_benchmark_grch38_amended_vaf.vcf /minda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /minda/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | pandas 3 | numpy 4 | pybedtools -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | *.swp 3 | *.swo 4 | .DS_Store 5 | __pycache__ 6 | ._* 7 | *.pyc 8 | *.egg-info 9 | build 10 | dist 11 | minda/.ipynb_checkpoints 12 | minda/__pycache__ 13 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: minda 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python>=3.10 8 | - pandas>=2.1.1 9 | - numpy>=1.26.0 10 | - pybedtools>=0.9.1 11 | -------------------------------------------------------------------------------- /scripts/add_vaf_severus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys 4 | import pysam 5 | import statistics 6 | 7 | vcf_file = sys.argv[1] 8 | 9 | out_vcf = vcf_file.replace('.vcf' , '_vaf.vcf') 10 | vcf_in=pysam.VariantFile(vcf_file,"r") 11 | vcf_in.header.info.add("VAF",1,"Float","variant_allele_frequency") 12 | vcf_out = pysam.VariantFile(out_vcf, 'w', header=vcf_in.header) 13 | 14 | for record in vcf_in: 15 | sample_id = record.samples.keys()[0] 16 | record.info['VAF'] = record.samples[sample_id]['VAF'] 17 | vcf_out.write(record) 18 | vcf_out.close() -------------------------------------------------------------------------------- /minda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #(c) 2023 by Authors 4 | #This file is a part of Minda. 5 | #Released under the BSD license (see LICENSE file) 6 | 7 | """ 8 | This script sets up environment paths 9 | and invokes Minda without installation. 10 | """ 11 | 12 | import os 13 | import sys 14 | 15 | def main(): 16 | #Setting executable paths 17 | minda_root = os.path.dirname(os.path.realpath(__file__)) 18 | sys.path.insert(0, minda_root) 19 | 20 | #Minda entry point 21 | from minda.main import main 22 | sys.exit(main()) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() -------------------------------------------------------------------------------- /scripts/add_vaf_new_colo829.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import sys 4 | import pysam 5 | import statistics 6 | 7 | vcf_file = sys.argv[1] 8 | 9 | out_vcf = vcf_file.replace('.vcf' , '_vaf.vcf') 10 | vcf_in=pysam.VariantFile(vcf_file,"r") 11 | vcf_in.header.info.add("VAF",1,"Float","variant_allele_frequency") 12 | vcf_out = pysam.VariantFile(out_vcf, 'w', header=vcf_in.header) 13 | 14 | for record in vcf_in: 15 | support = list(record.info.values()[4]) 16 | support = [sample.split("|") for sample in support] 17 | sample_vafs = [float(sample[2]) for sample in support] 18 | vaf = statistics.median(sample_vafs) 19 | record.info['VAF'] = vaf 20 | vcf_out.write(record) 21 | vcf_out.close() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import shutil 5 | 6 | try: 7 | import setuptools 8 | except ImportError: 9 | sys.exit("setuptools package not found. " 10 | "Please use 'pip install setuptools' first") 11 | 12 | from setuptools import setup 13 | 14 | # Make sure we're running from the setup.py directory. 15 | script_dir = os.path.dirname(os.path.realpath(__file__)) 16 | if script_dir != os.getcwd(): 17 | os.chdir(script_dir) 18 | 19 | from minda.__version__ import __version__ 20 | 21 | 22 | setup(name='minda', 23 | version=__version__, 24 | description='A tool for somatic structural variant calling using long reads', 25 | url='https://github.com/KolmogorovLab/minda', 26 | author='Asher Bryant', 27 | author_email = 'asher.bryant@nih.gov', 28 | license='BSD-3-Clause', 29 | packages=['minda'], 30 | entry_points={'console_scripts': ['minda = minda.main:main']}, 31 | ) -------------------------------------------------------------------------------- /annotation/README.md: -------------------------------------------------------------------------------- 1 | Minda SV Annotation 2 | =================== 3 | 4 | This folder contain a set of scripts that annotates SV calls ensemble 5 | generated by Minda with various challenging scenarios. We currently annotate the following categories. 6 | More detailed description is availabe in the 7 | [Severus manuscript](https://www.medrxiv.org/content/10.1101/2024.03.22.24304756v1) 8 | 9 | * Insertions 10 | * SVs with breakpoints at matching homologous repeats 11 | * SVs inside segmental duplications 12 | * SVs inside VNTRs 13 | * SVs with low allelic frequency 14 | * Short SVs (<100bp) 15 | * Duplicated SVs 16 | * SV chains 17 | 18 | To run annotation, you'll need Minda output directory and a directory 19 | with annotation for your reference genome. Links to download existing 20 | annotations are below: 21 | 22 | ``` 23 | grch38 https://zenodo.org/records/11992284/files/annotation_grch38.tar.gz 24 | ``` 25 | 26 | For example, if you are using grch38: 27 | 28 | ``` 29 | wget https://zenodo.org/records/11992284/files/annotation_grch38.tar.gz 30 | tar -xvf annotation_grch38.tar.gz 31 | ./minda_stratify.py annotation_grch38 minda_support.tsv 11 32 | ``` 33 | 34 | where `minda_support.tsv` is a support file generated by Minda, and `11` is the number of callsets 35 | used to create the Minda ensemble. 36 | 37 | Currently, genome annotations include the following: 38 | * Chromosome lengths (produces using `samtools faidx`) 39 | * Common repeat annotations (produces using `RepeatMasker`) 40 | * Segmental duplications annotation 41 | * VNTR annotations (produced using [findTandemRepeats](https://github.com/PacificBiosciences/pbsv/tree/master/annotations) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024, National Institutes of Health 2 | License: BSD-3-Clause 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the National Institutes of Health nor the 17 | names of its contributors may be used to endorse or promote products 18 | derived from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 24 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 27 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /minda/truthset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def get_base_df(decomposed_dfs_list, tolerance, multimatch): 5 | dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list] 6 | dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list] 7 | 8 | # create collective comparison df 9 | start_dfs = pd.concat(dfs_1) 10 | end_dfs = pd.concat(dfs_2) 11 | 12 | # create base df 13 | base_1_df = dfs_1[-1] 14 | base_2_df = dfs_2[-1] 15 | 16 | # find which comparison start loci are within tolerance range of base start loci 17 | base_1_loci = list(zip(base_1_df['#CHROM'], base_1_df['POS'])) 18 | start_loci = list(zip(start_dfs['#CHROM'], start_dfs['POS'])) 19 | base_2_loci = list(zip(base_2_df['#CHROM'], base_2_df['POS'])) 20 | end_loci = list(zip(end_dfs['#CHROM'], end_dfs['POS'])) 21 | 22 | start_index_lists = [] 23 | for i in range(len(base_1_loci)): 24 | base_locus = base_1_loci[i] 25 | index_list = [] 26 | 27 | for j in range(len(start_loci)): # in order to get the correct index cannot use "for start_locus in start_loci" 28 | 29 | start_locus = start_loci[j] 30 | if base_locus[0] == start_locus[0]: 31 | 32 | distance = abs(base_locus[1] - start_locus[1]) 33 | if distance <= tolerance: 34 | start_index = j 35 | index_list.append(start_index) 36 | 37 | start_index_lists.append(index_list) 38 | if len(start_index_lists) != (i+1): # ensure each base record has a list even if no comp calls within tolerance range 39 | start_index_lists.append([]) 40 | 41 | # if start loci within tolerance range, check that end also is 42 | running_list = [] 43 | comp_minda_ids = start_dfs.Minda_ID.to_list() 44 | minda_id_lists = [] 45 | for i in range(len(start_index_lists)): 46 | index_list = start_index_lists[i] 47 | base_locus = base_2_loci[i] 48 | 49 | 50 | minda_id_list = [] 51 | for index in index_list: 52 | end_locus = end_loci[index] 53 | if base_locus[0] == end_locus[0]: 54 | #print(base_locus, end_locus) 55 | distance = abs(base_locus[1] - end_locus[1]) 56 | if distance <= tolerance: 57 | minda_id = comp_minda_ids[index] 58 | 59 | if multimatch == False: 60 | caller = minda_id.rsplit('_', 1)[0] 61 | if any(id.startswith(caller) for id in minda_id_list) == False and minda_id not in running_list: 62 | minda_id_list.append(minda_id) 63 | running_list.append(minda_id) 64 | else: 65 | minda_id_list.append(minda_id) 66 | minda_id_lists.append(minda_id_list) 67 | if len( minda_id_lists) != (i+1): # ensure each base record has a list even if no comp calls within tolerance range 68 | minda_id_lists.append([]) 69 | 70 | # merge start & end base dfs & create column of Minda IDs for calls within tolerance range 71 | base_df = base_1_df.merge(base_2_df, left_index=True, right_index=True) 72 | base_df["Minda_IDs"] = minda_id_lists 73 | 74 | return base_df 75 | 76 | 77 | def get_support_df(base_df, caller_names, vaf, out_dir, sample_name): 78 | 79 | minda_id_lists = base_df.Minda_IDs.to_list() 80 | # create call columns for each caller 81 | for caller_name in caller_names: 82 | caller_column = [] 83 | for minda_id_list in minda_id_lists: 84 | 85 | call_boolean = any(value.startswith(caller_name) for value in minda_id_list) 86 | caller_column.append(call_boolean) 87 | base_df[f'{caller_name}'] = caller_column 88 | 89 | # if vaf == None: 90 | # base_df['VAF_x'] = np.nan 91 | 92 | column_names = ['#CHROM_x', 'POS_x', 'ID_x', 'INFO_x', \ 93 | '#CHROM_y', 'POS_y', 'ID_y', 'INFO_y', \ 94 | 'SVTYPE_x', 'SVLEN', 'VAF_x', 'Minda_ID_x','Minda_IDs'] + [caller_names[-1]] + caller_names[:-1] 95 | 96 | support_df = base_df[column_names].rename(columns={'SVTYPE_x':'SVTYPE', 'VAF_x':'VAF', 'Minda_ID_x': 'Minda_ID'}).copy() 97 | 98 | support_df.to_csv(f'{out_dir}/{sample_name}_support.tsv', sep='\t', index=False) 99 | 100 | return support_df 101 | 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Minda 2 | ###### Note: This tool is under active devlopment. 3 | 4 | Minda is a tool for evaluating structural variant (SV) callers that 5 | * standardizes VCF records for compatibility with both germline and somatic SV callers, 6 | * benchmarks against a single VCF input file, or 7 | * benchmarks against an ensemble call set created from multiple VCF input files. 8 | 9 | ## Installation 10 | 11 | Clone the repository and install the dependencies via conda: 12 | 13 | ``` 14 | git clone https://github.com:KolmogorovLab/minda 15 | cd minda 16 | conda env create --name minda --file environment.yml 17 | conda activate minda 18 | ./minda.py 19 | ``` 20 | 21 | ## Quick Usage 22 | 23 | Benchmarking several vcfs against a truth set vcf: 24 | 25 | ``` 26 | ./minda.py truthset --base truthset.vcf --vcfs caller_1.vcf caller_2.vcf caller_3.vcf --out_dir minda_out 27 | ``` 28 | 29 | Creating an ensemble from several vcfs and benchmarking against ensemble calls: 30 | 31 | ``` 32 | ./minda.py ensemble --vcfs caller_1.vcf caller_2.vcf caller_3.vcf --out_dir minda_out 33 | ``` 34 | 35 | ## Inputs and Parameters 36 | 37 | ### Required 38 | 39 | #### Truthset 40 | 41 | ``` 42 | --out_dir path to out directory 43 | --base path of base VCF 44 | --tsv | --vcfs tsv file path 45 | -OR- 46 | vcf file path(s) 47 | ``` 48 | #### Ensemble 49 | ``` 50 | --out_dir path to out directory 51 | --tsv | --vcfs tsv file path 52 | -OR- 53 | vcf file path(s) 54 | --min_support | minimumn number of callers required to support an ensemble call 55 | --conditions -OR- 56 | specific conditions to support a call 57 | ``` 58 | 59 | ### Optional 60 | ``` 61 | --bed path to bed file for filtering records with BedTool intersect 62 | --filter filter records by FILTER column; default="['PASS']" 63 | --min_size filter records by SVLEN in INFO column 64 | --tolerance maximum allowable bp distance between base and caller breakpoint; default=500 65 | --sample_name name of sample 66 | --vaf filter out records below a given VAF treshold 67 | --multimatch allow more than one record from the same caller VCF to match a single truthset/ensemble record 68 | ``` 69 | ##### VCF Input 70 | Minda standardizes input VCFs by decomposing every SV into start and end records. Records are handled in one of two following ways: 71 | 1. For records having a CHROM:POS pattern in the `ALT` field, the `#CHROM` and `POS` fields are considered the start. Minda then searches for the end record matching the `ALT` field among other records. Alternatively, the `MATEID` from the `INFO` field may be used to find the end record. If no end record is found, the details from the `ALT` field are used to create one. 72 | 2. All other records Minda considers start records. The corresponding end records use the start `#CHROM` and `POS` is calculated by adding the start `POS` with absolute value of `SVLEN` or is extracted from the `END` integer in the `INFO` field. 73 | Minda has been tested on VCFs produced by 74 | 75 | * Severus 76 | * SAVANA 77 | * nanomonsv 78 | * Sniffles2 79 | * cuteSV 80 | * SVIM 81 | * GRIPSS 82 | * manta 83 | * SvABA. 84 | 85 | If you encounter issues with these or other VCF files, please [let us know](https://github.com/KolmogorovLab/minda/issues). 86 | 87 | ##### TSV Input 88 | The `--tsv` file has one required column and up three columns. The columns should be as follows: 89 |
    90 |
  1. VCF paths (required)
  2. 91 |
  3. caller name
  4. 92 |
  5. prefix
  6. 93 |
94 | If a caller name is not provided, the name listed in the source field of the VCF will be used. If more than one VCF with the same caller name is provided, prefixes disambiguate ID and column names in Minda output files. In the case where prefixes are not provided by the user, Minda automatically assigns a letter prefix in ascending alphabetically order (i.e. A, B, C, etc.). 95 | 96 | An example of TSV contents: 97 | ``` 98 | /path/to/severus_ONT.vcf Severus ONT 99 | /path/to/severus_PB.vcf Severus PB 100 | /path/to/manta.vcf manta ILL 101 | ``` 102 | ##### Specific Conditions 103 | The `--conditions` parameter enables specific user-defined conditions to be met for each ensemble call. Input a list in double quotation marks that contains: 104 | 105 |
    106 |
  1. a (nested) list of caller names, each name in single quotation marks with prefixes, if necessary
  2. 107 |
  3. an operator in single quoation marks
  4. 108 |
  5. a number
  6. 109 |
110 | 111 | For example, from the TSV contents above, to require that an ensemble call be one for which both ONT and PB agree, when using `--tsv` input, specify: 112 | ``` 113 | "[['ONT_Severus', 'PB_Severus'], '>=', 2]" 114 | ``` 115 | OR when using `--vcfs` or `--tsv` input: 116 | ``` 117 | "[[caller_names[:2], '>=', 2]" 118 | ``` 119 | 120 | To combine multiple conditions, add `'&'` or `'|'` between each condition. 121 | For example, to require at least one long-read call and one short-read call to agree, specify for `--tsv` input: 122 | ``` 123 | "[[['ONT_Severus', 'PB_Severus'], '>=', 1], '&', [['ILL_manta'], '==', 1]]" 124 | ``` 125 | OR for `--vcfs` or `--tsv` input: 126 | ``` 127 | "[[caller_names[:2], '>=', 1], '&', [caller_names[2:], '==', 1]]" 128 | ``` 129 | ##### VAF Filtering 130 | ###### Note: This requires preprocessing of VCF file. See [scripts](scripts). 131 | To run Minda with the `--vaf` parameter, ensure the VCF files have a `VAF` value in the INFO field. 132 | 133 | ## Output Files 134 | Both `truthset` and `ensemble` output: 135 | * tp.tsv for each caller 136 | * fp.tsv for each caller 137 | * fn.tsv for each caller 138 | * support.tsv - lists which callers called which truthset/ensemble records 139 | * results.txt - for each caller, lists the overall precision, recall, F1 scores, as well as the number of TP, FN, FP calls overall and by SVTYPE and SVLEN 140 | * removed_records.txt - list of caller IDs of records not evaluated after removing singletons and filtering by FILTER, SVLEN, VAF 141 | 142 | `ensemble` also outputs: 143 | * ensemble.vcf 144 | 145 | License 146 | ------- 147 | 148 | Severus is distributed under a BSD license. See the [LICENSE](LICENSE) for details. 149 | 150 | Citation 151 | ------- 152 | Keskus, A.G., Bryant, A., Ahmad, T. et al. **Severus detects somatic structural variation and complex rearrangements in cancer genomes using long-read sequencing.** *Nature Biotechnology* (2025). https://doi.org/10.1038/s41587-025-02618-8 153 | 154 | Credits 155 | ------- 156 | 157 | Minda is being developed in the Kolmogorov Lab at the National Cancer Institute. 158 | 159 | Key contributors: 160 | 161 | * Asher Bryant 162 | * Ayse Keskus 163 | * Mikhail Kolmogorov 164 | 165 | --- 166 | ### Contact 167 | If you experience any problems or would like to make a suggestion, please submit an [issue](https://github.com/KolmogorovLab/minda/issues). 168 | To contact the developer directly, email asher.bryant@nih.gov. 169 | -------------------------------------------------------------------------------- /minda/stats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | import pandas as pd 5 | from collections import Counter 6 | 7 | def _get_tp_fn_fp(support_df, decomposed_dfs, caller_name, vaf, command): 8 | 9 | paired_df = decomposed_dfs[0].merge(decomposed_dfs[1], on='Minda_ID') 10 | 11 | if command == "ensemble": 12 | base_df = support_df[support_df['ensemble'] == True] 13 | else: 14 | base_df = support_df[support_df.iloc[:,13] == True] 15 | 16 | tp_ids = [id for ids in base_df["Minda_IDs"].to_list() for id in ids] 17 | 18 | # create tp, fn, fp dfs 19 | if command == "ensemble": 20 | fn_columns = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', \ 21 | '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \ 22 | 'SVTYPE', 'SVLEN', 'VAF', 'Minda_IDs'] 23 | 24 | if command == "truthset": 25 | fn_columns = ['#CHROM_x', 'POS_x', 'ID_x', 'INFO_x', \ 26 | '#CHROM_y', 'POS_y', 'ID_y', 'INFO_y', \ 27 | 'SVTYPE', 'SVLEN', 'VAF', 'Minda_IDs'] 28 | 29 | fn_df = base_df[base_df[f'{caller_name}'] == False][fn_columns] 30 | tp_df = paired_df[paired_df['Minda_ID'].isin(tp_ids)] 31 | fp_df = paired_df[~paired_df['Minda_ID'].isin(tp_ids)] 32 | 33 | 34 | return tp_df, fn_df, fp_df, base_df, paired_df 35 | 36 | def _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len, out_dir, sample_name, command, vaf, version): 37 | 38 | tp = tp_df.shape[0] 39 | fn = fn_df.shape[0] 40 | fp = fp_df.shape[0] 41 | 42 | # make tsv 43 | tp_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_tp.tsv', sep='\t', index=False) 44 | fn_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_fn.tsv', sep='\t', index=False) 45 | fp_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_fp.tsv', sep='\t', index=False) 46 | # dfs = [tp_df, fn_df, fp_df] 47 | # df_names = ["tp", "fn", "fp"] 48 | # date = datetime.today().strftime('%Y-%m-%d') 49 | # for i in range(len(dfs)): 50 | # df = dfs[i] 51 | # df_name = df_names[i] 52 | # with open(f'{out_dir}/{sample_name}_{caller_name}_{df_name}.vcf', 'w') as file: 53 | # file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n') 54 | # file.write('##ALT=\n##ALT=\n##ALT=\n##ALT=\n') 55 | # file.write('##FILTER=\n') 56 | # file.write('##INFO=\n##INFO=\n##INFO=\n') 57 | # if vaf != None: 58 | # file.write('##INFO=\n') 59 | # command_str = " ".join(sys.argv) 60 | # file.write(f"cmd: {command_str}\n") 61 | # df.to_csv(file, sep="\t", index=False) 62 | 63 | # caluluate stats 64 | if tp+fp == 0: 65 | sys.exit(f"{caller_name} has no TP or FP records. Please double check input files.") 66 | precision = tp/(tp+fp) 67 | if tp+fn == 0: 68 | sys.exit(f"{caller_name} has no TP or FN records. Please double check input files.") 69 | recall = tp/(tp+fn) 70 | f1 = (2*precision*recall)/(precision+recall) 71 | 72 | caller_len = len(paired_df) 73 | base_len = len(base_df) 74 | 75 | # overall df 76 | columns = ['True Positives', 'False Negatives', 'False Positives', 'Precision', 'Recall', 'F1 Score', 'Caller Records', 'Ensemble Records'] 77 | data = [[tp, fn, fp, precision, recall, f1, caller_len, base_len]] 78 | overall_df = pd.DataFrame(data, columns=columns, index=[caller_name]) 79 | 80 | # SV type dfs 81 | tp_type_df = tp_df['SVTYPE_y'].value_counts().to_frame(name=caller_name).rename_axis("SVTYPE").T.sort_index(axis=1) 82 | fn_type_df = fn_df['SVTYPE'].value_counts().to_frame(name=caller_name).T.sort_index(axis=1) 83 | fp_type_df = fp_df['SVTYPE_y'].value_counts().to_frame(name=caller_name).rename_axis("SVTYPE").T.sort_index(axis=1) 84 | 85 | 86 | # SV len dfs 87 | ranges = [ -1, 0, 50, 100, 1000, 10000]#, max_len] 88 | # ensure bins must increase monotonically 89 | ranges = [x for x in ranges if x < max_len] + [max_len] 90 | tp_len_df = tp_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).rename_axis("SVLEN").T 91 | fn_len_df = fn_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).T 92 | fp_len_df = fp_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).rename_axis("SVLEN").T 93 | 94 | return overall_df, tp_type_df, fn_type_df, fp_type_df, tp_len_df, fn_len_df, fp_len_df 95 | 96 | 97 | def get_results(decomposed_dfs_list, base_dfs, caller_names, out_dir, sample_name, max_len, tolerance, vaf, command, args, version): 98 | 99 | # tp, fn, fp dfs for each caller 100 | stats_dfs_list = [] 101 | for i in range(len(decomposed_dfs_list)): 102 | 103 | decomposed_dfs = decomposed_dfs_list[i] 104 | caller_name = caller_names[i] 105 | tp_df, fn_df, fp_df, base_df, paired_df = _get_tp_fn_fp(base_dfs, decomposed_dfs, caller_name, vaf, command) 106 | stats_dfs = _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len, out_dir, sample_name, command, vaf, version) 107 | stats_dfs_list.append(stats_dfs) 108 | 109 | overall_results_df = pd.concat([df[0] for df in stats_dfs_list]) 110 | tp_type_results_df = pd.concat([df[1] for df in stats_dfs_list]).fillna(0).astype(int) 111 | fn_type_results_df = pd.concat([df[2] for df in stats_dfs_list]).fillna(0).astype(int) 112 | fp_type_results_df = pd.concat([df[3] for df in stats_dfs_list]).fillna(0).astype(int) 113 | tp_len_results_df = pd.concat([df[4] for df in stats_dfs_list]).fillna(0).astype(int) 114 | fn_len_results_df = pd.concat([df[5] for df in stats_dfs_list]).fillna(0).astype(int) 115 | fp_len_results_df = pd.concat([df[6] for df in stats_dfs_list]).fillna(0).astype(int) 116 | 117 | results_dfs = [overall_results_df, tp_type_results_df, fn_type_results_df, fp_type_results_df, \ 118 | tp_len_results_df, fn_len_results_df, fp_len_results_df] 119 | 120 | #headings = ['OVERALL\n\n', '\n\nSV TYPE RESULTS\nTrue Positives\n\n', 'False Negatives\n', 'False Positives\n',\ 121 | #'\n\nSV LENGTH RESULTS\nTrue Positives\n', 'False Negatives\n', 'False Positives\n'] 122 | #user_input = ", ".join([f"{key}={value}" for key, value in vars(args).items() if value is not None and key!= "func"]) 123 | # with open(f'{out_dir}/{sample_name}_minda_results.txt', 'w') as file: 124 | # file.write("MINDA ENSEMBLE RESULTS\n\n") 125 | # for i in range(len(results_dfs)): 126 | # heading = headings[i] 127 | # df = results_dfs[i] 128 | # file.write(headings[i]) 129 | # if df.isna().all().all(): 130 | # file.write("None" + '\n\n') 131 | # else: 132 | # file.write(df.to_string() + '\n\n') 133 | # file.write(f'##minda_args: {user_input}\n') 134 | 135 | file_names = ['overall', 'SV_type_TP', 'SV_type_FN', 'SV_type_FP',\ 136 | 'SV_length_TP', 'SV_length_FN', 'SV_length_FP'] 137 | 138 | if not os.path.isdir(args.out_dir + "/results"): 139 | os.makedirs(args.out_dir + "/results") 140 | 141 | for i in range(len(results_dfs)): 142 | file_name = file_names[i] 143 | df = results_dfs[i].copy() 144 | df.insert(0, "Caller", caller_names) 145 | df.to_csv(args.out_dir + f"/results/{file_name}.tsv", sep='\t', index=False) 146 | 147 | 148 | return overall_results_df, tp_type_results_df, fn_type_results_df, fp_type_results_df, tp_len_results_df, fn_len_results_df, fp_len_results_df, paired_df -------------------------------------------------------------------------------- /minda/main.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import sys 3 | import os 4 | import argparse 5 | import logging 6 | import pandas as pd 7 | 8 | from minda.__version__ import __version__ 9 | from minda.decompose import get_caller_name, get_df, get_intersected_df, get_decomposed_dfs 10 | from minda.ensemble import get_support_df as get_ensemble_support_df 11 | from minda.truthset import get_support_df as get_truthset_support_df 12 | from minda.truthset import get_base_df 13 | from minda.stats import get_results 14 | 15 | logger = logging.getLogger() 16 | 17 | 18 | def _enable_logging(log_file, debug, overwrite): 19 | """ 20 | Turns on logging, sets debug levels and assigns a log file 21 | """ 22 | log_formatter = logging.Formatter("[%(asctime)s] %(name)s: %(levelname)s: " 23 | "%(message)s", "%Y-%m-%d %H:%M:%S") 24 | console_formatter = logging.Formatter("[%(asctime)s] %(levelname)s: " 25 | "%(message)s", "%Y-%m-%d %H:%M:%S") 26 | console_log = logging.StreamHandler() 27 | console_log.setFormatter(console_formatter) 28 | 29 | if overwrite: 30 | open(log_file, "w").close() 31 | file_handler = logging.FileHandler(log_file, mode="a") 32 | file_handler.setFormatter(log_formatter) 33 | 34 | if not debug: 35 | level = logging.INFO 36 | 37 | console_log.setLevel(level) 38 | file_handler.setLevel(level) 39 | 40 | logger.setLevel(logging.DEBUG) 41 | logger.addHandler(console_log) 42 | logger.addHandler(file_handler) 43 | 44 | 45 | def _version(): 46 | return __version__ 47 | 48 | def run(args): 49 | if not os.path.isdir(args.out_dir): 50 | os.makedirs(args.out_dir) 51 | 52 | log_file = os.path.join(args.out_dir, "minda.log") 53 | _enable_logging(log_file, debug=False, overwrite=True) 54 | 55 | version = _version() 56 | logger.info("cmd: %s", " ".join(sys.argv)) 57 | logger.info("MindaV" + version) 58 | 59 | if args.command == 'truthset': 60 | base = args.base 61 | 62 | # check whether input is tsv or list of vcfs 63 | if args.tsv != None: 64 | vcf_df = pd.read_csv(args.tsv, sep='\t', header=None) 65 | 66 | vcf_list = vcf_df.iloc[:,0].to_list() 67 | tsv_directory = os.path.abspath(args.tsv) 68 | vcf_list = [os.path.abspath(path) if not os.path.isabs(path) else path for path in vcf_list] 69 | 70 | column_count = vcf_df.shape[1] 71 | if column_count >= 2: 72 | caller_names = vcf_df.iloc[:,1].fillna("unknown").to_list() 73 | else: 74 | caller_names = [] 75 | 76 | if column_count == 3: 77 | prefixes = vcf_df.iloc[:,2].fillna("unk").to_list() 78 | caller_names = [prefixes[i] + "_" + caller_names[i] for i in range(len(caller_names))] 79 | 80 | else: 81 | column_count = 1 82 | vcf_list = args.vcfs 83 | caller_names = [] 84 | 85 | 86 | if args.command == "ensemble" and len(vcf_list) < 2: 87 | sys.exit("Provide a minimum of 2 VCF files.") 88 | elif args.command == "ensemble" and args.min_support != None and len(vcf_list) < args.min_support: 89 | sys.exit("Number of VCF files should be less than or equal minimum number of support.") 90 | elif len(vcf_list) < 1 and args.command == "truthset": 91 | sys.exit("Provide a minimum of 1 comparison VCF file.") 92 | 93 | if args.command == 'truthset': 94 | vcf_list.append(base) 95 | if len(caller_names) > 0: 96 | caller_names.append("base") 97 | 98 | if caller_names == []: 99 | for i in range(len(vcf_list)): 100 | vcf = vcf_list[i] 101 | caller_name = get_caller_name(vcf) 102 | caller_names.append(caller_name) 103 | 104 | if len(caller_names) > len(set(caller_names)): 105 | caller_names = [chr(ord('A') + i) + "_" + caller_names[i] for i in range(len(caller_names))] 106 | prefixed = True 107 | elif len(caller_names) == len(set(caller_names)) and column_count == 3: 108 | prefixed = True 109 | else: 110 | prefixed = False 111 | 112 | max_svlengths =[] 113 | decomposed_dfs_list = [] 114 | for i in range(len(vcf_list)): 115 | caller_name = caller_names[i] 116 | vcf = vcf_list[i] 117 | if args.bed == None: 118 | df = get_df(vcf) 119 | else: 120 | df = get_intersected_df(vcf, args.bed) 121 | decomposed_dfs = get_decomposed_dfs(caller_name, df, args.filter, args.min_size, prefixed, args.vaf, args.sample_name, args.out_dir) 122 | decomposed_dfs_list.append(decomposed_dfs[:2]) 123 | max_svlengths.append(decomposed_dfs[2]) 124 | 125 | max_len = max(max_svlengths) 126 | 127 | if args.command == 'ensemble' and args.conditions != None: 128 | conditions = eval(args.conditions) 129 | support_df = get_ensemble_support_df(vcf_list, decomposed_dfs_list, caller_names, args.tolerance, conditions, args.vaf, args.command, args.out_dir, args.sample_name, args, version, args.multimatch) 130 | results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf, args.command, args, version) 131 | logger.info(f"\n{results[0]}") 132 | 133 | elif args.command == 'ensemble' and args.min_support != None: 134 | conditions = eval(f"[[caller_names,'>=', {args.min_support}]]") 135 | support_df = get_ensemble_support_df(vcf_list, decomposed_dfs_list, caller_names, args.tolerance, conditions, args.vaf, args.command, args.out_dir, args.sample_name, args, version, args.multimatch) 136 | results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf, args.command, args, version) 137 | logger.info(f"\n{results[0]}") 138 | 139 | else: 140 | base_df = get_base_df(decomposed_dfs_list, args.tolerance, args.multimatch) 141 | support_df = get_truthset_support_df(base_df, caller_names, args.vaf, args.out_dir, args.sample_name) 142 | results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf,args.command, args, version) 143 | logger.info(f"\n{results[0]}") 144 | 145 | 146 | def main(): 147 | parser=argparse.ArgumentParser(description="Minda - VCF evaluation tool for germline and somatic structural variant callers") 148 | subparser=parser.add_subparsers(dest="command") 149 | 150 | #defaults ------------------------------------------------ 151 | FILTER = ["PASS"] 152 | TOLERANCE = 500 153 | 154 | # TRUTHSET ------------------------------------------------ 155 | truthset = subparser.add_parser("truthset", help='benchmark VCF(s) against a base VCF') 156 | 157 | # required arguements 158 | truthset.add_argument("--out_dir", help='path to out directory', dest="out_dir", type=str, required=True) 159 | truthset.add_argument("--base", help='path of base VCF', dest="base", type=str, required=True) 160 | 161 | # mutally exclusive arguments 162 | truthset_input = truthset.add_mutually_exclusive_group(required=True) 163 | truthset_input.add_argument('--tsv', action="store", dest="tsv", help="tsv file path") 164 | truthset_input.add_argument('--vcfs', action="store", dest="vcfs", nargs="+", help="vcf file path(s)") 165 | 166 | # # optional arguments 167 | truthset.add_argument("--bed", help=f'path to bed file for filtering records with BedTool intersect', dest="bed", type=str) 168 | truthset.add_argument("--filter", help=f'filter records by FILTER column; default="{FILTER}"', dest="filter", type=str, nargs="*", default=FILTER) 169 | truthset.add_argument("--min_size", help=f'filter records by SVSIZE in INFO column', dest="min_size", type=int) 170 | truthset.add_argument("--tolerance", help=f'maximum allowable bp distance between base and caller breakpoint; default={TOLERANCE}', dest="tolerance", type=int, default=TOLERANCE) 171 | truthset.add_argument("--sample_name", help=f'name of sample', dest="sample_name", type=str) 172 | truthset.add_argument("--vaf", help=f'filter out records below a given VAF treshold', dest="vaf", type=float) 173 | truthset.add_argument("--multimatch", help=f'allow more than one record from the same caller to match a single truthset record', dest="multimatch", action='store_true') 174 | 175 | # ENSEMBLE ------------------------------------------------ 176 | ensemble = subparser.add_parser("ensemble", help='create an ensemble call list from multiple VCF and, optionally, benchmark each VCF against') 177 | 178 | # required arguements 179 | ensemble.add_argument("--out_dir", help='path to out directory', dest="out_dir", type=str, required=True) 180 | 181 | # mutally exclusive arguments 182 | ensemble_input = ensemble.add_mutually_exclusive_group(required=True) 183 | ensemble_input.add_argument('--tsv', action="store", dest="tsv", help="tsv file path") 184 | ensemble_input.add_argument('--vcfs', action="store", dest="vcfs", nargs="+", help="vcf file path(s)") 185 | 186 | ensemble_support = ensemble.add_mutually_exclusive_group(required=True) 187 | ensemble_support.add_argument("--conditions", help=f'specific conditions to support a call', dest="conditions", type=str) 188 | ensemble_support.add_argument("--min_support", help=f'minimumn number of callers to support a call', dest="min_support", type=int) 189 | 190 | # optional arguments 191 | ensemble.add_argument("--bed", help=f'path to bed file for filtering records with BedTool intersect', dest="bed", type=str) 192 | ensemble.add_argument("--filter", help=f'filter records by FILTER column; default="{FILTER}"', dest="filter", type=str, nargs="*", default=FILTER) 193 | ensemble.add_argument("--min_size", help=f'filter records by SVSIZE in INFO column', dest="min_size", type=int) 194 | ensemble.add_argument("--tolerance", help=f'maximum allowable bp distance between base and caller breakpoint; default={TOLERANCE}', dest="tolerance", type=int, default=TOLERANCE) 195 | ensemble.add_argument("--sample_name", help=f'name of sample', dest="sample_name", type=str) 196 | ensemble.add_argument("--vaf", help=f'filter out records below a given VAF treshold', dest="vaf", type=float) 197 | ensemble.add_argument("--multimatch", help=f'allow more than one record from the same caller to match a single ensemble record', dest="multimatch", action='store_true') 198 | 199 | # ------------------------------------------------ 200 | args, remaining_args = parser.parse_known_args() 201 | parser.set_defaults(func=run) 202 | args=parser.parse_args() 203 | args.func(args) 204 | -------------------------------------------------------------------------------- /data/colo829_benchmark_grch38_amended_vaf.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##FILTER= 3 | ##contig= 4 | ##contig= 5 | ##contig= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | ##contig= 16 | ##contig= 17 | ##contig= 18 | ##contig= 19 | ##contig= 20 | ##contig= 21 | ##contig= 22 | ##contig= 23 | ##contig= 24 | ##contig= 25 | ##contig= 26 | ##contig= 27 | ##ALT= 28 | ##ALT= 29 | ##ALT= 30 | ##ALT= 31 | ##ALT= 32 | ##FORMAT= 33 | ##INFO= 34 | ##INFO= 35 | ##INFO= 36 | ##INFO= 37 | ##INFO= 38 | ##INFO= 39 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT COLO829SV 40 | chr1 207807889 COLO829_SOMATIC_SV01 N . PASS SVTYPE=DEL;SVLEN=33584;END=207841473;CHR2=chr1;SUPPORT=GSC_COLO829|0/1|0.23,ONT_COLO829|0/1|0.36,PBR_COLO829|0/1|0.27,VAI_COLO829|0/1|0.32;VAF=0.295 GT 0/1 41 | chr1 224458901 COLO829_SOMATIC_SV02 N . PASS SVTYPE=DUP;SVLEN=153518;END=224612419;CHR2=chr1;SUPPORT=GSC_COLO829|0/0|0.11,ONT_COLO829|0/1|0.36,PBR_COLO829|0/1|0.38,VAI_COLO829|0/0|0.21;VAF=0.285 GT 0/1 42 | chr1 224595103 COLO829_SOMATIC_SV03 N . PASS SVTYPE=DEL;SVLEN=3215;END=224598318;CHR2=chr1;SUPPORT=GSC_COLO829|0/1|0.29,ONT_COLO829|0/1|0.29,PBR_COLO829|0/1|0.41,VAI_COLO829|0/1|0.31;VAF=0.3 GT 0/1 43 | chr1 236097113 COLO829_SOMATIC_SV04 N . PASS SVTYPE=INS;SVLEN=11007;END=236097113;CHR2=chr1;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 44 | chr3 24523615 COLO829_SOMATIC_SV05 N . PASS SVTYPE=INV;SVLEN=1073;END=24524688;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.39,ONT_COLO829|0/1|0.37,PBR_COLO829|0/1|0.40,VAI_COLO829|0/1|0.39;VAF=0.39 GT 0/1 45 | chr3 26622432 COLO829_SOMATIC_SV06 N . PASS SVTYPE=INV;SVLEN=577;END=26623009;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.30,ONT_COLO829|0/1|0.32,PBR_COLO829|0/1|0.32,VAI_COLO829|0/1|0.33;VAF=0.32 GT 0/1 46 | chr3 60147027 COLO829_SOMATIC_SV07 N . PASS SVTYPE=DEL;SVLEN=71809;END=60218836;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.26,ONT_COLO829|0/1|0.30,PBR_COLO829|0/1|0.39,VAI_COLO829|0/1|0.25;VAF=0.28 GT 0/1 47 | chr3 60886452 COLO829_SOMATIC_SV08 N . PASS SVTYPE=DEL;SVLEN=141091;END=61027543;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.35,ONT_COLO829|0/1|0.29,PBR_COLO829|0/1|0.42,VAI_COLO829|0/1|0.28;VAF=0.32 GT 0/1 48 | chr4 65346239 COLO829_SOMATIC_SV09 N . PASS SVTYPE=DUP;SVLEN=80;END=65346319;CHR2=chr4;SUPPORT=GSC_COLO829|0/1|0.50,ONT_COLO829|0/1|0.48,PBR_COLO829|0/1|0.50,VAI_COLO829|0/1|0.56;VAF=0.5 GT 0/1 49 | chr4 187075168 COLO829_SOMATIC_SV10 N . PASS SVTYPE=DUP;SVLEN=60;END=187075228;CHR2=chr4;SUPPORT=GSC_COLO829|0/1|0.46,ONT_COLO829|0/1|0.42,PBR_COLO829|0/1|0.47,VAI_COLO829|0/1|0.37;VAF=0.44 GT 0/1 50 | chr5 28787890 COLO829_SOMATIC_SV11 N . PASS SVTYPE=DEL;SVLEN=175099;END=28962989;CHR2=chr5;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|0/1|0.74,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 51 | chr7 57403875 COLO829_SOMATIC_SV12 N . PASS SVTYPE=DEL;SVLEN=32528;END=57436403;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.65,ONT_COLO829|0/1|0.76,PBR_COLO829|1/1|0.87,VAI_COLO829|0/1|0.64;VAF=0.705 GT 0/1 52 | chr7 75595250 COLO829_SOMATIC_SV13 N . PASS SVTYPE=INS;SVLEN=54;END=75595250;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.34,ONT_COLO829|0/0|0.09,PBR_COLO829|0/1|0.55,VAI_COLO829|0/1|0.22;VAF=0.28 GT 0/1 53 | chr7 78352523 COLO829_SOMATIC_SV14 N . PASS SVTYPE=DEL;SVLEN=100409;END=78452932;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.36,ONT_COLO829|0/1|0.26,PBR_COLO829|0/1|0.30,VAI_COLO829|0/1|0.26;VAF=0.28 GT 0/1 54 | chr7 78560891 COLO829_SOMATIC_SV15 N . PASS SVTYPE=DEL;SVLEN=67480;END=78628371;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.31,ONT_COLO829|0/1|0.25,PBR_COLO829|0/1|0.50,VAI_COLO829|0/0|0.20;VAF=0.28 GT 0/1 55 | chr7 86215352 COLO829_SOMATIC_SV16 N . PASS SVTYPE=DUP;SVLEN=9032;END=86224384;CHR2=chr7;SUPPORT=GSC_COLO829|0/0|0.14,ONT_COLO829|0/0|0.11,PBR_COLO829|0/0|0.21,VAI_COLO829|0/0|0.14;VAF=0.14 GT 0/0 56 | chr7 104844620 COLO829_SOMATIC_SV17 N . PASS SVTYPE=DUP;SVLEN=127236;END=104971856;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.34,ONT_COLO829|0/1|0.39,PBR_COLO829|0/1|0.35,VAI_COLO829|0/1|0.36;VAF=0.355 GT 0/1 57 | chr7 110753277 COLO829_SOMATIC_SV18 N . PASS SVTYPE=DEL;SVLEN=1128;END=110754405;CHR2=chr7;SUPPORT=GSC_COLO829|0/0|0.19,ONT_COLO829|0/1|0.26,PBR_COLO829|0/1|0.24,VAI_COLO829|0/1|0.26;VAF=0.25 GT 0/1 58 | chr7 126106070 COLO829_SOMATIC_SV19 N . PASS SVTYPE=DEL;SVLEN=420777;END=126526847;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.55,ONT_COLO829|0/1|0.73,PBR_COLO829|0/1|0.70,VAI_COLO829|0/1|0.34;VAF=0.625 GT 0/1 59 | chr7 126458434 COLO829_SOMATIC_SV20 N . PASS SVTYPE=INV;SVLEN=68953;END=126527387;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.38,ONT_COLO829|0/1|0.38,PBR_COLO829|0/1|0.34,VAI_COLO829|0/1|0.35;VAF=0.365 GT 0/1 60 | chr7 144262134 COLO829_SOMATIC_SV21 N . PASS SVTYPE=DEL;SVLEN=129567;END=144391701;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.27,ONT_COLO829|0/1|0.27,PBR_COLO829|0/1|0.55,VAI_COLO829|0/1|0.26;VAF=0.27 GT 0/1 61 | chr7 151049571 COLO829_SOMATIC_SV22 N N[chr15:84141972[ . PASS SVTYPE=BND;SVLEN=.;END=N[chr15:84141972[;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.55,ONT_COLO829|0/1|0.63,PBR_COLO829|1/1|0.81,VAI_COLO829|0/1|0.61;VAF=0.62 GT 0/1 62 | chr7 158335312 COLO829_SOMATIC_SV23 N . PASS SVTYPE=DEL;SVLEN=46;END=158335358;CHR2=chr7;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|0/0|0.07,VAI_COLO829|1/1|0.82;VAF=0.91 GT 0/1 63 | chr9 28031838 COLO829_SOMATIC_SV24 N . PASS SVTYPE=INV;SVLEN=27305;END=28059143;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.45,ONT_COLO829|0/1|0.50,PBR_COLO829|0/1|0.44,VAI_COLO829|0/1|0.42;VAF=0.445 GT 0/1 64 | chr9 28031865 COLO829_SOMATIC_SV25 N . PASS SVTYPE=INV;SVLEN=2604;END=28034469;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.24,ONT_COLO829|0/1|0.33,PBR_COLO829|0/1|0.31,VAI_COLO829|0/1|0.34;VAF=0.32 GT 0/1 65 | chr9 28034301 COLO829_SOMATIC_SV26 N . PASS SVTYPE=DEL;SVLEN=123393;END=28157694;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.50,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|0.81,VAI_COLO829|1/1|1.00;VAF=0.905 GT 1/1 66 | chr10 7090915 COLO829_SOMATIC_SV27 N N[chr19:17286830[ . PASS SVTYPE=BND;SVLEN=.;END=N[chr19:17286830[;CHR2=chr19;SUPPORT=GSC_COLO829|0/1|0.37,ONT_COLO829|0/1|0.33,PBR_COLO829|0/1|0.26,VAI_COLO829|0/1|0.25;VAF=0.295 GT 0/1 67 | chr10 7592410 COLO829_SOMATIC_SV28 N N]chr18:9868619] . PASS SVTYPE=BND;SVLEN=.;END=N]chr18:9868619];CHR2=chr18;SUPPORT=GSC_COLO829|0/0|0.09,ONT_COLO829|0/0|0.15,PBR_COLO829|0/1|0.25,VAI_COLO829|0/0|0.21;VAF=0.18 GT 0/0 68 | chr10 87940543 COLO829_SOMATIC_SV29 N . PASS SVTYPE=DEL;SVLEN=11829;END=87952372;CHR2=chr10;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.80,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 69 | chr11 81074560 COLO829_SOMATIC_SV30 N . PASS SVTYPE=DEL;SVLEN=308177;END=81382737;CHR2=chr11;SUPPORT=GSC_COLO829|0/1|0.53,ONT_COLO829|1/1|0.79,PBR_COLO829|0/1|0.58,VAI_COLO829|0/1|0.76;VAF=0.67 GT 0/1 70 | chr12 129287232 COLO829_SOMATIC_SV31 N . PASS SVTYPE=INS;SVLEN=274;END=129287232;CHR2=chr12;SUPPORT=GSC_COLO829|0/1|0.73,ONT_COLO829|0/1|0.62,PBR_COLO829|0/1|0.65,VAI_COLO829|0/1|0.73;VAF=0.69 GT 0/1 71 | chr14 34545584 COLO829_SOMATIC_SV32 N . PASS SVTYPE=INS;SVLEN=2501;END=34545584;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.97,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 72 | chr14 72548005 COLO829_SOMATIC_SV33 N . PASS SVTYPE=INS;SVLEN=98;END=72548005;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.96,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 73 | chr14 104093751 COLO829_SOMATIC_SV34 N . PASS SVTYPE=DEL;SVLEN=55;END=104093806;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 74 | chr15 23586515 COLO829_SOMATIC_SV35 N . PASS SVTYPE=INV;SVLEN=161042;END=23747557;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.30,ONT_COLO829|0/1|0.39,PBR_COLO829|0/1|0.36,VAI_COLO829|0/1|0.27;VAF=0.33 GT 0/1 75 | chr15 41329096 COLO829_SOMATIC_SV36 N . PASS SVTYPE=DUP;SVLEN=7212;END=41336308;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.44,ONT_COLO829|0/0|0.11,PBR_COLO829|0/1|0.43,VAI_COLO829|0/0|0.16;VAF=0.295 GT 0/1 76 | chr16 58590641 COLO829_SOMATIC_SV37 N . PASS SVTYPE=DEL;SVLEN=38786;END=58629427;CHR2=chr16;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 77 | chr16 78894743 COLO829_SOMATIC_SV38 N . PASS SVTYPE=DEL;SVLEN=166361;END=79061104;CHR2=chr16;SUPPORT=GSC_COLO829|0/1|0.65,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|0.79,VAI_COLO829|1/1|1.00;VAF=0.895 GT 1/1 78 | chr18 68712224 COLO829_SOMATIC_SV39 N . PASS SVTYPE=DEL;SVLEN=3365;END=68715589;CHR2=chr18;SUPPORT=GSC_COLO829|0/0|0.21,ONT_COLO829|0/1|0.26,PBR_COLO829|0/0|0.21,VAI_COLO829|0/0|0.18;VAF=0.21 GT 0/0 79 | chr19 17286003 COLO829_SOMATIC_SV40 N N[chr10:7017548[ . PASS SVTYPE=BND;SVLEN=.;END=N[chr10:7017548[;CHR2=chr10;SUPPORT=GSC_COLO829|0/1|0.29,ONT_COLO829|0/1|0.32,PBR_COLO829|0/0|0.17,VAI_COLO829|0/1|0.24;VAF=0.265 GT 0/1 80 | chr20 13180081 COLO829_SOMATIC_SV41 N . PASS SVTYPE=DEL;SVLEN=3372;END=13183453;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.49,ONT_COLO829|0/1|0.48,PBR_COLO829|0/1|0.45,VAI_COLO829|0/1|0.49;VAF=0.485 GT 0/1 81 | chr20 14982313 COLO829_SOMATIC_SV42 N . PASS SVTYPE=DEL;SVLEN=50989;END=15033302;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.28,ONT_COLO829|0/1|0.27,PBR_COLO829|0/1|0.28,VAI_COLO829|0/1|0.33;VAF=0.28 GT 0/1 82 | chr20 15019977 COLO829_SOMATIC_SV43 N . PASS SVTYPE=DEL;SVLEN=13219;END=15033196;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.37,ONT_COLO829|0/1|0.23,PBR_COLO829|0/1|0.42,VAI_COLO829|0/1|0.28;VAF=0.325 GT 0/1 83 | chr22 33363264 COLO829_SOMATIC_SV44 N . PASS SVTYPE=DEL;SVLEN=79149;END=33442413;CHR2=chr22;SUPPORT=GSC_COLO829|0/1|0.27,ONT_COLO829|0/1|0.25,PBR_COLO829|0/0|0.20,VAI_COLO829|0/1|0.38;VAF=0.26 GT 0/1 84 | chrX 31178837 COLO829_SOMATIC_SV45 N . PASS SVTYPE=DEL;SVLEN=19254;END=31198091;CHR2=chrX;SUPPORT=GSC_COLO829|0/1|0.58,ONT_COLO829|0/1|0.70,PBR_COLO829|0/1|0.61,VAI_COLO829|0/1|0.56;VAF=0.595 GT 0/1 85 | chrX 31283087 COLO829_SOMATIC_SV46 N . PASS SVTYPE=DEL;SVLEN=737546;END=32020633;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|0.90,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 86 | chrX 32059739 COLO829_SOMATIC_SV47 N . PASS SVTYPE=DEL;SVLEN=215241;END=32274980;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|0.81,ONT_COLO829|0/1|0.65,PBR_COLO829|1/1|1.00,VAI_COLO829|0/1|0.70;VAF=0.755 GT 0/1 87 | chrX 32080417 COLO829_SOMATIC_SV48 N . PASS SVTYPE=DEL;SVLEN=102717;END=32183134;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1 GT 1/1 88 | chrX 34041661 COLO829_SOMATIC_SV49 N . PASS SVTYPE=DEL;SVLEN=2249;END=34043910;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.92,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|0.94;VAF=0.97 GT 1/1 89 | -------------------------------------------------------------------------------- /minda/ensemble.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import Counter 3 | from datetime import datetime 4 | import pandas as pd 5 | import numpy as np 6 | import re 7 | 8 | 9 | def _add_columns(ensemble_df, vaf): 10 | # create a column of list of prefixed IDs for each locus group 11 | key_columns = ['locus_group_x','locus_group_y'] 12 | value_columns = ['ID_x', 'ID_y'] 13 | column_suffixes = ['x','y'] 14 | 15 | for i in range(len(key_columns)): 16 | locus_group = key_columns[i] 17 | id = value_columns[i] 18 | column_suffix = column_suffixes[i] 19 | 20 | keys = ensemble_df[f'{locus_group}'].to_list() 21 | values = ensemble_df[f'{id}'].to_list() 22 | minda_values = ensemble_df['Minda_ID'].to_list() 23 | caller_names = ensemble_df.caller_names.to_list() 24 | 25 | id_dict = {} 26 | for key, value in zip(keys, values): 27 | if key not in id_dict: 28 | id_dict[key] = [] 29 | id_dict[key].append(value) 30 | 31 | minda_id_dict = {} 32 | for key, value in zip(keys, minda_values): 33 | if key not in minda_id_dict: 34 | minda_id_dict[key] = [] 35 | minda_id_dict[key].append(value) 36 | 37 | ensemble_df[f'ID_list_{column_suffix}'] = ensemble_df[locus_group].map(id_dict) 38 | ensemble_df[f'Minda_ID_list_{column_suffix}'] = ensemble_df[locus_group].map(minda_id_dict) 39 | 40 | 41 | # create dict for SV type 42 | values = ensemble_df.SVTYPE.to_list() 43 | svtype_dict = {} 44 | for key, value in zip(keys, values): 45 | if key not in svtype_dict: 46 | svtype_dict[key] = [] 47 | svtype_dict[key].append(value) 48 | 49 | most_common_svtpye_dict = {k:Counter(v).most_common(1)[0][0] for (k,v) in svtype_dict.items()} 50 | ensemble_df['SVTYPE'] = ensemble_df['locus_group_y'].map(most_common_svtpye_dict) 51 | 52 | # create dict of vafs 53 | if vaf != None: 54 | values = ensemble_df.VAF.to_list() 55 | vaf_dict = {} 56 | for key, value in zip(keys, values): 57 | if key not in vaf_dict: 58 | vaf_dict[key] = [] 59 | vaf_dict[key].append(value) 60 | 61 | ensemble_df['VAFs'] = ensemble_df['locus_group_y'].map(vaf_dict) 62 | 63 | return ensemble_df 64 | 65 | 66 | def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch): 67 | 68 | dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list] 69 | dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list] 70 | dfs_list = [dfs_1, dfs_2] 71 | 72 | # create stat dfs 73 | start_dfs_list = [] 74 | start_dfs = pd.concat(dfs_1).reset_index(drop=True) 75 | start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN']].sort_values(['#CHROM', 'POS']) 76 | 77 | start_dfs['diff_x'] = start_dfs.groupby('#CHROM').POS.diff().fillna(9999) 78 | diffs = start_dfs['diff_x'].to_list() 79 | 80 | # group start loci 81 | loci = [] 82 | count = 1 83 | for diff in diffs: 84 | if diff >= tolerance: 85 | locus = count 86 | loci.append(locus) 87 | count += 1 88 | else: 89 | locus = count - 1 90 | loci.append(locus) 91 | 92 | 93 | start_dfs['locus_group_x'] = loci 94 | start_dfs['median'] = start_dfs.groupby('locus_group_x')['POS'].transform('median').astype('int') 95 | 96 | # create end dfs 97 | end_dfs = pd.concat(dfs_2).reset_index(drop=True) 98 | 99 | #ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'SVLEN','Minda_ID']) 100 | ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'Minda_ID']) 101 | ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\ 102 | 'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\ 103 | 'ID_y', ]] 104 | ensemble_df = ensemble_df.sort_values(['locus_group_x','#CHROM_y', 'POS_y']) 105 | ensemble_df ['diff_y'] = ensemble_df.groupby(['locus_group_x','#CHROM_y']).POS_y.diff().abs().fillna(9999) 106 | diffs = ensemble_df['diff_y'].to_list() 107 | caller_names = ensemble_df['Minda_ID'].apply(lambda x: x.rsplit('_', 1)[0]).tolist() 108 | ensemble_df['caller_names']= caller_names 109 | ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\ 110 | 'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\ 111 | 'ID_y','diff_y','caller_names' ]] 112 | 113 | # group end loci 114 | locus_callers = [] 115 | loci = [] 116 | count = 1 117 | for i in range(len(diffs)): 118 | diff = diffs[i] 119 | caller_name = caller_names[i] 120 | 121 | #create new end locus (sublocus 1) 122 | if diff >= tolerance: 123 | locus_callers.clear() 124 | locus_callers.append(caller_name) 125 | locus = str(count) + "_1" 126 | loci.append(locus) 127 | count += 1 128 | 129 | # add new sublocus (sublocus determined by how many calls the caller makes at a given locus) 130 | elif multimatch == False and diff < tolerance and caller_name in locus_callers: 131 | #elif diff < tolerance and caller_name in locus_callers: 132 | locus_callers.append(caller_name) 133 | sub_group = locus_callers.count(caller_name) 134 | locus = str(count-1) + "_" + str(sub_group) 135 | loci.append(locus) 136 | 137 | # add to existing sublocus 1 138 | else: 139 | locus = str(count - 1) + "_1" 140 | loci.append(locus) 141 | locus_callers.append(caller_name) 142 | 143 | # add locus_group_y, median POS, caller ID, Minda ID, SV type, VAF columns 144 | ensemble_df['locus_group_y'] = loci 145 | ensemble_df['median'] = ensemble_df.groupby('locus_group_y')['POS_y'].transform('median').astype('int') 146 | ensemble_df = _add_columns(ensemble_df, vaf) 147 | if vaf != None: 148 | ensemble_df['VAF'] = ensemble_df.groupby('locus_group_y')['VAF'].transform('median') 149 | else: 150 | ensemble_df['VAF'] = np.nan 151 | 152 | ensemble_df = ensemble_df.drop_duplicates(['locus_group_x', 'locus_group_y']).reset_index(drop=True) 153 | 154 | return ensemble_df 155 | 156 | 157 | def _get_ensemble_call_column(support_df, conditions): 158 | column_names = [] 159 | condition_count = 0 160 | condition_columns = [] 161 | query_list = [] 162 | for i in range(len(conditions)): 163 | 164 | if i % 2 == 0: 165 | operator = conditions[i][1] 166 | number = str(conditions[i][2]) 167 | 168 | nested_caller_columns = conditions[i][0] 169 | nested_type = type(nested_caller_columns[0]) 170 | 171 | condition = chr(ord('A') + condition_count) 172 | column_name = f'condition_{condition}' 173 | 174 | nested_columns_count = 1 175 | sub_condition_columns = [] 176 | if nested_type == list: 177 | for j in range(len(nested_caller_columns)): 178 | caller_columns = nested_caller_columns[j] 179 | sub_column_name = f'condition_{nested_columns_count}_{condition}' 180 | support_df[f'{sub_column_name}'] = support_df[caller_columns].any(axis=1) 181 | nested_columns_count += 1 182 | sub_condition_columns.append(sub_column_name) 183 | support_df[f'{column_name}'] = support_df[sub_condition_columns].sum(axis=1) 184 | 185 | else: 186 | support_df[f'{column_name}'] = support_df[nested_caller_columns].sum(axis=1) 187 | condition_count += 1 188 | condition_columns.append(column_name) 189 | query_list.extend([column_name, operator, number]) 190 | else: 191 | query_list.extend(conditions[i]) 192 | 193 | query = ' '.join(query_list) 194 | mask = support_df.eval(query) 195 | #support_df['ensemble'] = mask 196 | support_df.insert(loc=12, column='ensemble', value=mask) 197 | return support_df 198 | 199 | def _replace_value(row): 200 | if row['ALT'] == '': 201 | return f"N]{row['#CHROM_y']}:{row['POS_y']}]" 202 | else: 203 | return row['ALT'] 204 | 205 | def _get_contigs(vcf_list): 206 | contig_dict = {} 207 | for vcf in vcf_list: 208 | with open(vcf, 'r') as file: 209 | for line in file: 210 | if not line.startswith("##"): 211 | break 212 | if line.startswith("##contig"): 213 | pattern = r'ID=([^,>]+),length=([^,>]+)' 214 | id_length_tuple = re.findall(pattern, line)[0] 215 | chr_id = id_length_tuple[0] 216 | length = int(id_length_tuple[1]) 217 | if chr_id not in contig_dict: 218 | contig_dict[chr_id] = length 219 | else: 220 | value = contig_dict[chr_id] 221 | max_length = max(value, length) 222 | if value != max_length: 223 | print(f'Contig ID {chr_id} has lengths {value} and {max_length}; {max_length} will be used in ensemble.vcf header.') 224 | contig_dict[chr_id] = max_length 225 | return contig_dict 226 | 227 | def _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version): 228 | vcf_df = support_df[support_df['ensemble'] == True].reset_index(drop=True).copy() 229 | vcf_df['ID'] = f'Minda_' + (vcf_df.index + 1).astype(str) 230 | vcf_df['REF'] = "N" 231 | vcf_df['ALT'] = ["<" + svtype +">" for svtype in vcf_df['SVTYPE']] 232 | vcf_df['ALT'] = vcf_df.apply(_replace_value, axis=1) 233 | vcf_df['QUAL'] = "." 234 | vcf_df['FILTER'] = "PASS" 235 | 236 | if vaf != None: 237 | vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \ 238 | ';SUPP_VEC=' + ','.join(map(str, supp_vec)) + ';VAF=' + str(vaf) \ 239 | for svlen, svtype, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'], vcf_df['VAF'])] 240 | vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"}) 241 | else: 242 | vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \ 243 | for svlen, svtype, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'])] 244 | vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"}) 245 | date = datetime.today().strftime('%Y-%m-%d') 246 | with open(f'{out_dir}/{sample_name}_minda_ensemble.vcf', 'w') as file: 247 | file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n') 248 | contig_dict = _get_contigs(vcf_list) 249 | for key, value in contig_dict.items(): 250 | file.write(f'##contig=\n') 251 | file.write('##ALT=\n##ALT=\n##ALT=\n##ALT=\n') 252 | file.write('##FILTER=\n') 253 | file.write('##INFO=\n##INFO=\n##INFO=\n') 254 | if vaf != None: 255 | file.write('##INFO=\n') 256 | command_str = " ".join(sys.argv) 257 | file.write(f"##cmd: {command_str}\n") 258 | vcf_df.to_csv(file, sep="\t", index=False) 259 | 260 | 261 | def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, conditions, vaf, command, out_dir, sample_name, args, version, multimatch): 262 | ensemble_df = _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch) 263 | 264 | minda_id_x_lists = ensemble_df.Minda_ID_list_x.to_list() 265 | minda_id_y_lists = ensemble_df.Minda_ID_list_y.to_list() 266 | 267 | # check that both start & end have same IDs 268 | for caller_name in caller_names: 269 | caller_column = [] 270 | for i in range(len(minda_id_x_lists)): 271 | minda_id_x_list = minda_id_x_lists[i] 272 | minda_id_y_list = minda_id_y_lists[i] 273 | intersect_list = list(set(minda_id_x_list).intersection(set(minda_id_y_list))) 274 | call_boolean = any(value.startswith(caller_name) for value in intersect_list) 275 | caller_column.append(call_boolean) 276 | ensemble_df[f'{caller_name}'] = caller_column 277 | 278 | column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', \ 279 | '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \ 280 | 'SVTYPE', 'SVLEN', 'VAF', 'Minda_ID_list_y'] + caller_names 281 | 282 | support_df = ensemble_df[column_names].rename(columns={"Minda_ID_list_y": "Minda_IDs"}).copy() 283 | #if command == "ensemble": 284 | support_df = _get_ensemble_call_column(support_df, conditions) 285 | 286 | # create ensemble vcf 287 | _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version) 288 | 289 | # create support csv 290 | support_ex_df = support_df 291 | support_df.to_csv(f'{out_dir}/{sample_name}_support.tsv', sep='\t', index=False) 292 | 293 | return support_df 294 | 295 | 296 | def add_vaf(row,df,caller_name): 297 | for item in row['Minda_IDs']: 298 | if item in df['Minda_ID'].values: 299 | return df[df['Minda_ID'] == item]['VAF'].values[0] 300 | return row[f'{caller_name}'] 301 | -------------------------------------------------------------------------------- /annotation/minda_stratify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import json 6 | 7 | from intervaltree import Interval, IntervalTree 8 | from collections import defaultdict, namedtuple 9 | 10 | 11 | def parse_repeatmasker(filename): 12 | """ 13 | Parses repeatmakser 'out' file and returns index of annotated repeats 14 | """ 15 | chr_trees = defaultdict(IntervalTree) 16 | num_rec = 0 17 | 18 | for line in open(filename, "r"): 19 | fields = line.split() 20 | if len(fields) != 15: 21 | continue 22 | 23 | divergence, chrom, start, end, repeat_id, family = float(fields[1]), fields[4], int(fields[5]), int(fields[6]), fields[9], fields[10] 24 | if end - start < MIN_REPEAT: 25 | continue 26 | 27 | chr_trees[chrom][start:end] = (repeat_id, family, start) 28 | num_rec += 1 29 | print(num_rec) 30 | 31 | return chr_trees 32 | 33 | 34 | def get_bed_intervals(filename): 35 | """ 36 | Parses bed file with intervals and returns index 37 | """ 38 | chr_trees = defaultdict(IntervalTree) 39 | 40 | for line in open(filename, "r"): 41 | if line.startswith("#"): 42 | continue 43 | 44 | fields = line.strip().split() 45 | chrom, start, end = fields[0], int(fields[1]), int(fields[2]) 46 | chr_trees[chrom][start:end] = (start, end) 47 | 48 | return chr_trees 49 | 50 | 51 | def get_vcf_breakpoints(filename): 52 | """ 53 | Extracts breakpoint coordinates from a vcf 54 | """ 55 | chr_trees = defaultdict(IntervalTree) 56 | for line in open(filename, "r"): 57 | if line.startswith("#"): 58 | continue 59 | 60 | fields = line.strip().split() 61 | chr_1, pos_1, info = fields[0], int(fields[1]), fields[7] 62 | chr_2, pos_2 = None, None 63 | tags = info.split(";") 64 | for tag in tags: 65 | if tag.startswith("CHR2"): 66 | chr_2 = tag[5:] 67 | if tag.startswith("END"): 68 | pos_2 = int(tag[4:]) 69 | 70 | if chr_2 is None: 71 | chr_2 = chr_1 72 | chr_trees[chr_1][pos_1 : pos_1 + 1] = (chr_1, pos_1) 73 | if pos_2 is not None: 74 | chr_trees[chr_2][pos_2 : pos_2 + 1] = (chr_2, pos_2) 75 | 76 | return chr_trees 77 | 78 | 79 | MindaEntry = namedtuple("MindaEntry", ["minda_num", "minda_str", "chr_x", "pos_x", "list_x", "chr_y", "pos_y", 80 | "list_y", "sv_type", "sv_len", "vaf", "support", "original_line", "is_ensemble"]) 81 | def parse_minda_csv(filename, num_callsets): 82 | """ 83 | Parses Minda output file 84 | """ 85 | callset_list = None 86 | minda_num = 0 87 | minda_entries = {} 88 | 89 | TOOLS_BEGIN = 13 90 | 91 | for line in open(filename, "r"): 92 | if line.startswith("#"): 93 | callset_list = line.strip().split("\t")[TOOLS_BEGIN : TOOLS_BEGIN + num_callsets] 94 | print("Callsets:", callset_list) 95 | continue 96 | 97 | fields = line.strip().split("\t") 98 | 99 | support_dict = {} 100 | for (caller, supp) in zip(callset_list, fields[TOOLS_BEGIN : TOOLS_BEGIN + num_callsets]): 101 | support_dict[caller] = True if supp != "False" else False 102 | is_ensemble = (fields[12] == "True") 103 | 104 | #if not len(fields[10]): 105 | # print(line) 106 | # continue 107 | 108 | #VAF not always available, default to 1.0 109 | try: 110 | vaf = float(fields[10]) 111 | except ValueError: 112 | vaf = 1.0 113 | 114 | entry = MindaEntry(minda_num=minda_num, minda_str=fields[10], 115 | chr_x=fields[0], pos_x=int(fields[1]), list_x=fields[3], 116 | chr_y=fields[4], pos_y=int(fields[5]), list_y=fields[7], 117 | sv_type=fields[8], sv_len=int(fields[9]), vaf=vaf, 118 | support=support_dict, original_line=line.strip(), is_ensemble=is_ensemble) 119 | minda_entries[minda_num] = entry 120 | minda_num += 1 121 | 122 | return minda_entries 123 | 124 | 125 | def get_confident_calls(minda_records): 126 | """ 127 | Extracts confident calls based on given minimum support 128 | """ 129 | confident_calls = set() 130 | for rec in minda_records.values(): 131 | if rec.is_ensemble: 132 | confident_calls.add(rec.minda_num) 133 | 134 | """ 135 | callset_list = list(next(iter(minda_records.values())).support.keys()) 136 | for rec in minda_records.values(): 137 | support_tools = set(x for x in rec.support if rec.support[x]) 138 | against_tools = set(callset_list) - support_tools 139 | techs = set([x.split("_")[-1] for x in support_tools]) 140 | if len(support_tools) >= min_tools and len(techs) >= min_tech: 141 | confident_calls.add(rec.minda_num) 142 | """ 143 | 144 | return confident_calls 145 | 146 | 147 | def filter_calls(minda_records, min_vaf, min_sv_len, remove_ins): 148 | filtered_minda = {} 149 | for rec in minda_records.values(): 150 | if rec.vaf < min_vaf: 151 | continue 152 | #if rec.pos_y == 0: 153 | # continue 154 | if rec.chr_x == rec.chr_y and abs(rec.pos_y - rec.pos_x) < min_sv_len: 155 | continue 156 | if remove_ins and rec.sv_type == "INS": 157 | continue 158 | 159 | filtered_minda[rec.minda_num] = rec 160 | return filtered_minda 161 | 162 | 163 | def stratify_breakends(minda_records, confident_calls, annotation_dir, remove_insertions): 164 | """ 165 | Performs various types of stratification 166 | """ 167 | repeatmasker_file = os.path.join(annotation_dir, REPEAT_MASKER) 168 | segdup_file = os.path.join(annotation_dir, SEGDUPS_BED) 169 | vntr_file = os.path.join(annotation_dir, VNTR_BED) 170 | chr_sizes_file = os.path.join(annotation_dir, CHR_LEN_BED) 171 | 172 | index_repeatmasker = parse_repeatmasker(repeatmasker_file) 173 | index_segdup = get_bed_intervals(segdup_file) 174 | index_vntr = get_bed_intervals(vntr_file) 175 | #index_germline = get_vcf_breakpoints(germline_vcf) 176 | 177 | #chr sizes 178 | chr_sizes = defaultdict(int) 179 | for line in open(chr_sizes_file, "r"): 180 | fields = line.split() 181 | chr_sizes[fields[0]] = int(fields[1]) 182 | 183 | #cluster indexing 184 | index_ensemble = defaultdict(IntervalTree) 185 | for entry in minda_records.values(): 186 | index_ensemble[entry.chr_x][entry.pos_x : entry.pos_x + 1] = entry 187 | index_ensemble[entry.chr_y][entry.pos_y : entry.pos_y + 1] = entry 188 | 189 | strat_categories = ["hom_repeat", "segdup", "vntr", "low_vaf", "low_len", 190 | "bnd_dup", "bnd_chain"] 191 | if not remove_insertions: 192 | strat_categories = ["insertion"] + strat_categories 193 | strat_entries = defaultdict(set) 194 | 195 | def _get_intervals(entry, index, threshold): 196 | ovlp_1 = index[entry.chr_x][entry.pos_x - threshold : entry.pos_x + threshold] 197 | ovlp_2 = index[entry.chr_y][entry.pos_y - threshold : entry.pos_y + threshold] 198 | return [o[2] for o in ovlp_1], [o[2] for o in ovlp_2] 199 | 200 | def _support_tools(rec): 201 | return set(x for x in rec.support if rec.support[x]) 202 | 203 | def _coords_number(entries): 204 | clusters = [] 205 | for e in entries: 206 | for (c, p) in [(e.chr_x, e.pos_x), (e.chr_y, e.pos_y)]: 207 | match = False 208 | for cl in clusters: 209 | if cl[0] == c and abs(cl[1] - p) <= BND_AREA: 210 | match = True 211 | if not match: 212 | clusters.append((c, p)) 213 | return len(clusters) 214 | 215 | def _is_duplicate(e1, e2): 216 | match_x = e1.chr_x == e2.chr_x and abs(e1.pos_x - e2.pos_x) <= BND_AREA 217 | match_y = e1.chr_y == e2.chr_y and abs(e1.pos_y - e2.pos_y) <= BND_AREA 218 | cross_x = e1.chr_x == e2.chr_y and abs(e1.pos_x - e2.pos_y) <= BND_AREA 219 | cross_y = e1.chr_y == e2.chr_x and abs(e1.pos_y - e2.pos_x) <= BND_AREA 220 | return (match_x and match_y) or (cross_x and cross_y) 221 | 222 | for entry in minda_records.values(): 223 | #Homologous repeats at breakends 224 | ovlp_1, ovlp_2 = _get_intervals(entry, index_repeatmasker, REPEAT_AREA) 225 | for x in ovlp_1: 226 | for y in ovlp_2: 227 | if x[1] == y[1] and x[2] != y[2]: #same repeat family, but different repeat 228 | strat_entries[entry.minda_num].add("hom_repeat") 229 | 230 | #same segdup section 231 | segdup_1, segdup_2 = _get_intervals(entry, index_segdup, REPEAT_AREA) 232 | if len(set(s[0] for s in segdup_1) & set(s[0] for s in segdup_2)) > 0: 233 | #if len(segdup_1) > 0 and len(segdup_2) > 0: 234 | strat_entries[entry.minda_num].add("segdup") 235 | 236 | #same vntr section 237 | vntr_1, vntr_2 = _get_intervals(entry, index_vntr, REPEAT_AREA) 238 | if len(set(s[0] for s in vntr_1) & set(s[0] for s in vntr_2)) > 0: 239 | #if len(vntr_1) > 0 and len(vntr_2) > 0: 240 | strat_entries[entry.minda_num].add("vntr") 241 | 242 | #low-ish vaf 243 | if entry.vaf < LOW_VAF: 244 | strat_entries[entry.minda_num].add("low_vaf") 245 | 246 | if entry.chr_x == entry.chr_y and abs(entry.pos_y - entry.pos_x) < LOW_LEN: 247 | strat_entries[entry.minda_num].add("low_len") 248 | 249 | #is insertion 250 | if entry.sv_type == "INS": 251 | strat_entries[entry.minda_num].add("insertion") 252 | 253 | """ 254 | #near telomere 255 | if min(entry.pos_x, chr_sizes[entry.chr_x] - entry.pos_x) < TELOMERE_LEN or \ 256 | min(entry.pos_y, chr_sizes[entry.chr_y] - entry.pos_y) < TELOMERE_LEN: 257 | strat_entries[entry.minda_num].add("telomere") 258 | """ 259 | 260 | """ 261 | #near germline SV breakpoints 262 | germ_1, germ_2 = _get_intervals(entry, index_germline, BND_AREA) 263 | if len(germ_1) > 0 and len(germ_2) > 0: 264 | strat_entries[entry.minda_num].add("germline") 265 | 266 | """ 267 | 268 | """ 269 | #near multiple condifent breakpoints 270 | if len(set(o.minda_num for o in ens_bnds_1 if o.minda_num in confident_calls)) > 1 or \ 271 | len(set(o.minda_num for o in ens_bnds_2 if o.minda_num in confident_calls)) > 1: 272 | strat_entries[entry.minda_num].add("truth_cluster") 273 | """ 274 | 275 | ens_bnds_1, ens_bnds_2 = _get_intervals(entry, index_ensemble, BND_AREA) 276 | 277 | #duplication 278 | for r in ens_bnds_1: 279 | if r != entry and _is_duplicate(r, entry): 280 | strat_entries[entry.minda_num].add("bnd_dup") 281 | strat_entries[r.minda_num].add("bnd_dup") 282 | for r in ens_bnds_2: 283 | if r != entry and _is_duplicate(r, entry): 284 | strat_entries[entry.minda_num].add("bnd_dup") 285 | strat_entries[r.minda_num].add("bnd_dup") 286 | 287 | #3+ chain of breakends 288 | left_chain, right_chain = None, None 289 | extra_chain_entries = set() 290 | for r in ens_bnds_1: 291 | if r != entry and not _is_duplicate(r, entry): 292 | left_chain = r 293 | extra_chain_entries.add(r.minda_num) 294 | for r in ens_bnds_2: 295 | if r != entry and not _is_duplicate(r, entry): 296 | right_chain = r 297 | extra_chain_entries.add(r.minda_num) 298 | 299 | if None not in [left_chain, right_chain] and left_chain != right_chain: 300 | strat_entries[entry.minda_num].add("bnd_chain") 301 | for e in extra_chain_entries: 302 | strat_entries[e].add("bnd_chain") 303 | 304 | #print(entry.original_line + "\t" + ",".join(list(strat_entries[entry.minda_num]))) 305 | 306 | return strat_categories, strat_entries 307 | 308 | 309 | def compute_fp_fn(minda_records, confident_calls, strat_category, strat_entries, print_table): 310 | callset_list = list(next(iter(minda_records.values())).support.keys()) 311 | callset_list.sort(key=lambda x: x.split("_")[0]) 312 | tools_tp, tools_fp, tools_fn, tools_f1 = defaultdict(set), defaultdict(set), defaultdict(set), defaultdict(float) 313 | tools_recall, tools_precision = defaultdict(set), defaultdict(set) 314 | 315 | for rec in minda_records.values(): 316 | support_tools = set(x for x in rec.support if rec.support[x]) 317 | against_tools = set(callset_list) - support_tools 318 | 319 | if strat_category is not None: 320 | estrat = strat_entries[rec.minda_num] 321 | if len(estrat) == 0: 322 | estrat = set(["Unclassified"]) 323 | if strat_category not in estrat: 324 | continue 325 | #if strat_category == "Unclassified" and rec.minda_num in confident_calls: 326 | # print("FN", rec.chr_x, rec.pos_x, rec.chr_y, rec.pos_y, against_tools) 327 | 328 | if rec.minda_num in confident_calls: 329 | for tool in support_tools: 330 | tools_tp[tool].add(rec.minda_num) 331 | for tool in against_tools: 332 | tools_fn[tool].add(rec.minda_num) 333 | else: 334 | for tool in support_tools: 335 | tools_fp[tool].add(rec.minda_num) 336 | 337 | if print_table: 338 | print(f"\n=== Stratifying by: {strat_category} ===\n") 339 | print("#Tool\tTP\tFP\tFN\tprecision\trecall\tF1") 340 | 341 | for tool in callset_list: 342 | if len(tools_tp[tool]) > 0: 343 | precision = len(tools_tp[tool]) / (len(tools_tp[tool]) + len(tools_fp[tool])) 344 | recall = len(tools_tp[tool]) / (len(tools_tp[tool]) + len(tools_fn[tool])) 345 | f1_score = 2 * precision * recall / (precision + recall) 346 | else: 347 | precision, recall, f1_score = 0, 0, 0 348 | tp, fp, fn, = len(tools_tp[tool]), len(tools_fp[tool]), len(tools_fn[tool]) 349 | tools_f1[tool] = f1_score 350 | tools_recall[tool] = recall 351 | tools_precision[tool] = precision 352 | 353 | if print_table: 354 | if PRETTY_PRINT: 355 | print(f"{tool:20s}\t{tp}\t{fp}\t{fn}\t{precision:.4f}\t{recall:.4f}\t{f1_score:.4f}") 356 | else: 357 | print(f"{tool}\t{tp}\t{fp}\t{fn}\t{precision:.4f}\t{recall:.4f}\t{f1_score:.4f}") 358 | 359 | return tools_tp, tools_fp, tools_fn, tools_recall, tools_precision, tools_f1 360 | 361 | 362 | def summary_errors(minda_records, confident_calls, categories, strat_entries): 363 | callset_list = list(next(iter(minda_records.values())).support.keys()) 364 | callset_list.sort(key=lambda x: x.split("_")[0]) 365 | 366 | by_tool_fp = defaultdict(dict) 367 | by_tool_fn = defaultdict(dict) 368 | by_tool_tp = defaultdict(dict) 369 | by_tool_f1_score = defaultdict(dict) 370 | by_tool_recall = defaultdict(dict) 371 | by_tool_precision = defaultdict(dict) 372 | for cat in categories: 373 | tp, fp, fn, recall, precision, f1_score = \ 374 | compute_fp_fn(minda_records, confident_calls, cat, strat_entries, print_table=False) 375 | for tool in tp: 376 | by_tool_tp[tool][cat] = len(tp[tool]) 377 | by_tool_fp[tool][cat] = len(fp[tool]) 378 | by_tool_fn[tool][cat] = len(fn[tool]) 379 | by_tool_f1_score[tool][cat] = "{:.4f}".format(f1_score[tool]) 380 | by_tool_recall[tool][cat] = "{:.4f}".format(recall[tool]) 381 | by_tool_precision[tool][cat] = "{:.4f}".format(precision[tool]) 382 | 383 | def print_with(stats, title): 384 | print(f"\n\t == {title} == \n") 385 | #print("{:20s}\t".format("#Tool") + "\t".join(categories)) 386 | print("{}\t".format("#Tool") + "\t".join(categories)) 387 | for tool in callset_list: 388 | numbers = [str(stats[tool][cat]) for cat in categories] 389 | if PRETTY_PRINT: 390 | print(f"{tool:20s}\t" + "\t".join(numbers)) 391 | else: 392 | print(f"{tool}\t" + "\t".join(numbers)) 393 | 394 | print_with(by_tool_tp, "True positives") 395 | print_with(by_tool_fp, "False positives") 396 | print_with(by_tool_fn, "False negatives") 397 | print_with(by_tool_recall, "Recall") 398 | print_with(by_tool_precision, "Precision") 399 | print_with(by_tool_f1_score, "F1 scores") 400 | 401 | 402 | #annotation filenames 403 | REPEAT_MASKER = "repeatmasker.out" 404 | SEGDUPS_BED = "segdups.bed" 405 | VNTR_BED = "trf.bed" 406 | CHR_LEN_BED = "chr.fasta.fai" 407 | 408 | #stratification 409 | MIN_REPEAT = 100 410 | REPEAT_AREA = 5 411 | BND_AREA = 500 412 | LOW_VAF = 0.10 413 | LOW_LEN = 100 414 | #TELOMERE_LEN = 10000 415 | 416 | #evaluation 417 | #MIN_TECH = 2 418 | #MIN_TOOLS = 4 419 | MIN_VAF = 0.00 420 | MIN_SV_LEN = 50 421 | REMOVE_INS = False 422 | PRETTY_PRINT = True 423 | 424 | 425 | def minda_stratification(annotation_dir, minda_support_tsv, num_callsets): 426 | print(f"Params: min_vaf:{MIN_VAF} insertions_removed:{REMOVE_INS}") 427 | 428 | minda_records = parse_minda_csv(minda_support_tsv, num_callsets) 429 | minda_records = filter_calls(minda_records, MIN_VAF, MIN_SV_LEN, remove_ins=REMOVE_INS) 430 | confident_calls = get_confident_calls(minda_records) 431 | compute_fp_fn(minda_records, confident_calls, None, None, print_table=True) 432 | 433 | strat_categories, strat_calls = stratify_breakends(minda_records, confident_calls, annotation_dir, 434 | REMOVE_INS) 435 | categories_unk = strat_categories + ["Unclassified"] 436 | 437 | for category in categories_unk: 438 | compute_fp_fn(minda_records, confident_calls, category, strat_calls, print_table=True) 439 | summary_errors(minda_records, confident_calls, categories_unk, strat_calls) 440 | 441 | 442 | def main(): 443 | if len(sys.argv) != 4: 444 | print("Usage: minda_stratify.py annotation_dir minda_support_tsv num_callsets") 445 | return 1 446 | 447 | annotation_dir = sys.argv[1] 448 | #germline_vcf = sys.argv[2] 449 | minda_csv = sys.argv[2] 450 | num_callsets = int(sys.argv[3]) 451 | minda_stratification(annotation_dir, minda_csv, num_callsets) 452 | 453 | 454 | if __name__ == "__main__": 455 | main() 456 | -------------------------------------------------------------------------------- /minda/decompose.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import pandas as pd 5 | import numpy as np 6 | import gzip 7 | from collections import Counter 8 | from pybedtools import BedTool 9 | 10 | logger = logging.getLogger() 11 | 12 | 13 | def _is_vcf_gz(vcf): 14 | with gzip.open(vcf, 'r') as file: 15 | try: 16 | file.read(1) 17 | return True 18 | except OSError: 19 | return False 20 | 21 | 22 | def get_caller_name(vcf): 23 | """ 24 | Extracts the name of the caller from vcf. 25 | 26 | """ 27 | is_vcf_gz = _is_vcf_gz(vcf) 28 | if is_vcf_gz == False: 29 | with open(vcf, 'r') as file: 30 | caller_name = _caller_name(file) 31 | else: 32 | with gzip.open(vcf, 'rt') as file: 33 | caller_name = _caller_name(file) 34 | 35 | return caller_name 36 | 37 | 38 | def _caller_name(file): 39 | found_source = False 40 | for line in file: 41 | if line.startswith("##source"): 42 | source_line = line.strip() 43 | line_length = len(source_line) 44 | start = source_line.find("=") + 1 45 | if line_length > 50: 46 | stop = source_line.find(" ") 47 | if stop == -1: 48 | stop = 51 49 | else: 50 | stop = line_length 51 | 52 | caller_name = source_line[start:stop] 53 | found_source = True 54 | 55 | if not found_source: 56 | caller_name = "Unknown source" 57 | return caller_name 58 | 59 | 60 | 61 | def get_df(vcf): 62 | """ 63 | Create a df from vcf. 64 | 65 | """ 66 | is_vcf_gz = _is_vcf_gz(vcf) 67 | if is_vcf_gz == False: 68 | df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'}) 69 | else: 70 | df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'}) 71 | df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO'] 72 | 73 | return df 74 | 75 | 76 | def get_intersected_df(vcf, bed): 77 | """ 78 | Create a df that only includes records that interesect intervals of the bed file. 79 | 80 | """ 81 | bed_to_bt = BedTool(bed) 82 | vcf_to_bt = BedTool(vcf) 83 | intersect_obj = vcf_to_bt.intersect(bed_to_bt, u=True) 84 | df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'}) 85 | df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO'] 86 | return df 87 | 88 | 89 | def _get_sorted_df(df): 90 | """ 91 | Sorts dataframe by #CHROM and POS 92 | 93 | """ 94 | # handles instances where chromosomes are only integers 95 | df["#CHROM"] = df["#CHROM"].astype(str) 96 | chrom_value = df.iloc[0,0] 97 | if chrom_value.startswith("chr"): 98 | chrom_set = set(df["#CHROM"].str.slice(start=3).to_list()) 99 | else: 100 | chrom_set = set(df["#CHROM"].to_list()) 101 | 102 | str_chrom_list = [] 103 | int_chrom_list = [] 104 | for chrom_str in chrom_set: 105 | try: 106 | chrom = int(chrom_str) 107 | int_chrom_list.append(chrom) 108 | except ValueError: 109 | chrom = chrom_str 110 | str_chrom_list.append(chrom) 111 | if chrom_value.startswith("chr"): 112 | chrom_sort = sorted(int_chrom_list) + sorted(str_chrom_list) 113 | chrom_sort = ['chr' + str(chrom_sort[i]) for i in range(len(chrom_sort))] 114 | else: 115 | chrom_sort = sorted(int_chrom_list) + sorted(str_chrom_list) 116 | chrom_sort = [str(chrom_sort[i]) for i in range(len(chrom_sort))] 117 | 118 | df = df.sort_values(by=['#CHROM', 'POS'], key=lambda x: x.map({v: i for i, v in enumerate(chrom_sort)})).reset_index(drop=True) 119 | 120 | return df 121 | 122 | 123 | def _get_alt_mate_index(df): 124 | 125 | #create mate df & create list of df values 126 | alt_df = df.ALT.str.extract(r'((chr)?\w+):(\d+)').rename(columns={0: "MATE_#CHROM", 1: "regex_noncap_group ", 2:"MATE_POS"}) 127 | alt_df.MATE_POS = alt_df.MATE_POS.astype(pd.Int64Dtype()) 128 | alt_df.drop(columns="regex_noncap_group ", inplace=True) 129 | mate_df = df[['#CHROM','POS']].merge(alt_df, left_index=True, right_index=True) 130 | mate_df_list = mate_df.values.tolist() 131 | 132 | # find the index of mate record 133 | mate_indices = [] 134 | for i in range(len(mate_df_list)): 135 | record = mate_df_list[i] 136 | chrom = record[0] 137 | pos = record[1] 138 | mate_chrom = record[2] 139 | mate_pos = record[3] 140 | 141 | matching_rows = mate_df[(mate_df['#CHROM'] == mate_chrom) & \ 142 | (mate_df['POS'] == mate_pos) & \ 143 | (mate_df['MATE_#CHROM'] == chrom) & \ 144 | (mate_df['MATE_POS'] == pos)] 145 | 146 | matching_row_indices = matching_rows.index.to_list() 147 | matching_row_indices = [index for index in matching_row_indices if index != i] 148 | if len(matching_row_indices) == 1: 149 | mate_index = matching_row_indices[0] 150 | else: 151 | mate_index = -1 152 | mate_indices.append(mate_index) 153 | 154 | df["MATE_INDEX"] = mate_indices 155 | 156 | # get mate results 157 | unique_indices_count = df['MATE_INDEX'].nunique() 158 | first_unique_index = df["MATE_INDEX"].value_counts().to_frame().index[0] 159 | unpaired_recrods_count = len(df[df["MATE_INDEX"] == -1]) 160 | paired_records_count = len(df[df["MATE_INDEX"] != -1]) 161 | 162 | logger.info(f"Number of unique indices: {unique_indices_count}") 163 | if unique_indices_count == len(df): 164 | logger.info(f"{paired_records_count} paired records found...") 165 | elif unique_indices_count == 1 and first_unique_index == -1: 166 | logger.info("No paired records found...") 167 | else: 168 | logger.info(f"{paired_records_count} paired records and {unpaired_recrods_count} unpaired records found...") 169 | 170 | return df 171 | 172 | 173 | def _get_paired_alt_dfs(alt_df): 174 | 175 | # check if BNDS are a single record or two 176 | mask = alt_df['MATE_INDEX'] == -1 177 | singleton_df = alt_df[mask] 178 | singleton_count = len(singleton_df) 179 | if singleton_count == alt_df.shape[0]: # ALT records are singletons 180 | logger.debug(f"(1) Number of singleton records: {singleton_count} {alt_df.shape[0]}") 181 | alt_df_1 = alt_df.copy() 182 | alt_df_2 = alt_df.copy() 183 | alt_df_2['#CHROM'] = alt_df_2.ALT.str.extract(r'(chr\w+|\w+):')[0].to_list() 184 | alt_df_2['POS'] = alt_df_2.ALT.str.extract(r':(\d+)')[0].astype(pd.Int64Dtype()).to_list() 185 | paired_alt_dfs = [alt_df_1, alt_df_2] 186 | logger.debug(f"(1) Number of alt/alt_1/alt_2 records: {alt_df.shape[0]} {alt_df_1.shape[0]} {alt_df_2.shape[0]}") 187 | logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}") 188 | logger.info(f"Number of unpaired records paired by MATE_ID: 0 0") 189 | 190 | # alt_df pairs based on mate index 191 | else: 192 | logger.debug(f"(2) Number of singleton/alt records: {singleton_count} {alt_df.shape[0]}") 193 | alt_df_1 = alt_df[(alt_df.index < alt_df.MATE_INDEX) & (alt_df.MATE_INDEX != -1)] 194 | alt_df_2 = alt_df[(alt_df.index > alt_df.MATE_INDEX) & (alt_df.MATE_INDEX != -1)] 195 | alt_df_2.index = alt_df_2["MATE_INDEX"].to_list() 196 | paired_alt_dfs = [alt_df_1, alt_df_2] 197 | 198 | mateless_alt_df = alt_df[alt_df.MATE_INDEX == -1] 199 | logger.debug(f"(2a) Number of alt/alt_1/alt_2/mateless records: {len(alt_df)} {len(alt_df_1)} {len(alt_df_2)} {len(mateless_alt_df)}") 200 | logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}") 201 | 202 | # alt_df pairs based on MATEID in INFO 203 | if mateless_alt_df.shape[0] > 0: 204 | 205 | mate_id_alt_df = _get_mate_id_df(mateless_alt_df) 206 | mate_id_alt_df_1 = mate_id_alt_df[(mate_id_alt_df.index < mate_id_alt_df.MATE_INDEX) & (mate_id_alt_df.MATE_INDEX != -1)] 207 | mate_id_alt_df_2 = mate_id_alt_df[(mate_id_alt_df.index > mate_id_alt_df.MATE_INDEX) & (mate_id_alt_df.MATE_INDEX != -1)] 208 | mate_id_alt_df_2.index = mate_id_alt_df_2["MATE_INDEX"].to_list() 209 | alt_to_info_df = mate_id_alt_df[mate_id_alt_df.MATE_INDEX == -1] # for SEVERUS INS 210 | logger.debug(f"(2b) Number of mate_id/mate_id/mate_id/alt_to_info records: {mate_id_alt_df.shape[0]} {mate_id_alt_df_1.shape[0]} {mate_id_alt_df_2.shape[0]} {alt_to_info_df.shape[0]}") 211 | #alt_df_1 = pd.concat([alt_df_1, mate_id_alt_df_1]) 212 | #alt_df_2 = pd.concat([alt_df_2, mate_id_alt_df_2]) 213 | non_empty_1 = [df for df in [alt_df_1, mate_id_alt_df_1] if not df.empty] 214 | alt_df_1 = pd.concat(non_empty_1).sort_index() 215 | non_empty_2 = [df for df in [alt_df_2, mate_id_alt_df_2] if not df.empty] 216 | alt_df_2 = pd.concat(non_empty_2).sort_index() 217 | 218 | paired_alt_dfs = [alt_df_1, alt_df_2] 219 | logger.debug(f"(2b) Total number of alt_1/alt_2 records: {alt_df_1.shape[0]} {alt_df_2.shape[0]}") 220 | if alt_to_info_df.shape[0] > 0: 221 | paired_alt_dfs.append(alt_to_info_df) 222 | logger.info(f"Number of unpaired records paired by MATE_ID: {len(mate_id_alt_df_1)} {len(mate_id_alt_df_2)}") 223 | else: 224 | logger.info(f"Number of unpaired records paired by MATE_ID: 0 0") 225 | return paired_alt_dfs 226 | 227 | 228 | def _get_mate_id_df(df): 229 | """ 230 | Finds the index of the mate ID listed in INFO in the ID column. If not found, index of -1 assigned. 231 | 232 | """ 233 | mate_id_pattern = r'MATEID=([^;]+)(?=;|$)' 234 | mate_id_list = df.INFO.str.extract(mate_id_pattern)[0].to_list() 235 | 236 | mate_indices = [] 237 | for i in range(len(mate_id_list)): 238 | mate_id = mate_id_list[i] 239 | matching_rows = df[(df['ID'] == mate_id)] 240 | matching_row_indices = matching_rows.index 241 | 242 | if len(matching_row_indices) == 1: 243 | mate_index = matching_row_indices[0] 244 | else: 245 | mate_index = -1 246 | mate_indices.append(mate_index) 247 | 248 | #df['MATE_INDEX'] = mate_indices 249 | df.loc[:, 'MATE_INDEX'] = mate_indices 250 | 251 | return df 252 | 253 | 254 | def _get_paired_info_dfs(info_df): 255 | 256 | mate_pos_list = info_df.INFO.str.extract(r'SVLEN=(-?\d+)')[0].astype(pd.Int64Dtype()).abs().to_list() 257 | info_df_1 = info_df.copy() 258 | info_df_2 = info_df.copy() 259 | info_df_2['POS'] = info_df_2['POS'] + mate_pos_list 260 | info_df_2['END'] = info_df_2.INFO.str.extract(r'END=(-?\d+)')[0].astype(pd.Int64Dtype()).abs().to_list() 261 | #info_df_2['POS'].fillna(info_df_2['END'], inplace=True) 262 | info_df_2.fillna({'POS':info_df_2['END']}, inplace=True) 263 | nan_indices = info_df_2[info_df_2['POS'].isna()].index 264 | info_df_1 =info_df_1.drop(index=nan_indices, errors='ignore') 265 | info_df_2 =info_df_2.drop(index=nan_indices, errors='ignore') 266 | dropped_singleton_count = info_df.shape[0] - info_df_1.shape[0] 267 | 268 | logger.info(f"Number of unpaired records paired by INFO column: {info_df_1.shape[0]} {info_df_2.shape[0]}") 269 | logger.info(f"Number of singleton records dropped: {dropped_singleton_count}") 270 | 271 | return info_df_1, info_df_2 272 | 273 | 274 | def _check_df_order(df_1, df_2): 275 | 276 | # row by row for start and end df, check that the order by sorting 277 | for i in range(len(df_1)): 278 | 279 | df_number = [0] 280 | row_1 = df_1.iloc[i].to_frame().T 281 | row_1['row_number'] = df_number 282 | 283 | df_number = [1] 284 | row_2 = df_2.iloc[i].to_frame().T 285 | row_2['row_number'] = df_number 286 | 287 | order_df = pd.concat([row_1, row_2]).reset_index(drop=True) 288 | sorted_order_df = _get_sorted_df(order_df) 289 | 290 | # if sort is out of order, what the chrom & pos values of the start & end dfs 291 | if order_df.equals(sorted_order_df) == False: 292 | df_1.at[i,'#CHROM'] = sorted_order_df.iloc[0]['#CHROM'] 293 | df_1.at[i, 'POS'] = sorted_order_df.iloc[0]['POS'] 294 | df_2.at[i,'#CHROM'] = sorted_order_df.iloc[1]['#CHROM'] 295 | df_2.at[i, 'POS'] = sorted_order_df.iloc[1]['POS'] 296 | 297 | return df_1, df_2 298 | 299 | 300 | def _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir): 301 | id_difference_list = sorted(list(id_set_difference)) 302 | path = f'{out_dir}/{sample_name}_removed_records.txt' 303 | file_check = os.path.isfile(path) 304 | with open(path,'a') as file: 305 | if file_check == False: 306 | file.write('REMOVED RECORDS\n') 307 | if written_count == 0: 308 | step = "***** " + caller_name + " *****" + "\n" + step 309 | file.write(f'\n{step}\n') 310 | file.write('\n'.join(id_difference_list)) 311 | file.write('\n') 312 | 313 | def get_decomposed_dfs(caller_name, df, filter, min_size, prefixed, vaf, sample_name, out_dir): 314 | """ 315 | Decomposes df records into start and end dfs. 316 | 317 | """ 318 | logger.info(f"DECOMPOSING {caller_name} RECORDS...") 319 | logger.info(f"Original number of records: {len(df)}") 320 | 321 | written_count = 0 322 | original_id_set = set(df.ID.to_list()) 323 | 324 | # filter_df 325 | if filter != None: 326 | df = df[df['FILTER'].isin(filter)] 327 | logger.info(f"Number of records after filtering by FILTER column: {len(df)}") 328 | 329 | # write removed ids to txt 330 | filter_id_set = set(df.ID.to_list()) 331 | id_set_difference = original_id_set.difference(filter_id_set) 332 | if len(list(id_set_difference)) > 0: 333 | step = "FILTER" 334 | _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir) 335 | written_count +=1 336 | 337 | # sort df 338 | df = _get_sorted_df(df) 339 | 340 | # change EVENTTYPE to SVTYPE (for GRIDSS/GRIPSS) 341 | # create SVTYPE column 342 | df.loc[:, 'INFO'] = df.INFO.str.replace('EVENTTYPE', 'SVTYPE') 343 | svtype_pattern = r'SVTYPE=([^;]+)(?=;|$)' 344 | svtype_column = df.INFO.str.extract(svtype_pattern)[0].to_list() 345 | df["SVTYPE"] = svtype_column 346 | 347 | # create SVLEN column 348 | # change SVINSLEN to SVLEN only if SVLEN not in info (for nanomonsv) 349 | df.loc[~df['INFO'].str.contains('SVLEN', na=False), 'INFO'] = df['INFO'].str.replace('SVINSLEN', 'SVLEN') 350 | 351 | # add VAF column 352 | # if vaf != None: 353 | # df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list() 354 | # if df.VAF.isnull().all() == True: 355 | # sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ") 356 | if vaf == None: 357 | df['VAF'] = "*" 358 | else: 359 | df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list() 360 | if df.VAF.isnull().all() == True: 361 | sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ") 362 | 363 | # get indices of mate rows 364 | df = _get_alt_mate_index(df) 365 | 366 | # create paired ALT dfs 367 | alt_df = df[df['ALT'].str.contains(r'(?:chr)?\w+:\d+', na=False)].copy() 368 | 369 | #create paired INFO dfs 370 | info_df = df.drop(index=alt_df.index, errors='ignore') 371 | logger.debug(f"Number of INFO records: {info_df.shape[0]}") 372 | 373 | # get ALT paired dfs 374 | paired_alt_dfs = _get_paired_alt_dfs(alt_df) 375 | if len(paired_alt_dfs) == 3: 376 | alt_df_1 = paired_alt_dfs[0] 377 | alt_df_2 = paired_alt_dfs[1] 378 | alt_to_info_df = paired_alt_dfs[2] 379 | 380 | info_df = pd.concat([info_df, alt_to_info_df]) 381 | logger.debug(f"Total number of INFO records: {info_df.shape[0]}") 382 | else: 383 | alt_df_1 = paired_alt_dfs[0] 384 | alt_df_2 = paired_alt_dfs[1] 385 | 386 | 387 | info_df_1, info_df_2 = _get_paired_info_dfs(info_df) 388 | 389 | non_empty_1 = [df for df in [info_df_1, alt_df_1] if not df.empty] 390 | decomposed_df_1 = pd.concat(non_empty_1).sort_index() 391 | non_empty_2 = [df for df in [info_df_2, alt_df_2] if not df.empty] 392 | decomposed_df_2 = pd.concat(non_empty_2).sort_index() 393 | 394 | # write removed ids to txt 395 | singleton_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list()) 396 | id_set_difference = filter_id_set.difference(singleton_id_set) 397 | if len(id_set_difference) > 0: 398 | step = "SINGLETON" 399 | _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir) 400 | written_count += 1 401 | 402 | # check that start and end record are in correct df 403 | decomposed_df_1, decomposed_df_2 = _check_df_order(decomposed_df_1, decomposed_df_2) 404 | 405 | # write removed ids to txt 406 | order_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list()) 407 | id_set_difference = singleton_id_set.difference(order_id_set) 408 | step = "END/START ORDER" 409 | if len(id_set_difference) > 0: 410 | step = "END/START ORDER" 411 | _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir) 412 | written_count += 1 413 | 414 | decomposed_df_1['Minda_ID'] = f'{caller_name}_' + (decomposed_df_1.index + 1).astype(str) 415 | decomposed_df_2['Minda_ID'] = f'{caller_name}_' + (decomposed_df_2.index + 1).astype(str) 416 | logger.info(f"Number of decomposed records after pairing: {decomposed_df_1.shape[0]} {decomposed_df_2.shape[0]}") 417 | 418 | # create SVLEN column determined on start & end df (not all vcfs have SVLEN in INFO) 419 | decomposed_df_1['SVLEN'] = decomposed_df_1.apply(lambda row: -1 if row['#CHROM'] != decomposed_df_2.loc[row.name, '#CHROM'] else int(abs(row['POS'] - decomposed_df_2.loc[row.name, 'POS'])), axis=1) 420 | max_svlen = decomposed_df_1['SVLEN'].max() 421 | 422 | if min_size != None: 423 | decomposed_df_1 = decomposed_df_1[(decomposed_df_1['SVLEN'] >= min_size) | (decomposed_df_1['SVLEN'] == -1)] 424 | decomposed_df_2 = decomposed_df_2[decomposed_df_2.index.isin(decomposed_df_1.index)] 425 | logger.info(f"Number of records after size filtering: {len(decomposed_df_1)} {len(decomposed_df_2)}") 426 | 427 | # write removed ids to txt 428 | svlen_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list()) 429 | id_set_difference = order_id_set.difference(svlen_id_set) 430 | if len(id_set_difference) > 0: 431 | step = "SVLEN" 432 | _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir) 433 | written_count += 1 434 | 435 | # filter low VAFs such that if either the start or end VAF is too low, records from both dfs are removed 436 | if vaf != None: 437 | decomposed_df_1 = decomposed_df_1[decomposed_df_1['VAF'] >= vaf] 438 | decomposed_df_2 = decomposed_df_2[decomposed_df_2['VAF'] >= vaf] 439 | minda_ids_list = pd.merge(decomposed_df_1, decomposed_df_2, on='Minda_ID')['Minda_ID'].to_list() 440 | decomposed_df_1 = decomposed_df_1[decomposed_df_1['Minda_ID'].isin(minda_ids_list)] 441 | decomposed_df_2 = decomposed_df_2[decomposed_df_2['Minda_ID'].isin(minda_ids_list)] 442 | logger.info(f"Number of records after VAF filtering: {len(decomposed_df_1)} {len(decomposed_df_2)}") 443 | 444 | # write removed ids to txt 445 | vaf_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list()) 446 | id_set_difference = svlen_id_set.difference(vaf_id_set) 447 | step = "VAF" 448 | if len(id_set_difference) > 0: 449 | step = "VAF" 450 | _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir) 451 | written_count += 1 452 | 453 | logger.info(f"Total number of decomposed records: {decomposed_df_1.shape[0]} {decomposed_df_2.shape[0]}") 454 | 455 | if prefixed == True: 456 | prefix = caller_name.split('_', 1)[0] 457 | decomposed_df_1.ID = prefix + "_" + decomposed_df_1['ID'].astype(str) 458 | decomposed_df_2.ID = prefix + "_" + decomposed_df_2['ID'].astype(str) 459 | 460 | return decomposed_df_1, decomposed_df_2, max_svlen --------------------------------------------------------------------------------