├── minda
    ├── __init__.py
    ├── __version__.py
    ├── truthset.py
    ├── stats.py
    ├── main.py
    ├── ensemble.py
    └── decompose.py
├── requirements.txt
├── .gitignore
├── environment.yml
├── scripts
    ├── add_vaf_severus.py
    └── add_vaf_new_colo829.py
├── minda.py
├── setup.py
├── annotation
    ├── README.md
    └── minda_stratify.py
├── LICENSE
├── README.md
└── data
    └── colo829_benchmark_grch38_amended_vaf.vcf


/minda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/minda/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | pandas
3 | numpy
4 | pybedtools


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | *.swp
 3 | *.swo
 4 | .DS_Store
 5 | __pycache__
 6 | ._*
 7 | *.pyc
 8 | *.egg-info
 9 | build
10 | dist
11 | minda/.ipynb_checkpoints
12 | minda/__pycache__
13 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: minda
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python>=3.10
 8 |   - pandas>=2.1.1
 9 |   - numpy>=1.26.0
10 |   - pybedtools>=0.9.1
11 |  


--------------------------------------------------------------------------------
/scripts/add_vaf_severus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import sys
 4 | import pysam
 5 | import statistics
 6 | 
 7 | vcf_file = sys.argv[1]
 8 | 
 9 | out_vcf = vcf_file.replace('.vcf' , '_vaf.vcf')
10 | vcf_in=pysam.VariantFile(vcf_file,"r")
11 | vcf_in.header.info.add("VAF",1,"Float","variant_allele_frequency")
12 | vcf_out = pysam.VariantFile(out_vcf, 'w', header=vcf_in.header)
13 | 
14 | for record in vcf_in:
15 |     sample_id = record.samples.keys()[0]
16 |     record.info['VAF'] = record.samples[sample_id]['VAF']
17 |     vcf_out.write(record)
18 | vcf_out.close()


--------------------------------------------------------------------------------
/minda.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #(c) 2023 by Authors
 4 | #This file is a part of Minda.
 5 | #Released under the BSD license (see LICENSE file)
 6 | 
 7 | """
 8 | This script sets up environment paths
 9 | and invokes Minda without installation.
10 | """
11 | 
12 | import os
13 | import sys
14 | 
15 | def main():
16 |     #Setting executable paths
17 |     minda_root = os.path.dirname(os.path.realpath(__file__))
18 |     sys.path.insert(0, minda_root)
19 | 
20 |     #Minda entry point
21 |     from minda.main import main
22 |     sys.exit(main())
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()


--------------------------------------------------------------------------------
/scripts/add_vaf_new_colo829.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import sys
 4 | import pysam
 5 | import statistics
 6 | 
 7 | vcf_file = sys.argv[1]
 8 | 
 9 | out_vcf = vcf_file.replace('.vcf' , '_vaf.vcf')
10 | vcf_in=pysam.VariantFile(vcf_file,"r")
11 | vcf_in.header.info.add("VAF",1,"Float","variant_allele_frequency")
12 | vcf_out = pysam.VariantFile(out_vcf, 'w', header=vcf_in.header)
13 | 
14 | for record in vcf_in:
15 |     support = list(record.info.values()[4])
16 |     support = [sample.split("|") for sample in support]
17 |     sample_vafs = [float(sample[2]) for sample in support]
18 |     vaf = statistics.median(sample_vafs)
19 |     record.info['VAF'] = vaf
20 |     vcf_out.write(record)
21 | vcf_out.close()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import shutil
 5 | 
 6 | try:
 7 |     import setuptools
 8 | except ImportError:
 9 |     sys.exit("setuptools package not found. "
10 |              "Please use 'pip install setuptools' first")
11 | 
12 | from setuptools import setup
13 | 
14 | # Make sure we're running from the setup.py directory.
15 | script_dir = os.path.dirname(os.path.realpath(__file__))
16 | if script_dir != os.getcwd():
17 |     os.chdir(script_dir)
18 | 
19 | from minda.__version__ import __version__
20 | 
21 | 
22 | setup(name='minda',
23 |       version=__version__,
24 |       description='A tool for somatic structural variant calling using long reads',
25 |       url='https://github.com/KolmogorovLab/minda',
26 |       author='Asher Bryant',
27 |       author_email = 'asher.bryant@nih.gov',
28 |       license='BSD-3-Clause',
29 |       packages=['minda'],
30 |       entry_points={'console_scripts': ['minda = minda.main:main']},
31 |       )


--------------------------------------------------------------------------------
/annotation/README.md:
--------------------------------------------------------------------------------
 1 | Minda SV Annotation
 2 | ===================
 3 | 
 4 | This folder contain a set of scripts that annotates SV calls ensemble
 5 | generated by Minda with various challenging scenarios. We currently annotate the following categories.
 6 | More detailed description is availabe in the 
 7 | [Severus manuscript](https://www.medrxiv.org/content/10.1101/2024.03.22.24304756v1)
 8 | 
 9 | * Insertions
10 | * SVs with breakpoints at matching homologous repeats
11 | * SVs inside segmental duplications
12 | * SVs inside VNTRs	
13 | * SVs with low allelic frequency
14 | * Short SVs (<100bp)
15 | * Duplicated SVs
16 | * SV chains
17 | 
18 | To run annotation, you'll need Minda output directory and a directory
19 | with annotation for your reference genome. Links to download existing
20 | annotations are below:
21 | 
22 | ```
23 | grch38  https://zenodo.org/records/11992284/files/annotation_grch38.tar.gz
24 | ```
25 | 
26 | For example, if you are using grch38:
27 | 
28 | ```
29 | wget https://zenodo.org/records/11992284/files/annotation_grch38.tar.gz
30 | tar -xvf annotation_grch38.tar.gz
31 | ./minda_stratify.py annotation_grch38 minda_support.tsv 11
32 | ```
33 | 
34 | where `minda_support.tsv` is a support file generated by Minda, and `11` is the number of callsets
35 | used to create the Minda ensemble.
36 | 
37 | Currently, genome annotations include the following:
38 | * Chromosome lengths (produces using `samtools faidx`)
39 | * Common repeat annotations (produces using `RepeatMasker`)
40 | * Segmental duplications annotation 
41 | * VNTR annotations (produced using [findTandemRepeats](https://github.com/PacificBiosciences/pbsv/tree/master/annotations)
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024, National Institutes of Health
 2 | License: BSD-3-Clause
 3 | 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright
10 |       notice, this list of conditions and the following disclaimer.
11 | 
12 |     * Redistributions in binary form must reproduce the above copyright
13 |       notice, this list of conditions and the following disclaimer in the
14 |       documentation and/or other materials provided with the distribution.
15 | 
16 |     * Neither the name of the National Institutes of Health nor the
17 |       names of its contributors may be used to endorse or promote products
18 |       derived from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
24 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/minda/truthset.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | def get_base_df(decomposed_dfs_list, tolerance, multimatch):
  5 |     dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list]
  6 |     dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list]
  7 | 
  8 |     # create collective comparison df
  9 |     start_dfs = pd.concat(dfs_1)
 10 |     end_dfs = pd.concat(dfs_2)
 11 |     
 12 |     # create base df
 13 |     base_1_df = dfs_1[-1]
 14 |     base_2_df = dfs_2[-1]
 15 | 
 16 |     # find which comparison start loci are within tolerance range of base start loci
 17 |     base_1_loci = list(zip(base_1_df['#CHROM'], base_1_df['POS']))
 18 |     start_loci = list(zip(start_dfs['#CHROM'], start_dfs['POS']))
 19 |     base_2_loci = list(zip(base_2_df['#CHROM'], base_2_df['POS']))
 20 |     end_loci = list(zip(end_dfs['#CHROM'], end_dfs['POS']))
 21 |     
 22 |     start_index_lists = []
 23 |     for i in range(len(base_1_loci)):
 24 |         base_locus = base_1_loci[i]
 25 |         index_list = []
 26 |         
 27 |         for j in range(len(start_loci)): # in order to get the correct index cannot use "for start_locus in start_loci"
 28 |             
 29 |             start_locus = start_loci[j]
 30 |             if base_locus[0] == start_locus[0]:
 31 |                 
 32 |                 distance = abs(base_locus[1] - start_locus[1])
 33 |                 if distance <= tolerance:
 34 |                     start_index = j
 35 |                     index_list.append(start_index)
 36 |         
 37 |         start_index_lists.append(index_list)
 38 |         if len(start_index_lists) != (i+1): # ensure each base record has a list even if no comp calls within tolerance range
 39 |             start_index_lists.append([])
 40 | 
 41 |     # if start loci within tolerance range, check that end also is
 42 |     running_list = []
 43 |     comp_minda_ids = start_dfs.Minda_ID.to_list()
 44 |     minda_id_lists = []
 45 |     for i in range(len(start_index_lists)):
 46 |         index_list = start_index_lists[i]
 47 |         base_locus = base_2_loci[i]
 48 | 
 49 |         
 50 |         minda_id_list = []
 51 |         for index in index_list:
 52 |             end_locus = end_loci[index]
 53 |             if base_locus[0] == end_locus[0]:
 54 |                 #print(base_locus, end_locus)
 55 |                 distance = abs(base_locus[1] - end_locus[1])
 56 |                 if distance <= tolerance:
 57 |                     minda_id = comp_minda_ids[index]
 58 |                     
 59 |                     if multimatch == False:
 60 |                         caller = minda_id.rsplit('_', 1)[0]
 61 |                         if any(id.startswith(caller) for id in minda_id_list) == False and minda_id not in running_list:   
 62 |                             minda_id_list.append(minda_id)
 63 |                             running_list.append(minda_id)
 64 |                     else:
 65 |                         minda_id_list.append(minda_id)
 66 |         minda_id_lists.append(minda_id_list)
 67 |         if len( minda_id_lists) != (i+1): # ensure each base record has a list even if no comp calls within tolerance range
 68 |              minda_id_lists.append([])
 69 | 
 70 |     # merge start & end base dfs & create column of Minda IDs for calls within tolerance range
 71 |     base_df = base_1_df.merge(base_2_df, left_index=True, right_index=True)
 72 |     base_df["Minda_IDs"] = minda_id_lists
 73 |     
 74 |     return base_df
 75 | 
 76 | 
 77 | def get_support_df(base_df, caller_names, vaf, out_dir, sample_name):
 78 |     
 79 |     minda_id_lists = base_df.Minda_IDs.to_list()
 80 |     # create call columns for each caller
 81 |     for caller_name in caller_names:
 82 |         caller_column = []
 83 |         for minda_id_list in minda_id_lists:
 84 |             
 85 |             call_boolean  = any(value.startswith(caller_name) for value in minda_id_list)
 86 |             caller_column.append(call_boolean)
 87 |         base_df[f'{caller_name}'] = caller_column
 88 |     
 89 |     # if vaf == None:
 90 |     #     base_df['VAF_x'] = np.nan
 91 |     
 92 |     column_names = ['#CHROM_x', 'POS_x', 'ID_x', 'INFO_x', \
 93 |                     '#CHROM_y', 'POS_y', 'ID_y', 'INFO_y', \
 94 |                     'SVTYPE_x', 'SVLEN', 'VAF_x', 'Minda_ID_x','Minda_IDs'] + [caller_names[-1]] + caller_names[:-1]
 95 |         
 96 |     support_df = base_df[column_names].rename(columns={'SVTYPE_x':'SVTYPE', 'VAF_x':'VAF', 'Minda_ID_x': 'Minda_ID'}).copy()
 97 | 
 98 |     support_df.to_csv(f'{out_dir}/{sample_name}_support.tsv', sep='\t', index=False)
 99 |     
100 |     return support_df
101 | 
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Minda
  2 | ###### Note: This tool is under active devlopment.
  3 | 
  4 | Minda is a tool for evaluating structural variant (SV) callers that
  5 | * standardizes VCF records for compatibility with both germline and somatic SV callers,
  6 | * benchmarks against a single VCF input file, or
  7 | * benchmarks against an ensemble call set created from multiple VCF input files.
  8 | 
  9 | ## Installation
 10 | 
 11 | Clone the repository and install the dependencies via conda:
 12 | 
 13 | ```
 14 | git clone https://github.com:KolmogorovLab/minda
 15 | cd minda
 16 | conda env create --name minda --file environment.yml
 17 | conda activate minda
 18 | ./minda.py
 19 | ```
 20 | 
 21 | ## Quick Usage
 22 | 
 23 | Benchmarking several vcfs against a truth set vcf:
 24 | 
 25 | ```
 26 | ./minda.py truthset --base truthset.vcf --vcfs caller_1.vcf caller_2.vcf caller_3.vcf --out_dir minda_out
 27 | ```
 28 | 
 29 | Creating an ensemble from several vcfs and benchmarking against ensemble calls:
 30 | 
 31 | ```
 32 | ./minda.py ensemble --vcfs caller_1.vcf caller_2.vcf caller_3.vcf --out_dir minda_out
 33 | ```
 34 | 
 35 | ## Inputs and Parameters
 36 | 
 37 | ### Required
 38 | 
 39 | #### Truthset
 40 | 
 41 | ```
 42 | --out_dir        path to out directory
 43 | --base           path of base VCF
 44 | --tsv | --vcfs   tsv file path
 45 |                     -OR-
 46 |                  vcf file path(s)
 47 | ```
 48 | #### Ensemble
 49 | ```
 50 | --out_dir        path to out directory
 51 | --tsv | --vcfs   tsv file path
 52 |                     -OR-
 53 |                  vcf file path(s)
 54 | --min_support |  minimumn number of callers required to support an ensemble call
 55 | --conditions        -OR-
 56 |                  specific conditions to support a call
 57 | ```
 58 | 
 59 | ### Optional
 60 | ```
 61 | --bed            path to bed file for filtering records with BedTool intersect
 62 | --filter         filter records by FILTER column; default="['PASS']"
 63 | --min_size       filter records by SVLEN in INFO column
 64 | --tolerance      maximum allowable bp distance between base and caller breakpoint; default=500
 65 | --sample_name    name of sample
 66 | --vaf            filter out records below a given VAF treshold
 67 | --multimatch     allow more than one record from the same caller VCF to match a single truthset/ensemble record
 68 | ```
 69 | ##### VCF Input
 70 | Minda standardizes input VCFs by decomposing every SV into start and end records. Records are handled in one of two following ways:
 71 | 1. For records having a CHROM:POS pattern in the `ALT` field, the `#CHROM` and `POS` fields are considered the start. Minda then searches for the end record matching the `ALT` field among other records. Alternatively, the `MATEID` from the `INFO` field may be used to find the end record. If no end record is found, the details from the `ALT` field are used to create one.
 72 | 2. All other records Minda considers start records. The corresponding end records use the start `#CHROM` and `POS` is calculated by adding the start `POS` with absolute value of `SVLEN` or is extracted from the `END` integer in the `INFO` field. 
 73 | Minda has been tested on VCFs produced by
 74 | 
 75 | * Severus
 76 | * SAVANA
 77 | * nanomonsv
 78 | * Sniffles2
 79 | * cuteSV
 80 | * SVIM
 81 | * GRIPSS
 82 | * manta
 83 | * SvABA.
 84 | 
 85 | If you encounter issues with these or other VCF files, please [let us know](https://github.com/KolmogorovLab/minda/issues). 
 86 | 
 87 | ##### TSV Input
 88 | The `--tsv` file has one required column and up three columns. The columns should be as follows:
 89 | <ol>
 90 |     <li>VCF paths (required)</li>
 91 |     <li>caller name</li>
 92 |     <li>prefix</li>  
 93 | </ol>
 94 | If a caller name is not provided, the name listed in the source field of the VCF will be used. If more than one VCF with the same caller name is provided, prefixes disambiguate ID and column names in Minda output files. In the case where prefixes are not provided by the user, Minda automatically assigns a letter prefix in ascending alphabetically order (i.e. A, B, C, etc.).
 95 | 
 96 | An example of TSV contents:
 97 | ```
 98 | /path/to/severus_ONT.vcf     Severus     ONT
 99 | /path/to/severus_PB.vcf      Severus     PB
100 | /path/to/manta.vcf           manta       ILL
101 | ```
102 | ##### Specific Conditions
103 | The `--conditions` parameter enables specific user-defined conditions to be met for each ensemble call. Input a list in double quotation marks that contains:
104 | 
105 | <ol>
106 |     <li>a (nested) list of caller names, each name in single quotation marks with prefixes, if necessary</li>
107 |     <li>an operator in single quoation marks</li>
108 |     <li>a number</li>  
109 | </ol>
110 | 
111 | For example, from the TSV contents above, to require that an ensemble call be one for which both ONT and PB agree, when using `--tsv` input, specify:
112 | ```
113 | "[['ONT_Severus', 'PB_Severus'], '>=', 2]"
114 | ```
115 | OR when using `--vcfs` or `--tsv` input:
116 | ```
117 | "[[caller_names[:2], '>=', 2]"
118 | ```
119 | 
120 | To combine multiple conditions, add `'&'` or `'|'` between each condition.
121 | For example, to require at least one long-read call and one short-read call to agree, specify for `--tsv` input:
122 | ```
123 | "[[['ONT_Severus', 'PB_Severus'], '>=', 1], '&', [['ILL_manta'], '==', 1]]"
124 | ```
125 | OR for `--vcfs` or `--tsv` input:
126 | ```
127 | "[[caller_names[:2], '>=', 1], '&', [caller_names[2:], '==', 1]]"
128 | ```
129 | ##### VAF Filtering
130 | ###### Note: This requires preprocessing of VCF file. See [scripts](scripts).
131 | To run Minda with the `--vaf` parameter, ensure the VCF files have a `VAF` value in the INFO field.  
132 | 
133 | ## Output Files
134 | Both `truthset` and `ensemble` output:
135 | * tp.tsv for each caller
136 | * fp.tsv for each caller
137 | * fn.tsv for each caller
138 | * support.tsv - lists which callers called which truthset/ensemble records
139 | * results.txt - for each caller, lists the overall precision, recall, F1 scores, as well as the number of TP, FN, FP calls overall and by SVTYPE and SVLEN
140 | * removed_records.txt - list of caller IDs of records not evaluated after removing singletons and filtering by FILTER, SVLEN, VAF
141 | 
142 | `ensemble` also outputs:
143 | * ensemble.vcf
144 | 
145 | License
146 | -------
147 | 
148 | Severus is distributed under a BSD license. See the [LICENSE](LICENSE) for details.
149 | 
150 | Citation
151 | -------
152 | Keskus, A.G., Bryant, A., Ahmad, T. et al. **Severus detects somatic structural variation and complex rearrangements in cancer genomes using long-read sequencing.** *Nature Biotechnology* (2025). https://doi.org/10.1038/s41587-025-02618-8
153 | 
154 | Credits
155 | -------
156 | 
157 | Minda is being developed in the Kolmogorov Lab at the National Cancer Institute.
158 | 
159 | Key contributors:
160 | 
161 | * Asher Bryant
162 | * Ayse Keskus
163 | * Mikhail Kolmogorov
164 | 
165 | ---
166 | ### Contact
167 | If you experience any problems or would like to make a suggestion, please submit an [issue](https://github.com/KolmogorovLab/minda/issues).
168 | To contact the developer directly, email asher.bryant@nih.gov.
169 | 


--------------------------------------------------------------------------------
/minda/stats.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from datetime import datetime
  4 | import pandas as pd 
  5 | from collections import Counter
  6 | 
  7 | def _get_tp_fn_fp(support_df, decomposed_dfs, caller_name, vaf, command):
  8 | 
  9 |     paired_df = decomposed_dfs[0].merge(decomposed_dfs[1], on='Minda_ID')
 10 | 
 11 |     if command == "ensemble":
 12 |         base_df = support_df[support_df['ensemble'] == True]
 13 |     else:
 14 |         base_df = support_df[support_df.iloc[:,13] == True]
 15 |             
 16 |     tp_ids = [id for ids in base_df["Minda_IDs"].to_list() for id in ids]
 17 |     
 18 |     # create tp, fn, fp dfs
 19 |     if command == "ensemble":
 20 |         fn_columns = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', \
 21 |                           '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \
 22 |                           'SVTYPE', 'SVLEN', 'VAF', 'Minda_IDs']
 23 | 
 24 |     if command == "truthset":
 25 |         fn_columns = ['#CHROM_x', 'POS_x', 'ID_x', 'INFO_x', \
 26 |                             '#CHROM_y', 'POS_y', 'ID_y', 'INFO_y', \
 27 |                             'SVTYPE', 'SVLEN', 'VAF', 'Minda_IDs'] 
 28 |         
 29 |     fn_df = base_df[base_df[f'{caller_name}'] == False][fn_columns]
 30 |     tp_df = paired_df[paired_df['Minda_ID'].isin(tp_ids)]
 31 |     fp_df = paired_df[~paired_df['Minda_ID'].isin(tp_ids)]
 32 |     
 33 |     
 34 |     return tp_df, fn_df, fp_df, base_df, paired_df
 35 |     
 36 | def _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len, out_dir, sample_name, command, vaf, version):
 37 |     
 38 |     tp = tp_df.shape[0]
 39 |     fn = fn_df.shape[0]
 40 |     fp = fp_df.shape[0]
 41 |     
 42 |     # make tsv
 43 |     tp_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_tp.tsv', sep='\t', index=False)
 44 |     fn_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_fn.tsv', sep='\t', index=False)
 45 |     fp_df.to_csv(f'{out_dir}/{sample_name}_{caller_name}_fp.tsv', sep='\t', index=False)
 46 |     # dfs = [tp_df, fn_df, fp_df]
 47 |     # df_names = ["tp", "fn", "fp"]
 48 |     # date = datetime.today().strftime('%Y-%m-%d')
 49 |     # for i in range(len(dfs)):
 50 |     #     df = dfs[i]
 51 |     #     df_name = df_names[i]
 52 |     #     with open(f'{out_dir}/{sample_name}_{caller_name}_{df_name}.vcf', 'w') as file:
 53 |     #         file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n')
 54 |     #         file.write('##ALT=<ID=DEL,Description="Deletion">\n##ALT=<ID=INS,Description="Insertion">\n##ALT=<ID=DUP,Description="Duplication">\n##ALT=<ID=INV,Description="Inversion">\n')
 55 |     #         file.write('##FILTER=<ID=PASS,Description="Default">\n')
 56 |     #         file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
 57 |     #         if vaf != None:
 58 |     #             file.write('##INFO=<ID=VAF,Number=1,Type=Float,Description="Variant allele frequency">\n')
 59 |     #         command_str = " ".join(sys.argv)
 60 |     #         file.write(f"cmd: {command_str}\n")
 61 |     #         df.to_csv(file, sep="\t", index=False)
 62 | 
 63 |     # caluluate stats
 64 |     if tp+fp == 0:
 65 |         sys.exit(f"{caller_name} has no TP or FP records. Please double check input files.")
 66 |     precision = tp/(tp+fp) 
 67 |     if tp+fn == 0:
 68 |         sys.exit(f"{caller_name} has no TP or FN records. Please double check input files.")
 69 |     recall = tp/(tp+fn)
 70 |     f1 = (2*precision*recall)/(precision+recall)
 71 | 
 72 |     caller_len = len(paired_df)
 73 |     base_len = len(base_df)
 74 | 
 75 |     # overall df
 76 |     columns = ['True Positives', 'False Negatives', 'False Positives', 'Precision', 'Recall', 'F1 Score', 'Caller Records', 'Ensemble Records']
 77 |     data = [[tp, fn, fp, precision, recall, f1, caller_len, base_len]]
 78 |     overall_df = pd.DataFrame(data, columns=columns, index=[caller_name])
 79 | 
 80 |     # SV type dfs
 81 |     tp_type_df = tp_df['SVTYPE_y'].value_counts().to_frame(name=caller_name).rename_axis("SVTYPE").T.sort_index(axis=1)
 82 |     fn_type_df = fn_df['SVTYPE'].value_counts().to_frame(name=caller_name).T.sort_index(axis=1) 
 83 |     fp_type_df = fp_df['SVTYPE_y'].value_counts().to_frame(name=caller_name).rename_axis("SVTYPE").T.sort_index(axis=1)
 84 |     
 85 | 
 86 |     # SV len dfs 
 87 |     ranges = [ -1, 0, 50, 100, 1000, 10000]#, max_len]
 88 |     # ensure bins must increase monotonically
 89 |     ranges = [x for x in ranges if x < max_len] + [max_len]
 90 |     tp_len_df = tp_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).rename_axis("SVLEN").T 
 91 |     fn_len_df = fn_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).T 
 92 |     fp_len_df = fp_df['SVLEN'].value_counts(bins=ranges, sort=False).to_frame(name=caller_name).rename_axis("SVLEN").T 
 93 |         
 94 |     return overall_df, tp_type_df, fn_type_df, fp_type_df, tp_len_df, fn_len_df, fp_len_df
 95 | 
 96 | 
 97 | def get_results(decomposed_dfs_list, base_dfs, caller_names, out_dir, sample_name, max_len, tolerance, vaf, command, args, version):
 98 |     
 99 |     # tp, fn, fp dfs for each caller
100 |     stats_dfs_list = []
101 |     for i in range(len(decomposed_dfs_list)):
102 |         
103 |         decomposed_dfs = decomposed_dfs_list[i]
104 |         caller_name = caller_names[i]
105 |         tp_df, fn_df, fp_df, base_df, paired_df = _get_tp_fn_fp(base_dfs, decomposed_dfs, caller_name, vaf, command)
106 |         stats_dfs = _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len, out_dir, sample_name, command, vaf, version)            
107 |         stats_dfs_list.append(stats_dfs)
108 | 
109 |     overall_results_df = pd.concat([df[0] for df in stats_dfs_list])
110 |     tp_type_results_df = pd.concat([df[1] for df in stats_dfs_list]).fillna(0).astype(int)
111 |     fn_type_results_df = pd.concat([df[2] for df in stats_dfs_list]).fillna(0).astype(int)
112 |     fp_type_results_df = pd.concat([df[3] for df in stats_dfs_list]).fillna(0).astype(int)
113 |     tp_len_results_df = pd.concat([df[4] for df in stats_dfs_list]).fillna(0).astype(int)
114 |     fn_len_results_df = pd.concat([df[5] for df in stats_dfs_list]).fillna(0).astype(int)
115 |     fp_len_results_df = pd.concat([df[6] for df in stats_dfs_list]).fillna(0).astype(int)
116 | 
117 |     results_dfs = [overall_results_df, tp_type_results_df, fn_type_results_df, fp_type_results_df, \
118 |                     tp_len_results_df, fn_len_results_df, fp_len_results_df] 
119 |     
120 |     #headings = ['OVERALL\n\n', '\n\nSV TYPE RESULTS\nTrue Positives\n\n', 'False Negatives\n', 'False Positives\n',\
121 |                 #'\n\nSV LENGTH RESULTS\nTrue Positives\n', 'False Negatives\n', 'False Positives\n']
122 |     #user_input = ", ".join([f"{key}={value}" for key, value in vars(args).items() if value is not None and key!= "func"])
123 |     # with open(f'{out_dir}/{sample_name}_minda_results.txt', 'w') as file:
124 |     #     file.write("MINDA ENSEMBLE RESULTS\n\n")
125 |     #     for i in range(len(results_dfs)):
126 |     #         heading = headings[i]
127 |     #         df = results_dfs[i]
128 |     #         file.write(headings[i])
129 |     #         if df.isna().all().all():
130 |     #             file.write("None" + '\n\n')
131 |     #         else:
132 |     #             file.write(df.to_string() + '\n\n')
133 |     #     file.write(f'##minda_args: {user_input}\n')
134 | 
135 |     file_names = ['overall', 'SV_type_TP', 'SV_type_FN', 'SV_type_FP',\
136 |                   'SV_length_TP', 'SV_length_FN', 'SV_length_FP']
137 |     
138 |     if not os.path.isdir(args.out_dir + "/results"):
139 |         os.makedirs(args.out_dir + "/results")
140 |    
141 |     for i in range(len(results_dfs)):
142 |         file_name = file_names[i]
143 |         df = results_dfs[i].copy()
144 |         df.insert(0, "Caller", caller_names)
145 |         df.to_csv(args.out_dir + f"/results/{file_name}.tsv", sep='\t', index=False)
146 |         
147 | 
148 |     return overall_results_df, tp_type_results_df, fn_type_results_df, fp_type_results_df, tp_len_results_df, fn_len_results_df, fp_len_results_df, paired_df


--------------------------------------------------------------------------------
/minda/main.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | import sys
  3 | import os
  4 | import argparse
  5 | import logging
  6 | import pandas as pd
  7 | 
  8 | from minda.__version__ import __version__
  9 | from minda.decompose import get_caller_name, get_df, get_intersected_df, get_decomposed_dfs
 10 | from minda.ensemble import get_support_df as get_ensemble_support_df
 11 | from minda.truthset import get_support_df as get_truthset_support_df
 12 | from minda.truthset import get_base_df
 13 | from minda.stats import get_results
 14 | 
 15 | logger = logging.getLogger()
 16 | 
 17 | 
 18 | def _enable_logging(log_file, debug, overwrite):
 19 |     """
 20 |     Turns on logging, sets debug levels and assigns a log file
 21 |     """
 22 |     log_formatter = logging.Formatter("[%(asctime)s] %(name)s: %(levelname)s: "
 23 |                                       "%(message)s", "%Y-%m-%d %H:%M:%S")
 24 |     console_formatter = logging.Formatter("[%(asctime)s] %(levelname)s: "
 25 |                                           "%(message)s", "%Y-%m-%d %H:%M:%S")
 26 |     console_log = logging.StreamHandler()
 27 |     console_log.setFormatter(console_formatter)
 28 | 
 29 |     if overwrite:
 30 |         open(log_file, "w").close()
 31 |     file_handler = logging.FileHandler(log_file, mode="a")
 32 |     file_handler.setFormatter(log_formatter)
 33 | 
 34 |     if not debug:
 35 |         level = logging.INFO
 36 |     
 37 |     console_log.setLevel(level)
 38 |     file_handler.setLevel(level)
 39 |     
 40 |     logger.setLevel(logging.DEBUG)
 41 |     logger.addHandler(console_log)
 42 |     logger.addHandler(file_handler)
 43 | 
 44 | 
 45 | def _version():
 46 |     return __version__
 47 | 
 48 | def run(args):
 49 |     if not os.path.isdir(args.out_dir):
 50 |         os.makedirs(args.out_dir)
 51 | 
 52 |     log_file = os.path.join(args.out_dir, "minda.log")
 53 |     _enable_logging(log_file, debug=False, overwrite=True)
 54 | 
 55 |     version =  _version()
 56 |     logger.info("cmd: %s", " ".join(sys.argv))
 57 |     logger.info("MindaV" + version)
 58 | 
 59 |     if args.command == 'truthset':
 60 |         base = args.base
 61 | 
 62 |     # check whether input is tsv or list of vcfs
 63 |     if args.tsv != None:
 64 |         vcf_df = pd.read_csv(args.tsv, sep='\t', header=None)
 65 |         
 66 |         vcf_list = vcf_df.iloc[:,0].to_list()
 67 |         tsv_directory = os.path.abspath(args.tsv)
 68 |         vcf_list = [os.path.abspath(path) if not os.path.isabs(path) else path for path in vcf_list]
 69 |         
 70 |         column_count = vcf_df.shape[1]
 71 |         if column_count >= 2:
 72 |             caller_names = vcf_df.iloc[:,1].fillna("unknown").to_list()
 73 |         else:
 74 |             caller_names = []
 75 |         
 76 |         if column_count == 3:
 77 |             prefixes = vcf_df.iloc[:,2].fillna("unk").to_list()
 78 |             caller_names = [prefixes[i] + "_" + caller_names[i] for i in range(len(caller_names))]
 79 |  
 80 |     else:
 81 |         column_count = 1
 82 |         vcf_list = args.vcfs
 83 |         caller_names = []
 84 | 
 85 |     
 86 |     if args.command == "ensemble" and len(vcf_list) < 2:
 87 |         sys.exit("Provide a minimum of 2 VCF files.")
 88 |     elif args.command == "ensemble" and args.min_support != None and len(vcf_list) < args.min_support:
 89 |         sys.exit("Number of VCF files should be less than or equal minimum number of support.")
 90 |     elif len(vcf_list) < 1 and args.command == "truthset":
 91 |         sys.exit("Provide a minimum of 1 comparison VCF file.") 
 92 | 
 93 |     if args.command == 'truthset':
 94 |         vcf_list.append(base)
 95 |         if len(caller_names) > 0:
 96 |             caller_names.append("base")
 97 |     
 98 |     if caller_names == []:
 99 |         for i in range(len(vcf_list)):
100 |             vcf = vcf_list[i]
101 |             caller_name = get_caller_name(vcf)
102 |             caller_names.append(caller_name)
103 | 
104 |     if len(caller_names) > len(set(caller_names)):
105 |         caller_names = [chr(ord('A') + i) + "_" + caller_names[i] for i in range(len(caller_names))]
106 |         prefixed = True
107 |     elif len(caller_names) == len(set(caller_names)) and column_count == 3:
108 |         prefixed = True
109 |     else:
110 |         prefixed = False    
111 | 
112 |     max_svlengths =[]
113 |     decomposed_dfs_list = []    
114 |     for i in range(len(vcf_list)):
115 |         caller_name = caller_names[i]
116 |         vcf = vcf_list[i]
117 |         if args.bed == None:
118 |             df = get_df(vcf)
119 |         else:
120 |             df = get_intersected_df(vcf, args.bed)
121 |         decomposed_dfs = get_decomposed_dfs(caller_name, df, args.filter, args.min_size, prefixed, args.vaf, args.sample_name, args.out_dir)
122 |         decomposed_dfs_list.append(decomposed_dfs[:2])
123 |         max_svlengths.append(decomposed_dfs[2])
124 | 
125 |     max_len = max(max_svlengths)
126 | 
127 |     if args.command == 'ensemble' and args.conditions != None:
128 |         conditions = eval(args.conditions)
129 |         support_df = get_ensemble_support_df(vcf_list, decomposed_dfs_list, caller_names, args.tolerance, conditions, args.vaf, args.command, args.out_dir, args.sample_name, args, version, args.multimatch)
130 |         results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf, args.command, args, version)
131 |         logger.info(f"\n{results[0]}")
132 |     
133 |     elif args.command == 'ensemble' and args.min_support != None:
134 |         conditions = eval(f"[[caller_names,'>=', {args.min_support}]]")
135 |         support_df = get_ensemble_support_df(vcf_list, decomposed_dfs_list, caller_names, args.tolerance, conditions, args.vaf, args.command, args.out_dir, args.sample_name, args, version, args.multimatch)
136 |         results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf, args.command, args, version)
137 |         logger.info(f"\n{results[0]}")
138 | 
139 |     else:
140 |         base_df = get_base_df(decomposed_dfs_list, args.tolerance, args.multimatch) 
141 |         support_df = get_truthset_support_df(base_df, caller_names, args.vaf, args.out_dir, args.sample_name)
142 |         results = get_results(decomposed_dfs_list, support_df, caller_names, args.out_dir, args.sample_name, max_len, args.tolerance, args.vaf,args.command, args, version)
143 |         logger.info(f"\n{results[0]}")
144 | 
145 | 
146 | def main():
147 |     parser=argparse.ArgumentParser(description="Minda - VCF evaluation tool for germline and somatic structural variant callers")
148 |     subparser=parser.add_subparsers(dest="command")
149 | 
150 |      #defaults  ------------------------------------------------
151 |     FILTER = ["PASS"]
152 |     TOLERANCE = 500
153 |     
154 |     # TRUTHSET ------------------------------------------------
155 |     truthset = subparser.add_parser("truthset", help='benchmark VCF(s) against a base VCF')
156 | 
157 |     # required arguements
158 |     truthset.add_argument("--out_dir", help='path to out directory', dest="out_dir", type=str, required=True)
159 |     truthset.add_argument("--base", help='path of base VCF', dest="base", type=str, required=True)
160 |     
161 |     # mutally exclusive arguments
162 |     truthset_input = truthset.add_mutually_exclusive_group(required=True)
163 |     truthset_input.add_argument('--tsv', action="store", dest="tsv", help="tsv file path")
164 |     truthset_input.add_argument('--vcfs', action="store", dest="vcfs", nargs="+", help="vcf file path(s)")
165 | 
166 |     # # optional arguments 
167 |     truthset.add_argument("--bed", help=f'path to bed file for filtering records with BedTool intersect', dest="bed", type=str)
168 |     truthset.add_argument("--filter", help=f'filter records by FILTER column; default="{FILTER}"', dest="filter", type=str, nargs="*", default=FILTER) 
169 |     truthset.add_argument("--min_size", help=f'filter records by SVSIZE in INFO column', dest="min_size", type=int)
170 |     truthset.add_argument("--tolerance", help=f'maximum allowable bp distance between base and caller breakpoint; default={TOLERANCE}', dest="tolerance", type=int, default=TOLERANCE)
171 |     truthset.add_argument("--sample_name", help=f'name of sample', dest="sample_name", type=str)
172 |     truthset.add_argument("--vaf", help=f'filter out records below a given VAF treshold', dest="vaf", type=float)
173 |     truthset.add_argument("--multimatch", help=f'allow more than one record from the same caller to match a single truthset record', dest="multimatch", action='store_true')
174 | 
175 |     # ENSEMBLE ------------------------------------------------
176 |     ensemble = subparser.add_parser("ensemble", help='create an ensemble call list from multiple VCF and, optionally, benchmark each VCF against')
177 | 
178 |     # required arguements
179 |     ensemble.add_argument("--out_dir", help='path to out directory', dest="out_dir", type=str, required=True)
180 |     
181 |     # mutally exclusive arguments
182 |     ensemble_input = ensemble.add_mutually_exclusive_group(required=True)
183 |     ensemble_input.add_argument('--tsv', action="store", dest="tsv", help="tsv file path")
184 |     ensemble_input.add_argument('--vcfs', action="store", dest="vcfs", nargs="+", help="vcf file path(s)")
185 | 
186 |     ensemble_support = ensemble.add_mutually_exclusive_group(required=True)
187 |     ensemble_support.add_argument("--conditions", help=f'specific conditions to support a call', dest="conditions", type=str)
188 |     ensemble_support.add_argument("--min_support", help=f'minimumn number of callers to support a call', dest="min_support", type=int)
189 | 
190 |     # optional arguments 
191 |     ensemble.add_argument("--bed", help=f'path to bed file for filtering records with BedTool intersect', dest="bed", type=str)
192 |     ensemble.add_argument("--filter", help=f'filter records by FILTER column; default="{FILTER}"', dest="filter", type=str, nargs="*", default=FILTER) 
193 |     ensemble.add_argument("--min_size", help=f'filter records by SVSIZE in INFO column', dest="min_size", type=int)
194 |     ensemble.add_argument("--tolerance", help=f'maximum allowable bp distance between base and caller breakpoint; default={TOLERANCE}', dest="tolerance", type=int, default=TOLERANCE)
195 |     ensemble.add_argument("--sample_name", help=f'name of sample', dest="sample_name", type=str)
196 |     ensemble.add_argument("--vaf", help=f'filter out records below a given VAF treshold', dest="vaf", type=float)
197 |     ensemble.add_argument("--multimatch", help=f'allow more than one record from the same caller to match a single ensemble record', dest="multimatch", action='store_true')
198 | 
199 |     # ------------------------------------------------
200 |     args, remaining_args = parser.parse_known_args()
201 |     parser.set_defaults(func=run)
202 |     args=parser.parse_args()
203 |     args.func(args)
204 | 


--------------------------------------------------------------------------------
/data/colo829_benchmark_grch38_amended_vaf.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##FILTER=<ID=PASS,Description="All filters passed">
 3 | ##contig=<ID=chr1,length=248956422>
 4 | ##contig=<ID=chr2,length=242193529>
 5 | ##contig=<ID=chr3,length=198295559>
 6 | ##contig=<ID=chr4,length=190214555>
 7 | ##contig=<ID=chr5,length=181538259>
 8 | ##contig=<ID=chr6,length=170805979>
 9 | ##contig=<ID=chr7,length=159345973>
10 | ##contig=<ID=chr8,length=145138636>
11 | ##contig=<ID=chr9,length=138394717>
12 | ##contig=<ID=chr10,length=133797422>
13 | ##contig=<ID=chr11,length=135086622>
14 | ##contig=<ID=chr12,length=133275309>
15 | ##contig=<ID=chr13,length=114364328>
16 | ##contig=<ID=chr14,length=107043718>
17 | ##contig=<ID=chr15,length=101991189>
18 | ##contig=<ID=chr16,length=90338345>
19 | ##contig=<ID=chr17,length=83257441>
20 | ##contig=<ID=chr18,length=80373285>
21 | ##contig=<ID=chr19,length=58617616>
22 | ##contig=<ID=chr20,length=64444167>
23 | ##contig=<ID=chr21,length=46709983>
24 | ##contig=<ID=chr22,length=50818468>
25 | ##contig=<ID=chrX,length=156040895>
26 | ##contig=<ID=chrY,length=57227415>
27 | ##ALT=<ID=INS,Description="Insertion">
28 | ##ALT=<ID=DEL,Description="Deletion">
29 | ##ALT=<ID=DUP,Description="Duplication">
30 | ##ALT=<ID=INV,Description="Inversion">
31 | ##ALT=<ID=BND,Description="Breakend; Translocation">
32 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
33 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variation">
34 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of structural variation">
35 | ##INFO=<ID=END,Number=1,Type=String,Description="End position of structural variation">
36 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Mate chromsome for BND SVs">
37 | ##INFO=<ID=SUPPORT,Number=1,Type=String,Description="Sample support with Sample nam|GT|VAF for four COLO829 replicates">
38 | ##INFO=<ID=VAF,Number=1,Type=Float,Description=variant_allele_frequency>
39 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	COLO829SV
40 | chr1	207807889	COLO829_SOMATIC_SV01	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=33584;END=207841473;CHR2=chr1;SUPPORT=GSC_COLO829|0/1|0.23,ONT_COLO829|0/1|0.36,PBR_COLO829|0/1|0.27,VAI_COLO829|0/1|0.32;VAF=0.295	GT	0/1
41 | chr1	224458901	COLO829_SOMATIC_SV02	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=153518;END=224612419;CHR2=chr1;SUPPORT=GSC_COLO829|0/0|0.11,ONT_COLO829|0/1|0.36,PBR_COLO829|0/1|0.38,VAI_COLO829|0/0|0.21;VAF=0.285	GT	0/1
42 | chr1	224595103	COLO829_SOMATIC_SV03	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=3215;END=224598318;CHR2=chr1;SUPPORT=GSC_COLO829|0/1|0.29,ONT_COLO829|0/1|0.29,PBR_COLO829|0/1|0.41,VAI_COLO829|0/1|0.31;VAF=0.3	GT	0/1
43 | chr1	236097113	COLO829_SOMATIC_SV04	N	<INS>	.	PASS	SVTYPE=INS;SVLEN=11007;END=236097113;CHR2=chr1;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
44 | chr3	24523615	COLO829_SOMATIC_SV05	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=1073;END=24524688;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.39,ONT_COLO829|0/1|0.37,PBR_COLO829|0/1|0.40,VAI_COLO829|0/1|0.39;VAF=0.39	GT	0/1
45 | chr3	26622432	COLO829_SOMATIC_SV06	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=577;END=26623009;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.30,ONT_COLO829|0/1|0.32,PBR_COLO829|0/1|0.32,VAI_COLO829|0/1|0.33;VAF=0.32	GT	0/1
46 | chr3	60147027	COLO829_SOMATIC_SV07	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=71809;END=60218836;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.26,ONT_COLO829|0/1|0.30,PBR_COLO829|0/1|0.39,VAI_COLO829|0/1|0.25;VAF=0.28	GT	0/1
47 | chr3	60886452	COLO829_SOMATIC_SV08	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=141091;END=61027543;CHR2=chr3;SUPPORT=GSC_COLO829|0/1|0.35,ONT_COLO829|0/1|0.29,PBR_COLO829|0/1|0.42,VAI_COLO829|0/1|0.28;VAF=0.32	GT	0/1
48 | chr4	65346239	COLO829_SOMATIC_SV09	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=80;END=65346319;CHR2=chr4;SUPPORT=GSC_COLO829|0/1|0.50,ONT_COLO829|0/1|0.48,PBR_COLO829|0/1|0.50,VAI_COLO829|0/1|0.56;VAF=0.5	GT	0/1
49 | chr4	187075168	COLO829_SOMATIC_SV10	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=60;END=187075228;CHR2=chr4;SUPPORT=GSC_COLO829|0/1|0.46,ONT_COLO829|0/1|0.42,PBR_COLO829|0/1|0.47,VAI_COLO829|0/1|0.37;VAF=0.44	GT	0/1
50 | chr5	28787890	COLO829_SOMATIC_SV11	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=175099;END=28962989;CHR2=chr5;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|0/1|0.74,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
51 | chr7	57403875	COLO829_SOMATIC_SV12	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=32528;END=57436403;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.65,ONT_COLO829|0/1|0.76,PBR_COLO829|1/1|0.87,VAI_COLO829|0/1|0.64;VAF=0.705	GT	0/1
52 | chr7	75595250	COLO829_SOMATIC_SV13	N	<INS>	.	PASS	SVTYPE=INS;SVLEN=54;END=75595250;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.34,ONT_COLO829|0/0|0.09,PBR_COLO829|0/1|0.55,VAI_COLO829|0/1|0.22;VAF=0.28	GT	0/1
53 | chr7	78352523	COLO829_SOMATIC_SV14	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=100409;END=78452932;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.36,ONT_COLO829|0/1|0.26,PBR_COLO829|0/1|0.30,VAI_COLO829|0/1|0.26;VAF=0.28	GT	0/1
54 | chr7	78560891	COLO829_SOMATIC_SV15	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=67480;END=78628371;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.31,ONT_COLO829|0/1|0.25,PBR_COLO829|0/1|0.50,VAI_COLO829|0/0|0.20;VAF=0.28	GT	0/1
55 | chr7	86215352	COLO829_SOMATIC_SV16	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=9032;END=86224384;CHR2=chr7;SUPPORT=GSC_COLO829|0/0|0.14,ONT_COLO829|0/0|0.11,PBR_COLO829|0/0|0.21,VAI_COLO829|0/0|0.14;VAF=0.14	GT	0/0
56 | chr7	104844620	COLO829_SOMATIC_SV17	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=127236;END=104971856;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.34,ONT_COLO829|0/1|0.39,PBR_COLO829|0/1|0.35,VAI_COLO829|0/1|0.36;VAF=0.355	GT	0/1
57 | chr7	110753277	COLO829_SOMATIC_SV18	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=1128;END=110754405;CHR2=chr7;SUPPORT=GSC_COLO829|0/0|0.19,ONT_COLO829|0/1|0.26,PBR_COLO829|0/1|0.24,VAI_COLO829|0/1|0.26;VAF=0.25	GT	0/1
58 | chr7	126106070	COLO829_SOMATIC_SV19	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=420777;END=126526847;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.55,ONT_COLO829|0/1|0.73,PBR_COLO829|0/1|0.70,VAI_COLO829|0/1|0.34;VAF=0.625	GT	0/1
59 | chr7	126458434	COLO829_SOMATIC_SV20	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=68953;END=126527387;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.38,ONT_COLO829|0/1|0.38,PBR_COLO829|0/1|0.34,VAI_COLO829|0/1|0.35;VAF=0.365	GT	0/1
60 | chr7	144262134	COLO829_SOMATIC_SV21	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=129567;END=144391701;CHR2=chr7;SUPPORT=GSC_COLO829|0/1|0.27,ONT_COLO829|0/1|0.27,PBR_COLO829|0/1|0.55,VAI_COLO829|0/1|0.26;VAF=0.27	GT	0/1
61 | chr7	151049571	COLO829_SOMATIC_SV22	N	N[chr15:84141972[	.	PASS	SVTYPE=BND;SVLEN=.;END=N[chr15:84141972[;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.55,ONT_COLO829|0/1|0.63,PBR_COLO829|1/1|0.81,VAI_COLO829|0/1|0.61;VAF=0.62	GT	0/1
62 | chr7	158335312	COLO829_SOMATIC_SV23	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=46;END=158335358;CHR2=chr7;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|0/0|0.07,VAI_COLO829|1/1|0.82;VAF=0.91	GT	0/1
63 | chr9	28031838	COLO829_SOMATIC_SV24	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=27305;END=28059143;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.45,ONT_COLO829|0/1|0.50,PBR_COLO829|0/1|0.44,VAI_COLO829|0/1|0.42;VAF=0.445	GT	0/1
64 | chr9	28031865	COLO829_SOMATIC_SV25	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=2604;END=28034469;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.24,ONT_COLO829|0/1|0.33,PBR_COLO829|0/1|0.31,VAI_COLO829|0/1|0.34;VAF=0.32	GT	0/1
65 | chr9	28034301	COLO829_SOMATIC_SV26	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=123393;END=28157694;CHR2=chr9;SUPPORT=GSC_COLO829|0/1|0.50,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|0.81,VAI_COLO829|1/1|1.00;VAF=0.905	GT	1/1
66 | chr10	7090915	COLO829_SOMATIC_SV27	N	N[chr19:17286830[	.	PASS	SVTYPE=BND;SVLEN=.;END=N[chr19:17286830[;CHR2=chr19;SUPPORT=GSC_COLO829|0/1|0.37,ONT_COLO829|0/1|0.33,PBR_COLO829|0/1|0.26,VAI_COLO829|0/1|0.25;VAF=0.295	GT	0/1
67 | chr10	7592410	COLO829_SOMATIC_SV28	N	N]chr18:9868619]	.	PASS	SVTYPE=BND;SVLEN=.;END=N]chr18:9868619];CHR2=chr18;SUPPORT=GSC_COLO829|0/0|0.09,ONT_COLO829|0/0|0.15,PBR_COLO829|0/1|0.25,VAI_COLO829|0/0|0.21;VAF=0.18	GT	0/0
68 | chr10	87940543	COLO829_SOMATIC_SV29	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=11829;END=87952372;CHR2=chr10;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.80,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
69 | chr11	81074560	COLO829_SOMATIC_SV30	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=308177;END=81382737;CHR2=chr11;SUPPORT=GSC_COLO829|0/1|0.53,ONT_COLO829|1/1|0.79,PBR_COLO829|0/1|0.58,VAI_COLO829|0/1|0.76;VAF=0.67	GT	0/1
70 | chr12	129287232	COLO829_SOMATIC_SV31	N	<INS>	.	PASS	SVTYPE=INS;SVLEN=274;END=129287232;CHR2=chr12;SUPPORT=GSC_COLO829|0/1|0.73,ONT_COLO829|0/1|0.62,PBR_COLO829|0/1|0.65,VAI_COLO829|0/1|0.73;VAF=0.69	GT	0/1
71 | chr14	34545584	COLO829_SOMATIC_SV32	N	<INS>	.	PASS	SVTYPE=INS;SVLEN=2501;END=34545584;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.97,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
72 | chr14	72548005	COLO829_SOMATIC_SV33	N	<INS>	.	PASS	SVTYPE=INS;SVLEN=98;END=72548005;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.96,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
73 | chr14	104093751	COLO829_SOMATIC_SV34	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=55;END=104093806;CHR2=chr14;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
74 | chr15	23586515	COLO829_SOMATIC_SV35	N	<INV>	.	PASS	SVTYPE=INV;SVLEN=161042;END=23747557;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.30,ONT_COLO829|0/1|0.39,PBR_COLO829|0/1|0.36,VAI_COLO829|0/1|0.27;VAF=0.33	GT	0/1
75 | chr15	41329096	COLO829_SOMATIC_SV36	N	<DUP>	.	PASS	SVTYPE=DUP;SVLEN=7212;END=41336308;CHR2=chr15;SUPPORT=GSC_COLO829|0/1|0.44,ONT_COLO829|0/0|0.11,PBR_COLO829|0/1|0.43,VAI_COLO829|0/0|0.16;VAF=0.295	GT	0/1
76 | chr16	58590641	COLO829_SOMATIC_SV37	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=38786;END=58629427;CHR2=chr16;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
77 | chr16	78894743	COLO829_SOMATIC_SV38	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=166361;END=79061104;CHR2=chr16;SUPPORT=GSC_COLO829|0/1|0.65,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|0.79,VAI_COLO829|1/1|1.00;VAF=0.895	GT	1/1
78 | chr18	68712224	COLO829_SOMATIC_SV39	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=3365;END=68715589;CHR2=chr18;SUPPORT=GSC_COLO829|0/0|0.21,ONT_COLO829|0/1|0.26,PBR_COLO829|0/0|0.21,VAI_COLO829|0/0|0.18;VAF=0.21	GT	0/0
79 | chr19	17286003	COLO829_SOMATIC_SV40	N	N[chr10:7017548[	.	PASS	SVTYPE=BND;SVLEN=.;END=N[chr10:7017548[;CHR2=chr10;SUPPORT=GSC_COLO829|0/1|0.29,ONT_COLO829|0/1|0.32,PBR_COLO829|0/0|0.17,VAI_COLO829|0/1|0.24;VAF=0.265	GT	0/1
80 | chr20	13180081	COLO829_SOMATIC_SV41	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=3372;END=13183453;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.49,ONT_COLO829|0/1|0.48,PBR_COLO829|0/1|0.45,VAI_COLO829|0/1|0.49;VAF=0.485	GT	0/1
81 | chr20	14982313	COLO829_SOMATIC_SV42	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=50989;END=15033302;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.28,ONT_COLO829|0/1|0.27,PBR_COLO829|0/1|0.28,VAI_COLO829|0/1|0.33;VAF=0.28	GT	0/1
82 | chr20	15019977	COLO829_SOMATIC_SV43	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=13219;END=15033196;CHR2=chr20;SUPPORT=GSC_COLO829|0/1|0.37,ONT_COLO829|0/1|0.23,PBR_COLO829|0/1|0.42,VAI_COLO829|0/1|0.28;VAF=0.325	GT	0/1
83 | chr22	33363264	COLO829_SOMATIC_SV44	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=79149;END=33442413;CHR2=chr22;SUPPORT=GSC_COLO829|0/1|0.27,ONT_COLO829|0/1|0.25,PBR_COLO829|0/0|0.20,VAI_COLO829|0/1|0.38;VAF=0.26	GT	0/1
84 | chrX	31178837	COLO829_SOMATIC_SV45	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=19254;END=31198091;CHR2=chrX;SUPPORT=GSC_COLO829|0/1|0.58,ONT_COLO829|0/1|0.70,PBR_COLO829|0/1|0.61,VAI_COLO829|0/1|0.56;VAF=0.595	GT	0/1
85 | chrX	31283087	COLO829_SOMATIC_SV46	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=737546;END=32020633;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|0.90,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
86 | chrX	32059739	COLO829_SOMATIC_SV47	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=215241;END=32274980;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|0.81,ONT_COLO829|0/1|0.65,PBR_COLO829|1/1|1.00,VAI_COLO829|0/1|0.70;VAF=0.755	GT	0/1
87 | chrX	32080417	COLO829_SOMATIC_SV48	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=102717;END=32183134;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|1.00,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|1.00;VAF=1	GT	1/1
88 | chrX	34041661	COLO829_SOMATIC_SV49	N	<DEL>	.	PASS	SVTYPE=DEL;SVLEN=2249;END=34043910;CHR2=chrX;SUPPORT=GSC_COLO829|1/1|1.00,ONT_COLO829|1/1|0.92,PBR_COLO829|1/1|1.00,VAI_COLO829|1/1|0.94;VAF=0.97	GT	1/1
89 | 


--------------------------------------------------------------------------------
/minda/ensemble.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from collections import Counter
  3 | from datetime import datetime
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import re
  7 | 
  8 | 
  9 | def _add_columns(ensemble_df, vaf):
 10 |     # create a column of list of prefixed IDs for each locus group
 11 |     key_columns = ['locus_group_x','locus_group_y']
 12 |     value_columns = ['ID_x', 'ID_y']
 13 |     column_suffixes = ['x','y']
 14 |     
 15 |     for i in range(len(key_columns)):
 16 |         locus_group = key_columns[i]
 17 |         id = value_columns[i]
 18 |         column_suffix = column_suffixes[i]
 19 |         
 20 |         keys =  ensemble_df[f'{locus_group}'].to_list()
 21 |         values = ensemble_df[f'{id}'].to_list()
 22 |         minda_values = ensemble_df['Minda_ID'].to_list()
 23 |         caller_names =  ensemble_df.caller_names.to_list()
 24 |         
 25 |         id_dict = {}
 26 |         for key, value in zip(keys, values):
 27 |             if key not in id_dict:
 28 |                 id_dict[key] = []
 29 |             id_dict[key].append(value)
 30 |     
 31 |         minda_id_dict = {}
 32 |         for key, value in zip(keys, minda_values):
 33 |             if key not in minda_id_dict:
 34 |                 minda_id_dict[key] = []
 35 |             minda_id_dict[key].append(value)
 36 | 
 37 |         ensemble_df[f'ID_list_{column_suffix}'] = ensemble_df[locus_group].map(id_dict)
 38 |         ensemble_df[f'Minda_ID_list_{column_suffix}'] = ensemble_df[locus_group].map(minda_id_dict)
 39 | 
 40 |         
 41 |     # create dict for SV type
 42 |     values = ensemble_df.SVTYPE.to_list()
 43 |     svtype_dict = {}
 44 |     for key, value in zip(keys, values):
 45 |         if key not in svtype_dict:
 46 |             svtype_dict[key] = []
 47 |         svtype_dict[key].append(value)
 48 |         
 49 |     most_common_svtpye_dict = {k:Counter(v).most_common(1)[0][0] for (k,v) in svtype_dict.items()}
 50 |     ensemble_df['SVTYPE'] = ensemble_df['locus_group_y'].map(most_common_svtpye_dict)
 51 | 
 52 |     # create dict of  vafs
 53 |     if vaf != None:
 54 |         values = ensemble_df.VAF.to_list()
 55 |         vaf_dict = {}
 56 |         for key, value in zip(keys, values):
 57 |             if key not in vaf_dict:
 58 |                 vaf_dict[key] = []
 59 |             vaf_dict[key].append(value)
 60 |     
 61 |         ensemble_df['VAFs'] = ensemble_df['locus_group_y'].map(vaf_dict)
 62 |     
 63 |     return ensemble_df
 64 | 
 65 | 
 66 | def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch):
 67 |     
 68 |     dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list]
 69 |     dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list]
 70 |     dfs_list = [dfs_1, dfs_2]
 71 |     
 72 |     # create stat dfs
 73 |     start_dfs_list = []
 74 |     start_dfs = pd.concat(dfs_1).reset_index(drop=True)
 75 |     start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN']].sort_values(['#CHROM', 'POS'])
 76 |     
 77 |     start_dfs['diff_x'] = start_dfs.groupby('#CHROM').POS.diff().fillna(9999)
 78 |     diffs = start_dfs['diff_x'].to_list()
 79 | 
 80 |     # group start loci
 81 |     loci = []
 82 |     count = 1
 83 |     for diff in diffs:
 84 |         if diff >= tolerance:
 85 |             locus = count
 86 |             loci.append(locus)
 87 |             count += 1
 88 |         else:
 89 |             locus = count - 1 
 90 |             loci.append(locus)
 91 |     
 92 |     
 93 |     start_dfs['locus_group_x'] = loci
 94 |     start_dfs['median'] = start_dfs.groupby('locus_group_x')['POS'].transform('median').astype('int')
 95 | 
 96 |     # create end dfs
 97 |     end_dfs = pd.concat(dfs_2).reset_index(drop=True)
 98 | 
 99 |     #ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'SVLEN','Minda_ID'])
100 |     ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'Minda_ID'])
101 |     ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
102 |            'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
103 |            'ID_y', ]]
104 |     ensemble_df = ensemble_df.sort_values(['locus_group_x','#CHROM_y', 'POS_y'])
105 |     ensemble_df ['diff_y'] = ensemble_df.groupby(['locus_group_x','#CHROM_y']).POS_y.diff().abs().fillna(9999)
106 |     diffs = ensemble_df['diff_y'].to_list()
107 |     caller_names = ensemble_df['Minda_ID'].apply(lambda x: x.rsplit('_', 1)[0]).tolist()
108 |     ensemble_df['caller_names']= caller_names
109 |     ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
110 |            'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
111 |            'ID_y','diff_y','caller_names' ]]
112 | 
113 |     # group end loci
114 |     locus_callers = []
115 |     loci = []
116 |     count = 1
117 |     for i in range(len(diffs)):
118 |         diff = diffs[i]
119 |         caller_name = caller_names[i]
120 |         
121 |         #create new end locus (sublocus 1)
122 |         if diff >= tolerance: 
123 |             locus_callers.clear()
124 |             locus_callers.append(caller_name)
125 |             locus = str(count) + "_1"
126 |             loci.append(locus)
127 |             count += 1
128 |             
129 |         # add new sublocus (sublocus determined by how many calls the caller makes at a given locus)
130 |         elif multimatch == False and diff < tolerance and caller_name in locus_callers:
131 |             #elif diff < tolerance and caller_name in locus_callers: 
132 |                 locus_callers.append(caller_name)
133 |                 sub_group = locus_callers.count(caller_name)
134 |                 locus = str(count-1) + "_" + str(sub_group)
135 |                 loci.append(locus)
136 |             
137 |         # add to existing sublocus 1   
138 |         else: 
139 |             locus = str(count - 1) + "_1" 
140 |             loci.append(locus)
141 |             locus_callers.append(caller_name)
142 |     
143 |     # add locus_group_y, median POS, caller ID, Minda ID, SV type, VAF columns
144 |     ensemble_df['locus_group_y'] = loci
145 |     ensemble_df['median'] = ensemble_df.groupby('locus_group_y')['POS_y'].transform('median').astype('int')
146 |     ensemble_df = _add_columns(ensemble_df, vaf)
147 |     if vaf != None:
148 |         ensemble_df['VAF'] = ensemble_df.groupby('locus_group_y')['VAF'].transform('median')
149 |     else:
150 |         ensemble_df['VAF'] = np.nan
151 |    
152 |     ensemble_df = ensemble_df.drop_duplicates(['locus_group_x', 'locus_group_y']).reset_index(drop=True)
153 |     
154 |     return ensemble_df
155 | 
156 | 
157 | def _get_ensemble_call_column(support_df, conditions):
158 |     column_names = []
159 |     condition_count = 0
160 |     condition_columns = []
161 |     query_list = []
162 |     for i in range(len(conditions)):
163 |         
164 |         if i % 2 == 0:
165 |             operator = conditions[i][1]
166 |             number = str(conditions[i][2])
167 |             
168 |             nested_caller_columns = conditions[i][0]
169 |             nested_type = type(nested_caller_columns[0])
170 |             
171 |             condition = chr(ord('A') + condition_count)
172 |             column_name = f'condition_{condition}'
173 |     
174 |             nested_columns_count = 1
175 |             sub_condition_columns = []
176 |             if nested_type == list:
177 |                 for j in range(len(nested_caller_columns)):
178 |                     caller_columns = nested_caller_columns[j]
179 |                     sub_column_name = f'condition_{nested_columns_count}_{condition}'
180 |                     support_df[f'{sub_column_name}'] = support_df[caller_columns].any(axis=1)
181 |                     nested_columns_count += 1
182 |                     sub_condition_columns.append(sub_column_name)
183 |                 support_df[f'{column_name}'] = support_df[sub_condition_columns].sum(axis=1)
184 |                 
185 |             else:
186 |                 support_df[f'{column_name}'] = support_df[nested_caller_columns].sum(axis=1)
187 |             condition_count += 1
188 |             condition_columns.append(column_name)
189 |             query_list.extend([column_name, operator, number])
190 |         else:
191 |             query_list.extend(conditions[i])
192 |     
193 |     query = ' '.join(query_list)
194 |     mask = support_df.eval(query) 
195 |     #support_df['ensemble'] = mask 
196 |     support_df.insert(loc=12, column='ensemble', value=mask)
197 |     return support_df
198 |     
199 | def _replace_value(row):
200 |     if row['ALT'] == '<BND>':
201 |         return f"N]{row['#CHROM_y']}:{row['POS_y']}]"
202 |     else:
203 |         return row['ALT']
204 |     
205 | def _get_contigs(vcf_list):
206 |     contig_dict = {}
207 |     for vcf in vcf_list:
208 |         with open(vcf, 'r') as file:
209 |             for line in file:
210 |                 if not line.startswith("##"):
211 |                     break
212 |                 if line.startswith("##contig"):
213 |                     pattern = r'ID=([^,>]+),length=([^,>]+)'
214 |                     id_length_tuple = re.findall(pattern, line)[0]
215 |                     chr_id = id_length_tuple[0]
216 |                     length = int(id_length_tuple[1])
217 |                     if chr_id not in contig_dict:
218 |                         contig_dict[chr_id] = length
219 |                     else:
220 |                         value = contig_dict[chr_id]
221 |                         max_length = max(value, length)
222 |                         if value != max_length:
223 |                             print(f'Contig ID {chr_id} has lengths {value} and {max_length}; {max_length} will be used in ensemble.vcf header.')
224 |                         contig_dict[chr_id] = max_length
225 |     return contig_dict
226 | 
227 | def _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version):
228 |     vcf_df = support_df[support_df['ensemble'] == True].reset_index(drop=True).copy()
229 |     vcf_df['ID'] = f'Minda_' + (vcf_df.index + 1).astype(str)
230 |     vcf_df['REF'] = "N" 
231 |     vcf_df['ALT'] = ["<" + svtype +">" for svtype in vcf_df['SVTYPE']]
232 |     vcf_df['ALT'] = vcf_df.apply(_replace_value, axis=1)
233 |     vcf_df['QUAL'] = "."
234 |     vcf_df['FILTER'] = "PASS"
235 | 
236 |     if vaf != None:
237 |         vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \
238 |                           ';SUPP_VEC=' + ','.join(map(str, supp_vec)) + ';VAF=' + str(vaf) \
239 |                           for svlen, svtype, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'], vcf_df['VAF'])]
240 |         vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
241 |     else:
242 |         vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \
243 |                           for svlen, svtype, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'])]
244 |         vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
245 |     date = datetime.today().strftime('%Y-%m-%d')
246 |     with open(f'{out_dir}/{sample_name}_minda_ensemble.vcf', 'w') as file:
247 |         file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n')
248 |         contig_dict = _get_contigs(vcf_list)
249 |         for key, value in contig_dict.items():
250 |             file.write(f'##contig=<ID={key},length={value}>\n')
251 |         file.write('##ALT=<ID=DEL,Description="Deletion">\n##ALT=<ID=INS,Description="Insertion">\n##ALT=<ID=DUP,Description="Duplication">\n##ALT=<ID=INV,Description="Inversion">\n')
252 |         file.write('##FILTER=<ID=PASS,Description="Default">\n')
253 |         file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
254 |         if vaf != None:
255 |             file.write('##INFO=<ID=VAF,Number=1,Type=Float,Description="Variant allele frequency">\n')
256 |         command_str = " ".join(sys.argv)
257 |         file.write(f"##cmd: {command_str}\n")
258 |         vcf_df.to_csv(file, sep="\t", index=False)
259 | 
260 | 
261 | def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, conditions, vaf, command, out_dir, sample_name, args, version, multimatch):
262 |     ensemble_df = _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch)
263 |     
264 |     minda_id_x_lists = ensemble_df.Minda_ID_list_x.to_list()
265 |     minda_id_y_lists = ensemble_df.Minda_ID_list_y.to_list()
266 |     
267 |     # check that both start & end have same IDs
268 |     for caller_name in caller_names:
269 |         caller_column = []
270 |         for i in range(len(minda_id_x_lists)):
271 |             minda_id_x_list = minda_id_x_lists[i]
272 |             minda_id_y_list = minda_id_y_lists[i]
273 |             intersect_list = list(set(minda_id_x_list).intersection(set(minda_id_y_list)))
274 |             call_boolean  = any(value.startswith(caller_name) for value in intersect_list)
275 |             caller_column.append(call_boolean)
276 |         ensemble_df[f'{caller_name}'] = caller_column
277 | 
278 |     column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x',  \
279 |                     '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \
280 |                     'SVTYPE', 'SVLEN', 'VAF', 'Minda_ID_list_y'] + caller_names
281 |     
282 |     support_df = ensemble_df[column_names].rename(columns={"Minda_ID_list_y": "Minda_IDs"}).copy()
283 |     #if command == "ensemble":
284 |     support_df = _get_ensemble_call_column(support_df, conditions)
285 | 
286 |     # create ensemble vcf
287 |     _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version)
288 | 
289 |     # create support csv
290 |     support_ex_df = support_df
291 |     support_df.to_csv(f'{out_dir}/{sample_name}_support.tsv', sep='\t', index=False)
292 |     
293 |     return support_df
294 | 
295 | 
296 | def add_vaf(row,df,caller_name):
297 |     for item in row['Minda_IDs']:
298 |         if item in df['Minda_ID'].values:
299 |             return df[df['Minda_ID'] == item]['VAF'].values[0]
300 |     return row[f'{caller_name}']
301 | 


--------------------------------------------------------------------------------
/annotation/minda_stratify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import json
  6 | 
  7 | from intervaltree import Interval, IntervalTree
  8 | from collections import defaultdict, namedtuple
  9 | 
 10 | 
 11 | def parse_repeatmasker(filename):
 12 |     """
 13 |     Parses repeatmakser 'out' file and returns index of annotated repeats
 14 |     """
 15 |     chr_trees = defaultdict(IntervalTree)
 16 |     num_rec = 0
 17 | 
 18 |     for line in open(filename, "r"):
 19 |         fields = line.split()
 20 |         if len(fields) != 15:
 21 |             continue
 22 | 
 23 |         divergence, chrom, start, end, repeat_id, family = float(fields[1]), fields[4], int(fields[5]), int(fields[6]), fields[9], fields[10]
 24 |         if end - start < MIN_REPEAT:
 25 |             continue
 26 | 
 27 |         chr_trees[chrom][start:end] = (repeat_id, family, start)
 28 |         num_rec += 1
 29 |     print(num_rec)
 30 | 
 31 |     return chr_trees
 32 | 
 33 | 
 34 | def get_bed_intervals(filename):
 35 |     """
 36 |     Parses bed file with intervals and returns index
 37 |     """
 38 |     chr_trees = defaultdict(IntervalTree)
 39 | 
 40 |     for line in open(filename, "r"):
 41 |         if line.startswith("#"):
 42 |             continue
 43 | 
 44 |         fields = line.strip().split()
 45 |         chrom, start, end = fields[0], int(fields[1]), int(fields[2])
 46 |         chr_trees[chrom][start:end] = (start, end)
 47 | 
 48 |     return chr_trees
 49 | 
 50 | 
 51 | def get_vcf_breakpoints(filename):
 52 |     """
 53 |     Extracts breakpoint coordinates from a vcf
 54 |     """
 55 |     chr_trees = defaultdict(IntervalTree)
 56 |     for line in open(filename, "r"):
 57 |         if line.startswith("#"):
 58 |             continue
 59 | 
 60 |         fields = line.strip().split()
 61 |         chr_1, pos_1, info = fields[0], int(fields[1]), fields[7]
 62 |         chr_2, pos_2 = None, None
 63 |         tags = info.split(";")
 64 |         for tag in tags:
 65 |             if tag.startswith("CHR2"):
 66 |                 chr_2 = tag[5:]
 67 |             if tag.startswith("END"):
 68 |                 pos_2 = int(tag[4:])
 69 | 
 70 |         if chr_2 is None:
 71 |             chr_2 = chr_1
 72 |         chr_trees[chr_1][pos_1 : pos_1 + 1] = (chr_1, pos_1)
 73 |         if pos_2 is not None:
 74 |             chr_trees[chr_2][pos_2 : pos_2 + 1] = (chr_2, pos_2)
 75 | 
 76 |     return chr_trees
 77 | 
 78 | 
 79 | MindaEntry = namedtuple("MindaEntry", ["minda_num", "minda_str", "chr_x", "pos_x", "list_x", "chr_y", "pos_y",
 80 |                                        "list_y", "sv_type", "sv_len", "vaf", "support", "original_line", "is_ensemble"])
 81 | def parse_minda_csv(filename, num_callsets):
 82 |     """
 83 |     Parses Minda output file
 84 |     """
 85 |     callset_list = None
 86 |     minda_num = 0
 87 |     minda_entries = {}
 88 | 
 89 |     TOOLS_BEGIN = 13
 90 | 
 91 |     for line in open(filename, "r"):
 92 |         if line.startswith("#"):
 93 |             callset_list = line.strip().split("\t")[TOOLS_BEGIN : TOOLS_BEGIN + num_callsets]
 94 |             print("Callsets:", callset_list)
 95 |             continue
 96 | 
 97 |         fields = line.strip().split("\t")
 98 | 
 99 |         support_dict = {}
100 |         for (caller, supp) in zip(callset_list, fields[TOOLS_BEGIN : TOOLS_BEGIN + num_callsets]):
101 |             support_dict[caller] = True if supp != "False" else False
102 |         is_ensemble = (fields[12] == "True")
103 | 
104 |         #if not len(fields[10]):
105 |         #    print(line)
106 |         #    continue
107 | 
108 |         #VAF not always available, default to 1.0
109 |         try:
110 |             vaf = float(fields[10])
111 |         except ValueError:
112 |             vaf = 1.0
113 | 
114 |         entry = MindaEntry(minda_num=minda_num, minda_str=fields[10],
115 |                            chr_x=fields[0], pos_x=int(fields[1]), list_x=fields[3],
116 |                            chr_y=fields[4], pos_y=int(fields[5]), list_y=fields[7],
117 |                            sv_type=fields[8], sv_len=int(fields[9]), vaf=vaf,
118 |                            support=support_dict, original_line=line.strip(), is_ensemble=is_ensemble)
119 |         minda_entries[minda_num] = entry
120 |         minda_num += 1
121 | 
122 |     return minda_entries
123 | 
124 | 
125 | def get_confident_calls(minda_records):
126 |     """
127 |     Extracts confident calls based on given minimum support
128 |     """
129 |     confident_calls = set()
130 |     for rec in minda_records.values():
131 |         if rec.is_ensemble:
132 |             confident_calls.add(rec.minda_num)
133 | 
134 |     """
135 |     callset_list = list(next(iter(minda_records.values())).support.keys())
136 |     for rec in minda_records.values():
137 |         support_tools = set(x for x in rec.support if rec.support[x])
138 |         against_tools = set(callset_list) - support_tools
139 |         techs = set([x.split("_")[-1] for x in support_tools])
140 |         if len(support_tools) >= min_tools and len(techs) >= min_tech:
141 |             confident_calls.add(rec.minda_num)
142 |     """
143 | 
144 |     return confident_calls
145 | 
146 | 
147 | def filter_calls(minda_records, min_vaf, min_sv_len, remove_ins):
148 |     filtered_minda = {}
149 |     for rec in minda_records.values():
150 |         if rec.vaf < min_vaf:
151 |             continue
152 |         #if rec.pos_y == 0:
153 |         #    continue
154 |         if rec.chr_x == rec.chr_y and abs(rec.pos_y - rec.pos_x) < min_sv_len:
155 |             continue
156 |         if remove_ins and rec.sv_type == "INS":
157 |             continue
158 | 
159 |         filtered_minda[rec.minda_num] = rec
160 |     return filtered_minda
161 | 
162 | 
163 | def stratify_breakends(minda_records, confident_calls, annotation_dir, remove_insertions):
164 |     """
165 |     Performs various types of stratification
166 |     """
167 |     repeatmasker_file = os.path.join(annotation_dir, REPEAT_MASKER)
168 |     segdup_file = os.path.join(annotation_dir, SEGDUPS_BED)
169 |     vntr_file = os.path.join(annotation_dir, VNTR_BED)
170 |     chr_sizes_file = os.path.join(annotation_dir, CHR_LEN_BED)
171 | 
172 |     index_repeatmasker = parse_repeatmasker(repeatmasker_file)
173 |     index_segdup = get_bed_intervals(segdup_file)
174 |     index_vntr = get_bed_intervals(vntr_file)
175 |     #index_germline = get_vcf_breakpoints(germline_vcf)
176 | 
177 |     #chr sizes
178 |     chr_sizes = defaultdict(int)
179 |     for line in open(chr_sizes_file, "r"):
180 |         fields = line.split()
181 |         chr_sizes[fields[0]] = int(fields[1])
182 | 
183 |     #cluster indexing
184 |     index_ensemble = defaultdict(IntervalTree)
185 |     for entry in minda_records.values():
186 |         index_ensemble[entry.chr_x][entry.pos_x : entry.pos_x + 1] = entry
187 |         index_ensemble[entry.chr_y][entry.pos_y : entry.pos_y + 1] = entry
188 | 
189 |     strat_categories = ["hom_repeat", "segdup", "vntr", "low_vaf", "low_len",
190 |                         "bnd_dup", "bnd_chain"]
191 |     if not remove_insertions:
192 |         strat_categories = ["insertion"] + strat_categories
193 |     strat_entries = defaultdict(set)
194 | 
195 |     def _get_intervals(entry, index, threshold):
196 |         ovlp_1 = index[entry.chr_x][entry.pos_x - threshold : entry.pos_x + threshold]
197 |         ovlp_2 = index[entry.chr_y][entry.pos_y - threshold : entry.pos_y + threshold]
198 |         return [o[2] for o in ovlp_1], [o[2] for o in ovlp_2]
199 | 
200 |     def _support_tools(rec):
201 |         return set(x for x in rec.support if rec.support[x])
202 | 
203 |     def _coords_number(entries):
204 |         clusters = []
205 |         for e in entries:
206 |             for (c, p) in [(e.chr_x, e.pos_x), (e.chr_y, e.pos_y)]:
207 |                 match = False
208 |                 for cl in clusters:
209 |                     if cl[0] == c and abs(cl[1] - p) <= BND_AREA:
210 |                         match = True
211 |                 if not match:
212 |                     clusters.append((c, p))
213 |         return len(clusters)
214 | 
215 |     def _is_duplicate(e1, e2):
216 |         match_x = e1.chr_x == e2.chr_x and abs(e1.pos_x - e2.pos_x) <= BND_AREA
217 |         match_y = e1.chr_y == e2.chr_y and abs(e1.pos_y - e2.pos_y) <= BND_AREA
218 |         cross_x = e1.chr_x == e2.chr_y and abs(e1.pos_x - e2.pos_y) <= BND_AREA
219 |         cross_y = e1.chr_y == e2.chr_x and abs(e1.pos_y - e2.pos_x) <= BND_AREA
220 |         return (match_x and match_y) or (cross_x and cross_y)
221 | 
222 |     for entry in minda_records.values():
223 |         #Homologous repeats at breakends
224 |         ovlp_1, ovlp_2 = _get_intervals(entry, index_repeatmasker, REPEAT_AREA)
225 |         for x in ovlp_1:
226 |             for y in ovlp_2:
227 |                 if x[1] == y[1] and x[2] != y[2]:   #same repeat family, but different repeat
228 |                     strat_entries[entry.minda_num].add("hom_repeat")
229 | 
230 |         #same segdup section
231 |         segdup_1, segdup_2 = _get_intervals(entry, index_segdup, REPEAT_AREA)
232 |         if len(set(s[0] for s in segdup_1) & set(s[0] for s in segdup_2)) > 0:
233 |         #if len(segdup_1) > 0 and len(segdup_2) > 0:
234 |             strat_entries[entry.minda_num].add("segdup")
235 | 
236 |         #same vntr section
237 |         vntr_1, vntr_2 = _get_intervals(entry, index_vntr, REPEAT_AREA)
238 |         if len(set(s[0] for s in vntr_1) & set(s[0] for s in vntr_2)) > 0:
239 |         #if len(vntr_1) > 0 and len(vntr_2) > 0:
240 |             strat_entries[entry.minda_num].add("vntr")
241 | 
242 |         #low-ish vaf
243 |         if entry.vaf < LOW_VAF:
244 |             strat_entries[entry.minda_num].add("low_vaf")
245 | 
246 |         if entry.chr_x == entry.chr_y and abs(entry.pos_y - entry.pos_x) < LOW_LEN:
247 |             strat_entries[entry.minda_num].add("low_len")
248 | 
249 |         #is insertion
250 |         if entry.sv_type == "INS":
251 |             strat_entries[entry.minda_num].add("insertion")
252 | 
253 |         """
254 |         #near telomere
255 |         if min(entry.pos_x, chr_sizes[entry.chr_x] - entry.pos_x) < TELOMERE_LEN or \
256 |                 min(entry.pos_y, chr_sizes[entry.chr_y] - entry.pos_y) < TELOMERE_LEN:
257 |             strat_entries[entry.minda_num].add("telomere")
258 |         """
259 | 
260 |         """
261 |         #near germline SV breakpoints
262 |         germ_1, germ_2 = _get_intervals(entry, index_germline, BND_AREA)
263 |         if len(germ_1) > 0 and len(germ_2) > 0:
264 |             strat_entries[entry.minda_num].add("germline")
265 | 
266 |         """
267 | 
268 |         """
269 |         #near multiple condifent breakpoints
270 |         if len(set(o.minda_num for o in ens_bnds_1 if o.minda_num in confident_calls)) > 1 or \
271 |                 len(set(o.minda_num for o in ens_bnds_2 if o.minda_num in confident_calls)) > 1:
272 |             strat_entries[entry.minda_num].add("truth_cluster")
273 |         """
274 | 
275 |         ens_bnds_1, ens_bnds_2 = _get_intervals(entry, index_ensemble, BND_AREA)
276 | 
277 |         #duplication
278 |         for r in ens_bnds_1:
279 |             if r != entry and _is_duplicate(r, entry):
280 |                 strat_entries[entry.minda_num].add("bnd_dup")
281 |                 strat_entries[r.minda_num].add("bnd_dup")
282 |         for r in ens_bnds_2:
283 |             if r != entry and _is_duplicate(r, entry):
284 |                 strat_entries[entry.minda_num].add("bnd_dup")
285 |                 strat_entries[r.minda_num].add("bnd_dup")
286 | 
287 |         #3+ chain of breakends
288 |         left_chain, right_chain = None, None
289 |         extra_chain_entries = set()
290 |         for r in ens_bnds_1:
291 |             if r != entry and not _is_duplicate(r, entry):
292 |                 left_chain = r
293 |                 extra_chain_entries.add(r.minda_num)
294 |         for r in ens_bnds_2:
295 |             if r != entry and not _is_duplicate(r, entry):
296 |                 right_chain = r
297 |                 extra_chain_entries.add(r.minda_num)
298 | 
299 |         if None not in [left_chain, right_chain] and left_chain != right_chain:
300 |             strat_entries[entry.minda_num].add("bnd_chain")
301 |             for e in extra_chain_entries:
302 |                 strat_entries[e].add("bnd_chain")
303 | 
304 |         #print(entry.original_line + "\t" + ",".join(list(strat_entries[entry.minda_num])))
305 | 
306 |     return strat_categories, strat_entries
307 | 
308 | 
309 | def compute_fp_fn(minda_records, confident_calls, strat_category, strat_entries, print_table):
310 |     callset_list = list(next(iter(minda_records.values())).support.keys())
311 |     callset_list.sort(key=lambda x: x.split("_")[0])
312 |     tools_tp, tools_fp, tools_fn, tools_f1 = defaultdict(set), defaultdict(set), defaultdict(set), defaultdict(float)
313 |     tools_recall, tools_precision = defaultdict(set), defaultdict(set)
314 | 
315 |     for rec in minda_records.values():
316 |         support_tools = set(x for x in rec.support if rec.support[x])
317 |         against_tools = set(callset_list) - support_tools
318 | 
319 |         if strat_category is not None:
320 |             estrat = strat_entries[rec.minda_num]
321 |             if len(estrat) == 0:
322 |                 estrat = set(["Unclassified"])
323 |             if strat_category not in estrat:
324 |                 continue
325 |             #if strat_category == "Unclassified" and rec.minda_num in confident_calls:
326 |             #    print("FN", rec.chr_x, rec.pos_x, rec.chr_y, rec.pos_y, against_tools)
327 | 
328 |         if rec.minda_num in confident_calls:
329 |             for tool in support_tools:
330 |                 tools_tp[tool].add(rec.minda_num)
331 |             for tool in against_tools:
332 |                 tools_fn[tool].add(rec.minda_num)
333 |         else:
334 |             for tool in support_tools:
335 |                 tools_fp[tool].add(rec.minda_num)
336 | 
337 |     if print_table:
338 |         print(f"\n=== Stratifying by: {strat_category} ===\n")
339 |         print("#Tool\tTP\tFP\tFN\tprecision\trecall\tF1")
340 | 
341 |     for tool in callset_list:
342 |         if len(tools_tp[tool]) > 0:
343 |             precision = len(tools_tp[tool]) / (len(tools_tp[tool]) + len(tools_fp[tool]))
344 |             recall = len(tools_tp[tool]) / (len(tools_tp[tool]) + len(tools_fn[tool]))
345 |             f1_score = 2 * precision * recall / (precision + recall)
346 |         else:
347 |             precision, recall, f1_score = 0, 0, 0
348 |         tp, fp, fn, = len(tools_tp[tool]), len(tools_fp[tool]), len(tools_fn[tool])
349 |         tools_f1[tool] = f1_score
350 |         tools_recall[tool] = recall
351 |         tools_precision[tool] = precision
352 | 
353 |         if print_table:
354 |             if PRETTY_PRINT:
355 |                 print(f"{tool:20s}\t{tp}\t{fp}\t{fn}\t{precision:.4f}\t{recall:.4f}\t{f1_score:.4f}")
356 |             else:
357 |                 print(f"{tool}\t{tp}\t{fp}\t{fn}\t{precision:.4f}\t{recall:.4f}\t{f1_score:.4f}")
358 | 
359 |     return tools_tp, tools_fp, tools_fn, tools_recall, tools_precision, tools_f1
360 | 
361 | 
362 | def summary_errors(minda_records, confident_calls, categories, strat_entries):
363 |     callset_list = list(next(iter(minda_records.values())).support.keys())
364 |     callset_list.sort(key=lambda x: x.split("_")[0])
365 | 
366 |     by_tool_fp = defaultdict(dict)
367 |     by_tool_fn = defaultdict(dict)
368 |     by_tool_tp = defaultdict(dict)
369 |     by_tool_f1_score = defaultdict(dict)
370 |     by_tool_recall = defaultdict(dict)
371 |     by_tool_precision = defaultdict(dict)
372 |     for cat in categories:
373 |         tp, fp, fn, recall, precision, f1_score = \
374 |                 compute_fp_fn(minda_records, confident_calls, cat, strat_entries, print_table=False)
375 |         for tool in tp:
376 |             by_tool_tp[tool][cat] = len(tp[tool])
377 |             by_tool_fp[tool][cat] = len(fp[tool])
378 |             by_tool_fn[tool][cat] = len(fn[tool])
379 |             by_tool_f1_score[tool][cat] = "{:.4f}".format(f1_score[tool])
380 |             by_tool_recall[tool][cat] = "{:.4f}".format(recall[tool])
381 |             by_tool_precision[tool][cat] = "{:.4f}".format(precision[tool])
382 | 
383 |     def print_with(stats, title):
384 |         print(f"\n\t == {title} == \n")
385 |         #print("{:20s}\t".format("#Tool") + "\t".join(categories))
386 |         print("{}\t".format("#Tool") + "\t".join(categories))
387 |         for tool in callset_list:
388 |             numbers = [str(stats[tool][cat]) for cat in categories]
389 |             if PRETTY_PRINT:
390 |                 print(f"{tool:20s}\t" + "\t".join(numbers))
391 |             else:
392 |                 print(f"{tool}\t" + "\t".join(numbers))
393 | 
394 |     print_with(by_tool_tp, "True positives")
395 |     print_with(by_tool_fp, "False positives")
396 |     print_with(by_tool_fn, "False negatives")
397 |     print_with(by_tool_recall, "Recall")
398 |     print_with(by_tool_precision, "Precision")
399 |     print_with(by_tool_f1_score, "F1 scores")
400 | 
401 | 
402 | #annotation filenames 
403 | REPEAT_MASKER = "repeatmasker.out"
404 | SEGDUPS_BED = "segdups.bed"
405 | VNTR_BED = "trf.bed"
406 | CHR_LEN_BED = "chr.fasta.fai"
407 | 
408 | #stratification
409 | MIN_REPEAT = 100
410 | REPEAT_AREA = 5
411 | BND_AREA = 500
412 | LOW_VAF = 0.10
413 | LOW_LEN = 100
414 | #TELOMERE_LEN = 10000
415 | 
416 | #evaluation
417 | #MIN_TECH = 2
418 | #MIN_TOOLS = 4
419 | MIN_VAF = 0.00
420 | MIN_SV_LEN = 50
421 | REMOVE_INS = False
422 | PRETTY_PRINT = True
423 | 
424 | 
425 | def minda_stratification(annotation_dir, minda_support_tsv, num_callsets):
426 |     print(f"Params: min_vaf:{MIN_VAF} insertions_removed:{REMOVE_INS}")
427 | 
428 |     minda_records = parse_minda_csv(minda_support_tsv, num_callsets)
429 |     minda_records = filter_calls(minda_records, MIN_VAF, MIN_SV_LEN, remove_ins=REMOVE_INS)
430 |     confident_calls = get_confident_calls(minda_records)
431 |     compute_fp_fn(minda_records, confident_calls, None, None, print_table=True)
432 | 
433 |     strat_categories, strat_calls = stratify_breakends(minda_records, confident_calls, annotation_dir,
434 |                                                        REMOVE_INS)
435 |     categories_unk = strat_categories + ["Unclassified"]
436 | 
437 |     for category in categories_unk:
438 |         compute_fp_fn(minda_records, confident_calls, category, strat_calls, print_table=True)
439 |     summary_errors(minda_records, confident_calls, categories_unk, strat_calls)
440 | 
441 | 
442 | def main():
443 |     if len(sys.argv) != 4:
444 |         print("Usage: minda_stratify.py annotation_dir minda_support_tsv num_callsets")
445 |         return 1
446 | 
447 |     annotation_dir = sys.argv[1]
448 |     #germline_vcf = sys.argv[2]
449 |     minda_csv = sys.argv[2]
450 |     num_callsets = int(sys.argv[3])
451 |     minda_stratification(annotation_dir, minda_csv, num_callsets)
452 | 
453 | 
454 | if __name__ == "__main__":
455 |     main()
456 | 


--------------------------------------------------------------------------------
/minda/decompose.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import pandas as pd
  5 | import numpy as np
  6 | import gzip
  7 | from collections import Counter
  8 | from pybedtools import BedTool
  9 | 
 10 | logger = logging.getLogger()
 11 | 
 12 | 
 13 | def _is_vcf_gz(vcf):
 14 |     with gzip.open(vcf, 'r') as file:
 15 |         try:
 16 |             file.read(1)
 17 |             return True
 18 |         except OSError:
 19 |             return False
 20 |             
 21 | 
 22 | def get_caller_name(vcf):
 23 |     """
 24 |     Extracts the name of the caller from vcf.
 25 | 
 26 |     """
 27 |     is_vcf_gz = _is_vcf_gz(vcf)
 28 |     if is_vcf_gz == False:
 29 |         with open(vcf, 'r') as file:
 30 |             caller_name = _caller_name(file)
 31 |     else:
 32 |         with gzip.open(vcf, 'rt') as file:
 33 |             caller_name = _caller_name(file)
 34 |     
 35 |     return caller_name
 36 | 
 37 | 
 38 | def _caller_name(file):
 39 |     found_source = False
 40 |     for line in file:
 41 |         if line.startswith("##source"):
 42 |             source_line = line.strip()
 43 |             line_length = len(source_line)
 44 |             start = source_line.find("=") + 1
 45 |             if line_length > 50:
 46 |                 stop = source_line.find(" ")
 47 |                 if stop == -1:
 48 |                     stop = 51
 49 |             else:
 50 |                 stop = line_length
 51 | 
 52 |             caller_name = source_line[start:stop]
 53 |             found_source = True
 54 | 
 55 |         if not found_source:
 56 |             caller_name = "Unknown source"
 57 |     return caller_name
 58 | 
 59 | 
 60 | 
 61 | def get_df(vcf):
 62 |     """
 63 |     Create a df from vcf.
 64 |     
 65 |     """
 66 |     is_vcf_gz = _is_vcf_gz(vcf)
 67 |     if is_vcf_gz == False:
 68 |         df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'})
 69 |     else:
 70 |         df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'})
 71 |     df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
 72 |     
 73 |     return df
 74 | 
 75 | 
 76 | def get_intersected_df(vcf, bed):
 77 |     """
 78 |     Create a df that only includes records that interesect intervals of the bed file.
 79 | 
 80 |     """
 81 |     bed_to_bt = BedTool(bed)
 82 |     vcf_to_bt = BedTool(vcf)
 83 |     intersect_obj = vcf_to_bt.intersect(bed_to_bt, u=True)
 84 |     df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'})
 85 |     df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
 86 |     return df
 87 | 
 88 | 
 89 | def _get_sorted_df(df):
 90 |     """
 91 |     Sorts dataframe by #CHROM and POS
 92 |     
 93 |     """
 94 |     # handles instances where chromosomes are only integers
 95 |     df["#CHROM"] = df["#CHROM"].astype(str)
 96 |     chrom_value = df.iloc[0,0]
 97 |     if chrom_value.startswith("chr"):
 98 |         chrom_set = set(df["#CHROM"].str.slice(start=3).to_list())
 99 |     else:
100 |         chrom_set = set(df["#CHROM"].to_list())
101 |         
102 |     str_chrom_list = []
103 |     int_chrom_list = []
104 |     for chrom_str in chrom_set:
105 |         try:
106 |             chrom = int(chrom_str)
107 |             int_chrom_list.append(chrom)
108 |         except ValueError:
109 |             chrom = chrom_str
110 |             str_chrom_list.append(chrom)
111 |     if chrom_value.startswith("chr"):
112 |         chrom_sort = sorted(int_chrom_list) + sorted(str_chrom_list)
113 |         chrom_sort = ['chr' + str(chrom_sort[i]) for i in range(len(chrom_sort))]
114 |     else:
115 |         chrom_sort = sorted(int_chrom_list) + sorted(str_chrom_list)
116 |         chrom_sort = [str(chrom_sort[i]) for i in range(len(chrom_sort))]
117 |         
118 |     df = df.sort_values(by=['#CHROM', 'POS'], key=lambda x: x.map({v: i for i, v in enumerate(chrom_sort)})).reset_index(drop=True)
119 |     
120 |     return df
121 | 
122 | 
123 | def _get_alt_mate_index(df):
124 |     
125 |     #create mate df & create list of df values
126 |     alt_df = df.ALT.str.extract(r'((chr)?\w+):(\d+)').rename(columns={0: "MATE_#CHROM", 1: "regex_noncap_group ", 2:"MATE_POS"})
127 |     alt_df.MATE_POS = alt_df.MATE_POS.astype(pd.Int64Dtype())
128 |     alt_df.drop(columns="regex_noncap_group ", inplace=True) 
129 |     mate_df = df[['#CHROM','POS']].merge(alt_df, left_index=True, right_index=True)
130 |     mate_df_list = mate_df.values.tolist()
131 | 
132 |     # find the index of mate record
133 |     mate_indices = []
134 |     for i in range(len(mate_df_list)):
135 |         record = mate_df_list[i]
136 |         chrom = record[0]
137 |         pos = record[1]
138 |         mate_chrom = record[2]
139 |         mate_pos = record[3]
140 | 
141 |         matching_rows = mate_df[(mate_df['#CHROM'] == mate_chrom) & \
142 |         (mate_df['POS'] == mate_pos) & \
143 |         (mate_df['MATE_#CHROM'] == chrom) & \
144 |         (mate_df['MATE_POS'] == pos)]
145 | 
146 |         matching_row_indices = matching_rows.index.to_list()
147 |         matching_row_indices = [index for index in matching_row_indices if index != i]
148 |         if len(matching_row_indices) == 1:
149 |             mate_index = matching_row_indices[0] 
150 |         else:
151 |             mate_index = -1
152 |         mate_indices.append(mate_index)
153 | 
154 |     df["MATE_INDEX"] = mate_indices
155 | 
156 |     # get mate results
157 |     unique_indices_count = df['MATE_INDEX'].nunique()
158 |     first_unique_index = df["MATE_INDEX"].value_counts().to_frame().index[0]
159 |     unpaired_recrods_count = len(df[df["MATE_INDEX"] == -1])
160 |     paired_records_count = len(df[df["MATE_INDEX"] != -1])
161 | 
162 |     logger.info(f"Number of unique indices: {unique_indices_count}")
163 |     if unique_indices_count == len(df):
164 |         logger.info(f"{paired_records_count} paired records found...")
165 |     elif unique_indices_count == 1 and first_unique_index == -1:
166 |         logger.info("No paired records found...")
167 |     else:
168 |         logger.info(f"{paired_records_count} paired records and {unpaired_recrods_count} unpaired records found...")
169 | 
170 |     return df
171 | 
172 | 
173 | def _get_paired_alt_dfs(alt_df):
174 |     
175 |     # check if BNDS are a single record or two
176 |     mask = alt_df['MATE_INDEX'] == -1
177 |     singleton_df = alt_df[mask]
178 |     singleton_count = len(singleton_df)
179 |     if singleton_count == alt_df.shape[0]: # ALT records are singletons
180 |         logger.debug(f"(1) Number of singleton records: {singleton_count} {alt_df.shape[0]}")
181 |         alt_df_1 = alt_df.copy()
182 |         alt_df_2 = alt_df.copy()  
183 |         alt_df_2['#CHROM'] = alt_df_2.ALT.str.extract(r'(chr\w+|\w+):')[0].to_list()
184 |         alt_df_2['POS'] = alt_df_2.ALT.str.extract(r':(\d+)')[0].astype(pd.Int64Dtype()).to_list() 
185 |         paired_alt_dfs = [alt_df_1, alt_df_2]
186 |         logger.debug(f"(1) Number of alt/alt_1/alt_2 records: {alt_df.shape[0]} {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
187 |         logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
188 |         logger.info(f"Number of unpaired records paired by MATE_ID: 0 0")
189 |     
190 |     # alt_df pairs based on mate index
191 |     else:
192 |         logger.debug(f"(2) Number of singleton/alt records: {singleton_count} {alt_df.shape[0]}")
193 |         alt_df_1 = alt_df[(alt_df.index < alt_df.MATE_INDEX) & (alt_df.MATE_INDEX != -1)]
194 |         alt_df_2 = alt_df[(alt_df.index > alt_df.MATE_INDEX) & (alt_df.MATE_INDEX != -1)]
195 |         alt_df_2.index = alt_df_2["MATE_INDEX"].to_list()
196 |         paired_alt_dfs = [alt_df_1, alt_df_2]
197 | 
198 |         mateless_alt_df = alt_df[alt_df.MATE_INDEX == -1]
199 |         logger.debug(f"(2a) Number of alt/alt_1/alt_2/mateless records: {len(alt_df)} {len(alt_df_1)} {len(alt_df_2)} {len(mateless_alt_df)}")
200 |         logger.info(f"Number of paired records paired by ALT column: {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
201 | 
202 |         # alt_df pairs based on MATEID in INFO
203 |         if mateless_alt_df.shape[0] > 0:
204 |             
205 |             mate_id_alt_df = _get_mate_id_df(mateless_alt_df)
206 |             mate_id_alt_df_1 = mate_id_alt_df[(mate_id_alt_df.index < mate_id_alt_df.MATE_INDEX) & (mate_id_alt_df.MATE_INDEX != -1)]
207 |             mate_id_alt_df_2 = mate_id_alt_df[(mate_id_alt_df.index > mate_id_alt_df.MATE_INDEX) & (mate_id_alt_df.MATE_INDEX != -1)]
208 |             mate_id_alt_df_2.index = mate_id_alt_df_2["MATE_INDEX"].to_list()
209 |             alt_to_info_df = mate_id_alt_df[mate_id_alt_df.MATE_INDEX == -1] # for SEVERUS INS
210 |             logger.debug(f"(2b) Number of mate_id/mate_id/mate_id/alt_to_info records: {mate_id_alt_df.shape[0]} {mate_id_alt_df_1.shape[0]}  {mate_id_alt_df_2.shape[0]} {alt_to_info_df.shape[0]}")
211 |             #alt_df_1 = pd.concat([alt_df_1, mate_id_alt_df_1])
212 |             #alt_df_2 = pd.concat([alt_df_2, mate_id_alt_df_2])
213 |             non_empty_1 = [df for df in [alt_df_1, mate_id_alt_df_1] if not df.empty]
214 |             alt_df_1 = pd.concat(non_empty_1).sort_index()
215 |             non_empty_2 = [df for df in [alt_df_2, mate_id_alt_df_2] if not df.empty]
216 |             alt_df_2 = pd.concat(non_empty_2).sort_index()
217 |             
218 |             paired_alt_dfs = [alt_df_1, alt_df_2]
219 |             logger.debug(f"(2b) Total number of alt_1/alt_2 records: {alt_df_1.shape[0]} {alt_df_2.shape[0]}")
220 |             if alt_to_info_df.shape[0] > 0:
221 |                 paired_alt_dfs.append(alt_to_info_df)
222 |             logger.info(f"Number of unpaired records paired by MATE_ID: {len(mate_id_alt_df_1)} {len(mate_id_alt_df_2)}")
223 |         else:
224 |             logger.info(f"Number of unpaired records paired by MATE_ID: 0 0")
225 |     return paired_alt_dfs
226 | 
227 | 
228 | def _get_mate_id_df(df):
229 |     """
230 |     Finds the index of the mate ID listed in INFO in the ID column. If not found, index of -1 assigned. 
231 |     
232 |     """
233 |     mate_id_pattern = r'MATEID=([^;]+)(?=;|$)'
234 |     mate_id_list = df.INFO.str.extract(mate_id_pattern)[0].to_list()
235 |     
236 |     mate_indices = []
237 |     for i in range(len(mate_id_list)):
238 |         mate_id = mate_id_list[i]
239 |         matching_rows = df[(df['ID'] == mate_id)]
240 |         matching_row_indices = matching_rows.index
241 |         
242 |         if len(matching_row_indices) == 1:
243 |             mate_index = matching_row_indices[0]    
244 |         else:
245 |             mate_index = -1
246 |         mate_indices.append(mate_index)
247 | 
248 |     #df['MATE_INDEX'] = mate_indices
249 |     df.loc[:, 'MATE_INDEX'] = mate_indices
250 |         
251 |     return df
252 |     
253 | 
254 | def _get_paired_info_dfs(info_df):
255 |     
256 |     mate_pos_list = info_df.INFO.str.extract(r'SVLEN=(-?\d+)')[0].astype(pd.Int64Dtype()).abs().to_list()
257 |     info_df_1 = info_df.copy()
258 |     info_df_2 = info_df.copy()
259 |     info_df_2['POS'] = info_df_2['POS'] + mate_pos_list
260 |     info_df_2['END'] = info_df_2.INFO.str.extract(r'END=(-?\d+)')[0].astype(pd.Int64Dtype()).abs().to_list()
261 |     #info_df_2['POS'].fillna(info_df_2['END'], inplace=True)
262 |     info_df_2.fillna({'POS':info_df_2['END']}, inplace=True)
263 |     nan_indices = info_df_2[info_df_2['POS'].isna()].index
264 |     info_df_1 =info_df_1.drop(index=nan_indices, errors='ignore')
265 |     info_df_2 =info_df_2.drop(index=nan_indices, errors='ignore')
266 |     dropped_singleton_count  = info_df.shape[0] - info_df_1.shape[0]
267 |     
268 |     logger.info(f"Number of unpaired records paired by INFO column: {info_df_1.shape[0]} {info_df_2.shape[0]}")
269 |     logger.info(f"Number of singleton records dropped: {dropped_singleton_count}")
270 |     
271 |     return info_df_1, info_df_2
272 | 
273 | 
274 | def _check_df_order(df_1, df_2):
275 |     
276 |     # row by row for start and end df, check that the order by sorting
277 |     for i in range(len(df_1)):
278 |         
279 |         df_number = [0]
280 |         row_1 = df_1.iloc[i].to_frame().T
281 |         row_1['row_number'] = df_number
282 |         
283 |         df_number = [1]
284 |         row_2 = df_2.iloc[i].to_frame().T
285 |         row_2['row_number'] = df_number
286 |     
287 |         order_df = pd.concat([row_1, row_2]).reset_index(drop=True)
288 |         sorted_order_df =  _get_sorted_df(order_df)
289 |     
290 |         # if sort is out of order, what the chrom & pos values of the start & end dfs
291 |         if order_df.equals(sorted_order_df) == False: 
292 |             df_1.at[i,'#CHROM'] = sorted_order_df.iloc[0]['#CHROM']
293 |             df_1.at[i, 'POS'] = sorted_order_df.iloc[0]['POS']
294 |             df_2.at[i,'#CHROM'] = sorted_order_df.iloc[1]['#CHROM']
295 |             df_2.at[i, 'POS'] = sorted_order_df.iloc[1]['POS']
296 |                        
297 |     return df_1, df_2
298 | 
299 | 
300 | def _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir):
301 |     id_difference_list = sorted(list(id_set_difference))
302 |     path = f'{out_dir}/{sample_name}_removed_records.txt'
303 |     file_check = os.path.isfile(path)
304 |     with open(path,'a') as file:
305 |         if file_check == False:
306 |             file.write('REMOVED RECORDS\n')  
307 |         if written_count == 0:
308 |             step =  "***** " + caller_name + " *****" + "\n" + step
309 |         file.write(f'\n{step}\n')
310 |         file.write('\n'.join(id_difference_list))
311 |         file.write('\n')
312 | 
313 | def get_decomposed_dfs(caller_name, df, filter, min_size, prefixed, vaf, sample_name, out_dir):
314 |     """
315 |     Decomposes df records into start and end dfs.
316 | 
317 |     """
318 |     logger.info(f"DECOMPOSING {caller_name} RECORDS...")
319 |     logger.info(f"Original number of records: {len(df)}")
320 |     
321 |     written_count = 0
322 |     original_id_set = set(df.ID.to_list())
323 |     
324 |     # filter_df
325 |     if filter != None:
326 |         df = df[df['FILTER'].isin(filter)]
327 |     logger.info(f"Number of records after filtering by FILTER column: {len(df)}")
328 | 
329 |     # write removed ids to txt
330 |     filter_id_set = set(df.ID.to_list())
331 |     id_set_difference = original_id_set.difference(filter_id_set)
332 |     if len(list(id_set_difference)) > 0:
333 |         step = "FILTER"
334 |         _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir)
335 |         written_count +=1
336 |     
337 |     # sort df
338 |     df = _get_sorted_df(df)
339 | 
340 |     # change EVENTTYPE to SVTYPE (for GRIDSS/GRIPSS)
341 |     # create SVTYPE column
342 |     df.loc[:, 'INFO'] = df.INFO.str.replace('EVENTTYPE', 'SVTYPE')
343 |     svtype_pattern = r'SVTYPE=([^;]+)(?=;|$)'
344 |     svtype_column = df.INFO.str.extract(svtype_pattern)[0].to_list()
345 |     df["SVTYPE"] = svtype_column
346 | 
347 |     # create SVLEN column
348 |     # change SVINSLEN to SVLEN only if SVLEN not in info (for nanomonsv)
349 |     df.loc[~df['INFO'].str.contains('SVLEN', na=False), 'INFO'] = df['INFO'].str.replace('SVINSLEN', 'SVLEN')
350 |     
351 |     # add VAF column
352 |     # if vaf != None:
353 |     #     df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list()
354 |     #     if df.VAF.isnull().all() == True:
355 |     #         sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ")
356 |     if vaf == None:
357 |         df['VAF'] = "*"
358 |     else:
359 |         df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list()
360 |         if df.VAF.isnull().all() == True:
361 |             sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ")
362 |     
363 |     # get indices of mate rows
364 |     df = _get_alt_mate_index(df)
365 | 
366 |     # create paired ALT dfs
367 |     alt_df = df[df['ALT'].str.contains(r'(?:chr)?\w+:\d+', na=False)].copy()
368 | 
369 |     #create paired INFO dfs
370 |     info_df = df.drop(index=alt_df.index, errors='ignore') 
371 |     logger.debug(f"Number of INFO records: {info_df.shape[0]}")
372 | 
373 |     # get ALT paired dfs
374 |     paired_alt_dfs = _get_paired_alt_dfs(alt_df)
375 |     if len(paired_alt_dfs) == 3:
376 |         alt_df_1 = paired_alt_dfs[0]
377 |         alt_df_2 = paired_alt_dfs[1]
378 |         alt_to_info_df = paired_alt_dfs[2]
379 | 
380 |         info_df = pd.concat([info_df, alt_to_info_df])
381 |         logger.debug(f"Total number of INFO records: {info_df.shape[0]}")
382 |     else:
383 |         alt_df_1 = paired_alt_dfs[0]
384 |         alt_df_2 = paired_alt_dfs[1]
385 |         
386 | 
387 |     info_df_1, info_df_2 = _get_paired_info_dfs(info_df)
388 | 
389 |     non_empty_1 = [df for df in [info_df_1, alt_df_1] if not df.empty]
390 |     decomposed_df_1 = pd.concat(non_empty_1).sort_index()
391 |     non_empty_2 = [df for df in [info_df_2, alt_df_2] if not df.empty]
392 |     decomposed_df_2 = pd.concat(non_empty_2).sort_index()
393 | 
394 |     # write removed ids to txt
395 |     singleton_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list())
396 |     id_set_difference = filter_id_set.difference(singleton_id_set)
397 |     if len(id_set_difference) > 0:
398 |         step = "SINGLETON"
399 |         _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir)
400 |         written_count += 1
401 | 
402 |     # check that start and end record are in correct df
403 |     decomposed_df_1, decomposed_df_2 = _check_df_order(decomposed_df_1, decomposed_df_2)
404 | 
405 |     # write removed ids to txt
406 |     order_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list())
407 |     id_set_difference = singleton_id_set.difference(order_id_set)
408 |     step = "END/START ORDER"
409 |     if len(id_set_difference) > 0:
410 |             step = "END/START ORDER"
411 |             _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir)
412 |             written_count += 1
413 | 
414 |     decomposed_df_1['Minda_ID'] = f'{caller_name}_' + (decomposed_df_1.index + 1).astype(str)
415 |     decomposed_df_2['Minda_ID'] = f'{caller_name}_' + (decomposed_df_2.index + 1).astype(str)
416 |     logger.info(f"Number of decomposed records after pairing: {decomposed_df_1.shape[0]} {decomposed_df_2.shape[0]}")
417 | 
418 |     # create SVLEN column determined on start & end df (not all vcfs have SVLEN in INFO)
419 |     decomposed_df_1['SVLEN'] = decomposed_df_1.apply(lambda row: -1 if row['#CHROM'] != decomposed_df_2.loc[row.name, '#CHROM'] else int(abs(row['POS'] - decomposed_df_2.loc[row.name, 'POS'])), axis=1)
420 |     max_svlen = decomposed_df_1['SVLEN'].max()
421 |     
422 |     if min_size != None:
423 |         decomposed_df_1 = decomposed_df_1[(decomposed_df_1['SVLEN'] >= min_size) | (decomposed_df_1['SVLEN'] == -1)]
424 |         decomposed_df_2 = decomposed_df_2[decomposed_df_2.index.isin(decomposed_df_1.index)]
425 |         logger.info(f"Number of records after size filtering: {len(decomposed_df_1)} {len(decomposed_df_2)}")
426 | 
427 |     # write removed ids to txt
428 |     svlen_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list())
429 |     id_set_difference = order_id_set.difference(svlen_id_set)
430 |     if len(id_set_difference) > 0:
431 |         step = "SVLEN"
432 |         _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir)
433 |         written_count += 1
434 | 
435 |     # filter low VAFs such that if either the start or end VAF is too low, records from both dfs are removed
436 |     if vaf != None:
437 |         decomposed_df_1 = decomposed_df_1[decomposed_df_1['VAF'] >= vaf]
438 |         decomposed_df_2 = decomposed_df_2[decomposed_df_2['VAF'] >= vaf]
439 |         minda_ids_list = pd.merge(decomposed_df_1, decomposed_df_2, on='Minda_ID')['Minda_ID'].to_list()
440 |         decomposed_df_1 = decomposed_df_1[decomposed_df_1['Minda_ID'].isin(minda_ids_list)]
441 |         decomposed_df_2 = decomposed_df_2[decomposed_df_2['Minda_ID'].isin(minda_ids_list)]
442 |         logger.info(f"Number of records after VAF filtering: {len(decomposed_df_1)} {len(decomposed_df_2)}")
443 | 
444 |     # write removed ids to txt
445 |     vaf_id_set = set(decomposed_df_1.ID.to_list() + decomposed_df_2.ID.to_list())
446 |     id_set_difference = svlen_id_set.difference(vaf_id_set)
447 |     step = "VAF"
448 |     if len(id_set_difference) > 0:
449 |         step = "VAF"
450 |         _write_removed_records(id_set_difference, caller_name, step, written_count, sample_name, out_dir)
451 |         written_count += 1
452 |           
453 |     logger.info(f"Total number of decomposed records: {decomposed_df_1.shape[0]} {decomposed_df_2.shape[0]}")
454 | 
455 |     if prefixed == True:
456 |         prefix = caller_name.split('_', 1)[0]
457 |         decomposed_df_1.ID = prefix + "_" + decomposed_df_1['ID'].astype(str)
458 |         decomposed_df_2.ID = prefix + "_" + decomposed_df_2['ID'].astype(str)
459 | 
460 |     return decomposed_df_1, decomposed_df_2, max_svlen


--------------------------------------------------------------------------------