├── tests
    ├── __init__.py
    ├── capice
    │   ├── __init__.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── test_args_handler_explain.py
    │   │   ├── test_args_handler_train.py
    │   │   └── test_args_handler_parent.py
    │   ├── vep
    │   │   ├── __init__.py
    │   │   ├── test_length.py
    │   │   ├── test_type.py
    │   │   ├── test_poly_phen.py
    │   │   ├── test_amino_acids.py
    │   │   ├── test_sift.py
    │   │   ├── test_cdna_position.py
    │   │   └── test_consequence.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── test_capice_exporter.py
    │   │   └── test_logger.py
    │   ├── utilities
    │   │   ├── __init__.py
    │   │   ├── test_predictor.py
    │   │   ├── test_custom_logging_filter.py
    │   │   ├── test_input_parser.py
    │   │   ├── test_load_file_postprocessor.py
    │   │   ├── test_class_suggestor.py
    │   │   ├── test_predict.py
    │   │   ├── test_file_postprocessor.py
    │   │   ├── test_dynamic_loader.py
    │   │   ├── test_column_utils.py
    │   │   └── test_input_processor.py
    │   ├── validators
    │   │   ├── __init__.py
    │   │   ├── test_model_validator.py
    │   │   ├── test_post_vep_processing_validator.py
    │   │   ├── test_property_type_validator.py
    │   │   ├── test_input_validator.py
    │   │   └── test_post_file_parse_validator.py
    │   ├── test__init__.py
    │   ├── test_resources.py
    │   ├── test_main_predict.py
    │   ├── test_main_explain.py
    │   ├── test_templates.py
    │   └── test_edge_cases_predict.py
    └── resources
    │   ├── input_processor
    │       └── filename.txt
    │   ├── dynamic_loader_test_no_files
    │       └── __init__.py
    │   ├── dynamic_loader_test_files_present
    │       ├── __init__.py
    │       ├── correct_file.py
    │       ├── incorrect_file.py
    │       ├── correct_vep_grch_file.py
    │       ├── correct_vep_grch_file_dupe_test.py
    │       ├── correct_overwrite_file.py
    │       └── correct_overwrite_file_dupe_test.py
    │   ├── input_parser
    │       └── input_parser.txt
    │   ├── breakends.vcf.gz
    │   ├── edge_cases.vcf.gz
    │   ├── breakends_vep.tsv.gz
    │   ├── features_test.json
    │   ├── xgb_booster_poc.ubj
    │   ├── edge_cases_vep.tsv.gz
    │   ├── symbolic_alleles.vcf.gz
    │   ├── symbolic_alleles_vep.tsv.gz
    │   ├── train_dataset_missing_column_pos.tsv.gz
    │   ├── train_dataset_missing_column_ref.tsv.gz
    │   ├── train_dataset_missing_column_polyphen.tsv.gz
    │   └── VEP104.json
├── src
    └── molgenis
    │   └── capice
    │       ├── cli
    │           ├── __init__.py
    │           ├── args_handler_explain.py
    │           ├── args_handler_predict.py
    │           ├── args_handler_train.py
    │           └── args_handler_parent.py
    │       ├── core
    │           ├── __init__.py
    │           ├── capice_exporter.py
    │           ├── args_handler.py
    │           ├── capice_manager.py
    │           └── logger.py
    │       ├── vep
    │           ├── __init__.py
    │           ├── cds_position.py
    │           ├── cdna_position.py
    │           ├── protein_position.py
    │           ├── length.py
    │           ├── sift.py
    │           ├── template_sift_polyphen.py
    │           ├── amino_acids.py
    │           ├── poly_phen.py
    │           ├── template.py
    │           ├── template_position.py
    │           ├── type.py
    │           └── consequence.py
    │       ├── validators
    │           ├── __init__.py
    │           ├── model_validator.py
    │           ├── property_type_validator.py
    │           ├── predict_validator.py
    │           ├── post_vep_processing_validator.py
    │           ├── input_validator.py
    │           ├── post_file_parse_validator.py
    │           └── version_validator.py
    │       ├── __init__.py
    │       ├── capice.py
    │       ├── utilities
    │           ├── custom_logging_filter.py
    │           ├── class_suggestor.py
    │           ├── predictor.py
    │           ├── input_parser.py
    │           ├── load_file_postprocessor.py
    │           ├── column_utils.py
    │           ├── enums.py
    │           ├── __init__.py
    │           ├── manual_vep_processor.py
    │           ├── input_processor.py
    │           └── dynamic_loader.py
    │       ├── main_predict.py
    │       ├── main_explain.py
    │       └── main_capice.py
├── resources
    ├── train_test.tsv.gz
    ├── predict_input.tsv.gz
    ├── train_input_raw.vcf.gz
    ├── predict_input_raw.vcf.gz
    ├── test_input.vcf
    └── train_features.json
├── .flake8
├── scripts
    ├── tests
    │   ├── capice_input.vcf.zip
    │   └── test_convert_vep_vcf_to_tsv_capice.sh
    └── convert_vep_vcf_to_tsv_capice.sh
├── sonar-project.properties
├── .github
    ├── pull_request_template.md
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── setup.py
├── .gitignore
├── .travis.yml
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/vep/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/capice/validators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/resources/input_processor/filename.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '5.1.2'
2 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_no_files/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/resources/input_parser/input_parser.txt:
--------------------------------------------------------------------------------
1 | this,is,a,header
2 | this,is,a,line


--------------------------------------------------------------------------------
/resources/train_test.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/train_test.tsv.gz


--------------------------------------------------------------------------------
/resources/predict_input.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/predict_input.tsv.gz


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | import-order-style = pep8
3 | max_line_length = 100
4 | application-import-names = molgenis,tests


--------------------------------------------------------------------------------
/resources/train_input_raw.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/train_input_raw.vcf.gz


--------------------------------------------------------------------------------
/tests/resources/breakends.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/breakends.vcf.gz


--------------------------------------------------------------------------------
/tests/resources/edge_cases.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/edge_cases.vcf.gz


--------------------------------------------------------------------------------
/resources/predict_input_raw.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/predict_input_raw.vcf.gz


--------------------------------------------------------------------------------
/scripts/tests/capice_input.vcf.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/scripts/tests/capice_input.vcf.zip


--------------------------------------------------------------------------------
/tests/resources/breakends_vep.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/breakends_vep.tsv.gz


--------------------------------------------------------------------------------
/tests/resources/features_test.json:
--------------------------------------------------------------------------------
1 | {
2 |   "feature_1": null,
3 |   "feature_foobarbaz": null,
4 |   "feature_3": null
5 | }


--------------------------------------------------------------------------------
/tests/resources/xgb_booster_poc.ubj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/xgb_booster_poc.ubj


--------------------------------------------------------------------------------
/tests/resources/edge_cases_vep.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/edge_cases_vep.tsv.gz


--------------------------------------------------------------------------------
/tests/resources/symbolic_alleles.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/symbolic_alleles.vcf.gz


--------------------------------------------------------------------------------
/tests/resources/symbolic_alleles_vep.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/symbolic_alleles_vep.tsv.gz


--------------------------------------------------------------------------------
/tests/resources/train_dataset_missing_column_pos.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_pos.tsv.gz


--------------------------------------------------------------------------------
/tests/resources/train_dataset_missing_column_ref.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_ref.tsv.gz


--------------------------------------------------------------------------------
/tests/resources/train_dataset_missing_column_polyphen.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_polyphen.tsv.gz


--------------------------------------------------------------------------------
/sonar-project.properties:
--------------------------------------------------------------------------------
 1 | sonar.projectKey=molgenis_capice
 2 | sonar.organization=molgenis
 3 | 
 4 | sonar.sources = src/
 5 | sonar.tests = tests/
 6 | 
 7 | sonar.python.xunit.reportPath=results.xml
 8 | sonar.python.coverage.reportPaths=coverage.xml
 9 | 
10 | sonar.python.version=3.10


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/correct_file.py:
--------------------------------------------------------------------------------
 1 | class CorrectFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Correct'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @staticmethod
11 |     def some_function():
12 |         return 'foo'
13 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/incorrect_file.py:
--------------------------------------------------------------------------------
 1 | class InCorrectFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Incorrect'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @staticmethod
11 |     def other_function():
12 |         return 'foo'
13 | 


--------------------------------------------------------------------------------
/resources/test_input.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##reference=GRCh38
 3 | #CHROM	POS	ID	REF	ALT
 4 | chr12	69747417	.	C	A
 5 | chr17	41231346	.	G	T
 6 | chr2	122288533	.	C	A
 7 | chr11	118382645	.	G	T
 8 | chr5	235382	.	G	A
 9 | chr2	48026421	.	T	C
10 | chr5	90073785	.	C	T
11 | chr1	63114155	.	T	C
12 | chr2	179431764	.	G	A
13 | chr9	131250286	.	G	A
14 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## SOP
 2 | 
 3 | ### Changed
 4 | 
 5 | -
 6 | 
 7 | ## Important notes
 8 | 
 9 | -
10 | 
11 | ### Before merge:
12 | - [ ] Functionality works & meets specs
13 | - [ ] No Travis issues
14 | - [ ] Code reviewed
15 | - [ ] Documentation was updated
16 | 
17 | ### After merge:
18 | - [ ] Added feature/fix to draft release notes
19 | - [ ] Removed merged branches
20 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/cds_position.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.vep.template_position import TemplatePosition
 2 | 
 3 | 
 4 | class CDSPosition(TemplatePosition):
 5 |     def __init__(self):
 6 |         super(CDSPosition, self).__init__(
 7 |             name='CDS_position',
 8 |             usable=True
 9 |         )
10 | 
11 |     @property
12 |     def columns(self):
13 |         return ['CDSpos', 'relCDSpos']
14 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/cdna_position.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.vep.template_position import TemplatePosition
 2 | 
 3 | 
 4 | class CDNAPosition(TemplatePosition):
 5 |     def __init__(self):
 6 |         super(CDNAPosition, self).__init__(
 7 |             name='cDNA_position',
 8 |             usable=True
 9 |         )
10 | 
11 |     @property
12 |     def columns(self):
13 |         return ['cDNApos', 'relcDNApos']
14 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/protein_position.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.vep.template_position import TemplatePosition
 2 | 
 3 | 
 4 | class ProteinPosition(TemplatePosition):
 5 |     def __init__(self):
 6 |         super(ProteinPosition, self).__init__(
 7 |             name='Protein_position',
 8 |             usable=True
 9 |         )
10 | 
11 |     @property
12 |     def columns(self):
13 |         return ['protPos', 'relProtPos']
14 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/capice.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.core.args_handler import ArgsHandler
 2 | 
 3 | 
 4 | def main():
 5 |     """
 6 |     CAPICE main. Runs the Argument handler, which in turns runs the super class
 7 |     args handler for all available modules. For usage, print the help on
 8 |     the command line by using (python3) capice(.py) --help.
 9 |     """
10 |     argument_handler = ArgsHandler()
11 |     argument_handler.create()
12 |     argument_handler.handle()
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     main()
17 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/correct_vep_grch_file.py:
--------------------------------------------------------------------------------
 1 | class CorrectVEPFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Correct_VEP_GRCh'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @property
11 |     def supported_vep_version(self):
12 |         return 104.0
13 | 
14 |     @property
15 |     def supported_grch_build(self):
16 |         return 37
17 | 
18 |     @staticmethod
19 |     def some_function():
20 |         return 'SomeVeryUniqueString'
21 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/correct_vep_grch_file_dupe_test.py:
--------------------------------------------------------------------------------
 1 | class CorrectVEPFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Correct_VEP_GRCh'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @property
11 |     def supported_vep_version(self):
12 |         return 104.0
13 | 
14 |     @property
15 |     def supported_grch_build(self):
16 |         return 37
17 | 
18 |     @staticmethod
19 |     def some_function():
20 |         return 'SomeVeryUniqueString'
21 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/correct_overwrite_file.py:
--------------------------------------------------------------------------------
 1 | class CorrectOverwriteFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Correct_Overwrite'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @property
11 |     def supported_vep_version(self):
12 |         return None
13 | 
14 |     @property
15 |     def supported_grch_build(self):
16 |         return None
17 | 
18 |     @staticmethod
19 |     def some_function():
20 |         return 'SomeVeryUniqueString_butdifferent'
21 | 


--------------------------------------------------------------------------------
/tests/resources/dynamic_loader_test_files_present/correct_overwrite_file_dupe_test.py:
--------------------------------------------------------------------------------
 1 | class CorrectOverwriteFile:
 2 |     @property
 3 |     def name(self):
 4 |         return 'Correct_Overwrite'
 5 | 
 6 |     @property
 7 |     def usable(self):
 8 |         return True
 9 | 
10 |     @property
11 |     def supported_vep_version(self):
12 |         return None
13 | 
14 |     @property
15 |     def supported_grch_build(self):
16 |         return None
17 | 
18 |     @staticmethod
19 |     def some_function():
20 |         return 'SomeVeryUniqueString_butdifferent'
21 | 


--------------------------------------------------------------------------------
/tests/capice/cli/test_args_handler_explain.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from argparse import ArgumentParser
 3 | 
 4 | from molgenis.capice.cli.args_handler_explain import ArgsHandlerExplain
 5 | 
 6 | 
 7 | class TestArgsHandlerExplain(unittest.TestCase):
 8 |     def test_property_str_versions(self):
 9 |         args_handler = ArgsHandlerExplain(ArgumentParser())
10 |         self.assertEqual('.json, .ubj', args_handler._extension_str())
11 |         self.assertEqual('.tsv, .tsv.gz', args_handler._required_output_extensions_str())
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     unittest.main()
16 | 


--------------------------------------------------------------------------------
/tests/capice/test__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from re import match
 3 | 
 4 | from molgenis.capice.utilities.enums import Versioning
 5 | from molgenis.capice import __version__
 6 | 
 7 | 
 8 | class TestVersion(unittest.TestCase):
 9 |     def test_version_formatting(self):
10 |         """
11 |         Test that assures CAPICE is not given an invalid version number.
12 |         """
13 |         if match(Versioning.VALIDATION_REGEX.value, __version__) is None:
14 |             raise ValueError('CAPICE has invalid version format')
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     unittest.main()
19 | 


--------------------------------------------------------------------------------
/resources/train_features.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "PolyPhen": null,
 3 |     "SIFT": null,
 4 |     "cDNA_position": null,
 5 |     "CDS_position": null,
 6 |     "Protein_position": null,
 7 |     "Amino_acids": null,
 8 |     "REF": null,
 9 |     "ALT": null,
10 |     "Consequence": null,
11 |     "SpliceAI_pred_DP_AG": null,
12 |     "SpliceAI_pred_DP_AL": null,
13 |     "SpliceAI_pred_DP_DG": null,
14 |     "SpliceAI_pred_DP_DL": null,
15 |     "SpliceAI_pred_DS_AG": null,
16 |     "SpliceAI_pred_DS_AL": null,
17 |     "SpliceAI_pred_DS_DG": null,
18 |     "SpliceAI_pred_DS_DL": null,
19 |     "Grantham": null,
20 |     "phyloP": null
21 | }


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/model_validator.py:
--------------------------------------------------------------------------------
 1 | class ModelValidator:
 2 |     @staticmethod
 3 |     def validate_has_required_attributes(model):
 4 |         """
 5 |         Function to validate if the required attributes CAPICE_version,
 6 |         impute_values and predict_proba are present.
 7 |         """
 8 |         required_attributes = ['CAPICE_version', 'vep_features',
 9 |                                'processable_features', 'predict_proba']
10 |         for attribute in required_attributes:
11 |             if attribute not in dir(model):
12 |                 raise AttributeError(f'Unable to locate attribute {attribute} in model file!')
13 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/custom_logging_filter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class CustomLoggingFilter(logging.Filter):
 5 |     """
 6 |     Custom logging filter class to make sure that stdout only contains
 7 |     INFO or DEBUG calls.
 8 |     """
 9 |     def __init__(self, custom_loglevels):
10 |         """
11 |         :param custom_loglevels: str or iterable: the loglevels that should pass
12 |         this logging filter.
13 |         """
14 |         super(CustomLoggingFilter, self).__init__()
15 |         self.custom_loglevels = custom_loglevels
16 | 
17 |     def filter(self, record) -> bool:
18 |         return record.levelno in self.custom_loglevels
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.


--------------------------------------------------------------------------------
/tests/capice/vep/test_length.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.vep import length
 6 | 
 7 | 
 8 | class TestType(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         print('Setting up.')
12 |         cls.length = length.Length()
13 | 
14 |     def test_process(self):
15 |         dataframe = pd.DataFrame({
16 |             'REF': ['ATAG', 'A', 'C', 'AC'],
17 |             'ALT': ['A', 'ATG', 'A', 'GT']})
18 |         observed = self.length.process(dataframe)
19 |         expected = pd.DataFrame({
20 |             'REF': ['ATAG', 'A', 'C', 'AC'],
21 |             'ALT': ['A', 'ATG', 'A', 'GT'],
22 |             'Length': [3, 2, 0, 0]})
23 |         pd.testing.assert_frame_equal(expected, observed)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     unittest.main()
28 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/length.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.vep.template import Template
 4 | from molgenis.capice.utilities.enums import Column
 5 | 
 6 | 
 7 | class Length(Template):
 8 |     def __init__(self):
 9 |         super(Length, self).__init__(
10 |             name=Column.ref.value,
11 |             usable=True
12 |         )
13 | 
14 |     @property
15 |     def columns(self):
16 |         return ['Length']
17 | 
18 |     def _process(self, dataframe: pd.DataFrame):
19 |         dataframe = dataframe.join(
20 |             pd.DataFrame(
21 |                 abs(dataframe[Column.ref.value].str.len() - dataframe[Column.alt.value].str.len()),
22 |                 columns=self.columns
23 |             )
24 |         )
25 |         return dataframe
26 | 
27 |     @property
28 |     def drop(self):
29 |         return False
30 | 


--------------------------------------------------------------------------------
/tests/capice/test_resources.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from molgenis.capice import __version__
 4 | from tests.capice.test_templates import ResourceFile, load_model
 5 | from molgenis.capice.validators.version_validator import VersionValidator
 6 | 
 7 | 
 8 | class TestResources(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls) -> None:
11 |         cls.validator = VersionValidator()
12 | 
13 |     def test_resource_model(self):
14 |         """
15 |         Test that the resources model is valid with the current CAPICE version.
16 |         """
17 |         model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
18 |         self.validator.validate_model_version(model.CAPICE_version)
19 |         self.validator.validate_versions_compatible(__version__, model.CAPICE_version)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/sift.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.vep.template_sift_polyphen import TemplateSiftPolyPhen
 4 | 
 5 | 
 6 | class SIFT(TemplateSiftPolyPhen):
 7 |     def __init__(self):
 8 |         super(SIFT, self).__init__(
 9 |             name='SIFT',
10 |             usable=True
11 |         )
12 | 
13 |     @property
14 |     def columns(self):
15 |         return ['SIFTcat', 'SIFTval']
16 | 
17 |     def apply_label(self, dataframe: pd.DataFrame):
18 |         """
19 |         Under the 0.05 should be deleterious, everything else should be tolerated (if not nan)
20 |         """
21 |         dataframe.loc[
22 |             dataframe[dataframe[self.name].notnull()].index, self.columns[0]] = 'tolerated'
23 |         dataframe.loc[
24 |             dataframe[dataframe[self.name] <= 0.05].index, self.columns[0]] = 'deleterious'
25 |         return dataframe
26 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/template_sift_polyphen.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.vep.template import Template
 7 | 
 8 | 
 9 | class TemplateSiftPolyPhen(Template):
10 |     def __init__(self, name='Template', usable=False):
11 |         super(TemplateSiftPolyPhen, self).__init__(
12 |             name=name,
13 |             usable=usable
14 |         )
15 | 
16 |     @property
17 |     @abstractmethod
18 |     def columns(self):
19 |         return [None, None]
20 | 
21 |     @abstractmethod
22 |     def apply_label(self, dataframe: pd.DataFrame):
23 |         return dataframe
24 | 
25 |     def _process(self, dataframe: pd.DataFrame):
26 |         dataframe[self.columns[1]] = dataframe[self.name]
27 |         dataframe[self.columns[0]] = np.nan
28 |         dataframe = self.apply_label(dataframe)
29 |         return dataframe
30 | 


--------------------------------------------------------------------------------
/tests/capice/validators/test_model_validator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import xgboost as xgb
 4 | 
 5 | from tests.capice.test_templates import ResourceFile, load_model
 6 | from molgenis.capice.validators.model_validator import ModelValidator
 7 | 
 8 | 
 9 | class TestModelValidator(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         cls.validator = ModelValidator()
13 |         cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
14 | 
15 |     def test_model_required_attributes_correct(self):
16 |         self.validator.validate_has_required_attributes(self.model)
17 | 
18 |     def test_model_missing_attribute(self):
19 |         model = xgb.XGBClassifier()
20 |         self.assertRaises(
21 |             AttributeError,
22 |             self.validator.validate_has_required_attributes,
23 |             model
24 |         )
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_type.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.vep import type
 6 | 
 7 | 
 8 | class TestType(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         print('Setting up.')
12 |         cls.type = type.Type()
13 | 
14 |     def test_process(self):
15 |         input_data_frame = pd.DataFrame({'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
16 |                                          'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C']})
17 |         actual_output = self.type.process(input_data_frame)
18 |         expected_output = pd.DataFrame({
19 |             'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'],
20 |             'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C'],
21 |             'Type': ['SNV', 'DELINS', 'DELINS', 'INS', 'DELINS', 'DEL']})
22 |         pd.testing.assert_frame_equal(actual_output, expected_output)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_predictor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from molgenis.capice.utilities.predictor import Predictor
 4 | from tests.capice.test_templates import set_up_impute_preprocess
 5 | 
 6 | 
 7 | class TestPredictor(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         print('Setting up.')
11 |         main, model = set_up_impute_preprocess()
12 |         cls.predictor = Predictor(model)
13 |         cls.dataset = main.categorical_process(
14 |             main.process(
15 |                 main._load_file(), process_features=model.vep_features.keys()
16 |             )[0], processing_features=model.processable_features
17 |         )[0]
18 | 
19 |     def test_predict(self):
20 |         observed = self.predictor.predict(self.dataset)
21 |         self.assertGreater(observed['score'].sum(), 0)
22 |         self.assertFalse(observed['score'].hasnans)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'bug'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | ### System information
14 |  - OS: [e.g. iOS]
15 |  - Version: [e.g. 3.0.0]
16 |  - Python version: [e.g. Python3.9.1]
17 |  - Shell: [e.g. ZSH]
18 |  
19 | ### How to Reproduce
20 | Steps to reproduce the behavior:
21 | 1. `cd` to dir [...]
22 | 2. Run the command `[...]`
23 | 3. See error.
24 | 
25 | ### Expected behavior
26 | A clear and concise description of what you expected to happen.
27 | 
28 | ### Logs
29 | If available, the generated logging information and/or error message (can also be attached as a file if very large).
30 | 
31 | ### Screenshots
32 | If applicable, add screenshots to help explain your problem.
33 | 
34 | ### Additional context
35 | Add any other context about the problem here.
36 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/class_suggestor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.utilities.enums import Column, OutputClasses
 4 | 
 5 | 
 6 | class ClassSuggestor:
 7 |     def __init__(self):
 8 |         # Implement way to make thresholds datafile user definable
 9 |         pass
10 | 
11 |     @staticmethod
12 |     def apply_suggestion(capice_predicted_dataset: pd.DataFrame):
13 |         """
14 |         Method to apply the suggested output class to a capice prediction score
15 |         keeping in mind the per gene thresholds.
16 | 
17 |         :param capice_predicted_dataset: pandas.DataFrame.
18 |             The input dataset that contains the "score" column and a gene name column.
19 |         :return: pandas.DataFrame.
20 |             Original input but with the column suggested_class, depending on the user provided
21 |             thresholds.
22 |         """
23 |         capice_predicted_dataset[Column.suggested_class.value] = OutputClasses.unknown.value
24 |         return capice_predicted_dataset
25 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/amino_acids.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.vep.template import Template
 4 | 
 5 | 
 6 | class AminoAcids(Template):
 7 |     def __init__(self):
 8 |         super(AminoAcids, self).__init__(
 9 |             name='Amino_acids',
10 |             usable=True
11 |         )
12 | 
13 |     @property
14 |     def columns(self):
15 |         return ['oAA', 'nAA']
16 | 
17 |     @property
18 |     def oaa(self):
19 |         return self.columns[0]
20 | 
21 |     @property
22 |     def naa(self):
23 |         return self.columns[1]
24 | 
25 |     def _process(self, dataframe: pd.DataFrame):
26 |         if dataframe[self.name].str.contains('/', regex=False).any():
27 |             dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True)
28 |             dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True)
29 |         else:
30 |             dataframe[self.oaa] = dataframe[self.name]
31 |             dataframe[self.naa] = dataframe[self.oaa]
32 |         return dataframe
33 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/poly_phen.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.vep.template_sift_polyphen import TemplateSiftPolyPhen
 4 | 
 5 | 
 6 | class PolyPhen(TemplateSiftPolyPhen):
 7 |     def __init__(self):
 8 |         super(PolyPhen, self).__init__(
 9 |             name='PolyPhen',
10 |             usable=True
11 |         )
12 | 
13 |     @property
14 |     def columns(self):
15 |         return ['PolyPhenCat', 'PolyPhenVal']
16 | 
17 |     def apply_label(self, dataframe: pd.DataFrame):
18 |         """
19 |         Under the 0.445 should be benign, between 0.445 and 0.908 should be possibly damaging,
20 |         everything else should be probably damaging (if not nan).
21 |         """
22 |         dataframe.loc[dataframe[dataframe[self.name].notnull()].index, self.columns[0]] = 'benign'
23 |         dataframe.loc[
24 |             dataframe[dataframe[self.name] > 0.445].index, self.columns[0]] = 'possibly_damaging'
25 |         dataframe.loc[
26 |             dataframe[dataframe[self.name] > 0.908].index, self.columns[0]] = 'probably_damaging'
27 |         return dataframe
28 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_custom_logging_filter.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import unittest
 3 | 
 4 | from molgenis.capice.utilities.custom_logging_filter import CustomLoggingFilter
 5 | 
 6 | 
 7 | class TestCustomLoggingFilter(unittest.TestCase):
 8 | 
 9 |     @classmethod
10 |     def setUp(cls):
11 |         print('Setting up.')
12 |         cls.custom_filter = CustomLoggingFilter({})
13 | 
14 |     def test_filter_true(self):
15 |         logger = logging.getLogger('simple_example')
16 |         record = logger.makeRecord('simple_example', 3, '', 5, 'message', ('arg',), None)
17 |         self.custom_filter.custom_loglevels = [3, 4]
18 |         actual = self.custom_filter.filter(record)
19 |         self.assertEqual(True, actual)
20 | 
21 |     def test_filter_false(self):
22 |         logger = logging.getLogger('simple_example')
23 |         record = logger.makeRecord('simple_example', 2, '', 5, 'message', ('arg',), None)
24 |         self.custom_filter.custom_loglevels = [3, 4]
25 |         actual = self.custom_filter.filter(record)
26 |         self.assertEqual(False, actual)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_poly_phen.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.vep import poly_phen
 7 | 
 8 | 
 9 | class TestType(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         print('Setting up.')
13 |         cls.poly_phen = poly_phen.PolyPhen()
14 | 
15 |     def test_process(self):
16 |         dataframe = pd.DataFrame(
17 |             {
18 |                 'PolyPhen': [0.445, 0.908, 0.999, np.nan]
19 |             }
20 |         )
21 |         expected = pd.concat(
22 |             [
23 |                 dataframe,
24 |                 pd.DataFrame(
25 |                     {
26 |                         'PolyPhenCat': ['benign', 'possibly_damaging', 'probably_damaging', np.nan],
27 |                         'PolyPhenVal': [0.445, 0.908, 0.999, np.nan]
28 |                     }
29 |                 )
30 |             ], axis=1
31 |         )
32 |         observed = self.poly_phen.process(dataframe)
33 |         pd.testing.assert_frame_equal(expected.sort_index(axis=1), observed.sort_index(axis=1))
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     unittest.main()
38 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_input_parser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.utilities.input_parser import InputParser
 7 | from tests.capice.test_templates import _project_root_directory
 8 | 
 9 | 
10 | class TestInputParser(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         print('Setting up.')
14 |         cls.parser = InputParser()
15 | 
16 |     def test_parse(self):
17 |         self.parser.set_separator(',')
18 |         input_file = self.parser.parse(
19 |             os.path.join(
20 |                 _project_root_directory,
21 |                 'tests',
22 |                 'resources',
23 |                 'input_parser',
24 |                 'input_parser.txt'
25 |             )
26 |         )
27 |         expected_df = pd.DataFrame(
28 |             {
29 |                 'this': ['this'],
30 |                 'is': ['is'],
31 |                 'a': ['a'],
32 |                 'header': ['line']
33 |             }
34 |         )
35 |         pd.testing.assert_frame_equal(input_file, expected_df)
36 | 
37 |         if __name__ == '__main__':
38 |             unittest.main()
39 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_amino_acids.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.vep import amino_acids
 6 | 
 7 | 
 8 | class TestType(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         print('Setting up.')
12 |         cls.aa = amino_acids.AminoAcids()
13 | 
14 |     def test_process(self):
15 |         dataframe = pd.DataFrame({'Amino_acids': ['A/G', 'R/C', 'G/C']})
16 |         observed = self.aa.process(dataframe)
17 |         expected = pd.DataFrame({'Amino_acids': ['A/G', 'R/C', 'G/C'],
18 |                                  'oAA': ['A', 'R', 'G'],
19 |                                  'nAA': ['G', 'C', 'C']})
20 |         pd.testing.assert_frame_equal(expected, observed)
21 | 
22 |     def test_process_no_alt(self):
23 |         dataframe = pd.DataFrame({'Amino_acids': ['A', 'R', 'G']})
24 |         observed = self.aa.process(dataframe)
25 |         expected = pd.DataFrame({'Amino_acids': ['A', 'R', 'G'],
26 |                                  'oAA': ['A', 'R', 'G'],
27 |                                  'nAA': ['A', 'R', 'G']})
28 |         pd.testing.assert_frame_equal(expected, observed)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/predictor.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.core.logger import Logger
 2 | from molgenis.capice.utilities.enums import Column
 3 | 
 4 | 
 5 | class Predictor:
 6 |     """
 7 |     Predictor class for CAPICE. Produces the final CAPICE score.
 8 |     """
 9 | 
10 |     def __init__(self, model):
11 |         """
12 |         :param model: XGBClassifier, the custom pickled model instance of user
13 |         provided model.
14 |         """
15 |         self.log = Logger().logger
16 |         self.model = model
17 |         self.log.info('Starting prediction.')
18 | 
19 |     def predict(self, dataset):
20 |         """
21 |         Predicts the probability score of CAPICE over dataset.
22 |         :param dataset: pandas.DataFrame, the fully imputed and processed
23 |         user input dataset of VEP-like origin.
24 |         :return: pandas.DataFrame: containing an extra column containing the
25 |         CAPICE score per variant.
26 |         """
27 |         self.log.info('Predicting for %d samples.', dataset.shape[0])
28 |         dataset[Column.score.value] = self.model.predict_proba(
29 |             dataset[self.model.get_booster().feature_names])[:, 1]
30 |         self.log.info('Prediction successful.')
31 |         return dataset
32 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/property_type_validator.py:
--------------------------------------------------------------------------------
 1 | class PropertyTypeValidator:
 2 |     def validate_property(self, value: object, expected_type: type, include_none: bool = False):
 3 |         """
 4 |         Logger method to raise a TypeError when a Property is not set correctly.
 5 | 
 6 |         :param value: value to be checked
 7 |         :param expected_type: type the value should match
 8 |         :param include_none: whenever None should be allowed
 9 |         """
10 |         if isinstance(value, bool):
11 |             if type(value) != expected_type:
12 |                 self._check_none(expected_type, value, include_none)
13 | 
14 |         elif not isinstance(value, expected_type):
15 |             self._check_none(expected_type, value, include_none)
16 | 
17 |     def _check_none(self, expected_type, value, include_none):
18 |         if include_none:
19 |             if value is not None:
20 |                 self._raise_type_error(expected_type, value)
21 |         else:
22 |             self._raise_type_error(expected_type, value)
23 | 
24 |     @staticmethod
25 |     def _raise_type_error(expected_type, value):
26 |         error_message = "Expected variable type %s but got %s"
27 |         raise TypeError(error_message % (expected_type, type(value)))
28 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/predict_validator.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import xgboost as xgb
 3 | 
 4 | from molgenis.capice.core.logger import Logger
 5 | 
 6 | 
 7 | class PredictValidator:
 8 |     def __init__(self):
 9 |         self.log = Logger().logger
10 | 
11 |     def validate_data_predict_ready(self, dataset: pd.DataFrame, model: xgb.XGBClassifier) ->\
12 |             None:
13 |         """
14 |         Validates if dataset is predict ready according to the feature names in model
15 | 
16 |         Args:
17 |             dataset:
18 |                 The dataset that is supposed to be predict ready.
19 |             model:
20 |                 The custom CAPICE xgboost.XGBClassifier.
21 |         Raises:
22 |             KeyError:
23 |                 Raised when a required predict feature is missing from dataset.
24 |         """
25 |         missing = []
26 |         for feature in model.get_booster().feature_names:  # type: ignore
27 |             if feature not in dataset.columns:
28 |                 missing.append(feature)
29 |         if len(missing) > 0:
30 |             error_message = 'Missing required predict column(s): %s'
31 |             self.log.critical(error_message, ', '.join(missing))
32 |             raise KeyError(error_message, ', '.join(missing))
33 | 


--------------------------------------------------------------------------------
/tests/capice/validators/test_post_vep_processing_validator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from tests.capice.test_templates import teardown, ResourceFile, load_model
 6 | from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator
 7 | 
 8 | 
 9 | class TestPostVEPProcessingValidator(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         print('Setting up.')
13 |         cls.dataset = pd.DataFrame(
14 |             {
15 |                 'chr': [1, 2],
16 |                 'pos': [100, 200],
17 |                 'REF': ['A', 'A'],
18 |                 'ALT': ['T', 'T'],
19 |                 'feat1': ['foo', 'bar']
20 |             }
21 |         )
22 |         cls.validator = PostVEPProcessingValidator()
23 |         cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
24 | 
25 |     @classmethod
26 |     def tearDownClass(cls) -> None:
27 |         print('Tearing down.')
28 |         teardown()
29 | 
30 |     def test_validate_features_present_incorrect(self):
31 |         print('KeyError raise due to missing VEP processed feature')
32 |         self.assertRaises(
33 |             KeyError,
34 |             self.validator.validate_features_present,
35 |             self.dataset,
36 |             self.model.vep_features.values()
37 |         )
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/input_parser.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.core.logger import Logger
 4 | 
 5 | 
 6 | class InputParser:
 7 |     def __init__(self):
 8 |         self.log = Logger().logger
 9 |         self.sep = '\t'
10 | 
11 |     def set_separator(self, sep: str):
12 |         """
13 |         Function to overwrite the default separator 'tab'.
14 |         Currently has no real function, but might be implemented
15 |         in the future if the default separator in the VEP output changes and
16 |         the separator has to be dynamically changed.
17 | 
18 |         :param sep: str, to be used separator in the pandas.read_csv call
19 |         """
20 |         self.sep = sep
21 | 
22 |     def parse(self, input_file_path: str):
23 |         """
24 |         Class to start the parsing of additional information from the input
25 |         file.
26 | 
27 |         :param input_file_path: str, direction to the input file
28 |         """
29 |         if self.sep == '\t':
30 |             used_sep = 'Tab'
31 |         else:
32 |             used_sep = self.sep
33 |         self.log.info('Reading VEP file from: %s using separator: %s', input_file_path, used_sep)
34 |         input_file = pd.read_csv(input_file_path, sep=self.sep, na_values='.', low_memory=False)
35 |         message = 'Input file at %s loaded with %s samples.'
36 |         self.log.info(message, input_file_path, input_file.shape[0])
37 |         return input_file
38 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/post_vep_processing_validator.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.core.logger import Logger
 4 | from molgenis.capice.utilities import check_if_in_list
 5 | 
 6 | 
 7 | class PostVEPProcessingValidator:
 8 |     def __init__(self):
 9 |         self.log = Logger().logger
10 | 
11 |     def validate_features_present(self, datafile: pd.DataFrame, vep_features: list[list[str]]) -> \
12 |             None:
13 |         """
14 |         Validator to see if all features that should be present after the
15 |         ManualVEPProcessor are present.
16 |         Args:
17 |             datafile:
18 |                 Pandas Dataframe over which the feature presence validation should happen.
19 |             vep_features:
20 |                 List of lists of expected output ManualVEPProcesing features as saved in the
21 |                 model.vep_features.values()
22 |         Raises:
23 |             KeyError:
24 |                 Raises KeyError when output VEP feature is not present within datafile.
25 |         """
26 |         features_not_present = check_if_in_list(vep_features, datafile.columns)
27 |         if len(features_not_present) > 0:
28 |             error_message = 'Detected required feature(s) %s not ' \
29 |                             'present within VEP processed input file!'
30 |             self.log.critical(error_message, ', '.join(features_not_present))
31 |             raise KeyError(error_message % ', '.join(features_not_present))
32 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_load_file_postprocessor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 6 | 
 7 | 
 8 | class LoadFilePostprocessor(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         print('Setting up.')
12 |         df = pd.DataFrame(
13 |             {
14 |                 'CHROM': [1],
15 |                 'POS': [123],
16 |                 'REF': ['A'],
17 |                 'ALT': ['G'],
18 |                 'Gene': [123],
19 |                 'SYMBOL_SOURCE': ['hgnc'],
20 |                 'Feature': ['NM1.123'],
21 |                 'SYMBOL': ['ACDC'],
22 |                 'INTRON': [5],
23 |                 'EXON': [11],
24 |             }
25 |         )
26 |         cls.processor = LoadFilePostProcessor(df)
27 | 
28 |     def test_process(self):
29 |         observed = self.processor.process()
30 |         expected = pd.DataFrame(
31 |             {
32 |                 'chr': [1],
33 |                 'pos': [123],
34 |                 'REF': ['A'],
35 |                 'ALT': ['G'],
36 |                 'gene_id': [123],
37 |                 'id_source': ['hgnc'],
38 |                 'feature': ['NM1.123'],
39 |                 'gene_name': ['ACDC'],
40 |                 'Intron': [5],
41 |                 'Exon': [11]
42 |             }
43 |         )
44 |         pd.testing.assert_frame_equal(expected, observed)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/load_file_postprocessor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.core.logger import Logger
 4 | from molgenis.capice.utilities.enums import InputColumn
 5 | 
 6 | 
 7 | class LoadFilePostProcessor:
 8 |     def __init__(self, dataset: pd.DataFrame):
 9 |         self.log = Logger().logger
10 |         self.dataset = dataset
11 | 
12 |     def process(self):
13 |         """
14 |         Function to start the LoadFilePostProcessor to correct the input file of
15 |         each column starting with % and the renaming of certain columns,
16 |         like #CHROM to chr.
17 | 
18 |         Returns
19 |         -------
20 |         dataset :   pandas.DataFrame
21 |                     Processed dataset with corrected % sign and renamed columns.
22 |         """
23 |         self.log.info('LoadFilePostProcessor starting.')
24 |         self._col_renamer()
25 |         self.log.info('LoadFilePostProcessor successful.')
26 |         return self.dataset
27 | 
28 |     def _col_renamer(self):
29 |         """
30 |         Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to
31 |         "GeneID, FeatureID, GeneName, Intron and Exon".
32 |         """
33 |         to_rename = {}
34 |         for column in InputColumn:
35 |             if column.col_input_name in self.dataset.columns:
36 |                 to_rename[column.col_input_name] = column.col_name
37 |         self.log.debug(f'Converting the following column names: {to_rename}')
38 |         self.dataset.rename(columns=to_rename, inplace=True)
39 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_class_suggestor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.utilities.enums import Column, OutputClasses
 6 | from molgenis.capice.utilities.class_suggestor import ClassSuggestor
 7 | 
 8 | 
 9 | class TestClassSuggestor(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         cls.dataset = pd.DataFrame(
13 |             {
14 |                 Column.gene_name.value: ['foo', 'TTN', 'COL7A1', 'MEFV', 'bar'],
15 |                 Column.score.value: [0.9234, 0.2134, 0.0012, 0.4563, 0.7854]
16 |             }
17 |         )
18 | 
19 |     def test_apply_suggestion(self):
20 |         suggestor = ClassSuggestor()
21 |         copy_dataset = self.dataset.copy(deep=True)
22 |         observed = suggestor.apply_suggestion(copy_dataset)
23 |         expected = pd.concat(
24 |             [
25 |                 self.dataset,
26 |                 pd.DataFrame(
27 |                     {
28 |                         Column.suggested_class.value: [
29 |                             OutputClasses.unknown.value,
30 |                             OutputClasses.unknown.value,
31 |                             OutputClasses.unknown.value,
32 |                             OutputClasses.unknown.value,
33 |                             OutputClasses.unknown.value
34 |                         ]
35 |                     }
36 |                 )
37 |             ], axis=1
38 |         )
39 |         pd.testing.assert_frame_equal(observed, expected)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/template.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator
 7 | 
 8 | 
 9 | class Template(metaclass=ABCMeta):
10 |     def __init__(self, name, usable):
11 |         self.property_checker = PropertyTypeValidator()
12 |         self.name = name
13 |         self.usable = usable
14 | 
15 |     @property
16 |     def name(self):
17 |         return self._name
18 | 
19 |     @name.setter
20 |     def name(self, value='Template'):
21 |         self.property_checker.validate_property(value=value, expected_type=str)
22 |         self._name = value
23 | 
24 |     @property
25 |     @abstractmethod
26 |     def columns(self):
27 |         return []
28 | 
29 |     @property
30 |     def usable(self):
31 |         return self._usable
32 | 
33 |     @usable.setter
34 |     def usable(self, value=False):
35 |         self.property_checker.validate_property(value=value, expected_type=bool)
36 |         self._usable = value
37 | 
38 |     @property
39 |     def drop(self):
40 |         return True
41 | 
42 |     @staticmethod
43 |     def _fillna():
44 |         return np.nan
45 | 
46 |     def process(self, dataframe: pd.DataFrame):
47 |         if dataframe[self.name].isnull().all():
48 |             dataframe[self.columns] = self._fillna()
49 |             return dataframe
50 |         else:
51 |             return self._process(dataframe)
52 | 
53 |     @abstractmethod
54 |     def _process(self, dataframe: pd.DataFrame):
55 |         return dataframe
56 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_sift.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.vep.sift import SIFT
 7 | 
 8 | 
 9 | class TestSift(unittest.TestCase):
10 |     def test_sift(self):
11 |         dataset = pd.DataFrame(
12 |             {
13 |                 'SIFT': [np.nan, 0.002, 0.05, 0.9]
14 |             }
15 |         )
16 |         expected = pd.concat(
17 |             [
18 |                 dataset,
19 |                 pd.DataFrame(
20 |                     {
21 |                         'SIFTval': [np.nan, 0.002, 0.05, 0.9],
22 |                         'SIFTcat': [np.nan, 'deleterious', 'deleterious', 'tolerated']
23 |                     }
24 |                 )
25 |             ], axis=1
26 |         )
27 |         observed = SIFT().process(dataset)
28 |         pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1))
29 | 
30 |     def test_sift_full_nan(self):
31 |         list_of_nans = [np.nan, np.nan, np.nan]
32 |         dataset = pd.DataFrame(
33 |             {
34 |                 'SIFT': list_of_nans
35 |             }
36 |         )
37 |         expected = pd.concat(
38 |             [
39 |                 dataset,
40 |                 pd.DataFrame(
41 |                     {
42 |                         'SIFTval': list_of_nans,
43 |                         'SIFTcat': list_of_nans
44 |                     }
45 |                 )
46 |             ], axis=1
47 |         )
48 |         observed = SIFT().process(dataset)
49 |         pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1))
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/column_utils.py:
--------------------------------------------------------------------------------
 1 | class ColumnUtils:
 2 |     """
 3 |     Utility class for columns.
 4 |     """
 5 | 
 6 |     def __init__(self):
 7 |         self.specified_columns = set()
 8 | 
 9 |     def get_specified_columns(self):
10 |         """
11 |         Getter for specified columns
12 |         :return: list of specified columns
13 |         """
14 |         return self.specified_columns
15 | 
16 |     def set_specified_columns(self, specified_columns):
17 |         """
18 |         Setter for specified columns
19 |         :param specified_columns: list
20 |         """
21 |         self.specified_columns = set(specified_columns)
22 | 
23 |     def add_to_specified_columns(self, columns):
24 |         """
25 |         Adds column(s) to the set of specified columns.
26 |         :param columns: string/int/float or list/tuple/set
27 |         """
28 |         if type(columns) in [str, int, float]:
29 |             columns = [columns]
30 |         for column in columns:
31 |             self.specified_columns.add(column)
32 | 
33 |     def column_in_specified_columns(self, column):
34 |         """
35 |         Checks whether column is in specified columns
36 |         :param column: string
37 |         :return: boolean
38 |         """
39 |         return column in self.specified_columns
40 | 
41 |     def get_missing_diff_with(self, columns):
42 |         """
43 |         Get the list of columns missing from the presented columns,
44 |         compared to the specified columns
45 |         :param columns: list of columns
46 |         :return: list
47 |         list of columns that are in the specified columns (specified_columns),
48 |         but not in the presented ones (columns)
49 |         """
50 |         return list(self.specified_columns - set(columns))
51 | 


--------------------------------------------------------------------------------
/tests/capice/test_main_predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.main_predict import CapicePredict
 7 | from tests.capice.test_templates import set_up_manager_and_out, teardown, _project_root_directory, \
 8 |     ResourceFile, load_model
 9 | 
10 | 
11 | class TestMainNonTrain(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         print('Setting up.')
15 |         manager, cls.output_dir = set_up_manager_and_out()
16 |         manager.output_filename = os.path.join(cls.output_dir, 'test_output.tsv')
17 | 
18 |         cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
19 | 
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         print('Performing teardown.')
23 |         teardown()
24 | 
25 |     def setUp(self):
26 |         print('Performing test:')
27 | 
28 |     def test_integration_main_nontrain(self):
29 |         print('Main no-train (integration)')
30 |         infile = os.path.join(_project_root_directory, 'resources', 'predict_input.tsv.gz')
31 |         predict = CapicePredict(input_path=infile, model=self.model, output_path=self.output_dir,
32 |                                 output_given=True, force=False)
33 |         predict.run()
34 |         prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.tsv'), sep='\t')
35 |         self.assertEqual(prediction_output.shape, (4, 11))
36 |         self.assertListEqual(
37 |             list(prediction_output.columns),
38 |             [
39 |                 'chr', 'pos', 'ref', 'alt', 'gene_name', 'gene_id', 'id_source', 'feature',
40 |                 'feature_type', 'score', 'suggested_class'
41 |             ]
42 |         )
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/tests/capice/cli/test_args_handler_train.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | from argparse import ArgumentParser
 4 | from io import StringIO
 5 | 
 6 | from molgenis.capice.cli.args_handler_train import ArgsHandlerTrain
 7 | 
 8 | 
 9 | class TestArgsHandlerPredict(unittest.TestCase):
10 | 
11 |     def setUp(self):
12 |         parser = ArgumentParser(
13 |             description="CAPICE test"
14 |         )
15 |         self.aht = ArgsHandlerTrain(parser)
16 | 
17 |     @patch('sys.stderr', new_callable=StringIO)
18 |     def test_validate_n_threads(self, stderr):
19 |         with self.assertRaises(SystemExit):
20 |             self.aht.validate_n_threads(0)
21 |         self.assertIn('The amount of threads has to be at least 1!', stderr.getvalue())
22 | 
23 |     @patch('sys.stderr', new_callable=StringIO)
24 |     def test_validate_test_split_0(self, stderr):
25 |         with self.assertRaises(SystemExit):
26 |             self.aht.validate_test_split(0)
27 |         self.assertIn('Test split must be a float between 0 and 1', stderr.getvalue())
28 | 
29 |     @patch('sys.stderr', new_callable=StringIO)
30 |     def test_validate_test_split_1(self, stderr):
31 |         with self.assertRaises(SystemExit):
32 |             self.aht.validate_test_split(1)
33 |         self.assertIn('Test split must be a float between 0 and 1', stderr.getvalue())
34 | 
35 |     def test_property_str_versions(self):
36 |         args_handler = ArgsHandlerTrain(ArgumentParser())
37 |         self.assertEqual('.tsv, .tsv.gz', args_handler._extension_str())
38 |         self.assertEqual('.json', args_handler._features_extension_str())
39 |         self.assertEqual('.json, .ubj', args_handler._required_output_extensions_str())
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/template_position.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.vep.template import Template
 7 | 
 8 | 
 9 | class TemplatePosition(Template):
10 |     def __init__(self, name='Template', usable=False):
11 |         super(TemplatePosition, self).__init__(
12 |             name=name,
13 |             usable=usable
14 |         )
15 | 
16 |     @property
17 |     @abstractmethod
18 |     def columns(self):
19 |         return [None, None]
20 | 
21 |     @property
22 |     def pos_col(self):
23 |         return self.columns[0]
24 | 
25 |     def _process(self, dataframe: pd.DataFrame):
26 |         if self.name in dataframe.select_dtypes(include='O'):
27 |             if dataframe[self.name].str.split('/', expand=True).shape[1] > 1:
28 |                 dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True)
29 |             else:
30 |                 dataframe[self.pos_col] = dataframe[self.name]
31 |                 dataframe[self.columns[1]] = np.nan
32 |             dataframe[self.pos_col] = dataframe[self.pos_col].str.replace('?-', '', regex=False)
33 |             dataframe[self.pos_col] = dataframe[self.pos_col].str.replace('-?', '', regex=False)
34 |             dataframe[self.pos_col] = dataframe[self.pos_col].str.split('-', expand=True)[0]
35 | 
36 |             for column in self.columns:
37 |                 dataframe.loc[dataframe[dataframe[column] == ''].index, column] = np.nan
38 |                 dataframe[column] = dataframe[column].astype(float)
39 |         else:
40 |             dataframe[self.pos_col] = dataframe[self.name]
41 |             for col in self.columns:
42 |                 if col not in dataframe.columns:
43 |                     dataframe[col] = np.nan
44 |         return dataframe
45 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_predict.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from molgenis.capice.utilities.enums import Column
 4 | from tests.capice.test_templates import set_up_impute_preprocess, teardown
 5 | 
 6 | 
 7 | class TestPredict(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         print('Setting up.')
11 |         cls.main, cls.model = set_up_impute_preprocess()
12 | 
13 |     @classmethod
14 |     def tearDownClass(cls):
15 |         print('Tearing down.')
16 |         teardown()
17 | 
18 |     def setUp(self):
19 |         print('Testing case:')
20 | 
21 |     def test_unit_prediction(self):
22 |         """
23 |         Unit test for the prediction part of CAPICE.
24 |         """
25 |         print('Prediction (unit)')
26 |         self.main.predict(
27 |             self.main.categorical_process(
28 |                 self.main.process(
29 |                     self.main._load_file(), process_features=self.model.vep_features.keys()
30 |                 )[0], processing_features=self.model.processable_features
31 |             )[0]
32 |         )
33 | 
34 |     def test_component_prediction(self):
35 |         """
36 |         Component test for prediction to see if the combined score of all is
37 |         greater than 0.
38 |         """
39 |         print('Prediction (component)')
40 |         prediction = self.main.predict(
41 |             self.main.categorical_process(
42 |                 self.main.process(
43 |                     self.main._load_file(), process_features=self.model.vep_features.keys()
44 |                 )[0], processing_features=self.model.processable_features
45 |             )[0]
46 |         )
47 |         # Combined sum of the prediction score should be higher than 0
48 |         self.assertGreater(prediction[Column.score.value].sum(), 0)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from setuptools import setup, find_namespace_packages
 4 | from src.molgenis.capice import __version__
 5 | 
 6 | with open('README.md', 'r', encoding='utf-8') as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | setup(
10 |     name='capice',
11 |     version=__version__,
12 |     packages=find_namespace_packages('src', exclude=['tests', 'scripts']),
13 |     package_dir={"": "src"},
14 |     url='https://capice.molgeniscloud.org/',
15 |     license='LGPL-3.0',
16 |     author='Shuang Li, Robert Sietsma and Molgenis',
17 |     author_email='support@molgenis.org',
18 |     description='Consequence Agnostic Pathogenicity Interpretation of '
19 |                 'Clinical Exoma variations. State of the art machine learning '
20 |                 'to predict SNVs and InDels pathogenicity.',
21 |     long_description=long_description,
22 |     long_description_content_type='text/markdown',
23 |     classifiers=[
24 |         'Development Status :: 4 - Beta',
25 |         'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)',
26 |         'Programming Language :: Python :: 3.10'
27 |     ],
28 |     python_requires='>=3.10',
29 |     install_requires=[
30 |         'numpy==1.26.4',
31 |         'pandas==1.5.3',
32 |         'scipy==1.14.1',
33 |         'scikit-learn==1.5.2',
34 |         'xgboost==1.7.6'
35 |     ],
36 |     extras_require={
37 |         'test': [
38 |             'pytest',  # pytest
39 |             'coverage',  # coverage run -m pytest --junitxml=results.xml && coverage html
40 |             'mypy',  # mypy --ignore-missing-imports src/
41 |             'flake8',  # flake8 src/ tests/
42 |             'flake8-import-order'
43 |         ]
44 |     },
45 |     entry_points={
46 |         'console_scripts': [
47 |             'capice = molgenis.capice.capice:main'
48 |         ]
49 |     }
50 | 
51 | )
52 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_cdna_position.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from molgenis.capice.vep import cdna_position
 7 | 
 8 | 
 9 | class TestType(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         print('Setting up.')
13 |         cls.cdna_pos = cdna_position.CDNAPosition()
14 | 
15 |     def test_process(self):
16 |         dataframe = pd.DataFrame({'cDNA_position': ['305/702', '60/550', '?-/123', '-?/456']})
17 |         observed = self.cdna_pos.process(dataframe)
18 |         expected = pd.DataFrame({'cDNA_position': ['305/702', '60/550', '?-/123', '-?/456'],
19 |                                  'cDNApos': [305.00000, 60.00000, np.nan, np.nan],
20 |                                  'relcDNApos': [702.00000, 550.0000, 123.00000, 456.00000]})
21 |         pd.testing.assert_frame_equal(expected, observed)
22 | 
23 |     def test_corner_case(self):
24 |         dataframe = pd.DataFrame(
25 |             {
26 |                 'cDNA_position': ['483-486', '162-163']
27 |             }
28 |         )
29 |         observed = self.cdna_pos.process(dataframe)
30 |         expected = pd.DataFrame({'cDNA_position': ['483-486', '162-163'],
31 |                                  'cDNApos': [483.00000, 162.00000],
32 |                                  'relcDNApos': [np.nan, np.nan]})
33 |         pd.testing.assert_frame_equal(expected, observed)
34 | 
35 |     def test_process_nan(self):
36 |         dataframe = pd.DataFrame({'cDNA_position': [np.nan, np.nan]})
37 |         observed = self.cdna_pos.process(dataframe)
38 |         expected = pd.DataFrame({'cDNA_position': [np.nan, np.nan],
39 |                                  'cDNApos': [np.nan, np.nan],
40 |                                  'relcDNApos': [np.nan, np.nan]})
41 |         pd.testing.assert_frame_equal(expected, observed)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/type.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.utilities.enums import Column
 4 | from molgenis.capice.vep.template import Template
 5 | 
 6 | 
 7 | class Type(Template):
 8 |     def __init__(self):
 9 |         super(Type, self).__init__(
10 |             name=Column.ref.value,
11 |             usable=True
12 |         )
13 | 
14 |     @property
15 |     def columns(self):
16 |         return ['Type']
17 | 
18 |     @staticmethod
19 |     def _ensure_column_value_is_one(column):
20 |         return column.str.len() == 1
21 | 
22 |     def _process(self, dataframe: pd.DataFrame):
23 |         """
24 |         process variants to annotate their types
25 |         :param dataframe: a dataframe with as columns at least a ref and an alt
26 |         :return: a dataframe with an added types column
27 | 
28 |         if len(ref) == 1 && len (alt) == 1:
29 |             type = 'SNV'
30 |         elif ref[0] == alt &&  len(alt) == 1:
31 |             type = 'DEL'
32 |         elif alt[0] == ref && len(ref) == 1:
33 |             type = 'INS'
34 |         else:
35 |             type = 'DELINS'
36 |         """
37 |         alt_column = dataframe[Column.alt.value]
38 |         ref_column = dataframe[Column.ref.value]
39 | 
40 |         alt_column_value_is_1 = self._ensure_column_value_is_one(alt_column)
41 |         ref_column_value_is_1 = self._ensure_column_value_is_one(ref_column)
42 | 
43 |         first_ref_nuc = ref_column.str.get(0)
44 |         first_alt_nuc = alt_column.str.get(0)
45 | 
46 |         dataframe[self.columns] = 'DELINS'
47 |         dataframe.loc[
48 |             dataframe[ref_column_value_is_1 & alt_column_value_is_1].index, self.columns] = 'SNV'
49 |         dataframe.loc[
50 |             dataframe[
51 |                 (first_ref_nuc == alt_column) & alt_column_value_is_1].index, self.columns] = 'DEL'
52 |         dataframe.loc[
53 |             dataframe[
54 |                 (first_alt_nuc == ref_column) & ref_column_value_is_1].index, self.columns] = 'INS'
55 |         return dataframe
56 | 
57 |     @property
58 |     def drop(self):
59 |         return False
60 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/cli/args_handler_explain.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.main_explain import CapiceExplain
 2 | from molgenis.capice.core.capice_manager import CapiceManager
 3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent
 4 | from molgenis.capice.validators.model_validator import ModelValidator
 5 | 
 6 | 
 7 | class ArgsHandlerExplain(ArgsHandlerParent):
 8 |     """
 9 |     Handler for the CAPICE submodule Explain
10 |     """
11 | 
12 |     def __init__(self, parser):
13 |         super(ArgsHandlerExplain, self).__init__(parser=parser)
14 | 
15 |     @property
16 |     def _extension(self):
17 |         return '.json', '.ubj'
18 | 
19 |     @property
20 |     def _required_output_extensions(self):
21 |         return '.tsv', '.tsv.gz'
22 | 
23 |     @property
24 |     def _empty_output_extension(self):
25 |         return self._required_output_extensions[1]
26 | 
27 |     def create(self):
28 |         self.parser.add_argument(
29 |             '-i',
30 |             '--input',
31 |             action='append',
32 |             type=str,
33 |             required=True,
34 |             help=f'path to trained model ({self._extension_str()}) (required)'
35 |         )
36 |         self.parser.add_argument(
37 |             '-o',
38 |             '--output',
39 |             action='append',
40 |             type=str,
41 |             help=f'path to directory or file ({self._required_output_extensions_str()}) for '
42 |                  f'exporting explain output (optional)'
43 |         )
44 |         self.parser.add_argument(
45 |             '-f',
46 |             '--force',
47 |             action='store_true',
48 |             help='overwrites output if it already exists'
49 |         )
50 | 
51 |     def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given,
52 |                                      args):
53 |         model = self.load_model(input_path)
54 |         validator = ModelValidator()
55 |         validator.validate_has_required_attributes(model)
56 |         CapiceManager().output_filename = output_filename
57 |         CapiceExplain(model, output_path, output_given, self.force).run()
58 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/input_validator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import warnings
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | class InputValidator:
 7 |     """
 8 |     Validator for the CLI arguments
 9 |     """
10 |     @staticmethod
11 |     def validate_input_path(input_path: os.PathLike, extension: tuple[str]):
12 |         """
13 |         Function to validate if there is a file at the input location
14 |         :param input_path: full path to input file
15 |         :param extension: string of what the input file should end with.
16 |         """
17 |         if not os.path.exists(input_path):
18 |             raise FileNotFoundError(f'{input_path} does not exist!')
19 |         if not str(input_path).endswith(extension):
20 |             raise IOError(f'{input_path} does not match required extension: '
21 |                           f'{", ".join(extension)}')
22 | 
23 |     @staticmethod
24 |     def validate_output_path(output_path):
25 |         """
26 |         Function to validate if the output directory exists and,
27 |         if not, make it.
28 |         :param output_path: path to output folder
29 |         """
30 |         # If the output directory is not present and
31 |         # the parent directory is also not writeable, throw OSError
32 |         if not os.path.isdir(output_path) and not os.access(Path(output_path).parent, os.W_OK):
33 |             raise OSError('New output directory cannot be made in a read/execute only directory!')
34 |         # If the output directory is present but not writable, throw OSError
35 |         elif os.path.isdir(output_path) and not os.access(output_path, os.W_OK):
36 |             raise OSError('Output directory is not writable!')
37 |         # If the output directory is not yet present,
38 |         # but passed the check that it is in a writable parent directory,
39 |         # only warn
40 |         elif not os.path.isdir(output_path):
41 |             warnings.warn("Output directory does not exist, creating.")
42 |             os.makedirs(output_path)
43 |         # No else is required, since the else would be to place the output file
44 |         # in a writeable output directory that is already present.
45 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_file_postprocessor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from tests.capice.test_templates import teardown, set_up_manager_and_out
 6 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 7 | 
 8 | 
 9 | class TestFilePostProcessor(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         print('Setting up.')
13 |         set_up_manager_and_out()
14 | 
15 |     @classmethod
16 |     def tearDownClass(cls) -> None:
17 |         print('Tearing down.')
18 |         teardown()
19 | 
20 |     def setUp(self) -> None:
21 |         print('Testing case:')
22 | 
23 |     def test_load_file_pre_processor(self):
24 |         """
25 |         Test to see if the post file loading processor outputs according to
26 |         expectation. Note: chromosome stays an integer un till the imputer,
27 |         that's why I don't mark them as string.
28 |         """
29 |         print('Load file preprocessor.')
30 |         data = pd.DataFrame(
31 |             {
32 |                 "CHROM": [1, 2, 3],
33 |                 "POS": [100, 200, 300],
34 |                 "REF": ['A', 'T', 'G'],
35 |                 "ALT": ['T', 'G', 'A'],
36 |                 "SYMBOL_SOURCE": ['foo', 'foo', 'bar'],
37 |                 "Feature": ['bar', 'bar', 'buz'],
38 |                 "SYMBOL": ['g1', 'g2', 'g3'],
39 |                 "INTRON": [1, 0, 0],
40 |                 "EXON": [0, 1, 1]
41 |             }
42 |         )
43 | 
44 |         expected_output = pd.DataFrame(
45 |             {
46 |                 "chr": [1, 2, 3],
47 |                 "pos": [100, 200, 300],
48 |                 "REF": ['A', 'T', 'G'],
49 |                 "ALT": ['T', 'G', 'A'],
50 |                 "id_source": ['foo', 'foo', 'bar'],
51 |                 "feature": ['bar', 'bar', 'buz'],
52 |                 "gene_name": ['g1', 'g2', 'g3'],
53 |                 "Intron": [1, 0, 0],
54 |                 "Exon": [0, 1, 1]
55 |             }
56 |         )
57 | 
58 |         processor = LoadFilePostProcessor(dataset=data)
59 |         observed_output = processor.process()
60 |         pd.testing.assert_frame_equal(expected_output, observed_output)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class InputColumn(Enum):
 5 |     """
 6 |     Columns within panda data frames.
 7 |     `col_name` is the column name as should be used within CAPICE after processing input.
 8 |     `col_input_name` is the expected name as given by the input file.
 9 | 
10 |     If `col_name` and `col_input_name` are equal, the name does not get altered.
11 |     """
12 |     # General
13 |     chr = ('chr', 'CHROM')
14 |     pos = ('pos', 'POS')
15 |     ref = ('REF', 'REF')
16 |     alt = ('ALT', 'ALT')
17 |     gene_name = ('gene_name', 'SYMBOL')
18 |     gene_id = ('gene_id', 'Gene')
19 |     gene_name_source = ('id_source', 'SYMBOL_SOURCE')  # see GitHub issue 169
20 |     feature = ('feature', 'Feature')
21 |     feature_type = ('feature_type', 'Feature_type')
22 |     intron = ('Intron', 'INTRON')  # Combination of 2x int64 divided by a "/"
23 |     exon = ('Exon', 'EXON')  # Combination of 2x int64 divided by a "/"
24 | 
25 |     # Train-only
26 |     binarized_label = ('binarized_label', 'binarized_label')
27 |     sample_weight = ('sample_weight', 'sample_weight')
28 | 
29 |     def __init__(self, col_name, col_input_name):
30 |         self.col_name = col_name
31 |         self.col_input_name = col_input_name
32 | 
33 | 
34 | class Column(Enum):
35 |     """
36 |     Enums to use that are specific to the column names after.
37 |     """
38 |     chr_pos_ref_alt = 'chr_pos_ref_alt'
39 |     chr = 'chr'
40 |     pos = 'pos'
41 |     ref = 'REF'
42 |     alt = 'ALT'
43 |     gene_name = 'gene_name'
44 |     gene_id = 'gene_id'
45 |     id_source = 'id_source'
46 |     feature = 'feature'
47 |     feature_type = 'feature_type'
48 |     score = 'score'
49 |     suggested_class = 'suggested_class'
50 |     other = 'other_CAPICE_value'
51 | 
52 | 
53 | class OutputClasses(Enum):
54 |     """
55 |     Enums to use for the suggested output classes.
56 |     """
57 |     unknown = 'VUS'
58 |     # Variables already defined for future implementation
59 |     tolerated = ''
60 |     likely_tolerated = ''
61 |     likely_damaging = ''
62 |     damaging = ''
63 | 
64 | 
65 | class UniqueSeparator(Enum):
66 |     """
67 |     Enum specific to creating a specific separator for the preservation of the chr pos ref alt
68 |     columns.
69 |     """
70 |     unique_separator = '_VeryUniqueCAPICESeparator_'
71 | 
72 | 
73 | class Versioning(Enum):
74 |     VALIDATION_REGEX = (r'^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)'
75 |                         r'(-?(?P<prerelease>a|b|rc[0-9]+))?$')
76 | 


--------------------------------------------------------------------------------
/tests/capice/test_main_explain.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from tests.capice.test_templates import _project_root_directory, ResourceFile, load_model
 7 | from molgenis.capice.core.capice_manager import CapiceManager
 8 | from molgenis.capice.main_explain import CapiceExplain
 9 | 
10 | 
11 | class TestCapiceExplain(unittest.TestCase):
12 |     output_path = os.path.join(_project_root_directory, 'testing_output')
13 |     output_filename = 'test_output.csv.gz'
14 |     full_output_path = os.path.join(output_path, output_filename)
15 | 
16 |     @classmethod
17 |     def setUpClass(cls) -> None:
18 |         cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
19 |         if not os.path.isdir(cls.output_path):
20 |             os.makedirs(cls.output_path)
21 |         CapiceManager().output_filename = cls.output_filename
22 | 
23 |     @classmethod
24 |     def tearDownClass(cls) -> None:
25 |         if os.path.isfile(cls.full_output_path):
26 |             os.remove(cls.full_output_path)
27 |         if os.path.isdir(cls.output_path):
28 |             os.rmdir(cls.output_path)
29 | 
30 |     def test_capice_explain(self):
31 |         explainer = CapiceExplain(
32 |             model=self.model,
33 |             output_path=self.output_path,
34 |             output_given=True,
35 |             force=False
36 |         )
37 |         explainer.run()
38 |         feature_importances = self.model.get_booster().get_score(importance_type='gain')
39 |         observed = pd.read_csv(self.full_output_path, sep='\t')
40 |         expected = pd.DataFrame(
41 |             data=[
42 |                 feature_importances.keys(),
43 |                 feature_importances.values()
44 |             ], index=['feature', 'gain']
45 |         ).T.sort_values(by='gain', ascending=False).reset_index(drop=True)
46 |         expected['gain'] = expected['gain'].astype(float)
47 |         expected['total_gain'] = expected['feature'].map(self.model.get_booster().get_score(
48 |             importance_type='total_gain'))
49 |         expected['weight'] = expected['feature'].map(self.model.get_booster().get_score(
50 |             importance_type='weight'))
51 |         expected['cover'] = expected['feature'].map(self.model.get_booster().get_score(
52 |             importance_type='cover'))
53 |         expected['total_cover'] = expected['feature'].map(self.model.get_booster().get_score(
54 |             importance_type='total_cover'))
55 |         pd.testing.assert_frame_equal(observed, expected)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/main_predict.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice.main_capice import Main
 2 | from molgenis.capice.utilities.predictor import Predictor
 3 | from molgenis.capice.utilities.class_suggestor import ClassSuggestor
 4 | from molgenis.capice.validators.predict_validator import PredictValidator
 5 | from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator
 6 | 
 7 | 
 8 | class CapicePredict(Main):
 9 |     """
10 |     Predict class of CAPICE to call the different modules to impute,
11 |     process and eventually predict a score over a CAPICE annotated file.
12 |     """
13 | 
14 |     def __init__(self, input_path, model, output_path, output_given, force):
15 |         super().__init__(
16 |             input_path,
17 |             output_path,
18 |             output_given,
19 |             force
20 |         )
21 | 
22 |         # Model.
23 |         self.model = model
24 | 
25 |     def run(self):
26 |         """
27 |         Function to make CAPICE run in a prediction matter.
28 |         """
29 |         capice_data = self._load_file()
30 |         capice_data = self.process(
31 |             loaded_data=capice_data,
32 |             process_features=list(self.model.vep_features.keys())
33 |         )[0]
34 |         PostVEPProcessingValidator().validate_features_present(
35 |             capice_data, self.model.vep_features.values()
36 |         )
37 |         capice_data = self.categorical_process(
38 |             loaded_data=capice_data,
39 |             processing_features=self.model.processable_features,
40 |             train_features=None
41 |         )[0]
42 |         capice_data = self.predict(loaded_data=capice_data)
43 |         capice_data = self.apply_suggested_class(predicted_data=capice_data)
44 |         self._export(dataset=capice_data, output=self.output)
45 | 
46 |     def predict(self, loaded_data):
47 |         """
48 |         Function to call the correct model to predict CAPICE scores
49 |         :return: pandas DataFrame
50 |         """
51 |         validator = PredictValidator()
52 |         validator.validate_data_predict_ready(loaded_data, self.model)
53 |         predictor = Predictor(self.model)
54 |         capice_data = predictor.predict(loaded_data)
55 |         return capice_data
56 | 
57 |     @staticmethod
58 |     def apply_suggested_class(predicted_data):
59 |         """
60 |         Method to call the ClassSuggestor
61 |         :return: pandas DataFrame
62 |         """
63 |         suggestor = ClassSuggestor()
64 |         capice_data = suggestor.apply_suggestion(predicted_data)
65 |         return capice_data
66 | 


--------------------------------------------------------------------------------
/tests/resources/VEP104.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Ref": null,
 3 |     "Alt": null,
 4 |     "Consequence": null,
 5 |     "GC": null,
 6 |     "CpG": null,
 7 |     "motifECount": null,
 8 |     "motifEScoreChng": null,
 9 |     "motifEHIPos": null,
10 |     "oAA": null,
11 |     "nAA": null,
12 |     "cDNApos": null,
13 |     "relcDNApos": null,
14 |     "CDSpos": null,
15 |     "relCDSpos": null,
16 |     "protPos": null,
17 |     "relProtPos": null,
18 |     "Domain": null,
19 |     "Dst2Splice": null,
20 |     "Dst2SplType": null,
21 |     "minDistTSS": null,
22 |     "minDistTSE": null,
23 |     "SIFTcat": null,
24 |     "SIFTval": null,
25 |     "PolyPhenCat": null,
26 |     "PolyPhenVal": null,
27 |     "priPhCons": null,
28 |     "mamPhCons": null,
29 |     "verPhCons": null,
30 |     "priPhyloP": null,
31 |     "mamPhyloP": null,
32 |     "verPhyloP": null,
33 |     "bStatistic": null,
34 |     "targetScan": null,
35 |     "mirSVR-Score": null,
36 |     "mirSVR-E": null,
37 |     "mirSVR-Aln": null,
38 |     "cHmmTssA": null,
39 |     "cHmmTssAFlnk": null,
40 |     "cHmmTxFlnk": null,
41 |     "cHmmTx": null,
42 |     "cHmmTxWk": null,
43 |     "cHmmEnhG": null,
44 |     "cHmmEnh": null,
45 |     "cHmmZnfRpts": null,
46 |     "cHmmHet": null,
47 |     "cHmmTssBiv": null,
48 |     "cHmmBivFlnk": null,
49 |     "cHmmEnhBiv": null,
50 |     "cHmmReprPC": null,
51 |     "cHmmReprPCWk": null,
52 |     "cHmmQuies": null,
53 |     "GerpRS": null,
54 |     "GerpRSpval": null,
55 |     "GerpN": null,
56 |     "GerpS": null,
57 |     "TFBS": null,
58 |     "TFBSPeaks": null,
59 |     "TFBSPeaksMax": null,
60 |     "tOverlapMotifs": null,
61 |     "motifDist": null,
62 |     "Segway": null,
63 |     "EncH3K27Ac": null,
64 |     "EncH3K4Me1": null,
65 |     "EncH3K4Me3": null,
66 |     "EncExp": null,
67 |     "EncNucleo": null,
68 |     "EncOCC": null,
69 |     "EncOCCombPVal": null,
70 |     "EncOCDNasePVal": null,
71 |     "EncOCFairePVal": null,
72 |     "EncOCpolIIPVal": null,
73 |     "EncOCctcfPVal": null,
74 |     "EncOCmycPVal": null,
75 |     "EncOCDNaseSig": null,
76 |     "EncOCFaireSig": null,
77 |     "EncOCpolIISig": null,
78 |     "EncOCctcfSig": null,
79 |     "EncOCmycSig": null,
80 |     "Grantham": null,
81 |     "Dist2Mutation": null,
82 |     "Freq100bp": null,
83 |     "Rare100bp": null,
84 |     "Sngl100bp": null,
85 |     "Freq1000bp": null,
86 |     "Rare1000bp": null,
87 |     "Sngl1000bp": null,
88 |     "Freq10000bp": null,
89 |     "Rare10000bp": null,
90 |     "Sngl10000bp": null,
91 |     "dbscSNV-ada_score": null,
92 |     "dbscSNV-rf_score": null,
93 |     "Type": null,
94 |     "Length": null
95 | }


--------------------------------------------------------------------------------
/tests/capice/validators/test_property_type_validator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator
 4 | 
 5 | 
 6 | class TestPropertyTypeValidator(unittest.TestCase):
 7 |     @classmethod
 8 |     def setUpClass(cls) -> None:
 9 |         print('Setting up.')
10 |         cls.property_validator = PropertyTypeValidator()
11 | 
12 |     def setUp(self) -> None:
13 |         print('Testing case:')
14 | 
15 |     def test_property_validator_correct(self):
16 |         print('Property validator correct (not None)')
17 |         value = 1.1
18 |         expected_type = float
19 |         self.property_validator.validate_property(value, expected_type)
20 | 
21 |     def test_property_validator_correct_with_none(self):
22 |         print('Property validator including None')
23 |         value = None
24 |         expected_type = float
25 |         self.property_validator.validate_property(
26 |             value,
27 |             expected_type,
28 |             include_none=True
29 |         )
30 | 
31 |     def test_property_validator_incorrect(self):
32 |         print('Property validator incorrect (without none)')
33 |         value = 1
34 |         expected_type = float
35 |         self.assertRaises(
36 |             TypeError,
37 |             self.property_validator.validate_property,
38 |             value,
39 |             expected_type
40 |         )
41 | 
42 |     def test_property_validator_incorrect_with_none(self):
43 |         print('Property validator incorrect including None')
44 |         value = None
45 |         expected_type = float
46 |         self.assertRaises(
47 |             TypeError,
48 |             self.property_validator.validate_property,
49 |             value,
50 |             expected_type
51 |         )
52 | 
53 |     def test_property_validator_int_bool(self):
54 |         print('Property validator with expected int and value is False')
55 |         value = False
56 |         expected_type = int
57 |         self.assertRaises(
58 |             TypeError,
59 |             self.property_validator.validate_property,
60 |             value,
61 |             expected_type
62 |         )
63 | 
64 |     def test_property_validator_int_bool_include_none(self):
65 |         print('Property validator with expected int, value is False and '
66 |               'include_none is True')
67 |         value = False
68 |         expected_type = int
69 |         self.assertRaises(
70 |             TypeError,
71 |             self.property_validator.validate_property,
72 |             value,
73 |             expected_type,
74 |             True
75 |         )
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     unittest.main()
80 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import functools
 3 | import warnings
 4 | from pathlib import Path
 5 | from collections.abc import Iterable
 6 | 
 7 | 
 8 | def get_project_root_dir():
 9 |     """
10 |     Function to get the project root directory
11 |     :return: Path instance
12 |     """
13 |     #  This script is within the 5th directory in the project.
14 |     return Path(__file__).parent.parent
15 | 
16 | 
17 | def deprecated(func):
18 |     @functools.wraps(func)
19 |     def new_func(*args, **kwargs):
20 |         warnings.simplefilter('always', DeprecationWarning)
21 |         warnings.warn('Call to deprecated function {}.'.format(func.__name__),
22 |                       category=DeprecationWarning,
23 |                       stacklevel=2)
24 |         warnings.simplefilter('default', DeprecationWarning)
25 |         return func(*args, **kwargs)
26 | 
27 |     return new_func
28 | 
29 | 
30 | def check_if_in_list(list_of_lists: list[list[object]], to_check_list: Iterable):
31 |     """
32 |     Checks if the item within a list within a list of object value (can be int, str, float,
33 |     etc.) is within the to_check_list. If False: add to return list. If True: do not add to
34 |     return list.
35 | 
36 |     Args:
37 |         list_of_lists:
38 |             List containing lists of values (object).
39 |             These values are each independently checked if they are within the to_check_list.
40 |             If False: add to return list. If true: do not add to return list.
41 |         to_check_list:
42 |             Iterable over which the individual items of the list_of_lists should be checked.
43 | 
44 |     Returns:
45 |         list:
46 |             A single list containing all individual items of list_of_lists that did not occur in
47 |             to_check_list.
48 | 
49 |     """
50 |     return_list = []
51 |     for items in list_of_lists:
52 |         for item in items:
53 |             if item not in to_check_list:
54 |                 return_list.append(item)
55 |     return return_list
56 | 
57 | 
58 | def check_file_exist(file_path: os.PathLike[str], force: bool):
59 |     """
60 |     Method to check if a file exists and (if force is set to False) raises FileExistsError.
61 |     If force is set to True, will not raise FileExistsError. Will also not raise FileExistsError
62 |     if file not exists.
63 | 
64 |     Args:
65 |         file_path:
66 |             Full absolute output path, including the output filename and extension.
67 |         force:
68 |             Command Line Argument of the "force" argument.
69 | 
70 |     """
71 |     if os.path.exists(file_path) and not force:
72 |         raise FileExistsError("Output file already exists!")
73 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_dynamic_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from molgenis.capice.utilities.dynamic_loader import DynamicLoader
 5 | from tests.capice.test_templates import set_up_manager_and_out, _project_root_directory
 6 | 
 7 | 
 8 | class TestDynamicLoader(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls) -> None:
11 |         print('Setting up.')
12 |         cls.manager, output = set_up_manager_and_out()
13 |         cls.correct_resources = os.path.join(
14 |             _project_root_directory,
15 |             'tests',
16 |             'resources',
17 |             'dynamic_loader_test_files_present'
18 |         )
19 |         cls.incorrect_resources = os.path.join(
20 |             _project_root_directory,
21 |             'tests',
22 |             'resources',
23 |             'dynamic_loader_test_no_files'
24 |         )
25 |         cls.required_attributes = ['name', 'some_function']
26 | 
27 |     @classmethod
28 |     def tearDownClass(cls) -> None:
29 |         print('Tearing down.')
30 | 
31 |     def setUp(self) -> None:
32 |         print('Testing case:')
33 | 
34 |     def test_no_directory_given_raise(self):
35 |         print('Raise OSError: no directory given')
36 |         self.assertRaises(
37 |             OSError,
38 |             DynamicLoader,
39 |             self.required_attributes,
40 |             os.path.join(_project_root_directory, 'some_random_directory')
41 |         )
42 | 
43 |     def test_manual_annotator_loader_correct(self):
44 |         print('Loading correct manual annotator')
45 |         loader = DynamicLoader(
46 |             required_attributes=self.required_attributes,
47 |             path=self.correct_resources
48 |         )
49 |         loaded_modules = loader.load_manual_annotators()
50 |         names = []
51 |         for module in loaded_modules:
52 |             names.append(module.name)
53 |         self.assertTrue('Correct' in names)
54 | 
55 |     def test_manual_annotator_loader_raise(self):
56 |         print('Loading raise manual annotator no module found in correct directory')
57 |         loader = DynamicLoader(
58 |             required_attributes=['name', 'unrelated_function'],
59 |             path=self.correct_resources
60 |         )
61 |         self.assertRaises(FileNotFoundError, loader.load_manual_annotators)
62 | 
63 |     def test_manual_annotator_loader_raise_no_module_found(self):
64 |         print('Loading raise manual annotator no module found in wrong directory')
65 |         loader = DynamicLoader(
66 |             required_attributes=self.required_attributes,
67 |             path=self.incorrect_resources
68 |         )
69 |         self.assertRaises(FileNotFoundError, loader.load_manual_annotators)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/tests/capice/test_templates.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | from pathlib import Path
 4 | 
 5 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent
 6 | from molgenis.capice.core.capice_manager import CapiceManager
 7 | from molgenis.capice.core.logger import Logger
 8 | from molgenis.capice.main_predict import CapicePredict
 9 | 
10 | _project_root_directory = Path(__file__).absolute().parent.parent.parent
11 | _project_resources = os.path.join(_project_root_directory, 'resources')
12 | _project_test_resources = os.path.join(_project_root_directory, 'tests', 'resources')
13 | 
14 | 
15 | def set_up_manager_and_out():
16 |     """
17 |     Function to set up the CapiceManager and testing output location
18 |     :return: manager instance, output_directory
19 |     """
20 |     manager = CapiceManager()
21 |     manager.critical_logging_only = True
22 |     root_dir = _project_root_directory
23 |     output_directory = os.path.join(root_dir, '.test_output')
24 |     if not os.path.exists(output_directory):
25 |         os.makedirs(output_directory)
26 |     return manager, output_directory
27 | 
28 | 
29 | def teardown():
30 |     """
31 |     Function to remove any and all files from the '.test_output' folder and
32 |     remove the folder itself too.
33 |     """
34 |     test_folder = os.path.join(_project_root_directory, '.test_output')
35 |     if os.path.isdir(test_folder):
36 |         if len(os.listdir(test_folder)) > 0:
37 |             for file in os.listdir(test_folder):
38 |                 os.remove(os.path.join(test_folder, file))
39 |         os.rmdir(test_folder)
40 |     Logger.instance = None
41 |     CapiceManager.instance = None
42 | 
43 | 
44 | def set_up_predict():
45 |     return CapicePredict(
46 |         input_path=None,
47 |         model=None,
48 |         output_path=None,
49 |         output_given=False,
50 |         force=False
51 |     )
52 | 
53 | 
54 | def set_up_impute_preprocess():
55 |     set_up_manager_and_out()
56 |     main = set_up_predict()
57 |     main.infile = os.path.join(_project_root_directory, 'resources', 'predict_input.tsv.gz')
58 |     model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
59 |     main.model = model
60 |     return main, model
61 | 
62 | 
63 | def load_model(file_path):
64 |     return ArgsHandlerParent.load_model(file_path)
65 | 
66 | 
67 | class ResourceFile(Enum):
68 |     """
69 |     Enum storing paths to test resource files for easy access.
70 |     """
71 |     PREDICT_INPUT_TSV_GZ = os.path.join(_project_resources, 'predict_input.tsv.gz')
72 |     XGB_BOOSTER_POC_JSON = os.path.join(_project_test_resources, 'xgb_booster_poc.json')
73 | 
74 | 
75 | class FakeResourceFile(Enum):
76 |     PREDICT_INPUT_TSV_GZ = os.path.join(_project_test_resources,
77 |                                         'non_existing_predict_input.tsv.gz')
78 | 


--------------------------------------------------------------------------------
/tests/capice/validators/test_input_validator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from molgenis.capice.validators.input_validator import InputValidator
 5 | from tests.capice.test_templates import _project_root_directory, ResourceFile, FakeResourceFile
 6 | 
 7 | 
 8 | class TestInputValidator(unittest.TestCase):
 9 |     new_directory_name = '.another_test_output_directory'
10 |     new_directory = os.path.join(_project_root_directory, new_directory_name)
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         print('Setting up.')
15 |         cls.input_validator = InputValidator()
16 | 
17 |     @classmethod
18 |     def tearDownClass(cls):
19 |         print('Tearing down.')
20 |         if os.path.isdir(cls.new_directory):
21 |             os.rmdir(cls.new_directory)
22 | 
23 |     def setUp(self):
24 |         print('Testing case:')
25 | 
26 |     def test_create_output_path(self):
27 |         print('Creating output location')
28 |         with self.assertWarns(Warning):
29 |             self.input_validator.validate_output_path(self.new_directory)
30 |         self.assertTrue(
31 |             self.new_directory_name in os.listdir(_project_root_directory)
32 |         )
33 | 
34 |     def test_input_single_extension(self):
35 |         allowed_extensions = ('.tsv.gz',)
36 |         self.input_validator.validate_input_path(ResourceFile.PREDICT_INPUT_TSV_GZ.value,
37 |                                                  extension=allowed_extensions)
38 | 
39 |     def test_input_multiple_extensions(self):
40 |         allowed_extensions = ('.tsv', '.tsv.gz')
41 |         self.input_validator.validate_input_path(ResourceFile.PREDICT_INPUT_TSV_GZ.value,
42 |                                                  extension=allowed_extensions)
43 | 
44 |     def test_input_multiple_extensions_invalid(self):
45 |         allowed_extensions = ('.tsv', '.tsv.gz')
46 |         with self.assertRaises(IOError) as e:
47 |             self.input_validator.validate_input_path(ResourceFile.XGB_BOOSTER_POC_JSON.value,
48 |                                                      extension=allowed_extensions)
49 | 
50 |         self.assertEqual(f'{ResourceFile.XGB_BOOSTER_POC_JSON.value} does not match required '
51 |                          f'extension: .tsv, .tsv.gz',
52 |                          str(e.exception))
53 | 
54 |     def test_input_non_existing(self):
55 |         allowed_extensions = ('.tsv', '.tsv.gz')
56 |         with self.assertRaises(FileNotFoundError) as e:
57 |             self.input_validator.validate_input_path(FakeResourceFile.PREDICT_INPUT_TSV_GZ.value,
58 |                                                      extension=allowed_extensions)
59 | 
60 |         self.assertEqual(f'{FakeResourceFile.PREDICT_INPUT_TSV_GZ.value} does not exist!',
61 |                          str(e.exception))
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     unittest.main()
66 | 


--------------------------------------------------------------------------------
/tests/capice/test_edge_cases_predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from tests.capice.test_templates import set_up_manager_and_out, teardown, set_up_predict, \
 7 |     _project_root_directory, ResourceFile, load_model
 8 | 
 9 | 
10 | class TestEdgeCases(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls) -> None:
13 |         cls.manager, cls.output = set_up_manager_and_out()
14 |         cls.edge_cases = os.path.join(
15 |             _project_root_directory,
16 |             'tests',
17 |             'resources',
18 |             'edge_cases_vep.tsv.gz'
19 |         )
20 |         cls.breakpoints = os.path.join(
21 |             _project_root_directory,
22 |             'tests',
23 |             'resources',
24 |             'breakends_vep.tsv.gz'
25 |         )
26 |         cls.symbolic = os.path.join(
27 |             _project_root_directory,
28 |             'tests',
29 |             'resources',
30 |             'symbolic_alleles_vep.tsv.gz'
31 |         )
32 |         cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value)
33 |         cls.main = set_up_predict()
34 | 
35 |     @classmethod
36 |     def tearDownClass(cls) -> None:
37 |         teardown()
38 | 
39 |     def setUp(self) -> None:
40 |         self.main = set_up_predict()
41 |         self.main.output = self.output
42 |         self.main.model = self.model
43 |         print('Testing case:')
44 | 
45 |     def get_observed_results(self):
46 |         return pd.read_csv(os.path.join(self.output, self.manager.output_filename), sep='\t')
47 | 
48 |     def test_edge_cases(self):
49 |         print('Edge cases')
50 |         self.main.infile = self.edge_cases
51 |         self.manager.output_filename = 'edge_cases_vep_capice.tsv.gz'
52 |         self.main.run()
53 |         observed_output = self.get_observed_results()
54 |         self.assertGreater(observed_output['score'].sum(), 0)
55 |         self.assertFalse(observed_output['score'].hasnans)
56 | 
57 |     def test_symbolic_alleles(self):
58 |         print('Symbolic alleles')
59 |         self.main.infile = self.symbolic
60 |         self.manager.output_filename = 'symbolic_alleles_vep_capice.tsv.gz'
61 |         self.main.run()
62 |         observed_output = self.get_observed_results()
63 |         self.assertGreater(observed_output['score'].sum(), 0)
64 |         self.assertFalse(observed_output['score'].hasnans)
65 | 
66 |     def test_breakpoints(self):
67 |         print('Breakpoints')
68 |         self.main.infile = self.breakpoints
69 |         self.manager.output_filename = 'breakends_vep_capice.tsv.gz'
70 |         self.main.run()
71 |         observed_output = self.get_observed_results()
72 |         self.assertGreater(observed_output['score'].sum(), 0)
73 |         self.assertFalse(observed_output['score'].hasnans)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | results.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # IDE specific files
142 | .idea/
143 | /capice.iml


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | os: linux
 2 | dist: jammy
 3 | language: java
 4 | python:
 5 | - '3.10'
 6 | jdk: openjdk21
 7 | cache:
 8 |   directories:
 9 |     - '$HOME/.cache/pip'
10 |     - '$HOME/.sonar/cache'
11 | branches:
12 |   only:
13 |   - main
14 |   - /^v\d+\.\d+(\.\d+)?(-\S*)?$/
15 | before_install:
16 | - pip install -U pip
17 | install:
18 | - pip install -e '.[test]'
19 | script:
20 | - coverage run -m pytest --junitxml=results.xml
21 | - coverage xml
22 | - sonar-scanner
23 | addons:
24 |   sonarcloud:
25 |     organization: molgenis
26 |     token:
27 |       secure: qFkmx02PjcBy6nCpf05evyhQBcwOqq3BHycZbmyYkKE9AS9AbYH7eiGboTOvO5N45d3UoZaUud22JK+rD4mKTm9fMzhxQLrexww+EUOCs0EYRDaQkBtl9HcgnDWnHtRl55151agb8VJ2l9AFMC6vlfJjs+nYGeFimhoat0s4NxWlhQV/fEdmmlJCj9k37Z1We0SSoSuw8A+kJ5S3Lzf+7mhm2o/BF1rvjipW9m66e6uOyNmm9n2FMi9em06OdQRDoeLIXjgqvF2SvphcOwXfiL3TFJR4R3wjajRqgGlRmCcdt/HSMe5eTjcNvHiE3HmeAKSLoItixSqxBxzLrW/lBziWPp6E4GqRpyePhprUzJDIeB4nPg28CHS5mbqtsEiJFkGKwcaDWd+jY/KHIhN/5ECMGGQrBsiB0v7ENwasa46pMzsZ9m9b1oZzMO+dKxBl9N7BGmcjZp37nRv7n7zo5SGn1NMSs+w+8C542mAuVT0TCemKf1178auK2zeHguUxt+/GHw67lGyegZC17a9DAVN4IBCU+hkrFSwhfk1VqXulVM/b+TDpHggOaRLTNR5u45j+ibNTASTvTHwVDlbaGOOrcBSjoMY8yf+/777g3KzQFofXyyaiWRcVQAg1pl8XJUVNSJkULxCZlxy9CXc0dmnwRpui6fGtHgNpn/hwXvk=
28 | deploy:
29 |   - provider: pypi
30 |     username: __token__
31 |     password:
32 |       secure: "f7iUE2wNOtqgDbB798eD9LNANJoU2VHmpnQwYqAJu/JLl7/JaMrPO2/n399i58HTXN5+7VxVo+zRHSCSoZ3R0sQI9m3rd9fN4hLtUEHDdwUW92ZoaMGoRCzj1qCslWSYb/H1yePp2hHzLPPJO7mPJMpP/ZCsTletBWl3BfeiaubXqASmJCBHCZ05ITCb5IY7w6LQWsEwZnN8QH3CPQL+T15P9xEyw5O+sP75MRxls8RHetzt41+3/cqvO/ZZpKydTdi7Whq6FKxGbDAk1CFP5I0g5CLaVxLXN2AJKuBouXs2r9J/+SZhItxtgzuU5Jdz527larMnWeKKGxVOwCA/7Zw/H1LGJ2tcDLI6MGrSjVnNd+M+/HyiXY+RmJw1zgAs9ZOH7M7hIQZ68Ld8wZ+e/OwrlzYJoB23RnpgtRFHrLH0GW2zKyONUdoS5IzOZyVUFwB/hHVQ2dEQSAkSwH5aa3tdCl9CbkF/VLoxqjPN6cRZgKF/UPK/Hvf+zVVF6upuzK91ETu1WXQ4d8M474OURvvNjHx+ZyGIprdpHAA6UFFNJVE0eySmJTxVM8Wdoqs9iUsUhOqXdjOhAweHfM9N6y2zgEnoOIKRlEfzY5WxU1oeDEHLcuX+Ll1l0aaxT3c6BRfpiKbfWE8ZJaUyaZCMlPocqorAxQFqgwCYWP3RYsM="
33 |     on:
34 |       tags: true
35 |   - provider: releases
36 |     token:
37 |       secure: vPXI5z2Wf5W6SLdCN96NWfsGz9dIpkp3tMmclgQls+mAmAdPxTlAqJCbDb8CoLetXcCx7U4SonWMShJokyqZPYd4KtoPK5lGZJqO3MV+pBMXHa9bO8nqoM4yC0Q+AukMoWTCzblZD/gSXbqcg/PODjplmBs9Doy3s8c81qx5H2L+rPsZPtm6GtCThgytW/bIOocJB6GilPVJfJoizHK0SHVKkZiuH0uCa5USVbM/HsWLTdJ5qItqLeU5TshgOg7o/4NMY6NrzbYL829Vcp7vqTuxqE45RG4jKckRm36pPZVZx4dlKQlqXOuxYAfkDPAdJy9+SSWAsqaaPj+alyhLii+0YTLMOoELPDcVSuNYqonS/7WZJ7HBVuQTtiFT5MU0fIQSvqptnBXCiOLUH5mNgL3FaBwwGDuVzbXmuKN4eSBflB1IWsgHftMFdhJ1NG+eS49zo5TJ3qaZBYtY+6rqUJt056ZUQE+9lqQSJDtHb0uVepc2QR7OvNxkYXaBjIU1wYpieT7dCCbo9+wnrtFYof+Ux3yiC/dDbhl2xXhJcsSKpbv1wwHBFmOjVTgvGuwGGCxYU2TVn0GdMf/ec7HPDLOxNTnhKgkqZfct4Id0BvKU40tPnS7KDlCCOjE6o1qyE3vMnjeqFw5gqPbUqVhe0ZF+ZsqH8B7Ga4VFDmVjAmY=
38 |     file: "bogus.file"
39 |     prerelease: true
40 |     on:
41 |       tags: true
42 |     edge: true # opt in to dpl v2
43 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/core/capice_exporter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.core.logger import Logger
 6 | from molgenis.capice.core.capice_manager import CapiceManager
 7 | from molgenis.capice.utilities import check_file_exist
 8 | from molgenis.capice.utilities.enums import Column, UniqueSeparator
 9 | 
10 | 
11 | class CapiceExporter:
12 |     """
13 |     Class specifically exporting files
14 |     """
15 | 
16 |     def __init__(self, file_path, output_given, force):
17 |         self.log = Logger().logger
18 |         self.capice_filename = CapiceManager().output_filename
19 |         self.file_path = file_path
20 |         self.output_given = output_given
21 |         self.force = force
22 |         self.export_cols = [
23 |             Column.chr.value,
24 |             Column.pos.value,
25 |             Column.ref.value.lower(),
26 |             Column.alt.value.lower(),
27 |             Column.gene_name.value,
28 |             Column.gene_id.value,
29 |             Column.id_source.value,
30 |             Column.feature.value,
31 |             Column.feature_type.value,
32 |             Column.score.value,
33 |             Column.suggested_class.value
34 |         ]
35 | 
36 |     def export_capice_prediction(self, datafile: pd.DataFrame):
37 |         """
38 |         Function specific to export the dataset created for the prediction
39 |         pathway.
40 |         :param datafile: prediction pandas DataFrame
41 |         """
42 |         export_path = os.path.join(self.file_path, self.capice_filename)
43 |         datafile = self._post_process_split_cols(datafile)
44 |         datafile = self._post_process_set_correct_dtypes(datafile)
45 |         check_file_exist(export_path, self.force)
46 |         datafile[self.export_cols].to_csv(export_path, sep='\t', index=False)
47 |         if not self.output_given:
48 |             print('Successfully exported CAPICE datafile to: %s', export_path)
49 | 
50 |     @staticmethod
51 |     def _post_process_split_cols(datafile: pd.DataFrame):
52 |         datafile[
53 |             [Column.chr.value, Column.pos.value, Column.ref.value.lower(), Column.alt.value.lower()]
54 |         ] = datafile[Column.chr_pos_ref_alt.value].str.split(
55 |             UniqueSeparator.unique_separator.value, expand=True)
56 |         return datafile
57 | 
58 |     @staticmethod
59 |     def _post_process_set_correct_dtypes(datafile: pd.DataFrame):
60 |         datafile[Column.gene_id.value] = pd.Series(datafile[Column.gene_id.value], dtype='Int64')
61 |         return datafile
62 | 
63 |     def export_capice_model(self, model):
64 |         """
65 |         Function specific to export a newly created CAPICE model
66 |         :param model: XGBClassifier instance
67 |         """
68 |         export_path = os.path.join(self.file_path, self.capice_filename)
69 |         check_file_exist(export_path, self.force)
70 |         model.save_model(export_path)
71 |         if not self.output_given:
72 |             print('Successfully exported CAPICE model to: ', export_path)
73 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/core/args_handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import argparse
 3 | 
 4 | from molgenis.capice import __version__
 5 | from molgenis.capice.core.capice_manager import CapiceManager
 6 | from molgenis.capice.cli.args_handler_train import ArgsHandlerTrain
 7 | from molgenis.capice.cli.args_handler_predict import ArgsHandlerPredict
 8 | from molgenis.capice.cli.args_handler_explain import ArgsHandlerExplain
 9 | 
10 | 
11 | class ArgsHandler:
12 |     """
13 |     Command-line argument handler.
14 |     Creates, initializes and calls the specific (predict, train etc.) module
15 |     sub-parser.
16 |     """
17 | 
18 |     def __init__(self):
19 |         link = "https://github.com/molgenis/capice/blob/main/scripts/" \
20 |                "convert_vep_vcf_to_tsv_capice.sh"
21 |         self.version = __version__
22 |         self.parser = argparse.ArgumentParser(
23 |             description=f"CAPICE, a machine-learning-based method for prioritizing pathogenic"
24 |                         f" variants (https://doi.org/10.1186/s13073-020-00775-w). "
25 |                         f"Converting a VEP output VCF can be done using the conversion tool "
26 |                         f"supplied here: {link} (requires Apptainer)."
27 |         )
28 |         self.manager = CapiceManager()
29 | 
30 |     def handle(self):
31 |         """
32 |         Method to handle the non module specific command line arguments. After
33 |         argument handling, calls the module
34 |         """
35 |         args = self.parser.parse_args()
36 |         self._handle_args(args)
37 |         if 'func' in args:
38 |             args.func(args)
39 |         else:
40 |             self.parser.print_help()
41 |             self.parser.exit(2)
42 | 
43 |     def create(self):
44 |         """
45 |         Classmethod to create the ArgsHandler ArgumentParser instance
46 |         and adds the subparsers to ArgsHandler. Does not automatically handle
47 |         the input arguments, please use ArgsHandler.create().handle() for that.
48 |         """
49 |         self._add_arguments()
50 |         subparsers = self.parser.add_subparsers()
51 |         predictor = ArgsHandlerPredict(subparsers.add_parser('predict'))
52 |         predictor.create()
53 |         predictor.handle()
54 |         trainer = ArgsHandlerTrain(subparsers.add_parser('train'))
55 |         trainer.create()
56 |         trainer.handle()
57 |         explainer = ArgsHandlerExplain(subparsers.add_parser('explain'))
58 |         explainer.create()
59 |         explainer.handle()
60 | 
61 |     def _add_arguments(self):
62 |         self.parser.add_argument(
63 |             '-v',
64 |             '--verbose',
65 |             action='count',
66 |             default=0,
67 |             help='verbose mode. multiple -v options increase the verbosity')
68 | 
69 |         self.parser.add_argument(
70 |             '--version',
71 |             action='version',
72 |             version=f'%(prog)s {self.version}'
73 |         )
74 | 
75 |     def _handle_args(self, args):
76 |         level = None
77 |         if args.verbose == 1:
78 |             level = logging.INFO
79 |         elif args.verbose >= 2:
80 |             level = logging.DEBUG
81 |         self.manager.loglevel = level
82 | 


--------------------------------------------------------------------------------
/tests/capice/cli/test_args_handler_parent.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent
 4 | 
 5 | 
 6 | class TestArgsHandlerPredict(unittest.TestCase):
 7 |     @classmethod
 8 |     def setUpClass(cls) -> None:
 9 |         cls.cli_args = '-z/--zz'
10 | 
11 |     def test__single_argument_retriever_single_none(self):
12 |         test_input = None
13 |         expected_output = None
14 |         actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
15 |                                                                      has_default=False)
16 |         self.assertEqual(actual_output, expected_output)
17 | 
18 |     def test__single_argument_retriever_empty_list(self):
19 |         test_input = []
20 |         with self.assertRaises(ValueError) as context:
21 |             ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
22 |                                                          has_default=False)
23 |         msg = 'Empty list is given. Should be None or list with elements.'
24 |         self.assertEqual(str(context.exception), msg)
25 | 
26 |     def test__single_argument_retriever_one_item(self):
27 |         test_input = ['aa']
28 |         expected_output = 'aa'
29 |         actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
30 |                                                                      has_default=False)
31 |         self.assertEqual(actual_output, expected_output)
32 | 
33 |     def test__single_argument_retriever_default_only(self):
34 |         test_input = ['aa']
35 |         expected_output = 'aa'
36 |         actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
37 |                                                                      has_default=True)
38 |         self.assertEqual(actual_output, expected_output)
39 | 
40 |     def test__single_argument_retriever_two_items_no_default(self):
41 |         test_input = ['aa', 'bb']
42 |         with self.assertRaises(IOError) as context:
43 |             ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
44 |                                                          has_default=False)
45 |         msg = 'Argument -z/--zz is only allowed once.'
46 |         self.assertEqual(str(context.exception), msg)
47 | 
48 |     def test__single_argument_retriever_default_with_one_item(self):
49 |         test_input = ['aa', 'bb']
50 |         expected_output = 'bb'
51 |         actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
52 |                                                                      has_default=True)
53 |         self.assertEqual(actual_output, expected_output)
54 | 
55 |     def test__single_argument_retriever_default_with_two_items(self):
56 |         test_input = ['aa', 'bb', 'cc']
57 |         with self.assertRaises(IOError) as context:
58 |             ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args,
59 |                                                          has_default=True)
60 |         msg = 'Argument -z/--zz is only allowed once.'
61 |         self.assertEqual(str(context.exception), msg)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     unittest.main()
66 | 


--------------------------------------------------------------------------------
/scripts/tests/test_convert_vep_vcf_to_tsv_capice.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Base paths (to current dir/script).
  4 | readonly CURRENT_PATH=$(pwd)
  5 | BASE_PATH=$(realpath "$0") && readonly BASE_PATH=${BASE_PATH%/*}
  6 | 
  7 | # Variable that stores whether a single test failed.
  8 | any_test_failed=false
  9 | 
 10 | main() {
 11 |   # Preparations.
 12 |   cd ${BASE_PATH}
 13 |   local -r input_vcf='../../CAPICE_example/capice_input.vcf.gz'
 14 |   local -r expected_output='../../CAPICE_example/CAPICE_input.tsv.gz'
 15 |   local -r actual_output='test_output.tsv.gz' # cleanup within each test!
 16 |   gunzip -k ${input_vcf} # keeps original gzip
 17 |   gunzip -k ${expected_output} # keeps original gzip
 18 | 
 19 |   # Run tests.
 20 |   testValidTextInput
 21 |   testValidGzipInput
 22 |   testEmptyInputParameter
 23 |   testNoOutputParameter
 24 |   testInvalidInputFileExtension
 25 |   testInvalidInputFilePath
 26 | 
 27 |   # Cleanup.
 28 |   rm ${input_vcf%.gz}
 29 |   rm ${expected_output%.gz}
 30 | 
 31 |   # Returns exitcode based on whether tests failed.
 32 |   if [[ "${any_test_failed}" == true ]]
 33 |   then
 34 |     exit 1
 35 |   fi
 36 | }
 37 | 
 38 | # $1: the generated exitcode
 39 | # $2: the name of the test
 40 | validateIfFailed() {
 41 |   if [[ $1 != 1 ]]
 42 |   then
 43 |     echo "$2: has exitcode 0, but expected 1"
 44 |     any_test_failed=true
 45 |   else
 46 |     echo "$2: done"
 47 |   fi
 48 | 
 49 |   rmSilent ${actual_output}
 50 | }
 51 | 
 52 | # $1: the generated exitcode
 53 | # $2: the name of the test
 54 | validateOutputFile() {
 55 |   if [[ $1 != 0 ]]
 56 |   then
 57 |     echo "$2: has exitcode 1, but expected 0"
 58 |     any_test_failed=true
 59 |   else
 60 |     gunzip ${actual_output}
 61 |     local checksum_expected=$(shasum -a 256 ${expected_output%.gz} | cut -d ' ' -f1)
 62 |     shasum -a 256 -c <<< "${checksum_expected%.gz}  ${actual_output%.gz}"
 63 |     if [[ $? == 1 ]]
 64 |     then
 65 |       any_test_failed=true
 66 |     fi
 67 |   fi
 68 | 
 69 |   rmSilent ${actual_output%.gz}
 70 | }
 71 | 
 72 | rmSilent() {
 73 |   rm "$1" 2> /dev/null
 74 | }
 75 | 
 76 | testValidTextInput() {
 77 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf%.gz} -o ${actual_output} &> /dev/null
 78 |   validateOutputFile "$?" 'testValidTextInput'
 79 | }
 80 | 
 81 | testValidGzipInput() {
 82 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf} -o ${actual_output} &> /dev/null
 83 |   validateOutputFile "$?" 'testValidGzipInput'
 84 | }
 85 | 
 86 | testEmptyInputParameter() {
 87 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i "" -o ${actual_output}  &> /dev/null
 88 |   validateIfFailed "$?" 'testEmptyInputParameter'
 89 | }
 90 | 
 91 | testNoOutputParameter() {
 92 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf} &> /dev/null
 93 |   validateIfFailed "$?" 'testNoOutputParameter'
 94 | }
 95 | 
 96 | testInvalidInputFileExtension() {
 97 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i './capice_input.vcf.zip' -o ${actual_output} &> /dev/null
 98 |   validateIfFailed "$?" 'testInvalidInputFileExtension'
 99 | }
100 | 
101 | testInvalidInputFilePath() {
102 |   bash ../convert_vep_vcf_to_tsv_capice.sh -i './non_existing_dir/capice_input.vcf.gz' -o ${actual_output} &> /dev/null
103 |   validateIfFailed "$?" 'testInvalidInputFileExtension'
104 | }
105 | 
106 | main


--------------------------------------------------------------------------------
/src/molgenis/capice/vep/consequence.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from molgenis.capice.core.logger import Logger
 5 | from molgenis.capice.vep.template import Template
 6 | 
 7 | 
 8 | class Consequence(Template):
 9 |     def __init__(self):
10 |         super(Consequence, self).__init__(
11 |             name='Consequence',
12 |             usable=True
13 |         )
14 |         self.log = Logger().logger
15 | 
16 |     @property
17 |     def drop(self):
18 |         return True
19 | 
20 |     @property
21 |     def columns(self):
22 |         return ['is_regulatory_region_variant',
23 |                 'is_regulatory_region_ablation',
24 |                 'is_regulatory_region_amplification',
25 |                 'is_missense_variant',
26 |                 'is_intron_variant',
27 |                 'is_upstream_gene_variant',
28 |                 'is_downstream_gene_variant',
29 |                 'is_synonymous_variant',
30 |                 'is_TF_binding_site_variant',
31 |                 'is_splice_donor_variant',
32 |                 'is_coding_sequence_variant',
33 |                 'is_splice_region_variant',
34 |                 'is_stop_gained',
35 |                 'is_splice_acceptor_variant',
36 |                 'is_frameshift_variant',
37 |                 'is_3_prime_UTR_variant',
38 |                 'is_inframe_insertion',
39 |                 'is_inframe_deletion',
40 |                 'is_5_prime_UTR_variant',
41 |                 'is_start_lost',
42 |                 'is_non_coding_transcript_exon_variant',
43 |                 'is_non_coding_transcript_variant',
44 |                 'is_TFBS_ablation',
45 |                 'is_TFBS_amplification',
46 |                 'is_protein_altering_variant',
47 |                 'is_stop_lost',
48 |                 'is_stop_retained_variant',
49 |                 'is_transcript_ablation',
50 |                 'is_intergenic_variant',
51 |                 'is_start_retained_variant',
52 |                 'is_transcript_amplification',
53 |                 'is_incomplete_terminal_codon_variant',
54 |                 'is_mature_miRNA_variant',
55 |                 'is_NMD_transcript_variant',
56 |                 'is_feature_elongation',
57 |                 'is_feature_truncation',
58 |                 'is_splice_donor_5th_base_variant',
59 |                 'is_splice_donor_region_variant',
60 |                 'is_splice_polypyrimidine_tract_variant'
61 |                 ]
62 | 
63 |     @staticmethod
64 |     def _fillna():
65 |         return 0
66 | 
67 |     def _process(self, dataframe: pd.DataFrame):
68 |         splitted_consequence = dataframe[self.name].str.split('&', expand=True)
69 |         raw_consequences = []
70 |         for consequence in self.columns:
71 |             current_consequence = consequence.split('is_')[1]
72 |             dataframe[consequence] = np.where(
73 |                 np.isin(splitted_consequence, current_consequence).any(axis=1), 1, 0
74 |             )
75 |             raw_consequences.append(current_consequence)
76 | 
77 |         self._validate_consequences(splitted_consequence, raw_consequences)
78 |         return dataframe
79 | 
80 |     def _validate_consequences(self, consequences: pd.DataFrame, supported_consequences: list):
81 |         unique_consequences = pd.Series(pd.unique(consequences.values.ravel('K'))).dropna()
82 |         for consequence in unique_consequences:
83 |             if consequence not in supported_consequences:
84 |                 self.log.warning('Supplied VEP consequence: %s is not supported in the '
85 |                                  'Consequence processor!', consequence)
86 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/post_file_parse_validator.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from molgenis.capice.core.logger import Logger
 4 | from molgenis.capice.utilities.enums import InputColumn
 5 | from molgenis.capice.utilities.column_utils import ColumnUtils
 6 | 
 7 | 
 8 | class PostFileParseValidator:
 9 |     MINIMUM_REQUIRED_COLUMNS = {
10 |         InputColumn.chr,
11 |         InputColumn.pos,
12 |         InputColumn.ref,
13 |         InputColumn.alt,
14 |         InputColumn.gene_name,
15 |         InputColumn.gene_id,
16 |         InputColumn.gene_name_source,
17 |         InputColumn.feature,
18 |         InputColumn.feature_type
19 |     }
20 | 
21 |     def __init__(self):
22 |         self.log = Logger().logger
23 | 
24 |     def validate_n_columns(self, dataset):
25 |         """
26 |         Validator to make sure that the number of loaded columns is at least equal to
27 |         MINIMUM_REQUIRED_COLUMNS. Does NOT check for the names of these columns!
28 |         """
29 |         if isinstance(dataset, pd.Series) or dataset.shape[1] < len(self.MINIMUM_REQUIRED_COLUMNS):
30 |             error_message = 'Loaded dataset does NOT have enough features! ' \
31 |                             'Is there a header present that does not start ' \
32 |                             'with ##?'
33 |             self.log.critical(error_message)
34 |             raise KeyError(error_message)
35 | 
36 |     def validate_variants_present(self, dataset):
37 |         """
38 |         Validator to make sure that there is at least one variant present.
39 |         """
40 |         if dataset.shape[0] == 0:
41 |             error_message = 'Loaded dataset does not contain variants!'
42 |             self.log.critical(error_message)
43 |             raise ValueError(error_message)
44 | 
45 |     def validate_minimally_required_columns(
46 |             self, dataset, additional_required_features: list | None = None
47 |     ):
48 |         """
49 |         Validator for both predict and train to check if the very least columns
50 |         are present (chr, pos, ref, alt) and additionally the additional
51 |         required columns.
52 |         """
53 |         column_utils = ColumnUtils()
54 |         column_utils.set_specified_columns(
55 |             {x.col_name for x in PostFileParseValidator.MINIMUM_REQUIRED_COLUMNS}
56 |         )
57 |         if additional_required_features is not None:
58 |             column_utils.add_to_specified_columns(additional_required_features)
59 |         columns_not_present = column_utils.get_missing_diff_with(dataset.columns)
60 |         if len(columns_not_present) > 0:
61 |             error_message = 'Detected required column %s not present within input dataset!'
62 |             if len(columns_not_present) > 1:
63 |                 error_message = 'Detected required columns %s not present within input dataset!'
64 |             self.log.critical(error_message, ', '.join(columns_not_present))
65 |             raise KeyError(error_message % ', '.join(columns_not_present))
66 | 
67 |     def validate_chrom_pos(self, dataset):
68 |         """
69 |         Function to check if all values of the columns Chr and Pos are present.
70 |         """
71 |         if dataset[InputColumn.chr.col_name].isnull().values.any():
72 |             error_message = 'Detected gap in Chromosome column! Please supply a valid dataset.'
73 |             self.log.critical(error_message)
74 |             raise ValueError(error_message)
75 |         if dataset[InputColumn.pos.col_name].isnull().values.any():
76 |             error_message = 'Detected gap in Position column! Please supply a valid dataset.'
77 |             self.log.critical(error_message)
78 |             raise ValueError(error_message)
79 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/manual_vep_processor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from molgenis.capice.core.logger import Logger
 6 | from molgenis.capice.utilities import get_project_root_dir
 7 | from molgenis.capice.utilities.dynamic_loader import DynamicLoader
 8 | 
 9 | 
10 | class ManualVEPProcessor:
11 |     """
12 |     Class ManualVEPProcessor, to process the (unusable) VEP-like features to
13 |     features that are more usable.
14 |     """
15 | 
16 |     def __init__(self):
17 |         self.log = Logger().logger
18 |         self.feature_processing_tracker = {}
19 | 
20 |     def process(self, dataset: pd.DataFrame, process_features: list[str]) -> pd.DataFrame:
21 |         """
22 |         Callable method for the ManualVEPProcessor to start processing.
23 |         Loads all the VEP processors dynamically from /src/molgenis/capice/vep.
24 | 
25 |         Args:
26 |             dataset: The input dataset over which the VEP features should be processed.
27 |             process_features: A collection of all input features that should be used in either
28 |                               training or predicting over which VEP processing should happen.
29 | 
30 |         Returns:
31 |             pandas.DataFrame: The input dataset, processed on the consequences
32 | 
33 |         """
34 |         self.log.info('Starting manual VEP feature processing.')
35 |         vep_annotators = self._load_vep_processors()
36 |         dropping_columns = []
37 |         n_feats_processed = 0
38 |         for processor in vep_annotators:
39 |             if (
40 |                     processor.name in dataset.columns and
41 |                     processor.name in process_features and
42 |                     processor.usable
43 |             ):
44 |                 self.log.debug('Processing: %s', processor.name)
45 |                 self._add_feature_tracking(processor.name, processor.columns)
46 |                 dataset = processor.process(dataset)
47 |                 if processor.drop and processor.name not in dropping_columns:
48 |                     dropping_columns.append(processor.name)
49 |                 n_feats_processed += 1
50 |             else:
51 |                 self.log.warning('Could not use processor %s on input dataset!', processor.name)
52 |         self.log.debug('Property drop was set True for columns: %s', ', '.join(dropping_columns))
53 |         dataset.drop(columns=dropping_columns, inplace=True)
54 |         self.log.info('Processing successful.')
55 |         self.log.debug('Processed %d features.', n_feats_processed)
56 |         return dataset
57 | 
58 |     def _add_feature_tracking(self, processor_name: str, processor_features: list[str]):
59 |         if processor_name not in self.feature_processing_tracker.keys():
60 |             self.feature_processing_tracker[processor_name] = processor_features
61 |         else:
62 |             self.feature_processing_tracker[processor_name].extend(processor_features)
63 | 
64 |     def get_feature_processes(self) -> dict[str, list[str]]:
65 |         """
66 |         Getter for the dictionary containing all the processed features and their output features.
67 | 
68 |         Returns:
69 |             dict:
70 |                 Input VEP processing features (key) and their output features (values)
71 |         """
72 |         return self.feature_processing_tracker
73 | 
74 |     def _load_vep_processors(self):
75 |         location = os.path.join(get_project_root_dir(), 'vep')
76 |         self.log.debug('Loading modules at %s', location)
77 |         loader = DynamicLoader(required_attributes=['name', 'process'], path=location)
78 |         loaded_modules = loader.load_manual_annotators()
79 |         self.log.debug('Loaded %d modules.', len(loaded_modules))
80 |         return loaded_modules
81 | 


--------------------------------------------------------------------------------
/tests/capice/validators/test_post_file_parse_validator.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from tests.capice.test_templates import teardown
  6 | from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
  7 | 
  8 | 
  9 | class TestPostFileParseValidator(unittest.TestCase):
 10 |     @classmethod
 11 |     def setUpClass(cls) -> None:
 12 |         print('Setting up.')
 13 |         cls.dataset = pd.DataFrame(
 14 |             {
 15 |                 'chr': [1, 2],
 16 |                 'pos': [100, 200],
 17 |                 'REF': ['A', 'A'],
 18 |                 'ALT': ['T', 'T'],
 19 |                 'gene_name': ['UBA1', 'TFE3'],
 20 |                 'gene_id': [7317, 7030],
 21 |                 'id_source': ['EntrezGene', 'EntrezGene'],
 22 |                 'feature': ['NM_003334.4', 'NM_006521.6'],
 23 |                 'feature_type': ['Transcript', 'Transcript'],
 24 |                 'feat1': ['foo', 'bar']
 25 |             }
 26 |         )
 27 |         cls.validator = PostFileParseValidator()
 28 | 
 29 |     @classmethod
 30 |     def tearDownClass(cls) -> None:
 31 |         print('Tearing down.')
 32 |         teardown()
 33 | 
 34 |     def test_validation_correct_n_columns(self):
 35 |         print('Correct validation n_columns')
 36 |         self.validator.validate_n_columns(self.dataset)
 37 | 
 38 |     def test_validation_incorrect_n_columns(self):
 39 |         print('KeyError raised in n_columns due to too few columns (incorrectly loaded)')
 40 |         incorrectly_loaded_dataset = self.dataset[self.dataset.columns].astype(str).agg(
 41 |             '_'.join, axis=1
 42 |         )
 43 |         self.assertRaises(
 44 |             KeyError,
 45 |             self.validator.validate_n_columns,
 46 |             incorrectly_loaded_dataset
 47 |         )
 48 | 
 49 |     def test_no_variants_present(self):
 50 |         print('ValueError raised in validate_variants_present')
 51 |         dataset = pd.DataFrame(columns=self.dataset.columns)
 52 |         self.assertRaises(
 53 |             ValueError,
 54 |             self.validator.validate_variants_present,
 55 |             dataset
 56 |         )
 57 | 
 58 |     def test_validation_correct_required_columns(self):
 59 |         print('Correct validation required_columns')
 60 |         self.validator.validate_minimally_required_columns(
 61 |             self.dataset,
 62 |             additional_required_features=['feat1']
 63 |         )
 64 | 
 65 |     def test_validation_incorrect_required_columns_preset_required(self):
 66 |         print('KeyError raised due to missing ref column')
 67 |         self.assertRaises(
 68 |             KeyError,
 69 |             self.validator.validate_minimally_required_columns,
 70 |             self.dataset.drop(columns='REF'),
 71 |             additional_required_features='feat1'
 72 |         )
 73 | 
 74 |     def test_validation_incorrect_required_columns(self):
 75 |         print('KeyError raised due to missing feat2 column')
 76 |         self.assertRaises(
 77 |             KeyError,
 78 |             self.validator.validate_minimally_required_columns,
 79 |             self.dataset,
 80 |             additional_required_features=('feat1', 'feat2')
 81 |         )
 82 | 
 83 |     def test_validation_correct_chrom_pos(self):
 84 |         print('Correct validation chrom_pos not empty')
 85 |         self.validator.validate_chrom_pos(self.dataset)
 86 | 
 87 |     def test_validation_incorrect_chrom_pos(self):
 88 |         print('ValueError raised due to gap in pos column')
 89 |         incorrect_dataset = self.dataset.copy(deep=True)
 90 |         incorrect_dataset.iloc[1, 1] = None
 91 |         self.assertRaises(
 92 |             ValueError,
 93 |             self.validator.validate_chrom_pos,
 94 |             incorrect_dataset
 95 |         )
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     unittest.main()
100 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/cli/args_handler_predict.py:
--------------------------------------------------------------------------------
 1 | from molgenis.capice import __version__
 2 | from molgenis.capice.main_predict import CapicePredict
 3 | from molgenis.capice.core.capice_manager import CapiceManager
 4 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent
 5 | from molgenis.capice.validators.model_validator import ModelValidator
 6 | from molgenis.capice.validators.version_validator import VersionValidator
 7 | 
 8 | 
 9 | class ArgsHandlerPredict(ArgsHandlerParent):
10 |     """
11 |     Child class ArgsHandlerPredict, specific to the predict part of CAPICE
12 |     """
13 | 
14 |     def __init__(self, parser):
15 |         super(ArgsHandlerPredict, self).__init__(parser=parser)
16 | 
17 |     @property
18 |     def _extension(self):
19 |         return '.tsv', '.tsv.gz'
20 | 
21 |     @property
22 |     def _model_extension(self) -> tuple[str]:
23 |         # Ignore because the amount of values of tuple does not matter.
24 |         return '.json', '.ubj'  # type: ignore
25 | 
26 |     def _model_extension_str(self) -> str:
27 |         return self._join_extensions(self._model_extension)
28 | 
29 |     @property
30 |     def _required_output_extensions(self):
31 |         return '.tsv', '.tsv.gz'
32 | 
33 |     @property
34 |     def _empty_output_extension(self):
35 |         return self._required_output_extensions[1]
36 | 
37 |     def create(self):
38 |         self.parser.add_argument(
39 |             '-i',
40 |             '--input',
41 |             action='append',
42 |             type=str,
43 |             required=True,
44 |             help=f'path to annotated variants file ({self._extension_str()}) (required)'
45 |         )
46 |         self.parser.add_argument(
47 |             '-m',
48 |             '--model',
49 |             action='append',
50 |             type=str,
51 |             required=True,
52 |             help=f'path to trained model ({self._model_extension_str()}) (required)'
53 |         )
54 |         self.parser.add_argument(
55 |             '-o',
56 |             '--output',
57 |             action='append',
58 |             type=str,
59 |             help=f'path to directory or file ({self._required_output_extensions_str()}) '
60 |                  f'for exporting prediction output (optional)'
61 |         )
62 |         self.parser.add_argument(
63 |             '-f',
64 |             '--force',
65 |             action='store_true',
66 |             help='overwrites output if it already exists'
67 |         )
68 | 
69 |     def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given,
70 |                                      args):
71 |         model_path = self._retrieve_argument_from_list(args.model, '-m/--model')
72 |         model = self.validate_model(model_path)
73 |         CapiceManager().output_filename = output_filename
74 |         CapicePredict(input_path, model, output_path, output_given, self.force).run()
75 | 
76 |     def validate_model(self, model_path):
77 |         """
78 |         Function to validate if the given model location is indeed a pickled
79 |         model and matches the current CAPICE version.
80 |         :param model_path: str, path-like, path to the model
81 |         :return: model, xgb.XGBClassifier class
82 |         """
83 |         try:
84 |             self.input_validator.validate_input_path(model_path, extension=self._model_extension)
85 |         except FileNotFoundError as cm:
86 |             self.parser.error(str(cm))
87 |         model = self.load_model(model_path)
88 |         model_validator = ModelValidator()
89 |         model_validator.validate_has_required_attributes(model)
90 |         version_validator = VersionValidator()
91 |         try:
92 |             version_validator.validate_model_version(model.CAPICE_version)
93 |             version_validator.validate_versions_compatible(__version__, model.CAPICE_version)
94 |         except ValueError as cm:
95 |             self.parser.error(str(cm))
96 |         return model
97 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/validators/version_validator.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from re import match
  3 | 
  4 | from molgenis.capice.utilities.enums import Versioning
  5 | 
  6 | 
  7 | class VersionValidator:
  8 |     def __init__(self):
  9 |         self.regex = Versioning.VALIDATION_REGEX.value
 10 | 
 11 |     def validate_capice_version(self, capice_version: str):
 12 |         """
 13 |         Validates if the CAPICE version adheres CAPICE versioning standards, which originate from
 14 |         the Semantic versioning standards.
 15 | 
 16 |         For instance:
 17 |         3.0.0
 18 |         3.1.0
 19 |         3.1.1
 20 |         3.1.1-rc1
 21 |         3.1.1rc1
 22 |         Are all valid versions.
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         capice_version : str
 27 |             The version of the CAPICE framework.
 28 | 
 29 |         Raises
 30 |         ------
 31 |         ValueError
 32 |             Raised when the CAPICE framework version does not adhere to the versioning standards.
 33 |         """
 34 |         if match(self.regex, capice_version) is None:
 35 |             raise ValueError(f'CAPICE version does not adhere to correct format: {capice_version}')
 36 | 
 37 |     def validate_model_version(self, model_version: str):
 38 |         """
 39 |         Validates if the model version adheres CAPICE versioning standards, which originate from
 40 |         the Semantic versioning standards.
 41 | 
 42 |         For instance:
 43 |         3.0.0
 44 |         3.1.0
 45 |         3.1.1
 46 |         3.1.1-rc1
 47 |         3.1.1rc1
 48 |         Are all valid versions.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         model_version : str
 53 |             The version of the CAPICE model.
 54 | 
 55 |         Raises
 56 |         ------
 57 |         ValueError
 58 |             Raised when the model version does not adhere to the versioning standards.
 59 |         """
 60 |         if match(self.regex, model_version) is None:
 61 |             raise ValueError(f'Model version does not adhere to correct format: {model_version}')
 62 | 
 63 |     def validate_versions_compatible(self, capice_version: str, model_version: str):
 64 |         """
 65 |         Validates if the model version and the CAPICE framework versions are compatible with each
 66 |         other.
 67 | 
 68 |         Parameters
 69 |         ----------
 70 |         capice_version : str
 71 |             The version of the CAPICE framework.
 72 | 
 73 | 
 74 |         model_version : str
 75 |             The version of the CAPICE model.
 76 | 
 77 |         Raises
 78 |         ------
 79 |         ValueError
 80 |             Raised when the model and framework versions are not compatible.
 81 |         """
 82 |         # All mypy ignores here are because attributes are not found.
 83 |         capice = match(self.regex, capice_version)
 84 |         model = match(self.regex, model_version)
 85 |         if capice.group('major') != model.group('major'):  # type: ignore
 86 |             raise ValueError(
 87 |                 f'CAPICE major version {capice.string} '  # type: ignore
 88 |                 f'does not match with the model '
 89 |                 f'{model.string}!'  # type: ignore
 90 |             )
 91 | 
 92 |         if capice.group('prerelease') or model.group('prerelease'):  # type: ignore
 93 |             self._validate_prerelease(capice, model)  # type: ignore
 94 | 
 95 |     @staticmethod
 96 |     def _validate_prerelease(capice_version: re.Match,
 97 |                              model_version: re.Match):
 98 |         matches = ['minor', 'patch', 'prerelease']
 99 |         for m in matches:
100 |             if capice_version.group(m) != model_version.group(m):
101 |                 raise ValueError(
102 |                     f'CAPICE {m} version {capice_version.string} does not match the model {m} '
103 |                     f'version {model_version.string} (should match for pre-releases)!'
104 |                 )
105 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/core/capice_manager.py:
--------------------------------------------------------------------------------
  1 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator
  2 | 
  3 | 
  4 | class CapiceManager:
  5 |     """
  6 |     Global CAPICE manager, to keep track of variables used throughout
  7 |     the entirety of CAPICE.
  8 |     """
  9 | 
 10 |     class __CapiceManager:
 11 |         def __init__(self):
 12 |             self.property_checker = PropertyTypeValidator()
 13 |             self.loglevel = None
 14 |             self.critical_logging_only = False
 15 |             self.output_filename = ''
 16 | 
 17 |         @property
 18 |         def loglevel(self):
 19 |             return self._loglevel
 20 | 
 21 |         @loglevel.setter
 22 |         def loglevel(self, value):
 23 |             self.property_checker.validate_property(
 24 |                 value=value, expected_type=int, include_none=True)
 25 |             self._loglevel = value
 26 | 
 27 |         @property
 28 |         def critical_logging_only(self):
 29 |             return self._critical_logging_only
 30 | 
 31 |         @critical_logging_only.setter
 32 |         def critical_logging_only(self, value):
 33 |             self.property_checker.validate_property(value=value, expected_type=bool)
 34 |             self._critical_logging_only = value
 35 | 
 36 |         @property
 37 |         def output_filename(self):
 38 |             return self._output_filename
 39 | 
 40 |         @output_filename.setter
 41 |         def output_filename(self, value):
 42 |             self.property_checker.validate_property(value=value, expected_type=str)
 43 |             self._output_filename = value
 44 | 
 45 |     instance = None
 46 | 
 47 |     @property
 48 |     def loglevel(self):
 49 |         """
 50 |         Getter for setter loglevel
 51 | 
 52 |         :return: None or int
 53 |         """
 54 |         return self._loglevel
 55 | 
 56 |     @loglevel.setter
 57 |     def loglevel(self, value):
 58 |         """
 59 |         Singleton property loglevel, to set the loglevel in int that will be
 60 |         used in the session of CAPICE.
 61 | 
 62 |         Raises TypeError if not supplied with int or None
 63 | 
 64 |         :param value: int or None
 65 |         """
 66 |         pass
 67 | 
 68 |     @property
 69 |     def critical_logging_only(self):
 70 |         """
 71 |         Getter for setter critical_logging_only
 72 | 
 73 |         :return: boolean
 74 |         """
 75 |         return self._critical_logging_only
 76 | 
 77 |     @critical_logging_only.setter
 78 |     def critical_logging_only(self, value):
 79 |         """
 80 |         Singleton property critical_logging_only,
 81 |         to tell the logger to only log CRITICAL loglevel events to file /
 82 |         STDout and STDerr.
 83 |         Raises TypeError if not supplied with a boolean.
 84 | 
 85 |         :param value: boolean
 86 |         """
 87 |         pass
 88 | 
 89 |     @property
 90 |     def output_filename(self):
 91 |         """
 92 |         Getter for setter output_filename
 93 | 
 94 |         :return: path-like
 95 |         """
 96 |         return self._output_filename
 97 | 
 98 |     @output_filename.setter
 99 |     def output_filename(self, value):
100 |         """
101 |         Singleton property output_filename,
102 |         to set the output file name that CAPICE prediction will produce.
103 | 
104 |         :param value: path-like
105 |         """
106 |         pass
107 | 
108 |     def __new__(cls):
109 |         """
110 |         Class method to set CapiceManager instance
111 |         :return: instance
112 |         """
113 |         if not CapiceManager.instance:
114 |             CapiceManager.instance = CapiceManager.__CapiceManager()
115 |         return CapiceManager.instance
116 | 
117 |     def __init__(self):
118 |         """
119 |         __init__ method to set instance to CapiceManager.__CapiceManager()
120 |         """
121 |         if not CapiceManager.instance:
122 |             CapiceManager.instance = CapiceManager.__CapiceManager()
123 | 
124 |     def __getattr__(self, name):
125 |         """
126 |         Method to return the value of the named attribute of name
127 |         :param name: str
128 |         :return: str
129 |         """
130 |         return getattr(self.instance, name)
131 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/input_processor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from molgenis.capice.utilities import check_file_exist
 5 | 
 6 | 
 7 | class InputProcessor:
 8 |     def __init__(self, input_path, output_path, force, default_extension):
 9 |         """
10 |         InputProcessor checks the input directory, output directory
11 |         (being either call_dir if output_path is None or output_path) and
12 |         the force flag to build the output directory and output filename.
13 |         :param input_path: str, path-like
14 |         :param output_path: str, path-like (if missing: supply None)
15 |         :param force: bool, force flag present or not
16 |         :param default_extension: str, the default extension the output file should get in case
17 |         output is missing from CLI
18 | 
19 |         Use getter get_output_filename() to get the output filename after
20 |         initialization and get_output_directory() to get the output directory.
21 |         (output directory is not yet checked for writability and existence)
22 | 
23 |         Note: when only the input_path is provided or only a file directory is
24 |         supplied, InputProcessor will strip the extension from the input path
25 |         and use the the input path file name as reference for the output
26 |         filename. (so input with example.tsv.gz will be come example).
27 |         Extension has to be manually added within the argument parser.
28 |         """
29 |         self.call_dir = str(Path('.').absolute())
30 |         self.input_path = input_path
31 |         self.output_path = output_path
32 |         self.output_given = False
33 |         self.force = force
34 |         self.default_extension = default_extension
35 |         self.output_directory = ''
36 |         self.output_filename = ''
37 |         self._handle_input_output_directories()
38 | 
39 |     def _handle_input_output_directories(self):
40 |         """
41 |         Function to validate the input location, output location and filename to
42 |         tell the exporter where to place what file.
43 |         """
44 |         if self.output_path is None:
45 |             filename = self.get_filename_from_path(self.input_path)
46 |             self._set_output_path(self.call_dir, filename)
47 |         else:
48 |             # Check if it is a path or else just a filename
49 |             if len(os.path.dirname(self.output_path)) > 0 or self.output_path == '.':
50 |                 # Then I know it's an output filepath + possibly name
51 |                 if os.path.splitext(self.output_path)[1] != '':
52 |                     # Then I know it is a full path + filename
53 |                     self._set_output_path(os.path.dirname(self.output_path),
54 |                                           os.path.basename(self.output_path))
55 |                     self.output_given = True
56 |                 else:
57 |                     # Then I know it's a full path
58 |                     filename = self.get_filename_from_path(self.input_path)
59 |                     self._set_output_path(self.output_path, filename)
60 |             else:
61 |                 # Then I know it's an output filename
62 |                 self._set_output_path(self.call_dir, self.output_path)
63 | 
64 |         self._check_force()
65 | 
66 |     def _check_force(self):
67 |         full_output_path = os.path.join(self.output_directory, self.output_filename)
68 |         check_file_exist(full_output_path, self.force)
69 | 
70 |     def _set_output_path(self, directory, filename):
71 |         self.output_directory = directory
72 |         self.output_filename = filename
73 | 
74 |     def get_filename_from_path(self, path):
75 |         """
76 |         Function to get the filename of a file from a given input
77 |         path or input filename.
78 |         :param path: string
79 |         :return: filename (string)
80 |         """
81 |         no_path = os.path.basename(path)
82 |         splitted_path = no_path.split('.')
83 |         filename = splitted_path[0]
84 |         return f'{filename}_capice{self.default_extension}'
85 | 
86 |     def get_output_filename(self):
87 |         return self.output_filename
88 | 
89 |     def get_output_directory(self):
90 |         return self.output_directory
91 | 
92 |     def get_output_given(self):
93 |         return self.output_given
94 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_column_utils.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from molgenis.capice.utilities import column_utils
  6 | 
  7 | 
  8 | class TestUtilities(unittest.TestCase):
  9 |     @classmethod
 10 |     def setUpClass(cls):
 11 |         print('Setting up')
 12 | 
 13 |     def setUp(self):
 14 |         print('\nTesting case:')
 15 |         self.column_utils = column_utils.ColumnUtils()
 16 |         self.column_utils.specified_columns = set(['a', 'b', 'c'])
 17 | 
 18 |     def test_set_specified_columns(self):
 19 |         """
 20 |         Test set_specified_columns
 21 |         Should set specified_columns of class
 22 |         """
 23 |         print('Setting specified columns')
 24 |         columns = ['x', 'y', 'z']
 25 |         self.column_utils.set_specified_columns(columns)
 26 |         self.assertEqual(set(columns), self.column_utils.get_specified_columns())
 27 | 
 28 |     def test_add_to_specified_columns_single(self):
 29 |         """
 30 |         Test add_to_specified_columns
 31 |         Should add string to specified_columns of class
 32 |         """
 33 |         print('Adding to specified columns')
 34 |         self.column_utils.add_to_specified_columns('da')
 35 |         self.assertEqual(set(['a', 'b', 'c', 'da']), self.column_utils.get_specified_columns())
 36 | 
 37 |     def test_add_to_specified_columns_multiple(self):
 38 |         """
 39 |         Test add_to_specified_columns
 40 |         Should merge list of columns with specified_columns of class
 41 |         """
 42 |         print('Adding to specified columns')
 43 |         self.column_utils.add_to_specified_columns(['d', 'd', 'e'])
 44 |         self.assertEqual(set(['a', 'b', 'c', 'd', 'e']), self.column_utils.get_specified_columns())
 45 | 
 46 |     def test_column_in_specified_columns(self):
 47 |         """
 48 |         Test column_in_specified_columns
 49 |         Should return true because column is in specified_columns
 50 |         """
 51 |         print('Column is in specified columns')
 52 |         column = 'a'
 53 |         self.assertTrue(self.column_utils.column_in_specified_columns(column))
 54 | 
 55 |     def test_column_not_in_specified_columns(self):
 56 |         """
 57 |         Test column_in_specified_columns
 58 |         Should return false because column is not in specified_columns
 59 |         """
 60 |         print('Column not is in specified columns')
 61 |         column = 'x'
 62 |         self.assertFalse(self.column_utils.column_in_specified_columns(column))
 63 | 
 64 |     def test_add_to_specified_columns_set(self):
 65 |         """
 66 |         Test add_to_specified_columns
 67 |         Should merge set of columns with specified_columns of class
 68 |         """
 69 |         print('Adding to specified columns')
 70 |         self.column_utils.add_to_specified_columns({'d', 'e'})
 71 |         self.assertEqual({'a', 'b', 'c', 'd', 'e'}, self.column_utils.get_specified_columns())
 72 | 
 73 |     def test_get_missing_diff_with_list(self):
 74 |         """
 75 |         Test get_missing_diff_with with type list
 76 |         Should return "c" as missing column.
 77 |         """
 78 |         print('Get missings of diff with columns as list')
 79 |         columns = ['a', 'b', 'x', 'y']
 80 |         missing = self.column_utils.get_missing_diff_with(columns)
 81 |         self.assertEqual(['c'], missing)
 82 | 
 83 |     def test_get_missing_diff_with_dict_keys(self):
 84 |         """
 85 |         Test get_missing_diff_with with type dict keys
 86 |         Should return "c" as missing column.
 87 |         """
 88 |         print('Get missings of diff with columns of type dict keys')
 89 |         data = {'a': 0, 'b': 1, 'x': 3, 'y': 4}
 90 |         missing = self.column_utils.get_missing_diff_with(data.keys())
 91 |         self.assertEqual(['c'], missing)
 92 | 
 93 |     def test_get_missing_diff_with_pd_df(self):
 94 |         """
 95 |         Test get_missing_diff_with with type pandas dataframe
 96 |         Should return "cd" as missing column.
 97 |         """
 98 |         print('Get missings of diff with columns of type pandas dataframe')
 99 |         data = {'ab': 0, 'bc': 1, 'x': 3, 'y': 4}
100 |         d = {'ab': [1, 2], 'bc': [3, 4], 'cd': [6, 8]}
101 |         df = pd.DataFrame(data=d)
102 |         self.column_utils.set_specified_columns(df.columns)
103 |         missing = self.column_utils.get_missing_diff_with(data.keys())
104 |         self.assertEqual(['cd'], missing)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     unittest.main()
109 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/cli/args_handler_train.py:
--------------------------------------------------------------------------------
  1 | from molgenis.capice.main_train import CapiceTrain
  2 | from molgenis.capice.core.capice_manager import CapiceManager
  3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent
  4 | 
  5 | 
  6 | class ArgsHandlerTrain(ArgsHandlerParent):
  7 |     """
  8 |     Command-line argument handler for train sub-command.
  9 |     Parses, validates and executes function.
 10 |     """
 11 | 
 12 |     def __init__(self, parser):
 13 |         super(ArgsHandlerTrain, self).__init__(parser=parser)
 14 |         self.split_default = 0.2
 15 |         self.n_threads_default = 1
 16 | 
 17 |     @property
 18 |     def _extension(self):
 19 |         return '.tsv', '.tsv.gz'
 20 | 
 21 |     @property
 22 |     def _features_extension(self) -> tuple[str]:
 23 |         return '.json',
 24 | 
 25 |     def _features_extension_str(self) -> str:
 26 |         return self._join_extensions(self._features_extension)
 27 | 
 28 |     @property
 29 |     def _required_output_extensions(self):
 30 |         return '.json', '.ubj'
 31 | 
 32 |     @property
 33 |     def _empty_output_extension(self):
 34 |         return self._required_output_extensions[1]
 35 | 
 36 |     def create(self):
 37 |         self.parser.add_argument(
 38 |             '-i',
 39 |             '--input',
 40 |             action='append',
 41 |             type=str,
 42 |             required=True,
 43 |             help=f'path to annotated variants file ({self._extension_str()}) (required)'
 44 |         )
 45 |         self.parser.add_argument(
 46 |             '-e',
 47 |             '--features',
 48 |             action='append',
 49 |             type=str,
 50 |             required=True,
 51 |             help=f'path to the features file ({self._features_extension_str()}) (required)'
 52 |         )
 53 |         self.parser.add_argument(
 54 |             '-s',
 55 |             '--split',
 56 |             action='append',
 57 |             default=[self.split_default],
 58 |             type=float,
 59 |             help=f'proportion of the input data to include in the test split (default: '
 60 |                  f'{self.split_default}) (optional)'
 61 |         )
 62 |         self.parser.add_argument(
 63 |             '-o',
 64 |             '--output',
 65 |             action='append',
 66 |             type=str,
 67 |             help=f'path to directory or file ({self._required_output_extensions_str()}) for '
 68 |                  f'exporting model (optional)'
 69 |         )
 70 |         self.parser.add_argument(
 71 |             '-f',
 72 |             '--force',
 73 |             action='store_true',
 74 |             help='overwrites output if it already exists'
 75 |         )
 76 |         self.parser.add_argument(
 77 |             '-t',
 78 |             '--threads',
 79 |             action='append',
 80 |             default=[self.n_threads_default],
 81 |             type=int,
 82 |             help=f'The amount of threads that can be used by XGBoost to parallel train (default: '
 83 |                  f'{self.n_threads_default})'
 84 |         )
 85 | 
 86 |     def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given,
 87 |                                      args):
 88 |         features = self._retrieve_argument_from_list(args.features, '-e/--features')
 89 |         self.input_validator.validate_input_path(features, extension=self._features_extension)
 90 | 
 91 |         test_split = self._retrieve_argument_from_list(args.split, '-s/--split', has_default=True)
 92 |         self.validate_test_split(test_split)
 93 | 
 94 |         n_threads = self._retrieve_argument_from_list(args.threads, '-t/--threads',
 95 |                                                       has_default=True)
 96 |         self.validate_n_threads(n_threads)
 97 | 
 98 |         CapiceManager().output_filename = output_filename
 99 |         CapiceTrain(
100 |             input_path,
101 |             features,
102 |             test_split,
103 |             output_path,
104 |             output_given,
105 |             self.force,
106 |             n_threads
107 |         ).run()
108 | 
109 |     def validate_n_threads(self, n_threads):
110 |         """
111 |         Function to validate that the amount of threads is at least 1.
112 |         """
113 |         if n_threads < 1:
114 |             self.parser.error('The amount of threads has to be at least 1!')
115 | 
116 |     def validate_test_split(self, test_split):
117 |         """
118 |         Validator for test split to make sure it lies between 0 and 1
119 |         (since the CLA is already set to type float, I do not have to validate
120 |         it here too)
121 |         """
122 |         if test_split <= 0 or test_split >= 1:
123 |             self.parser.error('Test split must be a float between 0 and 1')
124 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/utilities/dynamic_loader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from importlib import util
  3 | 
  4 | from molgenis.capice.core.logger import Logger
  5 | 
  6 | 
  7 | class DynamicLoader:
  8 |     def __init__(self, required_attributes: list, path):
  9 |         """
 10 |         Dynamic Loader for both the imputer and preprocessor
 11 | 
 12 |         :param required_attributes: list, list containing all the required
 13 |         attritubes the loaded modules have to have.
 14 |         :param path: Path-like, path to the potential modules.
 15 | 
 16 |         Use `load_impute_preprocess_modules()` to load the modules required for
 17 |         the imputer and preprocessor. Use `load_manual_annotators()` to load
 18 |         the manual VEP annotation processors.
 19 |         """
 20 |         self.log = Logger().logger
 21 |         self.path = path
 22 |         self._check_dir_exists()
 23 |         self.required_attributes = required_attributes
 24 |         self.modules: dict[str, object] = {}
 25 | 
 26 |     def load_manual_annotators(self):
 27 |         """
 28 |         Load the VEP annotation modules within path.
 29 | 
 30 |         :return: list, list containing all the usable VEP modules within path.
 31 | 
 32 |         :raises: FileNotFoundError, if no VEP annotation module is found within
 33 |         path.
 34 |         """
 35 |         self._load_modules()
 36 |         # Since the manual annotator doesn't require VEP version, GRCh build or
 37 |         # overwrite, this loading is done.
 38 |         return self.modules.values()
 39 | 
 40 |     def _load_modules(self, required_attributes=None):
 41 |         self._check_dir_exists()
 42 |         if required_attributes:
 43 |             set_required = required_attributes
 44 |         else:
 45 |             set_required = self.required_attributes
 46 |         modules = self._load_modules_from_path(self.path)
 47 |         self._check_n_modules(modules)
 48 |         imported_modules = self._import(modules)
 49 |         for path, module in imported_modules.items():
 50 |             if all(item in dir(module) for item in set_required):
 51 |                 self.modules[path] = module
 52 |         self._check_n_modules(self.modules)
 53 |         self.log.info('Successfully loaded %s modules.', len(self.modules))
 54 | 
 55 |     def _check_dir_exists(self):
 56 |         if not os.path.exists(self.path):
 57 |             error_message = "%s is not a path!"
 58 |             self.log.critical(error_message, self.path)
 59 |             raise OSError(error_message % self.path)
 60 | 
 61 |     def _check_n_modules(self, modules_dict):
 62 |         if len(modules_dict) < 1:
 63 |             self._raise_no_module_found_error()
 64 | 
 65 |     def _raise_no_module_found_error(self):
 66 |         error_message = "No usable modules are found within %s!"
 67 |         self.log.critical(error_message, self.path)
 68 |         raise FileNotFoundError(error_message % self.path)
 69 | 
 70 |     @staticmethod
 71 |     def _load_modules_from_path(path):
 72 |         """
 73 |         Function to dynamically load in modules in the given path
 74 |         :param path: path to the modules
 75 |         :return: list
 76 |         """
 77 |         modules = []
 78 |         for module in os.listdir(path):
 79 |             module = os.path.join(path, module)
 80 |             if (module.endswith('.py') and not module.endswith('__.py')
 81 |                     and not module.endswith('abstract.py')):
 82 |                 modules.append(module)
 83 |         return modules
 84 | 
 85 |     def _import(self, usable_modules: list[str]) -> dict[str, object]:
 86 |         """
 87 |         Function  to dynamically load in the modules using the
 88 |         import_module library.
 89 |         :param usable_modules: list of absolute paths to potential modules
 90 |         :return: list of usable modules
 91 |         """
 92 |         # For some reason, mypy wants this line to be Typed instead of the method.
 93 |         return_modules: dict[str, object] = {}
 94 |         for module in usable_modules:
 95 |             name = os.path.basename(module).split('.py')[0]
 96 |             spec = util.spec_from_file_location(name=name, location=module)
 97 |             loaded_module = self._process_spec(spec)
 98 |             if loaded_module and module not in return_modules.keys():
 99 |                 return_modules[module] = loaded_module
100 |         return return_modules
101 | 
102 |     @staticmethod
103 |     def _process_spec(spec):
104 |         return_spec = None
105 |         loaded_spec = util.module_from_spec(spec)
106 |         spec.loader.exec_module(loaded_spec)
107 |         for attribute in dir(loaded_spec):
108 |             if not attribute.startswith('Template') and not attribute.startswith('__'):
109 |                 get_attribute = getattr(loaded_spec, attribute)
110 |                 if ('name' in dir(get_attribute) and 'usable' in dir(get_attribute)
111 |                         and get_attribute().usable is True):
112 |                     return_spec = get_attribute()
113 |         return return_spec
114 | 


--------------------------------------------------------------------------------
/tests/capice/utilities/test_input_processor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from pathlib import Path
  4 | 
  5 | from tests.capice.test_templates import _project_root_directory
  6 | from molgenis.capice.utilities.input_processor import InputProcessor
  7 | 
  8 | 
  9 | class TestInputProcessor(unittest.TestCase):
 10 | 
 11 |     __FILE__ = 'file_capice.txt'
 12 |     __GZIPFILE__ = 'file_capice.txt.gz'
 13 | 
 14 |     def setUp(self):
 15 |         print('Setting up.')
 16 |         output = os.path.join(
 17 |             _project_root_directory,
 18 |             'tests',
 19 |             'resources',
 20 |             'input_processor',
 21 |             'filename.txt'
 22 |         )
 23 |         self.processor = InputProcessor('/test/input/file.txt', output, True, '.txt')
 24 | 
 25 |     def tearDown(self) -> None:
 26 |         potential_file = os.path.join(
 27 |             _project_root_directory,
 28 |             'tests',
 29 |             'resources',
 30 |             self.__FILE__
 31 |         )
 32 |         if os.path.isfile(potential_file):
 33 |             os.remove(potential_file)
 34 |         second_potential_file = os.path.join(
 35 |                 _project_root_directory,
 36 |                 'tests',
 37 |                 'resources',
 38 |                 self.__GZIPFILE__
 39 |             )
 40 |         if os.path.isfile(second_potential_file):
 41 |             os.remove(second_potential_file)
 42 | 
 43 |     def test__set_output_path(self):
 44 |         output_dir = '/test/input/dir'
 45 |         filename = 'filename.txt'
 46 |         self.processor._set_output_path(output_dir, filename)
 47 |         self.assertEqual(self.processor.get_output_directory(), output_dir)
 48 |         self.assertEqual(self.processor.get_output_filename(), filename)
 49 | 
 50 |     def test_get_filename_from_path(self):
 51 |         path = '/test/input/dir/filename.txt'
 52 |         actual = self.processor.get_filename_from_path(path)
 53 |         self.assertEqual(actual, 'filename_capice.txt')
 54 | 
 55 |     def test__check_force(self):
 56 |         self.processor.force = False
 57 |         self.assertRaises(FileExistsError, self.processor._check_force)
 58 | 
 59 |     def test___handle_input_output_directories_case1(self):
 60 |         self.processor.output_path = None
 61 |         self.processor._handle_input_output_directories()
 62 |         self.assertEqual(str(Path('.').absolute()), self.processor.get_output_directory())
 63 |         self.assertEqual(self.__FILE__, self.processor.get_output_filename())
 64 | 
 65 |     def test___handle_input_output_directories_case2(self):
 66 |         self.processor.output_path = ''
 67 |         self.processor._handle_input_output_directories()
 68 |         self.assertEqual(str(Path('.').absolute()), self.processor.get_output_directory())
 69 |         self.assertEqual('', self.processor.get_output_filename())
 70 | 
 71 |     def test___handle_input_output_directories_case3(self):
 72 |         self.processor.output_path = '/something'
 73 |         self.processor._handle_input_output_directories()
 74 |         self.assertEqual('/something', self.processor.get_output_directory())
 75 |         self.assertEqual(self.__FILE__, self.processor.get_output_filename())
 76 | 
 77 |     def test___handle_input_output_directories_case4(self):
 78 |         self.processor.output_path = '/directory/file.txt'
 79 |         self.processor._handle_input_output_directories()
 80 |         self.assertEqual('/directory', self.processor.get_output_directory())
 81 |         self.assertEqual('file.txt', self.processor.get_output_filename())
 82 | 
 83 |     def test___handle_input_output_directories_case5(self):
 84 |         self.processor.output_path = '.'
 85 |         self.processor._handle_input_output_directories()
 86 |         self.assertEqual('.', self.processor.get_output_directory())
 87 |         self.assertEqual(self.__FILE__, self.processor.get_output_filename())
 88 | 
 89 |     def test___handle_input_output_directories_case6(self):
 90 |         self.processor.output_path = './file.txt'
 91 |         self.processor._handle_input_output_directories()
 92 |         self.assertEqual('.', self.processor.get_output_directory())
 93 |         self.assertEqual('file.txt', self.processor.get_output_filename())
 94 | 
 95 |     def test_force_false_output_missing_output_exists(self):
 96 |         # This test mimics what happens when output is left empty from the CLI
 97 |         # and the output file + _capice + default_extension already exists
 98 |         with open(
 99 |                 os.path.join(
100 |                     _project_root_directory,
101 |                     'tests',
102 |                     'resources',
103 |                     self.__FILE__
104 |                 ), 'wt'
105 |         ) as some_file:
106 |             some_file.write('SomeString')
107 |         self.processor.force = False
108 |         self.assertRaises(
109 |             FileExistsError,
110 |             self.processor._handle_input_output_directories
111 |         )
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     unittest.main()
116 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/main_explain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import xgboost as xgb
 5 | 
 6 | from molgenis.capice.main_capice import Main
 7 | from molgenis.capice.core.logger import Logger
 8 | from molgenis.capice.utilities import check_file_exist
 9 | from molgenis.capice.core.capice_manager import CapiceManager
10 | 
11 | 
12 | class CapiceExplain(Main):
13 |     def __init__(self, model, output_path, output_given, force):
14 |         super().__init__(
15 |             input_path=None,
16 |             output_path=output_path,
17 |             output_given=output_given,
18 |             force=force
19 |         )
20 |         self.model = model
21 |         self.output = output_path
22 |         self.log = Logger().logger
23 | 
24 |     def run(self):
25 |         gain_importances = self._extract_features_importances_gain(self.model)
26 |         total_gain_importances = self._extract_features_importances_total_gain(self.model)
27 |         weight_importances = self._extract_features_importances_weight(self.model)
28 |         cover_importances = self._extract_features_importances_cover(self.model)
29 |         total_cover_importances = self._extract_features_importances_total_cover(self.model)
30 |         importances = self._convert_importances_to_dataframe(gain_importances,
31 |                                                              total_gain_importances,
32 |                                                              weight_importances,
33 |                                                              cover_importances,
34 |                                                              total_cover_importances)
35 |         self._order_importances(importances)
36 |         self._export(importances, self.output)
37 | 
38 |     def _extract_features_importances_gain(self, model: xgb.XGBClassifier):
39 |         self.log.info('Extracting gain from model.')
40 |         feature_importances = model.get_booster().get_score(importance_type='gain')
41 |         self.log.debug('Extracted %d gain features from model.', len(feature_importances.keys()))
42 |         return feature_importances
43 | 
44 |     def _extract_features_importances_total_gain(self, model: xgb.XGBClassifier):
45 |         self.log.info('Extracting total gain from model.')
46 |         feature_importances = model.get_booster().get_score(importance_type='total_gain')
47 |         self.log.debug('Extracted %d total_gain features from model.',
48 |                        len(feature_importances.keys()))
49 |         return feature_importances
50 | 
51 |     def _extract_features_importances_weight(self, model: xgb.XGBClassifier):
52 |         self.log.info('Extracting weight from model.')
53 |         feature_importances = model.get_booster().get_score(importance_type='weight')
54 |         self.log.debug('Extracted %d weight features from model.',
55 |                        len(feature_importances.keys()))
56 |         return feature_importances
57 | 
58 |     def _extract_features_importances_cover(self, model: xgb.XGBClassifier):
59 |         self.log.info('Extracting cover from model.')
60 |         feature_importances = model.get_booster().get_score(importance_type='cover')
61 |         self.log.debug('Extracted %d cover features from model.',
62 |                        len(feature_importances.keys()))
63 |         return feature_importances
64 | 
65 |     def _extract_features_importances_total_cover(self, model: xgb.XGBClassifier):
66 |         self.log.info('Extracting total cover from model.')
67 |         feature_importances = model.get_booster().get_score(importance_type='total_cover')
68 |         self.log.debug('Extracted %d total_cover features from model.',
69 |                        len(feature_importances.keys()))
70 |         return feature_importances
71 | 
72 |     def _convert_importances_to_dataframe(self, gain: dict, total_gain: dict, weight: dict,
73 |                                           cover: dict, total_cover: dict):
74 |         self.log.info('Converting importances to dataframe.')
75 |         feature_importances = pd.DataFrame(data=[gain.keys(), gain.values()],
76 |                                            index=['feature', 'gain']).T
77 |         feature_importances['total_gain'] = feature_importances['feature'].map(total_gain)
78 |         feature_importances['weight'] = feature_importances['feature'].map(weight)
79 |         feature_importances['cover'] = feature_importances['feature'].map(cover)
80 |         feature_importances['total_cover'] = feature_importances['feature'].map(total_cover)
81 |         self.log.debug('Converted %d features into the dataframe', feature_importances.shape[0])
82 |         self.log.debug('Converted all %d importance types into the dataframe',
83 |                        feature_importances.shape[1])
84 |         return feature_importances
85 | 
86 |     def _order_importances(self, importances: pd.DataFrame):
87 |         self.log.info('Ordering feature importances.')
88 |         importances.sort_values(by='gain', ascending=False, inplace=True)
89 | 
90 |     def _export(self, dataset, output):
91 |         output_path = os.path.join(output, CapiceManager().output_filename)
92 |         check_file_exist(output_path, self.force)
93 |         dataset.to_csv(output_path, index=False, sep='\t')
94 |         if not self.output_given:
95 |             print(f'Successfully exported explain to: {output_path}')
96 | 


--------------------------------------------------------------------------------
/tests/capice/core/test_capice_exporter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from molgenis.capice.utilities.enums import Column
  8 | from molgenis.capice.core.capice_exporter import CapiceExporter
  9 | from tests.capice.test_templates import set_up_manager_and_out, teardown
 10 | 
 11 | 
 12 | class TestCapiceExporter(unittest.TestCase):
 13 |     @classmethod
 14 |     def setUpClass(cls):
 15 |         print('Setting up.')
 16 |         cls.prediction_output_dataframe = pd.DataFrame(
 17 |             {
 18 |                 Column.chr_pos_ref_alt.value: [
 19 |                     '1_VeryUniqueCAPICESeparator_100'
 20 |                     '_VeryUniqueCAPICESeparator_A_VeryUniqueCAPICESeparator_C',
 21 |                     '2_VeryUniqueCAPICESeparator_200'
 22 |                     '_VeryUniqueCAPICESeparator_T_VeryUniqueCAPICESeparator_G'
 23 |                 ],
 24 |                 Column.gene_name.value: ['foo', 'bar'],
 25 |                 Column.gene_id.value: [1000, 2000],
 26 |                 Column.id_source.value: ['foo', 'bar'],
 27 |                 Column.feature.value: ['TRANS_01', 'TRANS_02'],
 28 |                 Column.feature_type.value: ['Transcript', 'RegulatoryFeature'],
 29 |                 Column.score.value: [0.01, 0.998],
 30 |                 Column.suggested_class.value: ['VUS', 'VUS']
 31 |             }
 32 |         )
 33 |         cls.expected_prediction_output_dataframe = pd.DataFrame(
 34 |             {
 35 |                 'chr': ['1', '2'],
 36 |                 'pos': [100, 200],
 37 |                 'ref': ['A', 'T'],
 38 |                 'alt': ['C', 'G'],
 39 |                 'gene_name': ['foo', 'bar'],
 40 |                 'gene_id': [1000, 2000],
 41 |                 'id_source': ['foo', 'bar'],
 42 |                 'feature': ['TRANS_01', 'TRANS_02'],
 43 |                 'feature_type': ['Transcript', 'RegulatoryFeature'],
 44 |                 'score': [0.01, 0.998],
 45 |                 'suggested_class': ['VUS', 'VUS']
 46 |             }
 47 |         )
 48 |         cls.export_dataset = pd.DataFrame(
 49 |             {
 50 |                 'chr': [1, 2],
 51 |                 'pos': [100, 200],
 52 |                 'ref': ['A', 'A'],
 53 |                 'alt': ['C', 'G'],
 54 |                 'feature_1': [0.001, 0.2],
 55 |                 'feature_2': [0.02, 5.5]
 56 |             }
 57 |         )
 58 | 
 59 |     @classmethod
 60 |     def tearDownClass(cls):
 61 |         print('Tearing down.')
 62 |         teardown()
 63 | 
 64 |     def setUp(self):
 65 |         print('Testing case:')
 66 |         manager, self.output_path = set_up_manager_and_out()
 67 |         self.exporter = CapiceExporter(file_path=self.output_path, output_given=True, force=False)
 68 | 
 69 |     def test_prediction_output(self):
 70 |         print('Prediction output')
 71 |         filename = 'test_output.tsv'
 72 |         filename_path = os.path.join(self.output_path, filename)
 73 |         self.exporter.capice_filename = filename
 74 |         self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
 75 |         self.assertTrue(os.path.isfile(filename_path))
 76 |         exported_data = pd.read_csv(filename_path, sep='\t')
 77 |         exported_data[Column.chr.value] = exported_data[Column.chr.value].astype(str)
 78 |         pd.testing.assert_frame_equal(exported_data, self.expected_prediction_output_dataframe)
 79 | 
 80 |     def test_exporter_force_pass(self):
 81 |         """
 82 |         Since force is dealt with at the very start of CAPICE and raises an
 83 |         error if the output file is already present unless the force flag is
 84 |         True, this test just makes sure that the overwritten file is correct.
 85 |         """
 86 |         print('Filename generator (with force=True)')
 87 |         present_file = 'already_present_file.tsv'
 88 |         present_file_path = os.path.join(self.output_path, present_file)
 89 |         with open(present_file_path, 'wt') as present_file_conn:
 90 |             present_file_conn.write('This file is already present')
 91 |         self.exporter.capice_filename = present_file
 92 |         self.exporter.force = True
 93 |         self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
 94 |         forced_file = pd.read_csv(present_file_path, sep='\t')
 95 |         forced_file[Column.chr.value] = forced_file[Column.chr.value].astype(str)
 96 |         pd.testing.assert_frame_equal(forced_file, self.expected_prediction_output_dataframe)
 97 | 
 98 |     def test_post_process_set_correct_dtypes(self):
 99 |         print('Test post process set correct dtypes')
100 |         some_data = pd.DataFrame(
101 |             {
102 |                 'foo': [1, 2, 3],
103 |                 Column.gene_id.value: [1, np.nan, 3]
104 |             }
105 |         )
106 |         expected_output = some_data.copy(deep=True)
107 |         expected_output[Column.gene_id.value] = pd.Series(
108 |             expected_output[Column.gene_id.value], dtype='Int64'
109 |         )
110 |         out_data = self.exporter._post_process_set_correct_dtypes(some_data)
111 |         pd.testing.assert_frame_equal(
112 |             out_data.sort_index(axis=1),
113 |             expected_output.sort_index(axis=1)
114 |         )
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/scripts/convert_vep_vcf_to_tsv_capice.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Stops script if any error occurs.
  4 | set -e
  5 | 
  6 | # Defines error echo.
  7 | errcho() { echo "$@" 1>&2; }
  8 | 
  9 | # Usage.
 10 | readonly USAGE="VEP VCF output to CAPICE TSV converter
 11 | Usage:
 12 | convert_vep_to_tsv_capice.sh -p <arg> -i <arg> -o <arg> [-t] [-f]
 13 | -p    required: The path to the BCFTools image. (available at: https://download.molgeniscloud.org/downloads/vip/images/bcftools-1.14.sif)
 14 | -i    required: The VEP output VCF.
 15 | -o    required: The directory and output filename for the CAPICE .tsv.gz.
 16 | -f    optional: enable force.
 17 | -t    optional: enable train. Adds the ID column to the output.
 18 | 
 19 | Please note that this script expects apptainer binds to be set correctly by the system administrator.
 20 | Additional apptainer binds can be set by setting the environment variable APPTAINER_BIND.
 21 | If using SLURM, please export this environment variable to the sbatch instance too.
 22 | 
 23 | Example:
 24 | bash convert_vep_vcf_to_tsv_capice.sh -p /path/to/bcftools.sif -i vep_out.vcf.gz -o capice_in.tsv.gz
 25 | 
 26 | Requirements:
 27 | - Apptainer (although Singularity should work too, please change the script and adjust apptainer to singularity)
 28 | - BCFTools image. (available at: https://download.molgeniscloud.org/downloads/vip/images/bcftools-1.14.sif)
 29 | "
 30 | 
 31 | # Global variables
 32 | FORCE=false
 33 | TRAIN=false
 34 | 
 35 | 
 36 | main() {
 37 |   digestCommandLine "$@"
 38 |   processFile
 39 | }
 40 | 
 41 | digestCommandLine() {
 42 |   while getopts p:i:o:hft flag
 43 |   do
 44 |     case "${flag}" in
 45 |       p) bcftools_path=${OPTARG};;
 46 |       i) input=${OPTARG};;
 47 |       o) output=${OPTARG};;
 48 |       h)
 49 |         echo "${USAGE}"
 50 |         exit;;
 51 |       t)
 52 |         TRAIN=true;;
 53 |       f)
 54 |         FORCE=true;;
 55 |       \?)
 56 |         errcho "Error: invalid option"
 57 |         echo "${USAGE}"
 58 |         exit 1;;
 59 |     esac
 60 |   done
 61 | 
 62 |   if [[ ${TRAIN} == true ]]
 63 |   then
 64 |     HEADER="CHROM\tPOS\tID\tREF\tALT\t"
 65 |     FORMAT="%CHROM\t%POS\t%ID\t%REF\t%ALT\t%CSQ\n"
 66 |   else
 67 |     HEADER="CHROM\tPOS\tREF\tALT\t"
 68 |     FORMAT="%CHROM\t%POS\t%REF\t%ALT\t%CSQ\n"
 69 |   fi
 70 | 
 71 |   validateCommandLine
 72 | }
 73 | 
 74 | validateCommandLine() {
 75 |   local valid_command_line=true
 76 | 
 77 |   # Validate if BCFTools image is set & not empty
 78 |   if [ -z "${bcftools_path}" ]
 79 |   then
 80 |     valid_command_line=false
 81 |     errcho "BCFTools image not set/empty"
 82 |   else
 83 |     if [ ! -f "${bcftools_path}" ]
 84 |     then
 85 |       valid_command_line=false
 86 |       errcho "BCFTools image does not exist"
 87 |     fi
 88 |   fi
 89 | 
 90 |   # Validate if input is set & not empty.
 91 |   if [ -z "${input}" ]
 92 |   then
 93 |     valid_command_line=false
 94 |     errcho "input file not set/empty"
 95 |   else
 96 |     # Validate if input file exists.
 97 |     if [ ! -f "${input}" ]
 98 |     then
 99 |       valid_command_line=false
100 |       errcho "input file does not exist"
101 |     else
102 |       # Validate allowed input filetype.
103 |       case $(file --mime-type -b "${input}") in
104 |         text/plain);;
105 |         application/*gzip);;
106 |         *)
107 |           valid_command_line=false
108 |           errcho "input file has invalid type (plain text/gzip allowed)";;
109 |       esac
110 |     fi
111 |   fi
112 | 
113 |   # Validate if variable is set & not empty.
114 |   if [ -z "${output}" ]
115 |   then
116 |     valid_command_line=false
117 |     errcho "output file not set/empty"
118 |   else
119 |     # Validates proper output filename.
120 |     if [[ "${output}" != *.tsv.gz ]]
121 |     then
122 |       valid_command_line=false
123 |       errcho "output filename must end with '.tsv.gz'"
124 |     else
125 |       # Validates if output doesn't file already exist.
126 |       if [ -f "${output}" ]
127 |       then
128 |         if [[ ${FORCE} == true ]]
129 |         then
130 |           echo "output file exists, enforcing output"
131 |           rm "${output}"
132 |         else
133 |           errcho "output file exists and force flag is not called"
134 |           valid_command_line=false
135 |         fi
136 |       fi
137 |     fi
138 |   fi
139 | 
140 |   # If a the command line arguments are invalid, exits with code 1.
141 |   if [[ "${valid_command_line}" == false ]]; then errcho "Exiting."; exit 1; fi
142 | }
143 | 
144 | processFile() {
145 |   local output="${output%.gz}"
146 | 
147 |   local args=()
148 |   args+=("exec")
149 |   args+=("${bcftools_path}")
150 |   args+=("bcftools")
151 |   args+=("+split-vep")
152 | 
153 |   # Header
154 | 
155 |   echo "Obtaining header"
156 | 
157 |   header_args=("${args[@]}")
158 |   header_args+=("-l" "${input}")
159 | 
160 |   present_features=$(apptainer "${header_args[@]}" | cut -f 2 | tr "\n" "\t" | sed "s/\t$//")
161 | 
162 |   echo -e "${HEADER}$present_features" > ${output}
163 | 
164 |   # VEP VCF file content
165 | 
166 |   echo "Obtaining VCF content"
167 | 
168 |   file_args=("${args[@]}")
169 |   file_args+=("-d")
170 |   file_args+=("-f" "${FORMAT}")
171 |   file_args+=("-A" "tab")
172 |   file_args+=("${input}")
173 | 
174 |   apptainer "${file_args[@]}" >> ${output}
175 | 
176 |   echo "BCFTools finished."
177 | 
178 |   echo "Gzipping output file."
179 | 
180 |   gzip "${output}"
181 | 
182 |   echo "Done."
183 | }
184 | 
185 | main "$@"
186 | 


--------------------------------------------------------------------------------
/tests/capice/core/test_logger.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import sys
  3 | import logging
  4 | import unittest
  5 | 
  6 | from molgenis.capice.core.logger import Logger
  7 | from tests.capice.test_templates import teardown
  8 | from molgenis.capice.core.capice_manager import CapiceManager
  9 | 
 10 | 
 11 | class TestLogger(unittest.TestCase):
 12 |     @classmethod
 13 |     def setUpClass(cls):
 14 |         print('Setting up.')
 15 |         cls.manager = CapiceManager()
 16 |         cls.manager.critical_logging_only = False
 17 |         cls.not_present_string = 'Not present string'
 18 | 
 19 |     @classmethod
 20 |     def tearDownClass(cls):
 21 |         print('Tearing down.')
 22 |         teardown()
 23 | 
 24 |     def capture_stdout_call(self):
 25 |         old_stdout = sys.stdout
 26 |         listener = io.StringIO()
 27 |         sys.stdout = listener
 28 |         log = Logger().logger
 29 |         log.info('SomeString')
 30 |         log.debug('SomeString')
 31 |         out = listener.getvalue()
 32 |         sys.stdout = old_stdout
 33 |         self.assertGreater(len(out), 0)
 34 |         return out
 35 | 
 36 |     def capture_stderr_call(self):
 37 |         old_stderr = sys.stderr
 38 |         listener = io.StringIO()
 39 |         sys.stderr = listener
 40 |         log = Logger().logger
 41 |         log.critical('SomeString')
 42 |         log.error('SomeString')
 43 |         out = listener.getvalue()
 44 |         sys.stderr = old_stderr
 45 |         self.assertGreater(len(out), 0)
 46 |         return out
 47 | 
 48 |     def setUp(self):
 49 |         print('Testing case:')
 50 | 
 51 |     def tearDown(self) -> None:
 52 |         print('Resetting arguments.')
 53 |         Logger.instance = None
 54 |         self.manager.critical_logging_only = False
 55 |         self.manager.loglevel = None
 56 |         print('Arguments reset.')
 57 | 
 58 |     def test_isenbaled_false_debug(self):
 59 |         print('isEnabledFor(logging.DEBUG) is False')
 60 |         self.manager.loglevel = 20
 61 |         log = Logger().logger
 62 |         self.assertFalse(log.isEnabledFor(logging.DEBUG))
 63 | 
 64 |     def test_isenabled_true_debug(self):
 65 |         print('isEnabledFor(logging.DEBUG) is True')
 66 |         self.manager.loglevel = 10
 67 |         log = Logger().logger
 68 |         self.assertTrue(log.isEnabledFor(logging.DEBUG))
 69 | 
 70 |     def test_isenabled_false_warning(self):
 71 |         print('isEnabledFor(logging.WARNING) is False')
 72 |         self.manager.critical_logging_only = True
 73 |         log = Logger().logger
 74 |         self.assertFalse(log.isEnabledFor(logging.WARNING))
 75 | 
 76 |     def test_isenabled_true_warning(self):
 77 |         print('isEnabledFor(logging.WARNING) is True')
 78 |         log = Logger().logger
 79 |         self.assertTrue(log.isEnabledFor(logging.WARNING))
 80 |         self.assertFalse(log.isEnabledFor(logging.INFO))
 81 | 
 82 |     def test_set_multiple_loglevels(self):
 83 |         print('isEnabledFor(logging.DEBUG) is False with '
 84 |               'CapiceManager().critical_logging_only set to True')
 85 |         self.manager.critical_logging_only = True
 86 |         self.manager.loglevel = 10
 87 |         log = Logger().logger
 88 |         self.assertFalse(log.isEnabledFor(logging.DEBUG))
 89 | 
 90 |     def test_loglevel_nonverbose(self):
 91 |         """
 92 |         Testing Info messages just became a lot harder since the logger is set
 93 |         to logging.NOTSET by default, with it's StreamHandlers taking care of
 94 |         the messages itself, specially the stdout StreamHandler.
 95 |         """
 96 |         print('Loglevel info')
 97 |         self.manager.loglevel = 20
 98 |         out = self.capture_stdout_call()
 99 |         self.assertIn('INFO', out)
100 |         self.assertNotIn('DEBUG', out)
101 | 
102 |     def test_loglevel_verbose(self):
103 |         print('Loglevel verbose')
104 |         self.manager.loglevel = 10
105 |         out = self.capture_stdout_call()
106 |         self.assertIn('INFO', out)
107 |         self.assertIn('DEBUG', out)
108 | 
109 |     def test_loglevel_critical_logging_only(self):
110 |         print('Critical logging only')
111 |         self.manager.critical_logging_only = True
112 |         out = self.capture_stderr_call()
113 |         self.assertIn('CRITICAL', out)
114 |         self.assertNotIn('ERROR', out)
115 | 
116 |     def test_stderr(self):
117 |         print('Levels INFO and DEBUG not present in stderr')
118 |         self.manager.loglevel = 10
119 | 
120 |         old_stderr = sys.stderr
121 |         listener = io.StringIO()
122 |         sys.stderr = listener
123 | 
124 |         log = Logger().logger
125 |         log.info(self.not_present_string)
126 |         log.debug(self.not_present_string)
127 | 
128 |         out = listener.getvalue()
129 |         sys.stderr = old_stderr
130 |         self.assertNotIn(self.not_present_string, out)
131 | 
132 |     def test_stdout(self):
133 |         print('Levels WARNING, ERROR and CRITICAL not present in stdout')
134 |         old_stdout = sys.stdout
135 |         listener = io.StringIO()
136 |         sys.stdout = listener
137 | 
138 |         log = Logger().logger
139 |         log.warning(self.not_present_string)
140 |         log.error(self.not_present_string)
141 |         log.critical(self.not_present_string)
142 | 
143 |         out = listener.getvalue()
144 |         sys.stdout = old_stdout
145 | 
146 |         self.assertNotIn(self.not_present_string, out)
147 | 
148 |     def test_logger_class(self):
149 |         print('Logger class')
150 |         self.assertEqual(str(Logger().logger.__class__), "<class 'logging.RootLogger'>")
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     unittest.main()
155 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/main_capice.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from molgenis.capice.core.logger import Logger
  7 | from molgenis.capice.core.capice_manager import CapiceManager
  8 | from molgenis.capice.utilities.input_parser import InputParser
  9 | from molgenis.capice.core.capice_exporter import CapiceExporter
 10 | from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor
 11 | from molgenis.capice.utilities.categorical_processor import CategoricalProcessor
 12 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor
 13 | from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator
 14 | 
 15 | 
 16 | class Main(ABC):
 17 |     """
 18 |     Main class of CAPICE that contains methods to help the different modes to
 19 |     function.
 20 |     """
 21 | 
 22 |     def __init__(self, input_path, output_path, output_given, force):
 23 |         # Assumes CapiceManager has been initialized & filled.
 24 |         self.manager = CapiceManager()
 25 |         self.log = Logger().logger
 26 | 
 27 |         self.log.info('Initiating selected mode.')
 28 | 
 29 |         # Input file.
 30 |         self.infile = input_path
 31 |         self.log.debug('Input argument -i / --input confirmed: %s', self.infile)
 32 | 
 33 |         # Output file.
 34 |         self.output = output_path
 35 |         self.log.debug('Output directory -o / --output confirmed: %s', self.output)
 36 |         self.output_given = output_given
 37 | 
 38 |         self.force = force
 39 |         self.log.debug('Force output if exists: %s', self.force)
 40 | 
 41 |     @abstractmethod
 42 |     def run(self):
 43 |         pass
 44 | 
 45 |     def _load_file(self, additional_required_features: list | None = None):
 46 |         """
 47 |         Function to load the input TSV file into main
 48 |         :return: pandas DataFrame
 49 |         """
 50 |         input_parser = InputParser()
 51 |         input_file = input_parser.parse(input_file_path=self.infile)
 52 |         post_load_processor = LoadFilePostProcessor(dataset=input_file)
 53 |         input_file = post_load_processor.process()
 54 |         validator = PostFileParseValidator()
 55 |         # Individual calls to the validator for error readability
 56 |         validator.validate_variants_present(input_file)
 57 |         validator.validate_chrom_pos(input_file)
 58 |         validator.validate_n_columns(input_file)
 59 |         validator.validate_minimally_required_columns(
 60 |             input_file,
 61 |             additional_required_features=additional_required_features
 62 |         )
 63 |         return input_file
 64 | 
 65 |     @staticmethod
 66 |     def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[
 67 |         pd.DataFrame, dict[str, list[str]]
 68 |     ]:
 69 |         # Returns might look funky, but Google pydoc does not support multiple return statements.
 70 |         """
 71 |         Function to call the ManualVEPProcessor over loaded_data using the supplied
 72 |         process_features list.
 73 | 
 74 |         Args:
 75 |             loaded_data:
 76 |                 The pandas dataframe over which the VEP features should be processed.
 77 | 
 78 |             process_features:
 79 |                 List containing either all input features, possibly containing VEP features (in
 80 |                 the case of train) or already all input features that can be VEP processed (in
 81 |                 case of predict).
 82 | 
 83 |         Returns:
 84 |             tuple:
 85 |                 Tuple [0] containing: The output dataframe containing all VEP processed features
 86 |                 according to process_features. Depending on the property "drop" will drop the
 87 |                 feature present in process_features from the columns of the output dataframe.
 88 |                 Tuple [1] containing: The output dictionary containing the VEP feature (key)
 89 |                 and the derivative features that originate from said VEP feature (value).
 90 |                 The property "drop" is of no influence here.
 91 |         """
 92 |         processor = ManualVEPProcessor()
 93 |         processed_data = processor.process(loaded_data, process_features)
 94 |         processed_features = processor.get_feature_processes()
 95 |         # No validation, since that is specific to predict.
 96 |         # Also predict doesn't technically need processed_features, but within predict the first
 97 |         # argument in the tuple can just be indexed.
 98 |         # Still returning both is relevant, in case we want to validate the processed_features in
 99 |         # the future for predict.
100 |         return processed_data, processed_features
101 | 
102 |     @staticmethod
103 |     def categorical_process(loaded_data: pd.DataFrame,
104 |                             processing_features: dict[str, list[str]] | None = None,
105 |                             train_features: list | None = None):
106 |         processor = CategoricalProcessor()
107 |         capice_data, processed_features = processor.process(
108 |             loaded_data,
109 |             processable_features=train_features,
110 |             predetermined_features=processing_features
111 |         )
112 |         return capice_data, processed_features
113 | 
114 |     def _export(self, dataset: pd.DataFrame, output: os.PathLike):
115 |         """
116 |         Function to prepare the data to be exported
117 |         """
118 |         CapiceExporter(
119 |             file_path=output,
120 |             output_given=self.output_given,
121 |             force=self.force
122 |         ).export_capice_prediction(datafile=dataset)
123 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/core/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     File:         logger.py
  3 |     Created:      2019/10/11
  4 |     Last Changed:
  5 |     Author(s):    M.Vochteloo and R. J. Sietsma
  6 | 
  7 |     Copyright 2019 M. Vochteloo and R. J. Sietsma
  8 | 
  9 |     Licensed under the Apache License, Version 2.0 (the "License");
 10 |     you may not use this file except in compliance with the License.
 11 |     You may obtain a copy of the License at
 12 | 
 13 |        https://www.apache.org/licenses/LICENSE-2.0
 14 | 
 15 |     Unless required by applicable law or agreed to in writing, software
 16 |     distributed under the License is distributed on an "AS IS" BASIS,
 17 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |     See the License for the specific language governing permissions and
 19 |     limitations under the License.
 20 | """
 21 | 
 22 | import sys
 23 | import logging
 24 | 
 25 | from molgenis.capice.core.capice_manager import CapiceManager
 26 | from molgenis.capice.utilities.custom_logging_filter import CustomLoggingFilter
 27 | 
 28 | 
 29 | class Logger:
 30 |     """
 31 |     Singleton logger class developed by both:
 32 |     - Martijn Vochteloo
 33 |     - Robert Jarik Sietsma.
 34 |     Facilitates the python logging library
 35 |     """
 36 | 
 37 |     class __Logger:
 38 |         def __init__(self):
 39 |             self.global_settings = CapiceManager()
 40 |             self.stdout = False
 41 |             self.stdout_filter = []
 42 |             self.stderr_loglevel = 50
 43 |             self.min_loglevel = 50
 44 |             self.set_stderr_loglevel()
 45 |             self.logger = None
 46 |             if self.logger is None:
 47 |                 self.load_logger()
 48 | 
 49 |         def set_stderr_loglevel(self):
 50 |             """
 51 |             Function to set the log level at where messages are printed or
 52 |             logged. For more information, see:
 53 |             https://docs.python.org/3/library/logging.html#logging-levels
 54 |             :return: logging level
 55 |             """
 56 |             if not self.global_settings.critical_logging_only:
 57 |                 self.stderr_loglevel = 30
 58 |                 self.min_loglevel = 30
 59 |             if self.global_settings.loglevel and self.stderr_loglevel < 50:
 60 |                 self.stdout = True
 61 |                 self._set_stdout_filter()
 62 | 
 63 |         def _set_stdout_filter(self):
 64 |             """
 65 |             Required because else Warning, Error and CRITICAL messages are
 66 |             printed to sys.stdout.
 67 |             """
 68 |             logging_info = [logging.INFO]
 69 |             logging_debug = logging_info + [logging.DEBUG]
 70 |             dict_of_levels = {10: logging_debug, 20: logging_info}
 71 |             self.stdout_filter = dict_of_levels[self.global_settings.loglevel]
 72 |             self.min_loglevel = self.global_settings.loglevel
 73 | 
 74 |         def load_logger(self):
 75 |             """
 76 |             Function to set up the logger instance with the stdout and stderr
 77 |             StreamHandlers (stdout assuming verbose flag is called) and the
 78 |             formatter.
 79 |             """
 80 |             # Making a root logger to make sure the level is set correctly.
 81 |             logger = logging.getLogger()
 82 |             # Now renaming it to CAPICE.
 83 |             logger.name = 'CAPICE'
 84 | 
 85 |             # Capture warnings
 86 |             logging.captureWarnings(True)
 87 | 
 88 |             formatter = logging.Formatter(
 89 |                 "%(asctime)s "
 90 |                 "%(levelname)8s: "
 91 |                 "%(message)s",
 92 |                 datefmt='%Y-%m-%d %H:%M:%S'
 93 |             )
 94 | 
 95 |             # Setting the log level to debug, but with an applied filter
 96 |             logger.setLevel(self.min_loglevel)
 97 | 
 98 |             # sys.stdout (if critical logging only isn't called and one of
 99 |             # the verbose flags is called.
100 |             if self.stdout:
101 |                 stdout_handler = logging.StreamHandler(sys.stdout)
102 |                 stdout_handler.setLevel(self.global_settings.loglevel)
103 |                 stdout_handler.setFormatter(formatter)
104 |                 # Filter out warning, error and critical messages.
105 |                 stdout_handler.addFilter(CustomLoggingFilter(self.stdout_filter))
106 |                 logger.addHandler(stdout_handler)
107 | 
108 |             # sys.stderr
109 |             stderr_handler = logging.StreamHandler(sys.stderr)
110 |             stderr_handler.setLevel(self.stderr_loglevel)
111 |             stderr_handler.setFormatter(formatter)
112 |             logger.addHandler(stderr_handler)
113 |             self.logger = logger
114 | 
115 |         @property
116 |         def logger(self):
117 |             """
118 |             Property to get the logger instance.
119 | 
120 |             :return: logging.Logger
121 |             """
122 |             return self._logger
123 | 
124 |         @logger.setter
125 |         def logger(self, value):
126 |             """
127 |             Setter for the logger instance.
128 | 
129 |             :param value:
130 |             :return:
131 |             """
132 |             self._logger = value
133 | 
134 |     @property
135 |     def logger(self):
136 |         """
137 |         Property to get the logger instance.
138 | 
139 |         :return: logging.Logger
140 |         """
141 |         return self._logger
142 | 
143 |     instance = None
144 | 
145 |     def __new__(cls):
146 |         """
147 |         Class method to set Logger instance
148 |         :return: instance
149 |         """
150 |         if not Logger.instance:
151 |             Logger.instance = Logger.__Logger()
152 |         return Logger.instance
153 | 
154 |     def __init__(self):
155 |         """
156 |         __init__ method to set instance to Logger.__Logger()
157 |         """
158 |         if not Logger.instance:
159 |             Logger.instance = Logger.__Logger()
160 | 
161 |     def __getattr__(self, name):
162 |         """
163 |         Method to return the value of the named attribute of name
164 |         :param name: str
165 |         :return: str
166 |         """
167 |         return getattr(self.instance, name)
168 | 


--------------------------------------------------------------------------------
/tests/capice/vep/test_consequence.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from molgenis.capice.vep.consequence import Consequence
  7 | 
  8 | 
  9 | class TestConsequence(unittest.TestCase):
 10 |     def setUp(self) -> None:
 11 |         self.data = pd.DataFrame(
 12 |             {
 13 |                 'Consequence': ['transcript_ablation&stop_lost', 'start_lost', np.nan]
 14 |             }
 15 |         )
 16 |         self.expected_data = pd.DataFrame(
 17 |                     {
 18 |                         'is_regulatory_region_variant': {0: 0, 1: 0, 2: 0},
 19 |                         'is_regulatory_region_ablation': {0: 0, 1: 0, 2: 0},
 20 |                         'is_regulatory_region_amplification': {0: 0, 1: 0, 2: 0},
 21 |                         'is_missense_variant': {0: 0, 1: 0, 2: 0},
 22 |                         'is_intron_variant': {0: 0, 1: 0, 2: 0},
 23 |                         'is_upstream_gene_variant': {0: 0, 1: 0, 2: 0},
 24 |                         'is_downstream_gene_variant': {0: 0, 1: 0, 2: 0},
 25 |                         'is_synonymous_variant': {0: 0, 1: 0, 2: 0},
 26 |                         'is_TF_binding_site_variant': {0: 0, 1: 0, 2: 0},
 27 |                         'is_splice_donor_variant': {0: 0, 1: 0, 2: 0},
 28 |                         'is_coding_sequence_variant': {0: 0, 1: 0, 2: 0},
 29 |                         'is_splice_region_variant': {0: 0, 1: 0, 2: 0},
 30 |                         'is_stop_gained': {0: 0, 1: 0, 2: 0},
 31 |                         'is_splice_acceptor_variant': {0: 0, 1: 0, 2: 0},
 32 |                         'is_frameshift_variant': {0: 0, 1: 0, 2: 0},
 33 |                         'is_3_prime_UTR_variant': {0: 0, 1: 0, 2: 0},
 34 |                         'is_inframe_insertion': {0: 0, 1: 0, 2: 0},
 35 |                         'is_inframe_deletion': {0: 0, 1: 0, 2: 0},
 36 |                         'is_5_prime_UTR_variant': {0: 0, 1: 0, 2: 0},
 37 |                         'is_start_lost': {0: 0, 1: 1, 2: 0},
 38 |                         'is_non_coding_transcript_exon_variant': {0: 0, 1: 0, 2: 0},
 39 |                         'is_non_coding_transcript_variant': {0: 0, 1: 0, 2: 0},
 40 |                         'is_TFBS_ablation': {0: 0, 1: 0, 2: 0},
 41 |                         'is_TFBS_amplification': {0: 0, 1: 0, 2: 0},
 42 |                         'is_protein_altering_variant': {0: 0, 1: 0, 2: 0},
 43 |                         'is_stop_lost': {0: 1, 1: 0, 2: 0},
 44 |                         'is_stop_retained_variant': {0: 0, 1: 0, 2: 0},
 45 |                         'is_transcript_ablation': {0: 1, 1: 0, 2: 0},
 46 |                         'is_intergenic_variant': {0: 0, 1: 0, 2: 0},
 47 |                         'is_start_retained_variant': {0: 0, 1: 0, 2: 0},
 48 |                         'is_transcript_amplification': {0: 0, 1: 0, 2: 0},
 49 |                         'is_incomplete_terminal_codon_variant': {0: 0, 1: 0, 2: 0},
 50 |                         'is_mature_miRNA_variant': {0: 0, 1: 0, 2: 0},
 51 |                         'is_NMD_transcript_variant': {0: 0, 1: 0, 2: 0},
 52 |                         'is_feature_elongation': {0: 0, 1: 0, 2: 0},
 53 |                         'is_feature_truncation': {0: 0, 1: 0, 2: 0},
 54 |                         'is_splice_donor_5th_base_variant': {0: 0, 1: 0, 2: 0},
 55 |                         'is_splice_donor_region_variant': {0: 0, 1: 0, 2: 0},
 56 |                         'is_splice_polypyrimidine_tract_variant': {0: 0, 1: 0, 2: 0}
 57 |                     }
 58 | 
 59 |                 )
 60 | 
 61 |     def test_consequence(self):
 62 |         data_copy = self.data.copy(deep=True)
 63 |         observerd = Consequence().process(self.data)
 64 |         # if numpy.array dtype not given,
 65 |         # then the type will be determined as the minimum type required to hold the
 66 |         # objects in the sequence. this minimal type is system dependent.
 67 |         expected = pd.concat(
 68 |             [
 69 |                 data_copy,
 70 |                 self.expected_data
 71 |             ], axis=1
 72 |         )
 73 |         pd.testing.assert_frame_equal(observerd.sort_index(axis=1), expected.sort_index(
 74 |             axis=1), check_dtype=False)
 75 | 
 76 |     def test_non_coding(self):
 77 |         data = pd.DataFrame({
 78 |             'variants': ['variant_1', 'variant_2', 'variant_3'],
 79 |             'Consequence': [np.nan, np.nan, np.nan]
 80 |         })
 81 |         columns = data.columns
 82 |         expected_altered = self.expected_data.copy(deep=True)
 83 |         # Easier to locate the ones in self.expected_data than to hardcode a new one
 84 |         expected_altered.loc[1, 'is_start_lost'] = 0
 85 |         expected_altered.loc[0, 'is_stop_lost'] = 0
 86 |         expected_altered.loc[0, 'is_transcript_ablation'] = 0
 87 |         expected = pd.concat([data, expected_altered], axis=1)
 88 |         observed = Consequence().process(data)
 89 |         self.assertFalse(observed[observed.columns.difference(columns)].isnull().values.any())
 90 |         pd.testing.assert_frame_equal(
 91 |             observed.sort_index(axis=1),
 92 |             expected.sort_index(axis=1)
 93 |         )
 94 | 
 95 |     def test_consequence_warning(self):
 96 |         """
 97 |         Tests that when a consequence is encountered that is not present within the processor
 98 |         raises a warning.
 99 |         """
100 |         dataframe = pd.DataFrame(
101 |             {
102 |                 'Consequence': ['transcript_ablation&stop_lost', 'start_lost', 'fake_consequence']
103 |             }
104 |         )
105 |         dataframe_copy = dataframe.copy(deep=True)
106 |         with self.assertLogs() as captured:
107 |             observed = Consequence().process(dataframe)
108 |         expected = pd.concat(
109 |             [
110 |                 dataframe_copy,
111 |                 self.expected_data
112 |             ], axis=1
113 |         )
114 | 
115 |         pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(
116 |             axis=1), check_dtype=False)
117 |         self.assertEqual('Supplied VEP consequence: fake_consequence is not supported in the '
118 |                          'Consequence processor!', captured.records[0].getMessage())
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/src/molgenis/capice/cli/args_handler_parent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABCMeta, abstractmethod
  3 | 
  4 | import xgboost as xgb
  5 | 
  6 | from molgenis.capice import __version__
  7 | from molgenis.capice.utilities.input_processor import InputProcessor
  8 | from molgenis.capice.validators.input_validator import InputValidator
  9 | from molgenis.capice.validators.version_validator import VersionValidator
 10 | 
 11 | 
 12 | class ArgsHandlerParent(metaclass=ABCMeta):
 13 |     """
 14 |     Parent class of all module specific argument parsers / handlers.
 15 |     """
 16 | 
 17 |     def __init__(self, parser):
 18 |         self.parser = parser
 19 |         self.input_validator = InputValidator()
 20 |         self.force = False
 21 | 
 22 |     @property
 23 |     @abstractmethod
 24 |     def _extension(self) -> tuple[str]:
 25 |         """
 26 |         Property to define what extension(s) are allowed for an input file for
 27 |         each module parser.
 28 |         """
 29 |         pass
 30 | 
 31 |     def _extension_str(self) -> str:
 32 |         """
 33 |         String representation of `_extension()`
 34 |         """
 35 |         return self._join_extensions(self._extension)
 36 | 
 37 |     @property
 38 |     @abstractmethod
 39 |     def _required_output_extensions(self) -> tuple[str]:
 40 |         """
 41 |         Property to define what the output file extensions are allowed for each
 42 |         module parser.
 43 |         """
 44 |         pass
 45 | 
 46 |     def _required_output_extensions_str(self) -> str:
 47 |         """
 48 |         String representation of `_required_output_extensions()`
 49 |         """
 50 |         return self._join_extensions(self._required_output_extensions)
 51 | 
 52 |     @property
 53 |     @abstractmethod
 54 |     def _empty_output_extension(self) -> str:
 55 |         """
 56 |         Property to define what extension an output file should get if no
 57 |         output file extension was given.
 58 | 
 59 |         Preferably, use: self._required_output_extensions[<value>]
 60 |         """
 61 |         pass
 62 | 
 63 |     @abstractmethod
 64 |     def create(self):
 65 |         """
 66 |         Method to define what parser options should be available for the module.
 67 |         Use self.parser.add_argument() to add an argument to the subparser.
 68 |         """
 69 |         pass
 70 | 
 71 |     def handle(self):
 72 |         """
 73 |         Superclass handler to set the arguments set in create(). Also calls the
 74 |         parser to proceed with parsing the module specific arguments, validate
 75 |         them and run  the CAPICE code.
 76 |         """
 77 |         self.parser.set_defaults(func=self._handle_args)
 78 | 
 79 |     def _handle_args(self, args):
 80 |         """
 81 |         Superclass handle args to parse and validate the input and output
 82 |         arguments. Also parses the output filename.
 83 |         """
 84 |         version_validator = VersionValidator()
 85 |         try:
 86 |             version_validator.validate_capice_version(__version__)
 87 |         except ValueError as cm:
 88 |             self.parser.error(str(cm))
 89 |         input_path = self._retrieve_argument_from_list(args.input, '-i/--input')
 90 |         try:
 91 |             self.input_validator.validate_input_path(input_path, extension=self._extension)
 92 |         except FileNotFoundError as cm:
 93 |             self.parser.error(str(cm))
 94 |         output_path = self._retrieve_argument_from_list(args.output, '-o/--output')
 95 |         self.force = args.force
 96 |         try:
 97 |             processor = InputProcessor(
 98 |                 input_path=input_path,
 99 |                 output_path=output_path,
100 |                 force=self.force,
101 |                 default_extension=self._empty_output_extension
102 |             )
103 |         except FileExistsError as cm:
104 |             self.parser.error(str(cm))
105 |         output_filename = processor.get_output_filename()
106 |         output_filename = self._handle_output_filename(output_filename)
107 |         output_given = processor.get_output_given()
108 |         output_path = processor.get_output_directory()
109 |         try:
110 |             self.input_validator.validate_output_path(output_path)
111 |         except OSError as cm:
112 |             self.parser.error(str(cm))
113 |         self._handle_module_specific_args(input_path, output_path, output_filename, output_given,
114 |                                           args)
115 | 
116 |     def _retrieve_argument_from_list(self,
117 |                                      arg: list | None,
118 |                                      arg_name: str,
119 |                                      has_default: bool = False) -> None | str:
120 |         try:
121 |             return self._single_argument_retriever(arg, arg_name, has_default)
122 |         except IOError as e:
123 |             self.parser.error(e)
124 |             return None
125 | 
126 |     @staticmethod
127 |     def _single_argument_retriever(arg: list | None,
128 |                                    arg_name: str,
129 |                                    has_default: bool) -> None | str:
130 |         """
131 |         Retrieves the user-argument from a list. It requires the user to have only entered
132 |         the argument once (combined with `action='append'` for argument parsing), resulting in a
133 |         list of length:
134 |          - 0 (no arguments given & no default value)
135 |          - 1 (1 argument given or default_value is present)
136 |          - 2 (1 argument given and default value present)
137 | 
138 |          If `has_default`==True, the first list item is assumed to be the default one (set through
139 |          `default=[<value>]`) and any extra items in the list being user-input.
140 | 
141 |         Args:
142 |             arg: List of arguments (or None if no arguments where generated and no defaults were
143 |             present either)
144 |             arg_name: The name of the user-argument to which `arg` belongs
145 |             has_default: whether a default arg is present in the given arg list
146 |         Returns:
147 |             None (if args is None) or a single item from the given list.
148 |         Raises:
149 |             ValueError: If empty list is given (=programming error)
150 |             IOError: If list contains more items than expected (>2 if has_default, else >1).
151 | 
152 |         """
153 |         # None is simply returned.
154 |         if arg is None:
155 |             return arg
156 | 
157 |         arg_len = len(arg)
158 | 
159 |         # Empty list indicates programming bug.
160 |         if arg_len == 0:
161 |             raise ValueError('Empty list is given. Should be None or list with elements.')
162 | 
163 |         # Retrieve value to be used for CLI argument.
164 |         if arg_len > 2 or (arg_len > 1 and not has_default):
165 |             raise IOError(f'Argument {arg_name} is only allowed once.')
166 |         else:
167 |             return arg[arg_len-1]
168 | 
169 |     @abstractmethod
170 |     def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given,
171 |                                      args):
172 |         """
173 |         Method to be filled in by the module specific parsers. Should perform
174 |         additional validation over args specific to the parser. Should then call
175 |         the module to continue the module.
176 |         """
177 |         pass
178 | 
179 |     def _handle_output_filename(self, output_filename: str):
180 |         """
181 |         Method to validate that an output filename complies with the
182 |         required output extension.
183 |         """
184 |         if '.' in output_filename and not output_filename.endswith(
185 |                 self._required_output_extensions):
186 |             self.parser.error(
187 |                 f'Output file extension is incorrect. Expected output extension: '
188 |                 f'{self._required_output_extensions}'
189 |             )
190 |         else:
191 |             return output_filename
192 | 
193 |     @staticmethod
194 |     def load_model(model_path: os.PathLike) -> xgb.XGBClassifier:
195 |         model = xgb.XGBClassifier()
196 |         model.load_model(model_path)
197 |         return model
198 | 
199 |     @staticmethod
200 |     def _join_extensions(extensions: tuple[str]) -> str:
201 |         return ', '.join(extensions)
202 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------