├── tests ├── __init__.py ├── capice │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ ├── test_args_handler_explain.py │ │ ├── test_args_handler_train.py │ │ └── test_args_handler_parent.py │ ├── vep │ │ ├── __init__.py │ │ ├── test_length.py │ │ ├── test_type.py │ │ ├── test_poly_phen.py │ │ ├── test_amino_acids.py │ │ ├── test_sift.py │ │ ├── test_cdna_position.py │ │ └── test_consequence.py │ ├── core │ │ ├── __init__.py │ │ ├── test_capice_exporter.py │ │ └── test_logger.py │ ├── utilities │ │ ├── __init__.py │ │ ├── test_predictor.py │ │ ├── test_custom_logging_filter.py │ │ ├── test_input_parser.py │ │ ├── test_load_file_postprocessor.py │ │ ├── test_class_suggestor.py │ │ ├── test_predict.py │ │ ├── test_file_postprocessor.py │ │ ├── test_dynamic_loader.py │ │ ├── test_column_utils.py │ │ └── test_input_processor.py │ ├── validators │ │ ├── __init__.py │ │ ├── test_model_validator.py │ │ ├── test_post_vep_processing_validator.py │ │ ├── test_property_type_validator.py │ │ ├── test_input_validator.py │ │ └── test_post_file_parse_validator.py │ ├── test__init__.py │ ├── test_resources.py │ ├── test_main_predict.py │ ├── test_main_explain.py │ ├── test_templates.py │ └── test_edge_cases_predict.py └── resources │ ├── input_processor │ └── filename.txt │ ├── dynamic_loader_test_no_files │ └── __init__.py │ ├── dynamic_loader_test_files_present │ ├── __init__.py │ ├── correct_file.py │ ├── incorrect_file.py │ ├── correct_vep_grch_file.py │ ├── correct_vep_grch_file_dupe_test.py │ ├── correct_overwrite_file.py │ └── correct_overwrite_file_dupe_test.py │ ├── input_parser │ └── input_parser.txt │ ├── breakends.vcf.gz │ ├── edge_cases.vcf.gz │ ├── breakends_vep.tsv.gz │ ├── features_test.json │ ├── xgb_booster_poc.ubj │ ├── edge_cases_vep.tsv.gz │ ├── symbolic_alleles.vcf.gz │ ├── symbolic_alleles_vep.tsv.gz │ ├── train_dataset_missing_column_pos.tsv.gz │ ├── train_dataset_missing_column_ref.tsv.gz │ ├── train_dataset_missing_column_polyphen.tsv.gz │ └── VEP104.json ├── src └── molgenis │ └── capice │ ├── cli │ ├── __init__.py │ ├── args_handler_explain.py │ ├── args_handler_predict.py │ ├── args_handler_train.py │ └── args_handler_parent.py │ ├── core │ ├── __init__.py │ ├── capice_exporter.py │ ├── args_handler.py │ ├── capice_manager.py │ └── logger.py │ ├── vep │ ├── __init__.py │ ├── cds_position.py │ ├── cdna_position.py │ ├── protein_position.py │ ├── length.py │ ├── sift.py │ ├── template_sift_polyphen.py │ ├── amino_acids.py │ ├── poly_phen.py │ ├── template.py │ ├── template_position.py │ ├── type.py │ └── consequence.py │ ├── validators │ ├── __init__.py │ ├── model_validator.py │ ├── property_type_validator.py │ ├── predict_validator.py │ ├── post_vep_processing_validator.py │ ├── input_validator.py │ ├── post_file_parse_validator.py │ └── version_validator.py │ ├── __init__.py │ ├── capice.py │ ├── utilities │ ├── custom_logging_filter.py │ ├── class_suggestor.py │ ├── predictor.py │ ├── input_parser.py │ ├── load_file_postprocessor.py │ ├── column_utils.py │ ├── enums.py │ ├── __init__.py │ ├── manual_vep_processor.py │ ├── input_processor.py │ └── dynamic_loader.py │ ├── main_predict.py │ ├── main_explain.py │ └── main_capice.py ├── resources ├── train_test.tsv.gz ├── predict_input.tsv.gz ├── train_input_raw.vcf.gz ├── predict_input_raw.vcf.gz ├── test_input.vcf └── train_features.json ├── .flake8 ├── scripts ├── tests │ ├── capice_input.vcf.zip │ └── test_convert_vep_vcf_to_tsv_capice.sh └── convert_vep_vcf_to_tsv_capice.sh ├── sonar-project.properties ├── .github ├── pull_request_template.md └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── setup.py ├── .gitignore ├── .travis.yml └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/vep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/molgenis/capice/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/molgenis/capice/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/capice/validators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/resources/input_processor/filename.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/molgenis/capice/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '5.1.2' 2 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_no_files/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/resources/input_parser/input_parser.txt: -------------------------------------------------------------------------------- 1 | this,is,a,header 2 | this,is,a,line -------------------------------------------------------------------------------- /resources/train_test.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/train_test.tsv.gz -------------------------------------------------------------------------------- /resources/predict_input.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/predict_input.tsv.gz -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | import-order-style = pep8 3 | max_line_length = 100 4 | application-import-names = molgenis,tests -------------------------------------------------------------------------------- /resources/train_input_raw.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/train_input_raw.vcf.gz -------------------------------------------------------------------------------- /tests/resources/breakends.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/breakends.vcf.gz -------------------------------------------------------------------------------- /tests/resources/edge_cases.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/edge_cases.vcf.gz -------------------------------------------------------------------------------- /resources/predict_input_raw.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/resources/predict_input_raw.vcf.gz -------------------------------------------------------------------------------- /scripts/tests/capice_input.vcf.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/scripts/tests/capice_input.vcf.zip -------------------------------------------------------------------------------- /tests/resources/breakends_vep.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/breakends_vep.tsv.gz -------------------------------------------------------------------------------- /tests/resources/features_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "feature_1": null, 3 | "feature_foobarbaz": null, 4 | "feature_3": null 5 | } -------------------------------------------------------------------------------- /tests/resources/xgb_booster_poc.ubj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/xgb_booster_poc.ubj -------------------------------------------------------------------------------- /tests/resources/edge_cases_vep.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/edge_cases_vep.tsv.gz -------------------------------------------------------------------------------- /tests/resources/symbolic_alleles.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/symbolic_alleles.vcf.gz -------------------------------------------------------------------------------- /tests/resources/symbolic_alleles_vep.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/symbolic_alleles_vep.tsv.gz -------------------------------------------------------------------------------- /tests/resources/train_dataset_missing_column_pos.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_pos.tsv.gz -------------------------------------------------------------------------------- /tests/resources/train_dataset_missing_column_ref.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_ref.tsv.gz -------------------------------------------------------------------------------- /tests/resources/train_dataset_missing_column_polyphen.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/molgenis/capice/HEAD/tests/resources/train_dataset_missing_column_polyphen.tsv.gz -------------------------------------------------------------------------------- /sonar-project.properties: -------------------------------------------------------------------------------- 1 | sonar.projectKey=molgenis_capice 2 | sonar.organization=molgenis 3 | 4 | sonar.sources = src/ 5 | sonar.tests = tests/ 6 | 7 | sonar.python.xunit.reportPath=results.xml 8 | sonar.python.coverage.reportPaths=coverage.xml 9 | 10 | sonar.python.version=3.10 -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/correct_file.py: -------------------------------------------------------------------------------- 1 | class CorrectFile: 2 | @property 3 | def name(self): 4 | return 'Correct' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @staticmethod 11 | def some_function(): 12 | return 'foo' 13 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/incorrect_file.py: -------------------------------------------------------------------------------- 1 | class InCorrectFile: 2 | @property 3 | def name(self): 4 | return 'Incorrect' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @staticmethod 11 | def other_function(): 12 | return 'foo' 13 | -------------------------------------------------------------------------------- /resources/test_input.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##reference=GRCh38 3 | #CHROM POS ID REF ALT 4 | chr12 69747417 . C A 5 | chr17 41231346 . G T 6 | chr2 122288533 . C A 7 | chr11 118382645 . G T 8 | chr5 235382 . G A 9 | chr2 48026421 . T C 10 | chr5 90073785 . C T 11 | chr1 63114155 . T C 12 | chr2 179431764 . G A 13 | chr9 131250286 . G A 14 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## SOP 2 | 3 | ### Changed 4 | 5 | - 6 | 7 | ## Important notes 8 | 9 | - 10 | 11 | ### Before merge: 12 | - [ ] Functionality works & meets specs 13 | - [ ] No Travis issues 14 | - [ ] Code reviewed 15 | - [ ] Documentation was updated 16 | 17 | ### After merge: 18 | - [ ] Added feature/fix to draft release notes 19 | - [ ] Removed merged branches 20 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/cds_position.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.vep.template_position import TemplatePosition 2 | 3 | 4 | class CDSPosition(TemplatePosition): 5 | def __init__(self): 6 | super(CDSPosition, self).__init__( 7 | name='CDS_position', 8 | usable=True 9 | ) 10 | 11 | @property 12 | def columns(self): 13 | return ['CDSpos', 'relCDSpos'] 14 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/cdna_position.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.vep.template_position import TemplatePosition 2 | 3 | 4 | class CDNAPosition(TemplatePosition): 5 | def __init__(self): 6 | super(CDNAPosition, self).__init__( 7 | name='cDNA_position', 8 | usable=True 9 | ) 10 | 11 | @property 12 | def columns(self): 13 | return ['cDNApos', 'relcDNApos'] 14 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/protein_position.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.vep.template_position import TemplatePosition 2 | 3 | 4 | class ProteinPosition(TemplatePosition): 5 | def __init__(self): 6 | super(ProteinPosition, self).__init__( 7 | name='Protein_position', 8 | usable=True 9 | ) 10 | 11 | @property 12 | def columns(self): 13 | return ['protPos', 'relProtPos'] 14 | -------------------------------------------------------------------------------- /src/molgenis/capice/capice.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.core.args_handler import ArgsHandler 2 | 3 | 4 | def main(): 5 | """ 6 | CAPICE main. Runs the Argument handler, which in turns runs the super class 7 | args handler for all available modules. For usage, print the help on 8 | the command line by using (python3) capice(.py) --help. 9 | """ 10 | argument_handler = ArgsHandler() 11 | argument_handler.create() 12 | argument_handler.handle() 13 | 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/correct_vep_grch_file.py: -------------------------------------------------------------------------------- 1 | class CorrectVEPFile: 2 | @property 3 | def name(self): 4 | return 'Correct_VEP_GRCh' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @property 11 | def supported_vep_version(self): 12 | return 104.0 13 | 14 | @property 15 | def supported_grch_build(self): 16 | return 37 17 | 18 | @staticmethod 19 | def some_function(): 20 | return 'SomeVeryUniqueString' 21 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/correct_vep_grch_file_dupe_test.py: -------------------------------------------------------------------------------- 1 | class CorrectVEPFile: 2 | @property 3 | def name(self): 4 | return 'Correct_VEP_GRCh' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @property 11 | def supported_vep_version(self): 12 | return 104.0 13 | 14 | @property 15 | def supported_grch_build(self): 16 | return 37 17 | 18 | @staticmethod 19 | def some_function(): 20 | return 'SomeVeryUniqueString' 21 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/correct_overwrite_file.py: -------------------------------------------------------------------------------- 1 | class CorrectOverwriteFile: 2 | @property 3 | def name(self): 4 | return 'Correct_Overwrite' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @property 11 | def supported_vep_version(self): 12 | return None 13 | 14 | @property 15 | def supported_grch_build(self): 16 | return None 17 | 18 | @staticmethod 19 | def some_function(): 20 | return 'SomeVeryUniqueString_butdifferent' 21 | -------------------------------------------------------------------------------- /tests/resources/dynamic_loader_test_files_present/correct_overwrite_file_dupe_test.py: -------------------------------------------------------------------------------- 1 | class CorrectOverwriteFile: 2 | @property 3 | def name(self): 4 | return 'Correct_Overwrite' 5 | 6 | @property 7 | def usable(self): 8 | return True 9 | 10 | @property 11 | def supported_vep_version(self): 12 | return None 13 | 14 | @property 15 | def supported_grch_build(self): 16 | return None 17 | 18 | @staticmethod 19 | def some_function(): 20 | return 'SomeVeryUniqueString_butdifferent' 21 | -------------------------------------------------------------------------------- /tests/capice/cli/test_args_handler_explain.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from argparse import ArgumentParser 3 | 4 | from molgenis.capice.cli.args_handler_explain import ArgsHandlerExplain 5 | 6 | 7 | class TestArgsHandlerExplain(unittest.TestCase): 8 | def test_property_str_versions(self): 9 | args_handler = ArgsHandlerExplain(ArgumentParser()) 10 | self.assertEqual('.json, .ubj', args_handler._extension_str()) 11 | self.assertEqual('.tsv, .tsv.gz', args_handler._required_output_extensions_str()) 12 | 13 | 14 | if __name__ == '__main__': 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /tests/capice/test__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from re import match 3 | 4 | from molgenis.capice.utilities.enums import Versioning 5 | from molgenis.capice import __version__ 6 | 7 | 8 | class TestVersion(unittest.TestCase): 9 | def test_version_formatting(self): 10 | """ 11 | Test that assures CAPICE is not given an invalid version number. 12 | """ 13 | if match(Versioning.VALIDATION_REGEX.value, __version__) is None: 14 | raise ValueError('CAPICE has invalid version format') 15 | 16 | 17 | if __name__ == '__main__': 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /resources/train_features.json: -------------------------------------------------------------------------------- 1 | { 2 | "PolyPhen": null, 3 | "SIFT": null, 4 | "cDNA_position": null, 5 | "CDS_position": null, 6 | "Protein_position": null, 7 | "Amino_acids": null, 8 | "REF": null, 9 | "ALT": null, 10 | "Consequence": null, 11 | "SpliceAI_pred_DP_AG": null, 12 | "SpliceAI_pred_DP_AL": null, 13 | "SpliceAI_pred_DP_DG": null, 14 | "SpliceAI_pred_DP_DL": null, 15 | "SpliceAI_pred_DS_AG": null, 16 | "SpliceAI_pred_DS_AL": null, 17 | "SpliceAI_pred_DS_DG": null, 18 | "SpliceAI_pred_DS_DL": null, 19 | "Grantham": null, 20 | "phyloP": null 21 | } -------------------------------------------------------------------------------- /src/molgenis/capice/validators/model_validator.py: -------------------------------------------------------------------------------- 1 | class ModelValidator: 2 | @staticmethod 3 | def validate_has_required_attributes(model): 4 | """ 5 | Function to validate if the required attributes CAPICE_version, 6 | impute_values and predict_proba are present. 7 | """ 8 | required_attributes = ['CAPICE_version', 'vep_features', 9 | 'processable_features', 'predict_proba'] 10 | for attribute in required_attributes: 11 | if attribute not in dir(model): 12 | raise AttributeError(f'Unable to locate attribute {attribute} in model file!') 13 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/custom_logging_filter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class CustomLoggingFilter(logging.Filter): 5 | """ 6 | Custom logging filter class to make sure that stdout only contains 7 | INFO or DEBUG calls. 8 | """ 9 | def __init__(self, custom_loglevels): 10 | """ 11 | :param custom_loglevels: str or iterable: the loglevels that should pass 12 | this logging filter. 13 | """ 14 | super(CustomLoggingFilter, self).__init__() 15 | self.custom_loglevels = custom_loglevels 16 | 17 | def filter(self, record) -> bool: 18 | return record.levelno in self.custom_loglevels 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. -------------------------------------------------------------------------------- /tests/capice/vep/test_length.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.vep import length 6 | 7 | 8 | class TestType(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | print('Setting up.') 12 | cls.length = length.Length() 13 | 14 | def test_process(self): 15 | dataframe = pd.DataFrame({ 16 | 'REF': ['ATAG', 'A', 'C', 'AC'], 17 | 'ALT': ['A', 'ATG', 'A', 'GT']}) 18 | observed = self.length.process(dataframe) 19 | expected = pd.DataFrame({ 20 | 'REF': ['ATAG', 'A', 'C', 'AC'], 21 | 'ALT': ['A', 'ATG', 'A', 'GT'], 22 | 'Length': [3, 2, 0, 0]}) 23 | pd.testing.assert_frame_equal(expected, observed) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/length.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.vep.template import Template 4 | from molgenis.capice.utilities.enums import Column 5 | 6 | 7 | class Length(Template): 8 | def __init__(self): 9 | super(Length, self).__init__( 10 | name=Column.ref.value, 11 | usable=True 12 | ) 13 | 14 | @property 15 | def columns(self): 16 | return ['Length'] 17 | 18 | def _process(self, dataframe: pd.DataFrame): 19 | dataframe = dataframe.join( 20 | pd.DataFrame( 21 | abs(dataframe[Column.ref.value].str.len() - dataframe[Column.alt.value].str.len()), 22 | columns=self.columns 23 | ) 24 | ) 25 | return dataframe 26 | 27 | @property 28 | def drop(self): 29 | return False 30 | -------------------------------------------------------------------------------- /tests/capice/test_resources.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from molgenis.capice import __version__ 4 | from tests.capice.test_templates import ResourceFile, load_model 5 | from molgenis.capice.validators.version_validator import VersionValidator 6 | 7 | 8 | class TestResources(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls) -> None: 11 | cls.validator = VersionValidator() 12 | 13 | def test_resource_model(self): 14 | """ 15 | Test that the resources model is valid with the current CAPICE version. 16 | """ 17 | model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 18 | self.validator.validate_model_version(model.CAPICE_version) 19 | self.validator.validate_versions_compatible(__version__, model.CAPICE_version) 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/sift.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.vep.template_sift_polyphen import TemplateSiftPolyPhen 4 | 5 | 6 | class SIFT(TemplateSiftPolyPhen): 7 | def __init__(self): 8 | super(SIFT, self).__init__( 9 | name='SIFT', 10 | usable=True 11 | ) 12 | 13 | @property 14 | def columns(self): 15 | return ['SIFTcat', 'SIFTval'] 16 | 17 | def apply_label(self, dataframe: pd.DataFrame): 18 | """ 19 | Under the 0.05 should be deleterious, everything else should be tolerated (if not nan) 20 | """ 21 | dataframe.loc[ 22 | dataframe[dataframe[self.name].notnull()].index, self.columns[0]] = 'tolerated' 23 | dataframe.loc[ 24 | dataframe[dataframe[self.name] <= 0.05].index, self.columns[0]] = 'deleterious' 25 | return dataframe 26 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/template_sift_polyphen.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep.template import Template 7 | 8 | 9 | class TemplateSiftPolyPhen(Template): 10 | def __init__(self, name='Template', usable=False): 11 | super(TemplateSiftPolyPhen, self).__init__( 12 | name=name, 13 | usable=usable 14 | ) 15 | 16 | @property 17 | @abstractmethod 18 | def columns(self): 19 | return [None, None] 20 | 21 | @abstractmethod 22 | def apply_label(self, dataframe: pd.DataFrame): 23 | return dataframe 24 | 25 | def _process(self, dataframe: pd.DataFrame): 26 | dataframe[self.columns[1]] = dataframe[self.name] 27 | dataframe[self.columns[0]] = np.nan 28 | dataframe = self.apply_label(dataframe) 29 | return dataframe 30 | -------------------------------------------------------------------------------- /tests/capice/validators/test_model_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import xgboost as xgb 4 | 5 | from tests.capice.test_templates import ResourceFile, load_model 6 | from molgenis.capice.validators.model_validator import ModelValidator 7 | 8 | 9 | class TestModelValidator(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | cls.validator = ModelValidator() 13 | cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 14 | 15 | def test_model_required_attributes_correct(self): 16 | self.validator.validate_has_required_attributes(self.model) 17 | 18 | def test_model_missing_attribute(self): 19 | model = xgb.XGBClassifier() 20 | self.assertRaises( 21 | AttributeError, 22 | self.validator.validate_has_required_attributes, 23 | model 24 | ) 25 | 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tests/capice/vep/test_type.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.vep import type 6 | 7 | 8 | class TestType(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | print('Setting up.') 12 | cls.type = type.Type() 13 | 14 | def test_process(self): 15 | input_data_frame = pd.DataFrame({'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], 16 | 'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C']}) 17 | actual_output = self.type.process(input_data_frame) 18 | expected_output = pd.DataFrame({ 19 | 'REF': ['C', 'CA', 'CA', 'C', 'CA', 'CA'], 20 | 'ALT': ['G', 'GCC', 'GG', 'CG', 'G', 'C'], 21 | 'Type': ['SNV', 'DELINS', 'DELINS', 'INS', 'DELINS', 'DEL']}) 22 | pd.testing.assert_frame_equal(actual_output, expected_output) 23 | 24 | 25 | if __name__ == '__main__': 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_predictor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from molgenis.capice.utilities.predictor import Predictor 4 | from tests.capice.test_templates import set_up_impute_preprocess 5 | 6 | 7 | class TestPredictor(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | print('Setting up.') 11 | main, model = set_up_impute_preprocess() 12 | cls.predictor = Predictor(model) 13 | cls.dataset = main.categorical_process( 14 | main.process( 15 | main._load_file(), process_features=model.vep_features.keys() 16 | )[0], processing_features=model.processable_features 17 | )[0] 18 | 19 | def test_predict(self): 20 | observed = self.predictor.predict(self.dataset) 21 | self.assertGreater(observed['score'].sum(), 0) 22 | self.assertFalse(observed['score'].hasnans) 23 | 24 | 25 | if __name__ == '__main__': 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ### System information 14 | - OS: [e.g. iOS] 15 | - Version: [e.g. 3.0.0] 16 | - Python version: [e.g. Python3.9.1] 17 | - Shell: [e.g. ZSH] 18 | 19 | ### How to Reproduce 20 | Steps to reproduce the behavior: 21 | 1. `cd` to dir [...] 22 | 2. Run the command `[...]` 23 | 3. See error. 24 | 25 | ### Expected behavior 26 | A clear and concise description of what you expected to happen. 27 | 28 | ### Logs 29 | If available, the generated logging information and/or error message (can also be attached as a file if very large). 30 | 31 | ### Screenshots 32 | If applicable, add screenshots to help explain your problem. 33 | 34 | ### Additional context 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/class_suggestor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.utilities.enums import Column, OutputClasses 4 | 5 | 6 | class ClassSuggestor: 7 | def __init__(self): 8 | # Implement way to make thresholds datafile user definable 9 | pass 10 | 11 | @staticmethod 12 | def apply_suggestion(capice_predicted_dataset: pd.DataFrame): 13 | """ 14 | Method to apply the suggested output class to a capice prediction score 15 | keeping in mind the per gene thresholds. 16 | 17 | :param capice_predicted_dataset: pandas.DataFrame. 18 | The input dataset that contains the "score" column and a gene name column. 19 | :return: pandas.DataFrame. 20 | Original input but with the column suggested_class, depending on the user provided 21 | thresholds. 22 | """ 23 | capice_predicted_dataset[Column.suggested_class.value] = OutputClasses.unknown.value 24 | return capice_predicted_dataset 25 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/amino_acids.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.vep.template import Template 4 | 5 | 6 | class AminoAcids(Template): 7 | def __init__(self): 8 | super(AminoAcids, self).__init__( 9 | name='Amino_acids', 10 | usable=True 11 | ) 12 | 13 | @property 14 | def columns(self): 15 | return ['oAA', 'nAA'] 16 | 17 | @property 18 | def oaa(self): 19 | return self.columns[0] 20 | 21 | @property 22 | def naa(self): 23 | return self.columns[1] 24 | 25 | def _process(self, dataframe: pd.DataFrame): 26 | if dataframe[self.name].str.contains('/', regex=False).any(): 27 | dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True) 28 | dataframe[self.naa].fillna(dataframe[self.oaa], inplace=True) 29 | else: 30 | dataframe[self.oaa] = dataframe[self.name] 31 | dataframe[self.naa] = dataframe[self.oaa] 32 | return dataframe 33 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/poly_phen.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.vep.template_sift_polyphen import TemplateSiftPolyPhen 4 | 5 | 6 | class PolyPhen(TemplateSiftPolyPhen): 7 | def __init__(self): 8 | super(PolyPhen, self).__init__( 9 | name='PolyPhen', 10 | usable=True 11 | ) 12 | 13 | @property 14 | def columns(self): 15 | return ['PolyPhenCat', 'PolyPhenVal'] 16 | 17 | def apply_label(self, dataframe: pd.DataFrame): 18 | """ 19 | Under the 0.445 should be benign, between 0.445 and 0.908 should be possibly damaging, 20 | everything else should be probably damaging (if not nan). 21 | """ 22 | dataframe.loc[dataframe[dataframe[self.name].notnull()].index, self.columns[0]] = 'benign' 23 | dataframe.loc[ 24 | dataframe[dataframe[self.name] > 0.445].index, self.columns[0]] = 'possibly_damaging' 25 | dataframe.loc[ 26 | dataframe[dataframe[self.name] > 0.908].index, self.columns[0]] = 'probably_damaging' 27 | return dataframe 28 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_custom_logging_filter.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | 4 | from molgenis.capice.utilities.custom_logging_filter import CustomLoggingFilter 5 | 6 | 7 | class TestCustomLoggingFilter(unittest.TestCase): 8 | 9 | @classmethod 10 | def setUp(cls): 11 | print('Setting up.') 12 | cls.custom_filter = CustomLoggingFilter({}) 13 | 14 | def test_filter_true(self): 15 | logger = logging.getLogger('simple_example') 16 | record = logger.makeRecord('simple_example', 3, '', 5, 'message', ('arg',), None) 17 | self.custom_filter.custom_loglevels = [3, 4] 18 | actual = self.custom_filter.filter(record) 19 | self.assertEqual(True, actual) 20 | 21 | def test_filter_false(self): 22 | logger = logging.getLogger('simple_example') 23 | record = logger.makeRecord('simple_example', 2, '', 5, 'message', ('arg',), None) 24 | self.custom_filter.custom_loglevels = [3, 4] 25 | actual = self.custom_filter.filter(record) 26 | self.assertEqual(False, actual) 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/capice/vep/test_poly_phen.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep import poly_phen 7 | 8 | 9 | class TestType(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | print('Setting up.') 13 | cls.poly_phen = poly_phen.PolyPhen() 14 | 15 | def test_process(self): 16 | dataframe = pd.DataFrame( 17 | { 18 | 'PolyPhen': [0.445, 0.908, 0.999, np.nan] 19 | } 20 | ) 21 | expected = pd.concat( 22 | [ 23 | dataframe, 24 | pd.DataFrame( 25 | { 26 | 'PolyPhenCat': ['benign', 'possibly_damaging', 'probably_damaging', np.nan], 27 | 'PolyPhenVal': [0.445, 0.908, 0.999, np.nan] 28 | } 29 | ) 30 | ], axis=1 31 | ) 32 | observed = self.poly_phen.process(dataframe) 33 | pd.testing.assert_frame_equal(expected.sort_index(axis=1), observed.sort_index(axis=1)) 34 | 35 | 36 | if __name__ == '__main__': 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_input_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import pandas as pd 5 | 6 | from molgenis.capice.utilities.input_parser import InputParser 7 | from tests.capice.test_templates import _project_root_directory 8 | 9 | 10 | class TestInputParser(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | print('Setting up.') 14 | cls.parser = InputParser() 15 | 16 | def test_parse(self): 17 | self.parser.set_separator(',') 18 | input_file = self.parser.parse( 19 | os.path.join( 20 | _project_root_directory, 21 | 'tests', 22 | 'resources', 23 | 'input_parser', 24 | 'input_parser.txt' 25 | ) 26 | ) 27 | expected_df = pd.DataFrame( 28 | { 29 | 'this': ['this'], 30 | 'is': ['is'], 31 | 'a': ['a'], 32 | 'header': ['line'] 33 | } 34 | ) 35 | pd.testing.assert_frame_equal(input_file, expected_df) 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /tests/capice/vep/test_amino_acids.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.vep import amino_acids 6 | 7 | 8 | class TestType(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | print('Setting up.') 12 | cls.aa = amino_acids.AminoAcids() 13 | 14 | def test_process(self): 15 | dataframe = pd.DataFrame({'Amino_acids': ['A/G', 'R/C', 'G/C']}) 16 | observed = self.aa.process(dataframe) 17 | expected = pd.DataFrame({'Amino_acids': ['A/G', 'R/C', 'G/C'], 18 | 'oAA': ['A', 'R', 'G'], 19 | 'nAA': ['G', 'C', 'C']}) 20 | pd.testing.assert_frame_equal(expected, observed) 21 | 22 | def test_process_no_alt(self): 23 | dataframe = pd.DataFrame({'Amino_acids': ['A', 'R', 'G']}) 24 | observed = self.aa.process(dataframe) 25 | expected = pd.DataFrame({'Amino_acids': ['A', 'R', 'G'], 26 | 'oAA': ['A', 'R', 'G'], 27 | 'nAA': ['A', 'R', 'G']}) 28 | pd.testing.assert_frame_equal(expected, observed) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/predictor.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.core.logger import Logger 2 | from molgenis.capice.utilities.enums import Column 3 | 4 | 5 | class Predictor: 6 | """ 7 | Predictor class for CAPICE. Produces the final CAPICE score. 8 | """ 9 | 10 | def __init__(self, model): 11 | """ 12 | :param model: XGBClassifier, the custom pickled model instance of user 13 | provided model. 14 | """ 15 | self.log = Logger().logger 16 | self.model = model 17 | self.log.info('Starting prediction.') 18 | 19 | def predict(self, dataset): 20 | """ 21 | Predicts the probability score of CAPICE over dataset. 22 | :param dataset: pandas.DataFrame, the fully imputed and processed 23 | user input dataset of VEP-like origin. 24 | :return: pandas.DataFrame: containing an extra column containing the 25 | CAPICE score per variant. 26 | """ 27 | self.log.info('Predicting for %d samples.', dataset.shape[0]) 28 | dataset[Column.score.value] = self.model.predict_proba( 29 | dataset[self.model.get_booster().feature_names])[:, 1] 30 | self.log.info('Prediction successful.') 31 | return dataset 32 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/property_type_validator.py: -------------------------------------------------------------------------------- 1 | class PropertyTypeValidator: 2 | def validate_property(self, value: object, expected_type: type, include_none: bool = False): 3 | """ 4 | Logger method to raise a TypeError when a Property is not set correctly. 5 | 6 | :param value: value to be checked 7 | :param expected_type: type the value should match 8 | :param include_none: whenever None should be allowed 9 | """ 10 | if isinstance(value, bool): 11 | if type(value) != expected_type: 12 | self._check_none(expected_type, value, include_none) 13 | 14 | elif not isinstance(value, expected_type): 15 | self._check_none(expected_type, value, include_none) 16 | 17 | def _check_none(self, expected_type, value, include_none): 18 | if include_none: 19 | if value is not None: 20 | self._raise_type_error(expected_type, value) 21 | else: 22 | self._raise_type_error(expected_type, value) 23 | 24 | @staticmethod 25 | def _raise_type_error(expected_type, value): 26 | error_message = "Expected variable type %s but got %s" 27 | raise TypeError(error_message % (expected_type, type(value))) 28 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/predict_validator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | from molgenis.capice.core.logger import Logger 5 | 6 | 7 | class PredictValidator: 8 | def __init__(self): 9 | self.log = Logger().logger 10 | 11 | def validate_data_predict_ready(self, dataset: pd.DataFrame, model: xgb.XGBClassifier) ->\ 12 | None: 13 | """ 14 | Validates if dataset is predict ready according to the feature names in model 15 | 16 | Args: 17 | dataset: 18 | The dataset that is supposed to be predict ready. 19 | model: 20 | The custom CAPICE xgboost.XGBClassifier. 21 | Raises: 22 | KeyError: 23 | Raised when a required predict feature is missing from dataset. 24 | """ 25 | missing = [] 26 | for feature in model.get_booster().feature_names: # type: ignore 27 | if feature not in dataset.columns: 28 | missing.append(feature) 29 | if len(missing) > 0: 30 | error_message = 'Missing required predict column(s): %s' 31 | self.log.critical(error_message, ', '.join(missing)) 32 | raise KeyError(error_message, ', '.join(missing)) 33 | -------------------------------------------------------------------------------- /tests/capice/validators/test_post_vep_processing_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from tests.capice.test_templates import teardown, ResourceFile, load_model 6 | from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator 7 | 8 | 9 | class TestPostVEPProcessingValidator(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | print('Setting up.') 13 | cls.dataset = pd.DataFrame( 14 | { 15 | 'chr': [1, 2], 16 | 'pos': [100, 200], 17 | 'REF': ['A', 'A'], 18 | 'ALT': ['T', 'T'], 19 | 'feat1': ['foo', 'bar'] 20 | } 21 | ) 22 | cls.validator = PostVEPProcessingValidator() 23 | cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 24 | 25 | @classmethod 26 | def tearDownClass(cls) -> None: 27 | print('Tearing down.') 28 | teardown() 29 | 30 | def test_validate_features_present_incorrect(self): 31 | print('KeyError raise due to missing VEP processed feature') 32 | self.assertRaises( 33 | KeyError, 34 | self.validator.validate_features_present, 35 | self.dataset, 36 | self.model.vep_features.values() 37 | ) 38 | 39 | 40 | if __name__ == '__main__': 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/input_parser.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.core.logger import Logger 4 | 5 | 6 | class InputParser: 7 | def __init__(self): 8 | self.log = Logger().logger 9 | self.sep = '\t' 10 | 11 | def set_separator(self, sep: str): 12 | """ 13 | Function to overwrite the default separator 'tab'. 14 | Currently has no real function, but might be implemented 15 | in the future if the default separator in the VEP output changes and 16 | the separator has to be dynamically changed. 17 | 18 | :param sep: str, to be used separator in the pandas.read_csv call 19 | """ 20 | self.sep = sep 21 | 22 | def parse(self, input_file_path: str): 23 | """ 24 | Class to start the parsing of additional information from the input 25 | file. 26 | 27 | :param input_file_path: str, direction to the input file 28 | """ 29 | if self.sep == '\t': 30 | used_sep = 'Tab' 31 | else: 32 | used_sep = self.sep 33 | self.log.info('Reading VEP file from: %s using separator: %s', input_file_path, used_sep) 34 | input_file = pd.read_csv(input_file_path, sep=self.sep, na_values='.', low_memory=False) 35 | message = 'Input file at %s loaded with %s samples.' 36 | self.log.info(message, input_file_path, input_file.shape[0]) 37 | return input_file 38 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/post_vep_processing_validator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.core.logger import Logger 4 | from molgenis.capice.utilities import check_if_in_list 5 | 6 | 7 | class PostVEPProcessingValidator: 8 | def __init__(self): 9 | self.log = Logger().logger 10 | 11 | def validate_features_present(self, datafile: pd.DataFrame, vep_features: list[list[str]]) -> \ 12 | None: 13 | """ 14 | Validator to see if all features that should be present after the 15 | ManualVEPProcessor are present. 16 | Args: 17 | datafile: 18 | Pandas Dataframe over which the feature presence validation should happen. 19 | vep_features: 20 | List of lists of expected output ManualVEPProcesing features as saved in the 21 | model.vep_features.values() 22 | Raises: 23 | KeyError: 24 | Raises KeyError when output VEP feature is not present within datafile. 25 | """ 26 | features_not_present = check_if_in_list(vep_features, datafile.columns) 27 | if len(features_not_present) > 0: 28 | error_message = 'Detected required feature(s) %s not ' \ 29 | 'present within VEP processed input file!' 30 | self.log.critical(error_message, ', '.join(features_not_present)) 31 | raise KeyError(error_message % ', '.join(features_not_present)) 32 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_load_file_postprocessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor 6 | 7 | 8 | class LoadFilePostprocessor(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | print('Setting up.') 12 | df = pd.DataFrame( 13 | { 14 | 'CHROM': [1], 15 | 'POS': [123], 16 | 'REF': ['A'], 17 | 'ALT': ['G'], 18 | 'Gene': [123], 19 | 'SYMBOL_SOURCE': ['hgnc'], 20 | 'Feature': ['NM1.123'], 21 | 'SYMBOL': ['ACDC'], 22 | 'INTRON': [5], 23 | 'EXON': [11], 24 | } 25 | ) 26 | cls.processor = LoadFilePostProcessor(df) 27 | 28 | def test_process(self): 29 | observed = self.processor.process() 30 | expected = pd.DataFrame( 31 | { 32 | 'chr': [1], 33 | 'pos': [123], 34 | 'REF': ['A'], 35 | 'ALT': ['G'], 36 | 'gene_id': [123], 37 | 'id_source': ['hgnc'], 38 | 'feature': ['NM1.123'], 39 | 'gene_name': ['ACDC'], 40 | 'Intron': [5], 41 | 'Exon': [11] 42 | } 43 | ) 44 | pd.testing.assert_frame_equal(expected, observed) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/load_file_postprocessor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.core.logger import Logger 4 | from molgenis.capice.utilities.enums import InputColumn 5 | 6 | 7 | class LoadFilePostProcessor: 8 | def __init__(self, dataset: pd.DataFrame): 9 | self.log = Logger().logger 10 | self.dataset = dataset 11 | 12 | def process(self): 13 | """ 14 | Function to start the LoadFilePostProcessor to correct the input file of 15 | each column starting with % and the renaming of certain columns, 16 | like #CHROM to chr. 17 | 18 | Returns 19 | ------- 20 | dataset : pandas.DataFrame 21 | Processed dataset with corrected % sign and renamed columns. 22 | """ 23 | self.log.info('LoadFilePostProcessor starting.') 24 | self._col_renamer() 25 | self.log.info('LoadFilePostProcessor successful.') 26 | return self.dataset 27 | 28 | def _col_renamer(self): 29 | """ 30 | Function to rename "Gene, Feature, SYMBOL, INTRON and EXON" to 31 | "GeneID, FeatureID, GeneName, Intron and Exon". 32 | """ 33 | to_rename = {} 34 | for column in InputColumn: 35 | if column.col_input_name in self.dataset.columns: 36 | to_rename[column.col_input_name] = column.col_name 37 | self.log.debug(f'Converting the following column names: {to_rename}') 38 | self.dataset.rename(columns=to_rename, inplace=True) 39 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_class_suggestor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.utilities.enums import Column, OutputClasses 6 | from molgenis.capice.utilities.class_suggestor import ClassSuggestor 7 | 8 | 9 | class TestClassSuggestor(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | cls.dataset = pd.DataFrame( 13 | { 14 | Column.gene_name.value: ['foo', 'TTN', 'COL7A1', 'MEFV', 'bar'], 15 | Column.score.value: [0.9234, 0.2134, 0.0012, 0.4563, 0.7854] 16 | } 17 | ) 18 | 19 | def test_apply_suggestion(self): 20 | suggestor = ClassSuggestor() 21 | copy_dataset = self.dataset.copy(deep=True) 22 | observed = suggestor.apply_suggestion(copy_dataset) 23 | expected = pd.concat( 24 | [ 25 | self.dataset, 26 | pd.DataFrame( 27 | { 28 | Column.suggested_class.value: [ 29 | OutputClasses.unknown.value, 30 | OutputClasses.unknown.value, 31 | OutputClasses.unknown.value, 32 | OutputClasses.unknown.value, 33 | OutputClasses.unknown.value 34 | ] 35 | } 36 | ) 37 | ], axis=1 38 | ) 39 | pd.testing.assert_frame_equal(observed, expected) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/template.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator 7 | 8 | 9 | class Template(metaclass=ABCMeta): 10 | def __init__(self, name, usable): 11 | self.property_checker = PropertyTypeValidator() 12 | self.name = name 13 | self.usable = usable 14 | 15 | @property 16 | def name(self): 17 | return self._name 18 | 19 | @name.setter 20 | def name(self, value='Template'): 21 | self.property_checker.validate_property(value=value, expected_type=str) 22 | self._name = value 23 | 24 | @property 25 | @abstractmethod 26 | def columns(self): 27 | return [] 28 | 29 | @property 30 | def usable(self): 31 | return self._usable 32 | 33 | @usable.setter 34 | def usable(self, value=False): 35 | self.property_checker.validate_property(value=value, expected_type=bool) 36 | self._usable = value 37 | 38 | @property 39 | def drop(self): 40 | return True 41 | 42 | @staticmethod 43 | def _fillna(): 44 | return np.nan 45 | 46 | def process(self, dataframe: pd.DataFrame): 47 | if dataframe[self.name].isnull().all(): 48 | dataframe[self.columns] = self._fillna() 49 | return dataframe 50 | else: 51 | return self._process(dataframe) 52 | 53 | @abstractmethod 54 | def _process(self, dataframe: pd.DataFrame): 55 | return dataframe 56 | -------------------------------------------------------------------------------- /tests/capice/vep/test_sift.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep.sift import SIFT 7 | 8 | 9 | class TestSift(unittest.TestCase): 10 | def test_sift(self): 11 | dataset = pd.DataFrame( 12 | { 13 | 'SIFT': [np.nan, 0.002, 0.05, 0.9] 14 | } 15 | ) 16 | expected = pd.concat( 17 | [ 18 | dataset, 19 | pd.DataFrame( 20 | { 21 | 'SIFTval': [np.nan, 0.002, 0.05, 0.9], 22 | 'SIFTcat': [np.nan, 'deleterious', 'deleterious', 'tolerated'] 23 | } 24 | ) 25 | ], axis=1 26 | ) 27 | observed = SIFT().process(dataset) 28 | pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1)) 29 | 30 | def test_sift_full_nan(self): 31 | list_of_nans = [np.nan, np.nan, np.nan] 32 | dataset = pd.DataFrame( 33 | { 34 | 'SIFT': list_of_nans 35 | } 36 | ) 37 | expected = pd.concat( 38 | [ 39 | dataset, 40 | pd.DataFrame( 41 | { 42 | 'SIFTval': list_of_nans, 43 | 'SIFTcat': list_of_nans 44 | } 45 | ) 46 | ], axis=1 47 | ) 48 | observed = SIFT().process(dataset) 49 | pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index(axis=1)) 50 | 51 | 52 | if __name__ == '__main__': 53 | unittest.main() 54 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/column_utils.py: -------------------------------------------------------------------------------- 1 | class ColumnUtils: 2 | """ 3 | Utility class for columns. 4 | """ 5 | 6 | def __init__(self): 7 | self.specified_columns = set() 8 | 9 | def get_specified_columns(self): 10 | """ 11 | Getter for specified columns 12 | :return: list of specified columns 13 | """ 14 | return self.specified_columns 15 | 16 | def set_specified_columns(self, specified_columns): 17 | """ 18 | Setter for specified columns 19 | :param specified_columns: list 20 | """ 21 | self.specified_columns = set(specified_columns) 22 | 23 | def add_to_specified_columns(self, columns): 24 | """ 25 | Adds column(s) to the set of specified columns. 26 | :param columns: string/int/float or list/tuple/set 27 | """ 28 | if type(columns) in [str, int, float]: 29 | columns = [columns] 30 | for column in columns: 31 | self.specified_columns.add(column) 32 | 33 | def column_in_specified_columns(self, column): 34 | """ 35 | Checks whether column is in specified columns 36 | :param column: string 37 | :return: boolean 38 | """ 39 | return column in self.specified_columns 40 | 41 | def get_missing_diff_with(self, columns): 42 | """ 43 | Get the list of columns missing from the presented columns, 44 | compared to the specified columns 45 | :param columns: list of columns 46 | :return: list 47 | list of columns that are in the specified columns (specified_columns), 48 | but not in the presented ones (columns) 49 | """ 50 | return list(self.specified_columns - set(columns)) 51 | -------------------------------------------------------------------------------- /tests/capice/test_main_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import pandas as pd 5 | 6 | from molgenis.capice.main_predict import CapicePredict 7 | from tests.capice.test_templates import set_up_manager_and_out, teardown, _project_root_directory, \ 8 | ResourceFile, load_model 9 | 10 | 11 | class TestMainNonTrain(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | print('Setting up.') 15 | manager, cls.output_dir = set_up_manager_and_out() 16 | manager.output_filename = os.path.join(cls.output_dir, 'test_output.tsv') 17 | 18 | cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | print('Performing teardown.') 23 | teardown() 24 | 25 | def setUp(self): 26 | print('Performing test:') 27 | 28 | def test_integration_main_nontrain(self): 29 | print('Main no-train (integration)') 30 | infile = os.path.join(_project_root_directory, 'resources', 'predict_input.tsv.gz') 31 | predict = CapicePredict(input_path=infile, model=self.model, output_path=self.output_dir, 32 | output_given=True, force=False) 33 | predict.run() 34 | prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.tsv'), sep='\t') 35 | self.assertEqual(prediction_output.shape, (4, 11)) 36 | self.assertListEqual( 37 | list(prediction_output.columns), 38 | [ 39 | 'chr', 'pos', 'ref', 'alt', 'gene_name', 'gene_id', 'id_source', 'feature', 40 | 'feature_type', 'score', 'suggested_class' 41 | ] 42 | ) 43 | 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /tests/capice/cli/test_args_handler_train.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | from argparse import ArgumentParser 4 | from io import StringIO 5 | 6 | from molgenis.capice.cli.args_handler_train import ArgsHandlerTrain 7 | 8 | 9 | class TestArgsHandlerPredict(unittest.TestCase): 10 | 11 | def setUp(self): 12 | parser = ArgumentParser( 13 | description="CAPICE test" 14 | ) 15 | self.aht = ArgsHandlerTrain(parser) 16 | 17 | @patch('sys.stderr', new_callable=StringIO) 18 | def test_validate_n_threads(self, stderr): 19 | with self.assertRaises(SystemExit): 20 | self.aht.validate_n_threads(0) 21 | self.assertIn('The amount of threads has to be at least 1!', stderr.getvalue()) 22 | 23 | @patch('sys.stderr', new_callable=StringIO) 24 | def test_validate_test_split_0(self, stderr): 25 | with self.assertRaises(SystemExit): 26 | self.aht.validate_test_split(0) 27 | self.assertIn('Test split must be a float between 0 and 1', stderr.getvalue()) 28 | 29 | @patch('sys.stderr', new_callable=StringIO) 30 | def test_validate_test_split_1(self, stderr): 31 | with self.assertRaises(SystemExit): 32 | self.aht.validate_test_split(1) 33 | self.assertIn('Test split must be a float between 0 and 1', stderr.getvalue()) 34 | 35 | def test_property_str_versions(self): 36 | args_handler = ArgsHandlerTrain(ArgumentParser()) 37 | self.assertEqual('.tsv, .tsv.gz', args_handler._extension_str()) 38 | self.assertEqual('.json', args_handler._features_extension_str()) 39 | self.assertEqual('.json, .ubj', args_handler._required_output_extensions_str()) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/template_position.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep.template import Template 7 | 8 | 9 | class TemplatePosition(Template): 10 | def __init__(self, name='Template', usable=False): 11 | super(TemplatePosition, self).__init__( 12 | name=name, 13 | usable=usable 14 | ) 15 | 16 | @property 17 | @abstractmethod 18 | def columns(self): 19 | return [None, None] 20 | 21 | @property 22 | def pos_col(self): 23 | return self.columns[0] 24 | 25 | def _process(self, dataframe: pd.DataFrame): 26 | if self.name in dataframe.select_dtypes(include='O'): 27 | if dataframe[self.name].str.split('/', expand=True).shape[1] > 1: 28 | dataframe[self.columns] = dataframe[self.name].str.split('/', expand=True) 29 | else: 30 | dataframe[self.pos_col] = dataframe[self.name] 31 | dataframe[self.columns[1]] = np.nan 32 | dataframe[self.pos_col] = dataframe[self.pos_col].str.replace('?-', '', regex=False) 33 | dataframe[self.pos_col] = dataframe[self.pos_col].str.replace('-?', '', regex=False) 34 | dataframe[self.pos_col] = dataframe[self.pos_col].str.split('-', expand=True)[0] 35 | 36 | for column in self.columns: 37 | dataframe.loc[dataframe[dataframe[column] == ''].index, column] = np.nan 38 | dataframe[column] = dataframe[column].astype(float) 39 | else: 40 | dataframe[self.pos_col] = dataframe[self.name] 41 | for col in self.columns: 42 | if col not in dataframe.columns: 43 | dataframe[col] = np.nan 44 | return dataframe 45 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_predict.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from molgenis.capice.utilities.enums import Column 4 | from tests.capice.test_templates import set_up_impute_preprocess, teardown 5 | 6 | 7 | class TestPredict(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | print('Setting up.') 11 | cls.main, cls.model = set_up_impute_preprocess() 12 | 13 | @classmethod 14 | def tearDownClass(cls): 15 | print('Tearing down.') 16 | teardown() 17 | 18 | def setUp(self): 19 | print('Testing case:') 20 | 21 | def test_unit_prediction(self): 22 | """ 23 | Unit test for the prediction part of CAPICE. 24 | """ 25 | print('Prediction (unit)') 26 | self.main.predict( 27 | self.main.categorical_process( 28 | self.main.process( 29 | self.main._load_file(), process_features=self.model.vep_features.keys() 30 | )[0], processing_features=self.model.processable_features 31 | )[0] 32 | ) 33 | 34 | def test_component_prediction(self): 35 | """ 36 | Component test for prediction to see if the combined score of all is 37 | greater than 0. 38 | """ 39 | print('Prediction (component)') 40 | prediction = self.main.predict( 41 | self.main.categorical_process( 42 | self.main.process( 43 | self.main._load_file(), process_features=self.model.vep_features.keys() 44 | )[0], processing_features=self.model.processable_features 45 | )[0] 46 | ) 47 | # Combined sum of the prediction score should be higher than 0 48 | self.assertGreater(prediction[Column.score.value].sum(), 0) 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup, find_namespace_packages 4 | from src.molgenis.capice import __version__ 5 | 6 | with open('README.md', 'r', encoding='utf-8') as fh: 7 | long_description = fh.read() 8 | 9 | setup( 10 | name='capice', 11 | version=__version__, 12 | packages=find_namespace_packages('src', exclude=['tests', 'scripts']), 13 | package_dir={"": "src"}, 14 | url='https://capice.molgeniscloud.org/', 15 | license='LGPL-3.0', 16 | author='Shuang Li, Robert Sietsma and Molgenis', 17 | author_email='support@molgenis.org', 18 | description='Consequence Agnostic Pathogenicity Interpretation of ' 19 | 'Clinical Exoma variations. State of the art machine learning ' 20 | 'to predict SNVs and InDels pathogenicity.', 21 | long_description=long_description, 22 | long_description_content_type='text/markdown', 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', 26 | 'Programming Language :: Python :: 3.10' 27 | ], 28 | python_requires='>=3.10', 29 | install_requires=[ 30 | 'numpy==1.26.4', 31 | 'pandas==1.5.3', 32 | 'scipy==1.14.1', 33 | 'scikit-learn==1.5.2', 34 | 'xgboost==1.7.6' 35 | ], 36 | extras_require={ 37 | 'test': [ 38 | 'pytest', # pytest 39 | 'coverage', # coverage run -m pytest --junitxml=results.xml && coverage html 40 | 'mypy', # mypy --ignore-missing-imports src/ 41 | 'flake8', # flake8 src/ tests/ 42 | 'flake8-import-order' 43 | ] 44 | }, 45 | entry_points={ 46 | 'console_scripts': [ 47 | 'capice = molgenis.capice.capice:main' 48 | ] 49 | } 50 | 51 | ) 52 | -------------------------------------------------------------------------------- /tests/capice/vep/test_cdna_position.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep import cdna_position 7 | 8 | 9 | class TestType(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | print('Setting up.') 13 | cls.cdna_pos = cdna_position.CDNAPosition() 14 | 15 | def test_process(self): 16 | dataframe = pd.DataFrame({'cDNA_position': ['305/702', '60/550', '?-/123', '-?/456']}) 17 | observed = self.cdna_pos.process(dataframe) 18 | expected = pd.DataFrame({'cDNA_position': ['305/702', '60/550', '?-/123', '-?/456'], 19 | 'cDNApos': [305.00000, 60.00000, np.nan, np.nan], 20 | 'relcDNApos': [702.00000, 550.0000, 123.00000, 456.00000]}) 21 | pd.testing.assert_frame_equal(expected, observed) 22 | 23 | def test_corner_case(self): 24 | dataframe = pd.DataFrame( 25 | { 26 | 'cDNA_position': ['483-486', '162-163'] 27 | } 28 | ) 29 | observed = self.cdna_pos.process(dataframe) 30 | expected = pd.DataFrame({'cDNA_position': ['483-486', '162-163'], 31 | 'cDNApos': [483.00000, 162.00000], 32 | 'relcDNApos': [np.nan, np.nan]}) 33 | pd.testing.assert_frame_equal(expected, observed) 34 | 35 | def test_process_nan(self): 36 | dataframe = pd.DataFrame({'cDNA_position': [np.nan, np.nan]}) 37 | observed = self.cdna_pos.process(dataframe) 38 | expected = pd.DataFrame({'cDNA_position': [np.nan, np.nan], 39 | 'cDNApos': [np.nan, np.nan], 40 | 'relcDNApos': [np.nan, np.nan]}) 41 | pd.testing.assert_frame_equal(expected, observed) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /src/molgenis/capice/vep/type.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.utilities.enums import Column 4 | from molgenis.capice.vep.template import Template 5 | 6 | 7 | class Type(Template): 8 | def __init__(self): 9 | super(Type, self).__init__( 10 | name=Column.ref.value, 11 | usable=True 12 | ) 13 | 14 | @property 15 | def columns(self): 16 | return ['Type'] 17 | 18 | @staticmethod 19 | def _ensure_column_value_is_one(column): 20 | return column.str.len() == 1 21 | 22 | def _process(self, dataframe: pd.DataFrame): 23 | """ 24 | process variants to annotate their types 25 | :param dataframe: a dataframe with as columns at least a ref and an alt 26 | :return: a dataframe with an added types column 27 | 28 | if len(ref) == 1 && len (alt) == 1: 29 | type = 'SNV' 30 | elif ref[0] == alt && len(alt) == 1: 31 | type = 'DEL' 32 | elif alt[0] == ref && len(ref) == 1: 33 | type = 'INS' 34 | else: 35 | type = 'DELINS' 36 | """ 37 | alt_column = dataframe[Column.alt.value] 38 | ref_column = dataframe[Column.ref.value] 39 | 40 | alt_column_value_is_1 = self._ensure_column_value_is_one(alt_column) 41 | ref_column_value_is_1 = self._ensure_column_value_is_one(ref_column) 42 | 43 | first_ref_nuc = ref_column.str.get(0) 44 | first_alt_nuc = alt_column.str.get(0) 45 | 46 | dataframe[self.columns] = 'DELINS' 47 | dataframe.loc[ 48 | dataframe[ref_column_value_is_1 & alt_column_value_is_1].index, self.columns] = 'SNV' 49 | dataframe.loc[ 50 | dataframe[ 51 | (first_ref_nuc == alt_column) & alt_column_value_is_1].index, self.columns] = 'DEL' 52 | dataframe.loc[ 53 | dataframe[ 54 | (first_alt_nuc == ref_column) & ref_column_value_is_1].index, self.columns] = 'INS' 55 | return dataframe 56 | 57 | @property 58 | def drop(self): 59 | return False 60 | -------------------------------------------------------------------------------- /src/molgenis/capice/cli/args_handler_explain.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.main_explain import CapiceExplain 2 | from molgenis.capice.core.capice_manager import CapiceManager 3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent 4 | from molgenis.capice.validators.model_validator import ModelValidator 5 | 6 | 7 | class ArgsHandlerExplain(ArgsHandlerParent): 8 | """ 9 | Handler for the CAPICE submodule Explain 10 | """ 11 | 12 | def __init__(self, parser): 13 | super(ArgsHandlerExplain, self).__init__(parser=parser) 14 | 15 | @property 16 | def _extension(self): 17 | return '.json', '.ubj' 18 | 19 | @property 20 | def _required_output_extensions(self): 21 | return '.tsv', '.tsv.gz' 22 | 23 | @property 24 | def _empty_output_extension(self): 25 | return self._required_output_extensions[1] 26 | 27 | def create(self): 28 | self.parser.add_argument( 29 | '-i', 30 | '--input', 31 | action='append', 32 | type=str, 33 | required=True, 34 | help=f'path to trained model ({self._extension_str()}) (required)' 35 | ) 36 | self.parser.add_argument( 37 | '-o', 38 | '--output', 39 | action='append', 40 | type=str, 41 | help=f'path to directory or file ({self._required_output_extensions_str()}) for ' 42 | f'exporting explain output (optional)' 43 | ) 44 | self.parser.add_argument( 45 | '-f', 46 | '--force', 47 | action='store_true', 48 | help='overwrites output if it already exists' 49 | ) 50 | 51 | def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given, 52 | args): 53 | model = self.load_model(input_path) 54 | validator = ModelValidator() 55 | validator.validate_has_required_attributes(model) 56 | CapiceManager().output_filename = output_filename 57 | CapiceExplain(model, output_path, output_given, self.force).run() 58 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/input_validator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from pathlib import Path 4 | 5 | 6 | class InputValidator: 7 | """ 8 | Validator for the CLI arguments 9 | """ 10 | @staticmethod 11 | def validate_input_path(input_path: os.PathLike, extension: tuple[str]): 12 | """ 13 | Function to validate if there is a file at the input location 14 | :param input_path: full path to input file 15 | :param extension: string of what the input file should end with. 16 | """ 17 | if not os.path.exists(input_path): 18 | raise FileNotFoundError(f'{input_path} does not exist!') 19 | if not str(input_path).endswith(extension): 20 | raise IOError(f'{input_path} does not match required extension: ' 21 | f'{", ".join(extension)}') 22 | 23 | @staticmethod 24 | def validate_output_path(output_path): 25 | """ 26 | Function to validate if the output directory exists and, 27 | if not, make it. 28 | :param output_path: path to output folder 29 | """ 30 | # If the output directory is not present and 31 | # the parent directory is also not writeable, throw OSError 32 | if not os.path.isdir(output_path) and not os.access(Path(output_path).parent, os.W_OK): 33 | raise OSError('New output directory cannot be made in a read/execute only directory!') 34 | # If the output directory is present but not writable, throw OSError 35 | elif os.path.isdir(output_path) and not os.access(output_path, os.W_OK): 36 | raise OSError('Output directory is not writable!') 37 | # If the output directory is not yet present, 38 | # but passed the check that it is in a writable parent directory, 39 | # only warn 40 | elif not os.path.isdir(output_path): 41 | warnings.warn("Output directory does not exist, creating.") 42 | os.makedirs(output_path) 43 | # No else is required, since the else would be to place the output file 44 | # in a writeable output directory that is already present. 45 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_file_postprocessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from tests.capice.test_templates import teardown, set_up_manager_and_out 6 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor 7 | 8 | 9 | class TestFilePostProcessor(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | print('Setting up.') 13 | set_up_manager_and_out() 14 | 15 | @classmethod 16 | def tearDownClass(cls) -> None: 17 | print('Tearing down.') 18 | teardown() 19 | 20 | def setUp(self) -> None: 21 | print('Testing case:') 22 | 23 | def test_load_file_pre_processor(self): 24 | """ 25 | Test to see if the post file loading processor outputs according to 26 | expectation. Note: chromosome stays an integer un till the imputer, 27 | that's why I don't mark them as string. 28 | """ 29 | print('Load file preprocessor.') 30 | data = pd.DataFrame( 31 | { 32 | "CHROM": [1, 2, 3], 33 | "POS": [100, 200, 300], 34 | "REF": ['A', 'T', 'G'], 35 | "ALT": ['T', 'G', 'A'], 36 | "SYMBOL_SOURCE": ['foo', 'foo', 'bar'], 37 | "Feature": ['bar', 'bar', 'buz'], 38 | "SYMBOL": ['g1', 'g2', 'g3'], 39 | "INTRON": [1, 0, 0], 40 | "EXON": [0, 1, 1] 41 | } 42 | ) 43 | 44 | expected_output = pd.DataFrame( 45 | { 46 | "chr": [1, 2, 3], 47 | "pos": [100, 200, 300], 48 | "REF": ['A', 'T', 'G'], 49 | "ALT": ['T', 'G', 'A'], 50 | "id_source": ['foo', 'foo', 'bar'], 51 | "feature": ['bar', 'bar', 'buz'], 52 | "gene_name": ['g1', 'g2', 'g3'], 53 | "Intron": [1, 0, 0], 54 | "Exon": [0, 1, 1] 55 | } 56 | ) 57 | 58 | processor = LoadFilePostProcessor(dataset=data) 59 | observed_output = processor.process() 60 | pd.testing.assert_frame_equal(expected_output, observed_output) 61 | 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class InputColumn(Enum): 5 | """ 6 | Columns within panda data frames. 7 | `col_name` is the column name as should be used within CAPICE after processing input. 8 | `col_input_name` is the expected name as given by the input file. 9 | 10 | If `col_name` and `col_input_name` are equal, the name does not get altered. 11 | """ 12 | # General 13 | chr = ('chr', 'CHROM') 14 | pos = ('pos', 'POS') 15 | ref = ('REF', 'REF') 16 | alt = ('ALT', 'ALT') 17 | gene_name = ('gene_name', 'SYMBOL') 18 | gene_id = ('gene_id', 'Gene') 19 | gene_name_source = ('id_source', 'SYMBOL_SOURCE') # see GitHub issue 169 20 | feature = ('feature', 'Feature') 21 | feature_type = ('feature_type', 'Feature_type') 22 | intron = ('Intron', 'INTRON') # Combination of 2x int64 divided by a "/" 23 | exon = ('Exon', 'EXON') # Combination of 2x int64 divided by a "/" 24 | 25 | # Train-only 26 | binarized_label = ('binarized_label', 'binarized_label') 27 | sample_weight = ('sample_weight', 'sample_weight') 28 | 29 | def __init__(self, col_name, col_input_name): 30 | self.col_name = col_name 31 | self.col_input_name = col_input_name 32 | 33 | 34 | class Column(Enum): 35 | """ 36 | Enums to use that are specific to the column names after. 37 | """ 38 | chr_pos_ref_alt = 'chr_pos_ref_alt' 39 | chr = 'chr' 40 | pos = 'pos' 41 | ref = 'REF' 42 | alt = 'ALT' 43 | gene_name = 'gene_name' 44 | gene_id = 'gene_id' 45 | id_source = 'id_source' 46 | feature = 'feature' 47 | feature_type = 'feature_type' 48 | score = 'score' 49 | suggested_class = 'suggested_class' 50 | other = 'other_CAPICE_value' 51 | 52 | 53 | class OutputClasses(Enum): 54 | """ 55 | Enums to use for the suggested output classes. 56 | """ 57 | unknown = 'VUS' 58 | # Variables already defined for future implementation 59 | tolerated = '' 60 | likely_tolerated = '' 61 | likely_damaging = '' 62 | damaging = '' 63 | 64 | 65 | class UniqueSeparator(Enum): 66 | """ 67 | Enum specific to creating a specific separator for the preservation of the chr pos ref alt 68 | columns. 69 | """ 70 | unique_separator = '_VeryUniqueCAPICESeparator_' 71 | 72 | 73 | class Versioning(Enum): 74 | VALIDATION_REGEX = (r'^(?P\d+)\.(?P\d+)\.(?P\d+)' 75 | r'(-?(?Pa|b|rc[0-9]+))?$') 76 | -------------------------------------------------------------------------------- /tests/capice/test_main_explain.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | import pandas as pd 5 | 6 | from tests.capice.test_templates import _project_root_directory, ResourceFile, load_model 7 | from molgenis.capice.core.capice_manager import CapiceManager 8 | from molgenis.capice.main_explain import CapiceExplain 9 | 10 | 11 | class TestCapiceExplain(unittest.TestCase): 12 | output_path = os.path.join(_project_root_directory, 'testing_output') 13 | output_filename = 'test_output.csv.gz' 14 | full_output_path = os.path.join(output_path, output_filename) 15 | 16 | @classmethod 17 | def setUpClass(cls) -> None: 18 | cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 19 | if not os.path.isdir(cls.output_path): 20 | os.makedirs(cls.output_path) 21 | CapiceManager().output_filename = cls.output_filename 22 | 23 | @classmethod 24 | def tearDownClass(cls) -> None: 25 | if os.path.isfile(cls.full_output_path): 26 | os.remove(cls.full_output_path) 27 | if os.path.isdir(cls.output_path): 28 | os.rmdir(cls.output_path) 29 | 30 | def test_capice_explain(self): 31 | explainer = CapiceExplain( 32 | model=self.model, 33 | output_path=self.output_path, 34 | output_given=True, 35 | force=False 36 | ) 37 | explainer.run() 38 | feature_importances = self.model.get_booster().get_score(importance_type='gain') 39 | observed = pd.read_csv(self.full_output_path, sep='\t') 40 | expected = pd.DataFrame( 41 | data=[ 42 | feature_importances.keys(), 43 | feature_importances.values() 44 | ], index=['feature', 'gain'] 45 | ).T.sort_values(by='gain', ascending=False).reset_index(drop=True) 46 | expected['gain'] = expected['gain'].astype(float) 47 | expected['total_gain'] = expected['feature'].map(self.model.get_booster().get_score( 48 | importance_type='total_gain')) 49 | expected['weight'] = expected['feature'].map(self.model.get_booster().get_score( 50 | importance_type='weight')) 51 | expected['cover'] = expected['feature'].map(self.model.get_booster().get_score( 52 | importance_type='cover')) 53 | expected['total_cover'] = expected['feature'].map(self.model.get_booster().get_score( 54 | importance_type='total_cover')) 55 | pd.testing.assert_frame_equal(observed, expected) 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /src/molgenis/capice/main_predict.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.main_capice import Main 2 | from molgenis.capice.utilities.predictor import Predictor 3 | from molgenis.capice.utilities.class_suggestor import ClassSuggestor 4 | from molgenis.capice.validators.predict_validator import PredictValidator 5 | from molgenis.capice.validators.post_vep_processing_validator import PostVEPProcessingValidator 6 | 7 | 8 | class CapicePredict(Main): 9 | """ 10 | Predict class of CAPICE to call the different modules to impute, 11 | process and eventually predict a score over a CAPICE annotated file. 12 | """ 13 | 14 | def __init__(self, input_path, model, output_path, output_given, force): 15 | super().__init__( 16 | input_path, 17 | output_path, 18 | output_given, 19 | force 20 | ) 21 | 22 | # Model. 23 | self.model = model 24 | 25 | def run(self): 26 | """ 27 | Function to make CAPICE run in a prediction matter. 28 | """ 29 | capice_data = self._load_file() 30 | capice_data = self.process( 31 | loaded_data=capice_data, 32 | process_features=list(self.model.vep_features.keys()) 33 | )[0] 34 | PostVEPProcessingValidator().validate_features_present( 35 | capice_data, self.model.vep_features.values() 36 | ) 37 | capice_data = self.categorical_process( 38 | loaded_data=capice_data, 39 | processing_features=self.model.processable_features, 40 | train_features=None 41 | )[0] 42 | capice_data = self.predict(loaded_data=capice_data) 43 | capice_data = self.apply_suggested_class(predicted_data=capice_data) 44 | self._export(dataset=capice_data, output=self.output) 45 | 46 | def predict(self, loaded_data): 47 | """ 48 | Function to call the correct model to predict CAPICE scores 49 | :return: pandas DataFrame 50 | """ 51 | validator = PredictValidator() 52 | validator.validate_data_predict_ready(loaded_data, self.model) 53 | predictor = Predictor(self.model) 54 | capice_data = predictor.predict(loaded_data) 55 | return capice_data 56 | 57 | @staticmethod 58 | def apply_suggested_class(predicted_data): 59 | """ 60 | Method to call the ClassSuggestor 61 | :return: pandas DataFrame 62 | """ 63 | suggestor = ClassSuggestor() 64 | capice_data = suggestor.apply_suggestion(predicted_data) 65 | return capice_data 66 | -------------------------------------------------------------------------------- /tests/resources/VEP104.json: -------------------------------------------------------------------------------- 1 | { 2 | "Ref": null, 3 | "Alt": null, 4 | "Consequence": null, 5 | "GC": null, 6 | "CpG": null, 7 | "motifECount": null, 8 | "motifEScoreChng": null, 9 | "motifEHIPos": null, 10 | "oAA": null, 11 | "nAA": null, 12 | "cDNApos": null, 13 | "relcDNApos": null, 14 | "CDSpos": null, 15 | "relCDSpos": null, 16 | "protPos": null, 17 | "relProtPos": null, 18 | "Domain": null, 19 | "Dst2Splice": null, 20 | "Dst2SplType": null, 21 | "minDistTSS": null, 22 | "minDistTSE": null, 23 | "SIFTcat": null, 24 | "SIFTval": null, 25 | "PolyPhenCat": null, 26 | "PolyPhenVal": null, 27 | "priPhCons": null, 28 | "mamPhCons": null, 29 | "verPhCons": null, 30 | "priPhyloP": null, 31 | "mamPhyloP": null, 32 | "verPhyloP": null, 33 | "bStatistic": null, 34 | "targetScan": null, 35 | "mirSVR-Score": null, 36 | "mirSVR-E": null, 37 | "mirSVR-Aln": null, 38 | "cHmmTssA": null, 39 | "cHmmTssAFlnk": null, 40 | "cHmmTxFlnk": null, 41 | "cHmmTx": null, 42 | "cHmmTxWk": null, 43 | "cHmmEnhG": null, 44 | "cHmmEnh": null, 45 | "cHmmZnfRpts": null, 46 | "cHmmHet": null, 47 | "cHmmTssBiv": null, 48 | "cHmmBivFlnk": null, 49 | "cHmmEnhBiv": null, 50 | "cHmmReprPC": null, 51 | "cHmmReprPCWk": null, 52 | "cHmmQuies": null, 53 | "GerpRS": null, 54 | "GerpRSpval": null, 55 | "GerpN": null, 56 | "GerpS": null, 57 | "TFBS": null, 58 | "TFBSPeaks": null, 59 | "TFBSPeaksMax": null, 60 | "tOverlapMotifs": null, 61 | "motifDist": null, 62 | "Segway": null, 63 | "EncH3K27Ac": null, 64 | "EncH3K4Me1": null, 65 | "EncH3K4Me3": null, 66 | "EncExp": null, 67 | "EncNucleo": null, 68 | "EncOCC": null, 69 | "EncOCCombPVal": null, 70 | "EncOCDNasePVal": null, 71 | "EncOCFairePVal": null, 72 | "EncOCpolIIPVal": null, 73 | "EncOCctcfPVal": null, 74 | "EncOCmycPVal": null, 75 | "EncOCDNaseSig": null, 76 | "EncOCFaireSig": null, 77 | "EncOCpolIISig": null, 78 | "EncOCctcfSig": null, 79 | "EncOCmycSig": null, 80 | "Grantham": null, 81 | "Dist2Mutation": null, 82 | "Freq100bp": null, 83 | "Rare100bp": null, 84 | "Sngl100bp": null, 85 | "Freq1000bp": null, 86 | "Rare1000bp": null, 87 | "Sngl1000bp": null, 88 | "Freq10000bp": null, 89 | "Rare10000bp": null, 90 | "Sngl10000bp": null, 91 | "dbscSNV-ada_score": null, 92 | "dbscSNV-rf_score": null, 93 | "Type": null, 94 | "Length": null 95 | } -------------------------------------------------------------------------------- /tests/capice/validators/test_property_type_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator 4 | 5 | 6 | class TestPropertyTypeValidator(unittest.TestCase): 7 | @classmethod 8 | def setUpClass(cls) -> None: 9 | print('Setting up.') 10 | cls.property_validator = PropertyTypeValidator() 11 | 12 | def setUp(self) -> None: 13 | print('Testing case:') 14 | 15 | def test_property_validator_correct(self): 16 | print('Property validator correct (not None)') 17 | value = 1.1 18 | expected_type = float 19 | self.property_validator.validate_property(value, expected_type) 20 | 21 | def test_property_validator_correct_with_none(self): 22 | print('Property validator including None') 23 | value = None 24 | expected_type = float 25 | self.property_validator.validate_property( 26 | value, 27 | expected_type, 28 | include_none=True 29 | ) 30 | 31 | def test_property_validator_incorrect(self): 32 | print('Property validator incorrect (without none)') 33 | value = 1 34 | expected_type = float 35 | self.assertRaises( 36 | TypeError, 37 | self.property_validator.validate_property, 38 | value, 39 | expected_type 40 | ) 41 | 42 | def test_property_validator_incorrect_with_none(self): 43 | print('Property validator incorrect including None') 44 | value = None 45 | expected_type = float 46 | self.assertRaises( 47 | TypeError, 48 | self.property_validator.validate_property, 49 | value, 50 | expected_type 51 | ) 52 | 53 | def test_property_validator_int_bool(self): 54 | print('Property validator with expected int and value is False') 55 | value = False 56 | expected_type = int 57 | self.assertRaises( 58 | TypeError, 59 | self.property_validator.validate_property, 60 | value, 61 | expected_type 62 | ) 63 | 64 | def test_property_validator_int_bool_include_none(self): 65 | print('Property validator with expected int, value is False and ' 66 | 'include_none is True') 67 | value = False 68 | expected_type = int 69 | self.assertRaises( 70 | TypeError, 71 | self.property_validator.validate_property, 72 | value, 73 | expected_type, 74 | True 75 | ) 76 | 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import functools 3 | import warnings 4 | from pathlib import Path 5 | from collections.abc import Iterable 6 | 7 | 8 | def get_project_root_dir(): 9 | """ 10 | Function to get the project root directory 11 | :return: Path instance 12 | """ 13 | # This script is within the 5th directory in the project. 14 | return Path(__file__).parent.parent 15 | 16 | 17 | def deprecated(func): 18 | @functools.wraps(func) 19 | def new_func(*args, **kwargs): 20 | warnings.simplefilter('always', DeprecationWarning) 21 | warnings.warn('Call to deprecated function {}.'.format(func.__name__), 22 | category=DeprecationWarning, 23 | stacklevel=2) 24 | warnings.simplefilter('default', DeprecationWarning) 25 | return func(*args, **kwargs) 26 | 27 | return new_func 28 | 29 | 30 | def check_if_in_list(list_of_lists: list[list[object]], to_check_list: Iterable): 31 | """ 32 | Checks if the item within a list within a list of object value (can be int, str, float, 33 | etc.) is within the to_check_list. If False: add to return list. If True: do not add to 34 | return list. 35 | 36 | Args: 37 | list_of_lists: 38 | List containing lists of values (object). 39 | These values are each independently checked if they are within the to_check_list. 40 | If False: add to return list. If true: do not add to return list. 41 | to_check_list: 42 | Iterable over which the individual items of the list_of_lists should be checked. 43 | 44 | Returns: 45 | list: 46 | A single list containing all individual items of list_of_lists that did not occur in 47 | to_check_list. 48 | 49 | """ 50 | return_list = [] 51 | for items in list_of_lists: 52 | for item in items: 53 | if item not in to_check_list: 54 | return_list.append(item) 55 | return return_list 56 | 57 | 58 | def check_file_exist(file_path: os.PathLike[str], force: bool): 59 | """ 60 | Method to check if a file exists and (if force is set to False) raises FileExistsError. 61 | If force is set to True, will not raise FileExistsError. Will also not raise FileExistsError 62 | if file not exists. 63 | 64 | Args: 65 | file_path: 66 | Full absolute output path, including the output filename and extension. 67 | force: 68 | Command Line Argument of the "force" argument. 69 | 70 | """ 71 | if os.path.exists(file_path) and not force: 72 | raise FileExistsError("Output file already exists!") 73 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_dynamic_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from molgenis.capice.utilities.dynamic_loader import DynamicLoader 5 | from tests.capice.test_templates import set_up_manager_and_out, _project_root_directory 6 | 7 | 8 | class TestDynamicLoader(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls) -> None: 11 | print('Setting up.') 12 | cls.manager, output = set_up_manager_and_out() 13 | cls.correct_resources = os.path.join( 14 | _project_root_directory, 15 | 'tests', 16 | 'resources', 17 | 'dynamic_loader_test_files_present' 18 | ) 19 | cls.incorrect_resources = os.path.join( 20 | _project_root_directory, 21 | 'tests', 22 | 'resources', 23 | 'dynamic_loader_test_no_files' 24 | ) 25 | cls.required_attributes = ['name', 'some_function'] 26 | 27 | @classmethod 28 | def tearDownClass(cls) -> None: 29 | print('Tearing down.') 30 | 31 | def setUp(self) -> None: 32 | print('Testing case:') 33 | 34 | def test_no_directory_given_raise(self): 35 | print('Raise OSError: no directory given') 36 | self.assertRaises( 37 | OSError, 38 | DynamicLoader, 39 | self.required_attributes, 40 | os.path.join(_project_root_directory, 'some_random_directory') 41 | ) 42 | 43 | def test_manual_annotator_loader_correct(self): 44 | print('Loading correct manual annotator') 45 | loader = DynamicLoader( 46 | required_attributes=self.required_attributes, 47 | path=self.correct_resources 48 | ) 49 | loaded_modules = loader.load_manual_annotators() 50 | names = [] 51 | for module in loaded_modules: 52 | names.append(module.name) 53 | self.assertTrue('Correct' in names) 54 | 55 | def test_manual_annotator_loader_raise(self): 56 | print('Loading raise manual annotator no module found in correct directory') 57 | loader = DynamicLoader( 58 | required_attributes=['name', 'unrelated_function'], 59 | path=self.correct_resources 60 | ) 61 | self.assertRaises(FileNotFoundError, loader.load_manual_annotators) 62 | 63 | def test_manual_annotator_loader_raise_no_module_found(self): 64 | print('Loading raise manual annotator no module found in wrong directory') 65 | loader = DynamicLoader( 66 | required_attributes=self.required_attributes, 67 | path=self.incorrect_resources 68 | ) 69 | self.assertRaises(FileNotFoundError, loader.load_manual_annotators) 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /tests/capice/test_templates.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent 6 | from molgenis.capice.core.capice_manager import CapiceManager 7 | from molgenis.capice.core.logger import Logger 8 | from molgenis.capice.main_predict import CapicePredict 9 | 10 | _project_root_directory = Path(__file__).absolute().parent.parent.parent 11 | _project_resources = os.path.join(_project_root_directory, 'resources') 12 | _project_test_resources = os.path.join(_project_root_directory, 'tests', 'resources') 13 | 14 | 15 | def set_up_manager_and_out(): 16 | """ 17 | Function to set up the CapiceManager and testing output location 18 | :return: manager instance, output_directory 19 | """ 20 | manager = CapiceManager() 21 | manager.critical_logging_only = True 22 | root_dir = _project_root_directory 23 | output_directory = os.path.join(root_dir, '.test_output') 24 | if not os.path.exists(output_directory): 25 | os.makedirs(output_directory) 26 | return manager, output_directory 27 | 28 | 29 | def teardown(): 30 | """ 31 | Function to remove any and all files from the '.test_output' folder and 32 | remove the folder itself too. 33 | """ 34 | test_folder = os.path.join(_project_root_directory, '.test_output') 35 | if os.path.isdir(test_folder): 36 | if len(os.listdir(test_folder)) > 0: 37 | for file in os.listdir(test_folder): 38 | os.remove(os.path.join(test_folder, file)) 39 | os.rmdir(test_folder) 40 | Logger.instance = None 41 | CapiceManager.instance = None 42 | 43 | 44 | def set_up_predict(): 45 | return CapicePredict( 46 | input_path=None, 47 | model=None, 48 | output_path=None, 49 | output_given=False, 50 | force=False 51 | ) 52 | 53 | 54 | def set_up_impute_preprocess(): 55 | set_up_manager_and_out() 56 | main = set_up_predict() 57 | main.infile = os.path.join(_project_root_directory, 'resources', 'predict_input.tsv.gz') 58 | model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 59 | main.model = model 60 | return main, model 61 | 62 | 63 | def load_model(file_path): 64 | return ArgsHandlerParent.load_model(file_path) 65 | 66 | 67 | class ResourceFile(Enum): 68 | """ 69 | Enum storing paths to test resource files for easy access. 70 | """ 71 | PREDICT_INPUT_TSV_GZ = os.path.join(_project_resources, 'predict_input.tsv.gz') 72 | XGB_BOOSTER_POC_JSON = os.path.join(_project_test_resources, 'xgb_booster_poc.json') 73 | 74 | 75 | class FakeResourceFile(Enum): 76 | PREDICT_INPUT_TSV_GZ = os.path.join(_project_test_resources, 77 | 'non_existing_predict_input.tsv.gz') 78 | -------------------------------------------------------------------------------- /tests/capice/validators/test_input_validator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from molgenis.capice.validators.input_validator import InputValidator 5 | from tests.capice.test_templates import _project_root_directory, ResourceFile, FakeResourceFile 6 | 7 | 8 | class TestInputValidator(unittest.TestCase): 9 | new_directory_name = '.another_test_output_directory' 10 | new_directory = os.path.join(_project_root_directory, new_directory_name) 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | print('Setting up.') 15 | cls.input_validator = InputValidator() 16 | 17 | @classmethod 18 | def tearDownClass(cls): 19 | print('Tearing down.') 20 | if os.path.isdir(cls.new_directory): 21 | os.rmdir(cls.new_directory) 22 | 23 | def setUp(self): 24 | print('Testing case:') 25 | 26 | def test_create_output_path(self): 27 | print('Creating output location') 28 | with self.assertWarns(Warning): 29 | self.input_validator.validate_output_path(self.new_directory) 30 | self.assertTrue( 31 | self.new_directory_name in os.listdir(_project_root_directory) 32 | ) 33 | 34 | def test_input_single_extension(self): 35 | allowed_extensions = ('.tsv.gz',) 36 | self.input_validator.validate_input_path(ResourceFile.PREDICT_INPUT_TSV_GZ.value, 37 | extension=allowed_extensions) 38 | 39 | def test_input_multiple_extensions(self): 40 | allowed_extensions = ('.tsv', '.tsv.gz') 41 | self.input_validator.validate_input_path(ResourceFile.PREDICT_INPUT_TSV_GZ.value, 42 | extension=allowed_extensions) 43 | 44 | def test_input_multiple_extensions_invalid(self): 45 | allowed_extensions = ('.tsv', '.tsv.gz') 46 | with self.assertRaises(IOError) as e: 47 | self.input_validator.validate_input_path(ResourceFile.XGB_BOOSTER_POC_JSON.value, 48 | extension=allowed_extensions) 49 | 50 | self.assertEqual(f'{ResourceFile.XGB_BOOSTER_POC_JSON.value} does not match required ' 51 | f'extension: .tsv, .tsv.gz', 52 | str(e.exception)) 53 | 54 | def test_input_non_existing(self): 55 | allowed_extensions = ('.tsv', '.tsv.gz') 56 | with self.assertRaises(FileNotFoundError) as e: 57 | self.input_validator.validate_input_path(FakeResourceFile.PREDICT_INPUT_TSV_GZ.value, 58 | extension=allowed_extensions) 59 | 60 | self.assertEqual(f'{FakeResourceFile.PREDICT_INPUT_TSV_GZ.value} does not exist!', 61 | str(e.exception)) 62 | 63 | 64 | if __name__ == '__main__': 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /tests/capice/test_edge_cases_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import pandas as pd 5 | 6 | from tests.capice.test_templates import set_up_manager_and_out, teardown, set_up_predict, \ 7 | _project_root_directory, ResourceFile, load_model 8 | 9 | 10 | class TestEdgeCases(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls) -> None: 13 | cls.manager, cls.output = set_up_manager_and_out() 14 | cls.edge_cases = os.path.join( 15 | _project_root_directory, 16 | 'tests', 17 | 'resources', 18 | 'edge_cases_vep.tsv.gz' 19 | ) 20 | cls.breakpoints = os.path.join( 21 | _project_root_directory, 22 | 'tests', 23 | 'resources', 24 | 'breakends_vep.tsv.gz' 25 | ) 26 | cls.symbolic = os.path.join( 27 | _project_root_directory, 28 | 'tests', 29 | 'resources', 30 | 'symbolic_alleles_vep.tsv.gz' 31 | ) 32 | cls.model = load_model(ResourceFile.XGB_BOOSTER_POC_JSON.value) 33 | cls.main = set_up_predict() 34 | 35 | @classmethod 36 | def tearDownClass(cls) -> None: 37 | teardown() 38 | 39 | def setUp(self) -> None: 40 | self.main = set_up_predict() 41 | self.main.output = self.output 42 | self.main.model = self.model 43 | print('Testing case:') 44 | 45 | def get_observed_results(self): 46 | return pd.read_csv(os.path.join(self.output, self.manager.output_filename), sep='\t') 47 | 48 | def test_edge_cases(self): 49 | print('Edge cases') 50 | self.main.infile = self.edge_cases 51 | self.manager.output_filename = 'edge_cases_vep_capice.tsv.gz' 52 | self.main.run() 53 | observed_output = self.get_observed_results() 54 | self.assertGreater(observed_output['score'].sum(), 0) 55 | self.assertFalse(observed_output['score'].hasnans) 56 | 57 | def test_symbolic_alleles(self): 58 | print('Symbolic alleles') 59 | self.main.infile = self.symbolic 60 | self.manager.output_filename = 'symbolic_alleles_vep_capice.tsv.gz' 61 | self.main.run() 62 | observed_output = self.get_observed_results() 63 | self.assertGreater(observed_output['score'].sum(), 0) 64 | self.assertFalse(observed_output['score'].hasnans) 65 | 66 | def test_breakpoints(self): 67 | print('Breakpoints') 68 | self.main.infile = self.breakpoints 69 | self.manager.output_filename = 'breakends_vep_capice.tsv.gz' 70 | self.main.run() 71 | observed_output = self.get_observed_results() 72 | self.assertGreater(observed_output['score'].sum(), 0) 73 | self.assertFalse(observed_output['score'].hasnans) 74 | 75 | 76 | if __name__ == '__main__': 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | results.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # IDE specific files 142 | .idea/ 143 | /capice.iml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: linux 2 | dist: jammy 3 | language: java 4 | python: 5 | - '3.10' 6 | jdk: openjdk21 7 | cache: 8 | directories: 9 | - '$HOME/.cache/pip' 10 | - '$HOME/.sonar/cache' 11 | branches: 12 | only: 13 | - main 14 | - /^v\d+\.\d+(\.\d+)?(-\S*)?$/ 15 | before_install: 16 | - pip install -U pip 17 | install: 18 | - pip install -e '.[test]' 19 | script: 20 | - coverage run -m pytest --junitxml=results.xml 21 | - coverage xml 22 | - sonar-scanner 23 | addons: 24 | sonarcloud: 25 | organization: molgenis 26 | token: 27 | secure: qFkmx02PjcBy6nCpf05evyhQBcwOqq3BHycZbmyYkKE9AS9AbYH7eiGboTOvO5N45d3UoZaUud22JK+rD4mKTm9fMzhxQLrexww+EUOCs0EYRDaQkBtl9HcgnDWnHtRl55151agb8VJ2l9AFMC6vlfJjs+nYGeFimhoat0s4NxWlhQV/fEdmmlJCj9k37Z1We0SSoSuw8A+kJ5S3Lzf+7mhm2o/BF1rvjipW9m66e6uOyNmm9n2FMi9em06OdQRDoeLIXjgqvF2SvphcOwXfiL3TFJR4R3wjajRqgGlRmCcdt/HSMe5eTjcNvHiE3HmeAKSLoItixSqxBxzLrW/lBziWPp6E4GqRpyePhprUzJDIeB4nPg28CHS5mbqtsEiJFkGKwcaDWd+jY/KHIhN/5ECMGGQrBsiB0v7ENwasa46pMzsZ9m9b1oZzMO+dKxBl9N7BGmcjZp37nRv7n7zo5SGn1NMSs+w+8C542mAuVT0TCemKf1178auK2zeHguUxt+/GHw67lGyegZC17a9DAVN4IBCU+hkrFSwhfk1VqXulVM/b+TDpHggOaRLTNR5u45j+ibNTASTvTHwVDlbaGOOrcBSjoMY8yf+/777g3KzQFofXyyaiWRcVQAg1pl8XJUVNSJkULxCZlxy9CXc0dmnwRpui6fGtHgNpn/hwXvk= 28 | deploy: 29 | - provider: pypi 30 | username: __token__ 31 | password: 32 | secure: "f7iUE2wNOtqgDbB798eD9LNANJoU2VHmpnQwYqAJu/JLl7/JaMrPO2/n399i58HTXN5+7VxVo+zRHSCSoZ3R0sQI9m3rd9fN4hLtUEHDdwUW92ZoaMGoRCzj1qCslWSYb/H1yePp2hHzLPPJO7mPJMpP/ZCsTletBWl3BfeiaubXqASmJCBHCZ05ITCb5IY7w6LQWsEwZnN8QH3CPQL+T15P9xEyw5O+sP75MRxls8RHetzt41+3/cqvO/ZZpKydTdi7Whq6FKxGbDAk1CFP5I0g5CLaVxLXN2AJKuBouXs2r9J/+SZhItxtgzuU5Jdz527larMnWeKKGxVOwCA/7Zw/H1LGJ2tcDLI6MGrSjVnNd+M+/HyiXY+RmJw1zgAs9ZOH7M7hIQZ68Ld8wZ+e/OwrlzYJoB23RnpgtRFHrLH0GW2zKyONUdoS5IzOZyVUFwB/hHVQ2dEQSAkSwH5aa3tdCl9CbkF/VLoxqjPN6cRZgKF/UPK/Hvf+zVVF6upuzK91ETu1WXQ4d8M474OURvvNjHx+ZyGIprdpHAA6UFFNJVE0eySmJTxVM8Wdoqs9iUsUhOqXdjOhAweHfM9N6y2zgEnoOIKRlEfzY5WxU1oeDEHLcuX+Ll1l0aaxT3c6BRfpiKbfWE8ZJaUyaZCMlPocqorAxQFqgwCYWP3RYsM=" 33 | on: 34 | tags: true 35 | - provider: releases 36 | token: 37 | secure: vPXI5z2Wf5W6SLdCN96NWfsGz9dIpkp3tMmclgQls+mAmAdPxTlAqJCbDb8CoLetXcCx7U4SonWMShJokyqZPYd4KtoPK5lGZJqO3MV+pBMXHa9bO8nqoM4yC0Q+AukMoWTCzblZD/gSXbqcg/PODjplmBs9Doy3s8c81qx5H2L+rPsZPtm6GtCThgytW/bIOocJB6GilPVJfJoizHK0SHVKkZiuH0uCa5USVbM/HsWLTdJ5qItqLeU5TshgOg7o/4NMY6NrzbYL829Vcp7vqTuxqE45RG4jKckRm36pPZVZx4dlKQlqXOuxYAfkDPAdJy9+SSWAsqaaPj+alyhLii+0YTLMOoELPDcVSuNYqonS/7WZJ7HBVuQTtiFT5MU0fIQSvqptnBXCiOLUH5mNgL3FaBwwGDuVzbXmuKN4eSBflB1IWsgHftMFdhJ1NG+eS49zo5TJ3qaZBYtY+6rqUJt056ZUQE+9lqQSJDtHb0uVepc2QR7OvNxkYXaBjIU1wYpieT7dCCbo9+wnrtFYof+Ux3yiC/dDbhl2xXhJcsSKpbv1wwHBFmOjVTgvGuwGGCxYU2TVn0GdMf/ec7HPDLOxNTnhKgkqZfct4Id0BvKU40tPnS7KDlCCOjE6o1qyE3vMnjeqFw5gqPbUqVhe0ZF+ZsqH8B7Ga4VFDmVjAmY= 38 | file: "bogus.file" 39 | prerelease: true 40 | on: 41 | tags: true 42 | edge: true # opt in to dpl v2 43 | -------------------------------------------------------------------------------- /src/molgenis/capice/core/capice_exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.core.logger import Logger 6 | from molgenis.capice.core.capice_manager import CapiceManager 7 | from molgenis.capice.utilities import check_file_exist 8 | from molgenis.capice.utilities.enums import Column, UniqueSeparator 9 | 10 | 11 | class CapiceExporter: 12 | """ 13 | Class specifically exporting files 14 | """ 15 | 16 | def __init__(self, file_path, output_given, force): 17 | self.log = Logger().logger 18 | self.capice_filename = CapiceManager().output_filename 19 | self.file_path = file_path 20 | self.output_given = output_given 21 | self.force = force 22 | self.export_cols = [ 23 | Column.chr.value, 24 | Column.pos.value, 25 | Column.ref.value.lower(), 26 | Column.alt.value.lower(), 27 | Column.gene_name.value, 28 | Column.gene_id.value, 29 | Column.id_source.value, 30 | Column.feature.value, 31 | Column.feature_type.value, 32 | Column.score.value, 33 | Column.suggested_class.value 34 | ] 35 | 36 | def export_capice_prediction(self, datafile: pd.DataFrame): 37 | """ 38 | Function specific to export the dataset created for the prediction 39 | pathway. 40 | :param datafile: prediction pandas DataFrame 41 | """ 42 | export_path = os.path.join(self.file_path, self.capice_filename) 43 | datafile = self._post_process_split_cols(datafile) 44 | datafile = self._post_process_set_correct_dtypes(datafile) 45 | check_file_exist(export_path, self.force) 46 | datafile[self.export_cols].to_csv(export_path, sep='\t', index=False) 47 | if not self.output_given: 48 | print('Successfully exported CAPICE datafile to: %s', export_path) 49 | 50 | @staticmethod 51 | def _post_process_split_cols(datafile: pd.DataFrame): 52 | datafile[ 53 | [Column.chr.value, Column.pos.value, Column.ref.value.lower(), Column.alt.value.lower()] 54 | ] = datafile[Column.chr_pos_ref_alt.value].str.split( 55 | UniqueSeparator.unique_separator.value, expand=True) 56 | return datafile 57 | 58 | @staticmethod 59 | def _post_process_set_correct_dtypes(datafile: pd.DataFrame): 60 | datafile[Column.gene_id.value] = pd.Series(datafile[Column.gene_id.value], dtype='Int64') 61 | return datafile 62 | 63 | def export_capice_model(self, model): 64 | """ 65 | Function specific to export a newly created CAPICE model 66 | :param model: XGBClassifier instance 67 | """ 68 | export_path = os.path.join(self.file_path, self.capice_filename) 69 | check_file_exist(export_path, self.force) 70 | model.save_model(export_path) 71 | if not self.output_given: 72 | print('Successfully exported CAPICE model to: ', export_path) 73 | -------------------------------------------------------------------------------- /src/molgenis/capice/core/args_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | 4 | from molgenis.capice import __version__ 5 | from molgenis.capice.core.capice_manager import CapiceManager 6 | from molgenis.capice.cli.args_handler_train import ArgsHandlerTrain 7 | from molgenis.capice.cli.args_handler_predict import ArgsHandlerPredict 8 | from molgenis.capice.cli.args_handler_explain import ArgsHandlerExplain 9 | 10 | 11 | class ArgsHandler: 12 | """ 13 | Command-line argument handler. 14 | Creates, initializes and calls the specific (predict, train etc.) module 15 | sub-parser. 16 | """ 17 | 18 | def __init__(self): 19 | link = "https://github.com/molgenis/capice/blob/main/scripts/" \ 20 | "convert_vep_vcf_to_tsv_capice.sh" 21 | self.version = __version__ 22 | self.parser = argparse.ArgumentParser( 23 | description=f"CAPICE, a machine-learning-based method for prioritizing pathogenic" 24 | f" variants (https://doi.org/10.1186/s13073-020-00775-w). " 25 | f"Converting a VEP output VCF can be done using the conversion tool " 26 | f"supplied here: {link} (requires Apptainer)." 27 | ) 28 | self.manager = CapiceManager() 29 | 30 | def handle(self): 31 | """ 32 | Method to handle the non module specific command line arguments. After 33 | argument handling, calls the module 34 | """ 35 | args = self.parser.parse_args() 36 | self._handle_args(args) 37 | if 'func' in args: 38 | args.func(args) 39 | else: 40 | self.parser.print_help() 41 | self.parser.exit(2) 42 | 43 | def create(self): 44 | """ 45 | Classmethod to create the ArgsHandler ArgumentParser instance 46 | and adds the subparsers to ArgsHandler. Does not automatically handle 47 | the input arguments, please use ArgsHandler.create().handle() for that. 48 | """ 49 | self._add_arguments() 50 | subparsers = self.parser.add_subparsers() 51 | predictor = ArgsHandlerPredict(subparsers.add_parser('predict')) 52 | predictor.create() 53 | predictor.handle() 54 | trainer = ArgsHandlerTrain(subparsers.add_parser('train')) 55 | trainer.create() 56 | trainer.handle() 57 | explainer = ArgsHandlerExplain(subparsers.add_parser('explain')) 58 | explainer.create() 59 | explainer.handle() 60 | 61 | def _add_arguments(self): 62 | self.parser.add_argument( 63 | '-v', 64 | '--verbose', 65 | action='count', 66 | default=0, 67 | help='verbose mode. multiple -v options increase the verbosity') 68 | 69 | self.parser.add_argument( 70 | '--version', 71 | action='version', 72 | version=f'%(prog)s {self.version}' 73 | ) 74 | 75 | def _handle_args(self, args): 76 | level = None 77 | if args.verbose == 1: 78 | level = logging.INFO 79 | elif args.verbose >= 2: 80 | level = logging.DEBUG 81 | self.manager.loglevel = level 82 | -------------------------------------------------------------------------------- /tests/capice/cli/test_args_handler_parent.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent 4 | 5 | 6 | class TestArgsHandlerPredict(unittest.TestCase): 7 | @classmethod 8 | def setUpClass(cls) -> None: 9 | cls.cli_args = '-z/--zz' 10 | 11 | def test__single_argument_retriever_single_none(self): 12 | test_input = None 13 | expected_output = None 14 | actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 15 | has_default=False) 16 | self.assertEqual(actual_output, expected_output) 17 | 18 | def test__single_argument_retriever_empty_list(self): 19 | test_input = [] 20 | with self.assertRaises(ValueError) as context: 21 | ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 22 | has_default=False) 23 | msg = 'Empty list is given. Should be None or list with elements.' 24 | self.assertEqual(str(context.exception), msg) 25 | 26 | def test__single_argument_retriever_one_item(self): 27 | test_input = ['aa'] 28 | expected_output = 'aa' 29 | actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 30 | has_default=False) 31 | self.assertEqual(actual_output, expected_output) 32 | 33 | def test__single_argument_retriever_default_only(self): 34 | test_input = ['aa'] 35 | expected_output = 'aa' 36 | actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 37 | has_default=True) 38 | self.assertEqual(actual_output, expected_output) 39 | 40 | def test__single_argument_retriever_two_items_no_default(self): 41 | test_input = ['aa', 'bb'] 42 | with self.assertRaises(IOError) as context: 43 | ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 44 | has_default=False) 45 | msg = 'Argument -z/--zz is only allowed once.' 46 | self.assertEqual(str(context.exception), msg) 47 | 48 | def test__single_argument_retriever_default_with_one_item(self): 49 | test_input = ['aa', 'bb'] 50 | expected_output = 'bb' 51 | actual_output = ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 52 | has_default=True) 53 | self.assertEqual(actual_output, expected_output) 54 | 55 | def test__single_argument_retriever_default_with_two_items(self): 56 | test_input = ['aa', 'bb', 'cc'] 57 | with self.assertRaises(IOError) as context: 58 | ArgsHandlerParent._single_argument_retriever(test_input, self.cli_args, 59 | has_default=True) 60 | msg = 'Argument -z/--zz is only allowed once.' 61 | self.assertEqual(str(context.exception), msg) 62 | 63 | 64 | if __name__ == '__main__': 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /scripts/tests/test_convert_vep_vcf_to_tsv_capice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Base paths (to current dir/script). 4 | readonly CURRENT_PATH=$(pwd) 5 | BASE_PATH=$(realpath "$0") && readonly BASE_PATH=${BASE_PATH%/*} 6 | 7 | # Variable that stores whether a single test failed. 8 | any_test_failed=false 9 | 10 | main() { 11 | # Preparations. 12 | cd ${BASE_PATH} 13 | local -r input_vcf='../../CAPICE_example/capice_input.vcf.gz' 14 | local -r expected_output='../../CAPICE_example/CAPICE_input.tsv.gz' 15 | local -r actual_output='test_output.tsv.gz' # cleanup within each test! 16 | gunzip -k ${input_vcf} # keeps original gzip 17 | gunzip -k ${expected_output} # keeps original gzip 18 | 19 | # Run tests. 20 | testValidTextInput 21 | testValidGzipInput 22 | testEmptyInputParameter 23 | testNoOutputParameter 24 | testInvalidInputFileExtension 25 | testInvalidInputFilePath 26 | 27 | # Cleanup. 28 | rm ${input_vcf%.gz} 29 | rm ${expected_output%.gz} 30 | 31 | # Returns exitcode based on whether tests failed. 32 | if [[ "${any_test_failed}" == true ]] 33 | then 34 | exit 1 35 | fi 36 | } 37 | 38 | # $1: the generated exitcode 39 | # $2: the name of the test 40 | validateIfFailed() { 41 | if [[ $1 != 1 ]] 42 | then 43 | echo "$2: has exitcode 0, but expected 1" 44 | any_test_failed=true 45 | else 46 | echo "$2: done" 47 | fi 48 | 49 | rmSilent ${actual_output} 50 | } 51 | 52 | # $1: the generated exitcode 53 | # $2: the name of the test 54 | validateOutputFile() { 55 | if [[ $1 != 0 ]] 56 | then 57 | echo "$2: has exitcode 1, but expected 0" 58 | any_test_failed=true 59 | else 60 | gunzip ${actual_output} 61 | local checksum_expected=$(shasum -a 256 ${expected_output%.gz} | cut -d ' ' -f1) 62 | shasum -a 256 -c <<< "${checksum_expected%.gz} ${actual_output%.gz}" 63 | if [[ $? == 1 ]] 64 | then 65 | any_test_failed=true 66 | fi 67 | fi 68 | 69 | rmSilent ${actual_output%.gz} 70 | } 71 | 72 | rmSilent() { 73 | rm "$1" 2> /dev/null 74 | } 75 | 76 | testValidTextInput() { 77 | bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf%.gz} -o ${actual_output} &> /dev/null 78 | validateOutputFile "$?" 'testValidTextInput' 79 | } 80 | 81 | testValidGzipInput() { 82 | bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf} -o ${actual_output} &> /dev/null 83 | validateOutputFile "$?" 'testValidGzipInput' 84 | } 85 | 86 | testEmptyInputParameter() { 87 | bash ../convert_vep_vcf_to_tsv_capice.sh -i "" -o ${actual_output} &> /dev/null 88 | validateIfFailed "$?" 'testEmptyInputParameter' 89 | } 90 | 91 | testNoOutputParameter() { 92 | bash ../convert_vep_vcf_to_tsv_capice.sh -i ${input_vcf} &> /dev/null 93 | validateIfFailed "$?" 'testNoOutputParameter' 94 | } 95 | 96 | testInvalidInputFileExtension() { 97 | bash ../convert_vep_vcf_to_tsv_capice.sh -i './capice_input.vcf.zip' -o ${actual_output} &> /dev/null 98 | validateIfFailed "$?" 'testInvalidInputFileExtension' 99 | } 100 | 101 | testInvalidInputFilePath() { 102 | bash ../convert_vep_vcf_to_tsv_capice.sh -i './non_existing_dir/capice_input.vcf.gz' -o ${actual_output} &> /dev/null 103 | validateIfFailed "$?" 'testInvalidInputFileExtension' 104 | } 105 | 106 | main -------------------------------------------------------------------------------- /src/molgenis/capice/vep/consequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from molgenis.capice.core.logger import Logger 5 | from molgenis.capice.vep.template import Template 6 | 7 | 8 | class Consequence(Template): 9 | def __init__(self): 10 | super(Consequence, self).__init__( 11 | name='Consequence', 12 | usable=True 13 | ) 14 | self.log = Logger().logger 15 | 16 | @property 17 | def drop(self): 18 | return True 19 | 20 | @property 21 | def columns(self): 22 | return ['is_regulatory_region_variant', 23 | 'is_regulatory_region_ablation', 24 | 'is_regulatory_region_amplification', 25 | 'is_missense_variant', 26 | 'is_intron_variant', 27 | 'is_upstream_gene_variant', 28 | 'is_downstream_gene_variant', 29 | 'is_synonymous_variant', 30 | 'is_TF_binding_site_variant', 31 | 'is_splice_donor_variant', 32 | 'is_coding_sequence_variant', 33 | 'is_splice_region_variant', 34 | 'is_stop_gained', 35 | 'is_splice_acceptor_variant', 36 | 'is_frameshift_variant', 37 | 'is_3_prime_UTR_variant', 38 | 'is_inframe_insertion', 39 | 'is_inframe_deletion', 40 | 'is_5_prime_UTR_variant', 41 | 'is_start_lost', 42 | 'is_non_coding_transcript_exon_variant', 43 | 'is_non_coding_transcript_variant', 44 | 'is_TFBS_ablation', 45 | 'is_TFBS_amplification', 46 | 'is_protein_altering_variant', 47 | 'is_stop_lost', 48 | 'is_stop_retained_variant', 49 | 'is_transcript_ablation', 50 | 'is_intergenic_variant', 51 | 'is_start_retained_variant', 52 | 'is_transcript_amplification', 53 | 'is_incomplete_terminal_codon_variant', 54 | 'is_mature_miRNA_variant', 55 | 'is_NMD_transcript_variant', 56 | 'is_feature_elongation', 57 | 'is_feature_truncation', 58 | 'is_splice_donor_5th_base_variant', 59 | 'is_splice_donor_region_variant', 60 | 'is_splice_polypyrimidine_tract_variant' 61 | ] 62 | 63 | @staticmethod 64 | def _fillna(): 65 | return 0 66 | 67 | def _process(self, dataframe: pd.DataFrame): 68 | splitted_consequence = dataframe[self.name].str.split('&', expand=True) 69 | raw_consequences = [] 70 | for consequence in self.columns: 71 | current_consequence = consequence.split('is_')[1] 72 | dataframe[consequence] = np.where( 73 | np.isin(splitted_consequence, current_consequence).any(axis=1), 1, 0 74 | ) 75 | raw_consequences.append(current_consequence) 76 | 77 | self._validate_consequences(splitted_consequence, raw_consequences) 78 | return dataframe 79 | 80 | def _validate_consequences(self, consequences: pd.DataFrame, supported_consequences: list): 81 | unique_consequences = pd.Series(pd.unique(consequences.values.ravel('K'))).dropna() 82 | for consequence in unique_consequences: 83 | if consequence not in supported_consequences: 84 | self.log.warning('Supplied VEP consequence: %s is not supported in the ' 85 | 'Consequence processor!', consequence) 86 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/post_file_parse_validator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from molgenis.capice.core.logger import Logger 4 | from molgenis.capice.utilities.enums import InputColumn 5 | from molgenis.capice.utilities.column_utils import ColumnUtils 6 | 7 | 8 | class PostFileParseValidator: 9 | MINIMUM_REQUIRED_COLUMNS = { 10 | InputColumn.chr, 11 | InputColumn.pos, 12 | InputColumn.ref, 13 | InputColumn.alt, 14 | InputColumn.gene_name, 15 | InputColumn.gene_id, 16 | InputColumn.gene_name_source, 17 | InputColumn.feature, 18 | InputColumn.feature_type 19 | } 20 | 21 | def __init__(self): 22 | self.log = Logger().logger 23 | 24 | def validate_n_columns(self, dataset): 25 | """ 26 | Validator to make sure that the number of loaded columns is at least equal to 27 | MINIMUM_REQUIRED_COLUMNS. Does NOT check for the names of these columns! 28 | """ 29 | if isinstance(dataset, pd.Series) or dataset.shape[1] < len(self.MINIMUM_REQUIRED_COLUMNS): 30 | error_message = 'Loaded dataset does NOT have enough features! ' \ 31 | 'Is there a header present that does not start ' \ 32 | 'with ##?' 33 | self.log.critical(error_message) 34 | raise KeyError(error_message) 35 | 36 | def validate_variants_present(self, dataset): 37 | """ 38 | Validator to make sure that there is at least one variant present. 39 | """ 40 | if dataset.shape[0] == 0: 41 | error_message = 'Loaded dataset does not contain variants!' 42 | self.log.critical(error_message) 43 | raise ValueError(error_message) 44 | 45 | def validate_minimally_required_columns( 46 | self, dataset, additional_required_features: list | None = None 47 | ): 48 | """ 49 | Validator for both predict and train to check if the very least columns 50 | are present (chr, pos, ref, alt) and additionally the additional 51 | required columns. 52 | """ 53 | column_utils = ColumnUtils() 54 | column_utils.set_specified_columns( 55 | {x.col_name for x in PostFileParseValidator.MINIMUM_REQUIRED_COLUMNS} 56 | ) 57 | if additional_required_features is not None: 58 | column_utils.add_to_specified_columns(additional_required_features) 59 | columns_not_present = column_utils.get_missing_diff_with(dataset.columns) 60 | if len(columns_not_present) > 0: 61 | error_message = 'Detected required column %s not present within input dataset!' 62 | if len(columns_not_present) > 1: 63 | error_message = 'Detected required columns %s not present within input dataset!' 64 | self.log.critical(error_message, ', '.join(columns_not_present)) 65 | raise KeyError(error_message % ', '.join(columns_not_present)) 66 | 67 | def validate_chrom_pos(self, dataset): 68 | """ 69 | Function to check if all values of the columns Chr and Pos are present. 70 | """ 71 | if dataset[InputColumn.chr.col_name].isnull().values.any(): 72 | error_message = 'Detected gap in Chromosome column! Please supply a valid dataset.' 73 | self.log.critical(error_message) 74 | raise ValueError(error_message) 75 | if dataset[InputColumn.pos.col_name].isnull().values.any(): 76 | error_message = 'Detected gap in Position column! Please supply a valid dataset.' 77 | self.log.critical(error_message) 78 | raise ValueError(error_message) 79 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/manual_vep_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.core.logger import Logger 6 | from molgenis.capice.utilities import get_project_root_dir 7 | from molgenis.capice.utilities.dynamic_loader import DynamicLoader 8 | 9 | 10 | class ManualVEPProcessor: 11 | """ 12 | Class ManualVEPProcessor, to process the (unusable) VEP-like features to 13 | features that are more usable. 14 | """ 15 | 16 | def __init__(self): 17 | self.log = Logger().logger 18 | self.feature_processing_tracker = {} 19 | 20 | def process(self, dataset: pd.DataFrame, process_features: list[str]) -> pd.DataFrame: 21 | """ 22 | Callable method for the ManualVEPProcessor to start processing. 23 | Loads all the VEP processors dynamically from /src/molgenis/capice/vep. 24 | 25 | Args: 26 | dataset: The input dataset over which the VEP features should be processed. 27 | process_features: A collection of all input features that should be used in either 28 | training or predicting over which VEP processing should happen. 29 | 30 | Returns: 31 | pandas.DataFrame: The input dataset, processed on the consequences 32 | 33 | """ 34 | self.log.info('Starting manual VEP feature processing.') 35 | vep_annotators = self._load_vep_processors() 36 | dropping_columns = [] 37 | n_feats_processed = 0 38 | for processor in vep_annotators: 39 | if ( 40 | processor.name in dataset.columns and 41 | processor.name in process_features and 42 | processor.usable 43 | ): 44 | self.log.debug('Processing: %s', processor.name) 45 | self._add_feature_tracking(processor.name, processor.columns) 46 | dataset = processor.process(dataset) 47 | if processor.drop and processor.name not in dropping_columns: 48 | dropping_columns.append(processor.name) 49 | n_feats_processed += 1 50 | else: 51 | self.log.warning('Could not use processor %s on input dataset!', processor.name) 52 | self.log.debug('Property drop was set True for columns: %s', ', '.join(dropping_columns)) 53 | dataset.drop(columns=dropping_columns, inplace=True) 54 | self.log.info('Processing successful.') 55 | self.log.debug('Processed %d features.', n_feats_processed) 56 | return dataset 57 | 58 | def _add_feature_tracking(self, processor_name: str, processor_features: list[str]): 59 | if processor_name not in self.feature_processing_tracker.keys(): 60 | self.feature_processing_tracker[processor_name] = processor_features 61 | else: 62 | self.feature_processing_tracker[processor_name].extend(processor_features) 63 | 64 | def get_feature_processes(self) -> dict[str, list[str]]: 65 | """ 66 | Getter for the dictionary containing all the processed features and their output features. 67 | 68 | Returns: 69 | dict: 70 | Input VEP processing features (key) and their output features (values) 71 | """ 72 | return self.feature_processing_tracker 73 | 74 | def _load_vep_processors(self): 75 | location = os.path.join(get_project_root_dir(), 'vep') 76 | self.log.debug('Loading modules at %s', location) 77 | loader = DynamicLoader(required_attributes=['name', 'process'], path=location) 78 | loaded_modules = loader.load_manual_annotators() 79 | self.log.debug('Loaded %d modules.', len(loaded_modules)) 80 | return loaded_modules 81 | -------------------------------------------------------------------------------- /tests/capice/validators/test_post_file_parse_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from tests.capice.test_templates import teardown 6 | from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator 7 | 8 | 9 | class TestPostFileParseValidator(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | print('Setting up.') 13 | cls.dataset = pd.DataFrame( 14 | { 15 | 'chr': [1, 2], 16 | 'pos': [100, 200], 17 | 'REF': ['A', 'A'], 18 | 'ALT': ['T', 'T'], 19 | 'gene_name': ['UBA1', 'TFE3'], 20 | 'gene_id': [7317, 7030], 21 | 'id_source': ['EntrezGene', 'EntrezGene'], 22 | 'feature': ['NM_003334.4', 'NM_006521.6'], 23 | 'feature_type': ['Transcript', 'Transcript'], 24 | 'feat1': ['foo', 'bar'] 25 | } 26 | ) 27 | cls.validator = PostFileParseValidator() 28 | 29 | @classmethod 30 | def tearDownClass(cls) -> None: 31 | print('Tearing down.') 32 | teardown() 33 | 34 | def test_validation_correct_n_columns(self): 35 | print('Correct validation n_columns') 36 | self.validator.validate_n_columns(self.dataset) 37 | 38 | def test_validation_incorrect_n_columns(self): 39 | print('KeyError raised in n_columns due to too few columns (incorrectly loaded)') 40 | incorrectly_loaded_dataset = self.dataset[self.dataset.columns].astype(str).agg( 41 | '_'.join, axis=1 42 | ) 43 | self.assertRaises( 44 | KeyError, 45 | self.validator.validate_n_columns, 46 | incorrectly_loaded_dataset 47 | ) 48 | 49 | def test_no_variants_present(self): 50 | print('ValueError raised in validate_variants_present') 51 | dataset = pd.DataFrame(columns=self.dataset.columns) 52 | self.assertRaises( 53 | ValueError, 54 | self.validator.validate_variants_present, 55 | dataset 56 | ) 57 | 58 | def test_validation_correct_required_columns(self): 59 | print('Correct validation required_columns') 60 | self.validator.validate_minimally_required_columns( 61 | self.dataset, 62 | additional_required_features=['feat1'] 63 | ) 64 | 65 | def test_validation_incorrect_required_columns_preset_required(self): 66 | print('KeyError raised due to missing ref column') 67 | self.assertRaises( 68 | KeyError, 69 | self.validator.validate_minimally_required_columns, 70 | self.dataset.drop(columns='REF'), 71 | additional_required_features='feat1' 72 | ) 73 | 74 | def test_validation_incorrect_required_columns(self): 75 | print('KeyError raised due to missing feat2 column') 76 | self.assertRaises( 77 | KeyError, 78 | self.validator.validate_minimally_required_columns, 79 | self.dataset, 80 | additional_required_features=('feat1', 'feat2') 81 | ) 82 | 83 | def test_validation_correct_chrom_pos(self): 84 | print('Correct validation chrom_pos not empty') 85 | self.validator.validate_chrom_pos(self.dataset) 86 | 87 | def test_validation_incorrect_chrom_pos(self): 88 | print('ValueError raised due to gap in pos column') 89 | incorrect_dataset = self.dataset.copy(deep=True) 90 | incorrect_dataset.iloc[1, 1] = None 91 | self.assertRaises( 92 | ValueError, 93 | self.validator.validate_chrom_pos, 94 | incorrect_dataset 95 | ) 96 | 97 | 98 | if __name__ == '__main__': 99 | unittest.main() 100 | -------------------------------------------------------------------------------- /src/molgenis/capice/cli/args_handler_predict.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice import __version__ 2 | from molgenis.capice.main_predict import CapicePredict 3 | from molgenis.capice.core.capice_manager import CapiceManager 4 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent 5 | from molgenis.capice.validators.model_validator import ModelValidator 6 | from molgenis.capice.validators.version_validator import VersionValidator 7 | 8 | 9 | class ArgsHandlerPredict(ArgsHandlerParent): 10 | """ 11 | Child class ArgsHandlerPredict, specific to the predict part of CAPICE 12 | """ 13 | 14 | def __init__(self, parser): 15 | super(ArgsHandlerPredict, self).__init__(parser=parser) 16 | 17 | @property 18 | def _extension(self): 19 | return '.tsv', '.tsv.gz' 20 | 21 | @property 22 | def _model_extension(self) -> tuple[str]: 23 | # Ignore because the amount of values of tuple does not matter. 24 | return '.json', '.ubj' # type: ignore 25 | 26 | def _model_extension_str(self) -> str: 27 | return self._join_extensions(self._model_extension) 28 | 29 | @property 30 | def _required_output_extensions(self): 31 | return '.tsv', '.tsv.gz' 32 | 33 | @property 34 | def _empty_output_extension(self): 35 | return self._required_output_extensions[1] 36 | 37 | def create(self): 38 | self.parser.add_argument( 39 | '-i', 40 | '--input', 41 | action='append', 42 | type=str, 43 | required=True, 44 | help=f'path to annotated variants file ({self._extension_str()}) (required)' 45 | ) 46 | self.parser.add_argument( 47 | '-m', 48 | '--model', 49 | action='append', 50 | type=str, 51 | required=True, 52 | help=f'path to trained model ({self._model_extension_str()}) (required)' 53 | ) 54 | self.parser.add_argument( 55 | '-o', 56 | '--output', 57 | action='append', 58 | type=str, 59 | help=f'path to directory or file ({self._required_output_extensions_str()}) ' 60 | f'for exporting prediction output (optional)' 61 | ) 62 | self.parser.add_argument( 63 | '-f', 64 | '--force', 65 | action='store_true', 66 | help='overwrites output if it already exists' 67 | ) 68 | 69 | def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given, 70 | args): 71 | model_path = self._retrieve_argument_from_list(args.model, '-m/--model') 72 | model = self.validate_model(model_path) 73 | CapiceManager().output_filename = output_filename 74 | CapicePredict(input_path, model, output_path, output_given, self.force).run() 75 | 76 | def validate_model(self, model_path): 77 | """ 78 | Function to validate if the given model location is indeed a pickled 79 | model and matches the current CAPICE version. 80 | :param model_path: str, path-like, path to the model 81 | :return: model, xgb.XGBClassifier class 82 | """ 83 | try: 84 | self.input_validator.validate_input_path(model_path, extension=self._model_extension) 85 | except FileNotFoundError as cm: 86 | self.parser.error(str(cm)) 87 | model = self.load_model(model_path) 88 | model_validator = ModelValidator() 89 | model_validator.validate_has_required_attributes(model) 90 | version_validator = VersionValidator() 91 | try: 92 | version_validator.validate_model_version(model.CAPICE_version) 93 | version_validator.validate_versions_compatible(__version__, model.CAPICE_version) 94 | except ValueError as cm: 95 | self.parser.error(str(cm)) 96 | return model 97 | -------------------------------------------------------------------------------- /src/molgenis/capice/validators/version_validator.py: -------------------------------------------------------------------------------- 1 | import re 2 | from re import match 3 | 4 | from molgenis.capice.utilities.enums import Versioning 5 | 6 | 7 | class VersionValidator: 8 | def __init__(self): 9 | self.regex = Versioning.VALIDATION_REGEX.value 10 | 11 | def validate_capice_version(self, capice_version: str): 12 | """ 13 | Validates if the CAPICE version adheres CAPICE versioning standards, which originate from 14 | the Semantic versioning standards. 15 | 16 | For instance: 17 | 3.0.0 18 | 3.1.0 19 | 3.1.1 20 | 3.1.1-rc1 21 | 3.1.1rc1 22 | Are all valid versions. 23 | 24 | Parameters 25 | ---------- 26 | capice_version : str 27 | The version of the CAPICE framework. 28 | 29 | Raises 30 | ------ 31 | ValueError 32 | Raised when the CAPICE framework version does not adhere to the versioning standards. 33 | """ 34 | if match(self.regex, capice_version) is None: 35 | raise ValueError(f'CAPICE version does not adhere to correct format: {capice_version}') 36 | 37 | def validate_model_version(self, model_version: str): 38 | """ 39 | Validates if the model version adheres CAPICE versioning standards, which originate from 40 | the Semantic versioning standards. 41 | 42 | For instance: 43 | 3.0.0 44 | 3.1.0 45 | 3.1.1 46 | 3.1.1-rc1 47 | 3.1.1rc1 48 | Are all valid versions. 49 | 50 | Parameters 51 | ---------- 52 | model_version : str 53 | The version of the CAPICE model. 54 | 55 | Raises 56 | ------ 57 | ValueError 58 | Raised when the model version does not adhere to the versioning standards. 59 | """ 60 | if match(self.regex, model_version) is None: 61 | raise ValueError(f'Model version does not adhere to correct format: {model_version}') 62 | 63 | def validate_versions_compatible(self, capice_version: str, model_version: str): 64 | """ 65 | Validates if the model version and the CAPICE framework versions are compatible with each 66 | other. 67 | 68 | Parameters 69 | ---------- 70 | capice_version : str 71 | The version of the CAPICE framework. 72 | 73 | 74 | model_version : str 75 | The version of the CAPICE model. 76 | 77 | Raises 78 | ------ 79 | ValueError 80 | Raised when the model and framework versions are not compatible. 81 | """ 82 | # All mypy ignores here are because attributes are not found. 83 | capice = match(self.regex, capice_version) 84 | model = match(self.regex, model_version) 85 | if capice.group('major') != model.group('major'): # type: ignore 86 | raise ValueError( 87 | f'CAPICE major version {capice.string} ' # type: ignore 88 | f'does not match with the model ' 89 | f'{model.string}!' # type: ignore 90 | ) 91 | 92 | if capice.group('prerelease') or model.group('prerelease'): # type: ignore 93 | self._validate_prerelease(capice, model) # type: ignore 94 | 95 | @staticmethod 96 | def _validate_prerelease(capice_version: re.Match, 97 | model_version: re.Match): 98 | matches = ['minor', 'patch', 'prerelease'] 99 | for m in matches: 100 | if capice_version.group(m) != model_version.group(m): 101 | raise ValueError( 102 | f'CAPICE {m} version {capice_version.string} does not match the model {m} ' 103 | f'version {model_version.string} (should match for pre-releases)!' 104 | ) 105 | -------------------------------------------------------------------------------- /src/molgenis/capice/core/capice_manager.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.validators.property_type_validator import PropertyTypeValidator 2 | 3 | 4 | class CapiceManager: 5 | """ 6 | Global CAPICE manager, to keep track of variables used throughout 7 | the entirety of CAPICE. 8 | """ 9 | 10 | class __CapiceManager: 11 | def __init__(self): 12 | self.property_checker = PropertyTypeValidator() 13 | self.loglevel = None 14 | self.critical_logging_only = False 15 | self.output_filename = '' 16 | 17 | @property 18 | def loglevel(self): 19 | return self._loglevel 20 | 21 | @loglevel.setter 22 | def loglevel(self, value): 23 | self.property_checker.validate_property( 24 | value=value, expected_type=int, include_none=True) 25 | self._loglevel = value 26 | 27 | @property 28 | def critical_logging_only(self): 29 | return self._critical_logging_only 30 | 31 | @critical_logging_only.setter 32 | def critical_logging_only(self, value): 33 | self.property_checker.validate_property(value=value, expected_type=bool) 34 | self._critical_logging_only = value 35 | 36 | @property 37 | def output_filename(self): 38 | return self._output_filename 39 | 40 | @output_filename.setter 41 | def output_filename(self, value): 42 | self.property_checker.validate_property(value=value, expected_type=str) 43 | self._output_filename = value 44 | 45 | instance = None 46 | 47 | @property 48 | def loglevel(self): 49 | """ 50 | Getter for setter loglevel 51 | 52 | :return: None or int 53 | """ 54 | return self._loglevel 55 | 56 | @loglevel.setter 57 | def loglevel(self, value): 58 | """ 59 | Singleton property loglevel, to set the loglevel in int that will be 60 | used in the session of CAPICE. 61 | 62 | Raises TypeError if not supplied with int or None 63 | 64 | :param value: int or None 65 | """ 66 | pass 67 | 68 | @property 69 | def critical_logging_only(self): 70 | """ 71 | Getter for setter critical_logging_only 72 | 73 | :return: boolean 74 | """ 75 | return self._critical_logging_only 76 | 77 | @critical_logging_only.setter 78 | def critical_logging_only(self, value): 79 | """ 80 | Singleton property critical_logging_only, 81 | to tell the logger to only log CRITICAL loglevel events to file / 82 | STDout and STDerr. 83 | Raises TypeError if not supplied with a boolean. 84 | 85 | :param value: boolean 86 | """ 87 | pass 88 | 89 | @property 90 | def output_filename(self): 91 | """ 92 | Getter for setter output_filename 93 | 94 | :return: path-like 95 | """ 96 | return self._output_filename 97 | 98 | @output_filename.setter 99 | def output_filename(self, value): 100 | """ 101 | Singleton property output_filename, 102 | to set the output file name that CAPICE prediction will produce. 103 | 104 | :param value: path-like 105 | """ 106 | pass 107 | 108 | def __new__(cls): 109 | """ 110 | Class method to set CapiceManager instance 111 | :return: instance 112 | """ 113 | if not CapiceManager.instance: 114 | CapiceManager.instance = CapiceManager.__CapiceManager() 115 | return CapiceManager.instance 116 | 117 | def __init__(self): 118 | """ 119 | __init__ method to set instance to CapiceManager.__CapiceManager() 120 | """ 121 | if not CapiceManager.instance: 122 | CapiceManager.instance = CapiceManager.__CapiceManager() 123 | 124 | def __getattr__(self, name): 125 | """ 126 | Method to return the value of the named attribute of name 127 | :param name: str 128 | :return: str 129 | """ 130 | return getattr(self.instance, name) 131 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/input_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from molgenis.capice.utilities import check_file_exist 5 | 6 | 7 | class InputProcessor: 8 | def __init__(self, input_path, output_path, force, default_extension): 9 | """ 10 | InputProcessor checks the input directory, output directory 11 | (being either call_dir if output_path is None or output_path) and 12 | the force flag to build the output directory and output filename. 13 | :param input_path: str, path-like 14 | :param output_path: str, path-like (if missing: supply None) 15 | :param force: bool, force flag present or not 16 | :param default_extension: str, the default extension the output file should get in case 17 | output is missing from CLI 18 | 19 | Use getter get_output_filename() to get the output filename after 20 | initialization and get_output_directory() to get the output directory. 21 | (output directory is not yet checked for writability and existence) 22 | 23 | Note: when only the input_path is provided or only a file directory is 24 | supplied, InputProcessor will strip the extension from the input path 25 | and use the the input path file name as reference for the output 26 | filename. (so input with example.tsv.gz will be come example). 27 | Extension has to be manually added within the argument parser. 28 | """ 29 | self.call_dir = str(Path('.').absolute()) 30 | self.input_path = input_path 31 | self.output_path = output_path 32 | self.output_given = False 33 | self.force = force 34 | self.default_extension = default_extension 35 | self.output_directory = '' 36 | self.output_filename = '' 37 | self._handle_input_output_directories() 38 | 39 | def _handle_input_output_directories(self): 40 | """ 41 | Function to validate the input location, output location and filename to 42 | tell the exporter where to place what file. 43 | """ 44 | if self.output_path is None: 45 | filename = self.get_filename_from_path(self.input_path) 46 | self._set_output_path(self.call_dir, filename) 47 | else: 48 | # Check if it is a path or else just a filename 49 | if len(os.path.dirname(self.output_path)) > 0 or self.output_path == '.': 50 | # Then I know it's an output filepath + possibly name 51 | if os.path.splitext(self.output_path)[1] != '': 52 | # Then I know it is a full path + filename 53 | self._set_output_path(os.path.dirname(self.output_path), 54 | os.path.basename(self.output_path)) 55 | self.output_given = True 56 | else: 57 | # Then I know it's a full path 58 | filename = self.get_filename_from_path(self.input_path) 59 | self._set_output_path(self.output_path, filename) 60 | else: 61 | # Then I know it's an output filename 62 | self._set_output_path(self.call_dir, self.output_path) 63 | 64 | self._check_force() 65 | 66 | def _check_force(self): 67 | full_output_path = os.path.join(self.output_directory, self.output_filename) 68 | check_file_exist(full_output_path, self.force) 69 | 70 | def _set_output_path(self, directory, filename): 71 | self.output_directory = directory 72 | self.output_filename = filename 73 | 74 | def get_filename_from_path(self, path): 75 | """ 76 | Function to get the filename of a file from a given input 77 | path or input filename. 78 | :param path: string 79 | :return: filename (string) 80 | """ 81 | no_path = os.path.basename(path) 82 | splitted_path = no_path.split('.') 83 | filename = splitted_path[0] 84 | return f'{filename}_capice{self.default_extension}' 85 | 86 | def get_output_filename(self): 87 | return self.output_filename 88 | 89 | def get_output_directory(self): 90 | return self.output_directory 91 | 92 | def get_output_given(self): 93 | return self.output_given 94 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_column_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from molgenis.capice.utilities import column_utils 6 | 7 | 8 | class TestUtilities(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | print('Setting up') 12 | 13 | def setUp(self): 14 | print('\nTesting case:') 15 | self.column_utils = column_utils.ColumnUtils() 16 | self.column_utils.specified_columns = set(['a', 'b', 'c']) 17 | 18 | def test_set_specified_columns(self): 19 | """ 20 | Test set_specified_columns 21 | Should set specified_columns of class 22 | """ 23 | print('Setting specified columns') 24 | columns = ['x', 'y', 'z'] 25 | self.column_utils.set_specified_columns(columns) 26 | self.assertEqual(set(columns), self.column_utils.get_specified_columns()) 27 | 28 | def test_add_to_specified_columns_single(self): 29 | """ 30 | Test add_to_specified_columns 31 | Should add string to specified_columns of class 32 | """ 33 | print('Adding to specified columns') 34 | self.column_utils.add_to_specified_columns('da') 35 | self.assertEqual(set(['a', 'b', 'c', 'da']), self.column_utils.get_specified_columns()) 36 | 37 | def test_add_to_specified_columns_multiple(self): 38 | """ 39 | Test add_to_specified_columns 40 | Should merge list of columns with specified_columns of class 41 | """ 42 | print('Adding to specified columns') 43 | self.column_utils.add_to_specified_columns(['d', 'd', 'e']) 44 | self.assertEqual(set(['a', 'b', 'c', 'd', 'e']), self.column_utils.get_specified_columns()) 45 | 46 | def test_column_in_specified_columns(self): 47 | """ 48 | Test column_in_specified_columns 49 | Should return true because column is in specified_columns 50 | """ 51 | print('Column is in specified columns') 52 | column = 'a' 53 | self.assertTrue(self.column_utils.column_in_specified_columns(column)) 54 | 55 | def test_column_not_in_specified_columns(self): 56 | """ 57 | Test column_in_specified_columns 58 | Should return false because column is not in specified_columns 59 | """ 60 | print('Column not is in specified columns') 61 | column = 'x' 62 | self.assertFalse(self.column_utils.column_in_specified_columns(column)) 63 | 64 | def test_add_to_specified_columns_set(self): 65 | """ 66 | Test add_to_specified_columns 67 | Should merge set of columns with specified_columns of class 68 | """ 69 | print('Adding to specified columns') 70 | self.column_utils.add_to_specified_columns({'d', 'e'}) 71 | self.assertEqual({'a', 'b', 'c', 'd', 'e'}, self.column_utils.get_specified_columns()) 72 | 73 | def test_get_missing_diff_with_list(self): 74 | """ 75 | Test get_missing_diff_with with type list 76 | Should return "c" as missing column. 77 | """ 78 | print('Get missings of diff with columns as list') 79 | columns = ['a', 'b', 'x', 'y'] 80 | missing = self.column_utils.get_missing_diff_with(columns) 81 | self.assertEqual(['c'], missing) 82 | 83 | def test_get_missing_diff_with_dict_keys(self): 84 | """ 85 | Test get_missing_diff_with with type dict keys 86 | Should return "c" as missing column. 87 | """ 88 | print('Get missings of diff with columns of type dict keys') 89 | data = {'a': 0, 'b': 1, 'x': 3, 'y': 4} 90 | missing = self.column_utils.get_missing_diff_with(data.keys()) 91 | self.assertEqual(['c'], missing) 92 | 93 | def test_get_missing_diff_with_pd_df(self): 94 | """ 95 | Test get_missing_diff_with with type pandas dataframe 96 | Should return "cd" as missing column. 97 | """ 98 | print('Get missings of diff with columns of type pandas dataframe') 99 | data = {'ab': 0, 'bc': 1, 'x': 3, 'y': 4} 100 | d = {'ab': [1, 2], 'bc': [3, 4], 'cd': [6, 8]} 101 | df = pd.DataFrame(data=d) 102 | self.column_utils.set_specified_columns(df.columns) 103 | missing = self.column_utils.get_missing_diff_with(data.keys()) 104 | self.assertEqual(['cd'], missing) 105 | 106 | 107 | if __name__ == '__main__': 108 | unittest.main() 109 | -------------------------------------------------------------------------------- /src/molgenis/capice/cli/args_handler_train.py: -------------------------------------------------------------------------------- 1 | from molgenis.capice.main_train import CapiceTrain 2 | from molgenis.capice.core.capice_manager import CapiceManager 3 | from molgenis.capice.cli.args_handler_parent import ArgsHandlerParent 4 | 5 | 6 | class ArgsHandlerTrain(ArgsHandlerParent): 7 | """ 8 | Command-line argument handler for train sub-command. 9 | Parses, validates and executes function. 10 | """ 11 | 12 | def __init__(self, parser): 13 | super(ArgsHandlerTrain, self).__init__(parser=parser) 14 | self.split_default = 0.2 15 | self.n_threads_default = 1 16 | 17 | @property 18 | def _extension(self): 19 | return '.tsv', '.tsv.gz' 20 | 21 | @property 22 | def _features_extension(self) -> tuple[str]: 23 | return '.json', 24 | 25 | def _features_extension_str(self) -> str: 26 | return self._join_extensions(self._features_extension) 27 | 28 | @property 29 | def _required_output_extensions(self): 30 | return '.json', '.ubj' 31 | 32 | @property 33 | def _empty_output_extension(self): 34 | return self._required_output_extensions[1] 35 | 36 | def create(self): 37 | self.parser.add_argument( 38 | '-i', 39 | '--input', 40 | action='append', 41 | type=str, 42 | required=True, 43 | help=f'path to annotated variants file ({self._extension_str()}) (required)' 44 | ) 45 | self.parser.add_argument( 46 | '-e', 47 | '--features', 48 | action='append', 49 | type=str, 50 | required=True, 51 | help=f'path to the features file ({self._features_extension_str()}) (required)' 52 | ) 53 | self.parser.add_argument( 54 | '-s', 55 | '--split', 56 | action='append', 57 | default=[self.split_default], 58 | type=float, 59 | help=f'proportion of the input data to include in the test split (default: ' 60 | f'{self.split_default}) (optional)' 61 | ) 62 | self.parser.add_argument( 63 | '-o', 64 | '--output', 65 | action='append', 66 | type=str, 67 | help=f'path to directory or file ({self._required_output_extensions_str()}) for ' 68 | f'exporting model (optional)' 69 | ) 70 | self.parser.add_argument( 71 | '-f', 72 | '--force', 73 | action='store_true', 74 | help='overwrites output if it already exists' 75 | ) 76 | self.parser.add_argument( 77 | '-t', 78 | '--threads', 79 | action='append', 80 | default=[self.n_threads_default], 81 | type=int, 82 | help=f'The amount of threads that can be used by XGBoost to parallel train (default: ' 83 | f'{self.n_threads_default})' 84 | ) 85 | 86 | def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given, 87 | args): 88 | features = self._retrieve_argument_from_list(args.features, '-e/--features') 89 | self.input_validator.validate_input_path(features, extension=self._features_extension) 90 | 91 | test_split = self._retrieve_argument_from_list(args.split, '-s/--split', has_default=True) 92 | self.validate_test_split(test_split) 93 | 94 | n_threads = self._retrieve_argument_from_list(args.threads, '-t/--threads', 95 | has_default=True) 96 | self.validate_n_threads(n_threads) 97 | 98 | CapiceManager().output_filename = output_filename 99 | CapiceTrain( 100 | input_path, 101 | features, 102 | test_split, 103 | output_path, 104 | output_given, 105 | self.force, 106 | n_threads 107 | ).run() 108 | 109 | def validate_n_threads(self, n_threads): 110 | """ 111 | Function to validate that the amount of threads is at least 1. 112 | """ 113 | if n_threads < 1: 114 | self.parser.error('The amount of threads has to be at least 1!') 115 | 116 | def validate_test_split(self, test_split): 117 | """ 118 | Validator for test split to make sure it lies between 0 and 1 119 | (since the CLA is already set to type float, I do not have to validate 120 | it here too) 121 | """ 122 | if test_split <= 0 or test_split >= 1: 123 | self.parser.error('Test split must be a float between 0 and 1') 124 | -------------------------------------------------------------------------------- /src/molgenis/capice/utilities/dynamic_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from importlib import util 3 | 4 | from molgenis.capice.core.logger import Logger 5 | 6 | 7 | class DynamicLoader: 8 | def __init__(self, required_attributes: list, path): 9 | """ 10 | Dynamic Loader for both the imputer and preprocessor 11 | 12 | :param required_attributes: list, list containing all the required 13 | attritubes the loaded modules have to have. 14 | :param path: Path-like, path to the potential modules. 15 | 16 | Use `load_impute_preprocess_modules()` to load the modules required for 17 | the imputer and preprocessor. Use `load_manual_annotators()` to load 18 | the manual VEP annotation processors. 19 | """ 20 | self.log = Logger().logger 21 | self.path = path 22 | self._check_dir_exists() 23 | self.required_attributes = required_attributes 24 | self.modules: dict[str, object] = {} 25 | 26 | def load_manual_annotators(self): 27 | """ 28 | Load the VEP annotation modules within path. 29 | 30 | :return: list, list containing all the usable VEP modules within path. 31 | 32 | :raises: FileNotFoundError, if no VEP annotation module is found within 33 | path. 34 | """ 35 | self._load_modules() 36 | # Since the manual annotator doesn't require VEP version, GRCh build or 37 | # overwrite, this loading is done. 38 | return self.modules.values() 39 | 40 | def _load_modules(self, required_attributes=None): 41 | self._check_dir_exists() 42 | if required_attributes: 43 | set_required = required_attributes 44 | else: 45 | set_required = self.required_attributes 46 | modules = self._load_modules_from_path(self.path) 47 | self._check_n_modules(modules) 48 | imported_modules = self._import(modules) 49 | for path, module in imported_modules.items(): 50 | if all(item in dir(module) for item in set_required): 51 | self.modules[path] = module 52 | self._check_n_modules(self.modules) 53 | self.log.info('Successfully loaded %s modules.', len(self.modules)) 54 | 55 | def _check_dir_exists(self): 56 | if not os.path.exists(self.path): 57 | error_message = "%s is not a path!" 58 | self.log.critical(error_message, self.path) 59 | raise OSError(error_message % self.path) 60 | 61 | def _check_n_modules(self, modules_dict): 62 | if len(modules_dict) < 1: 63 | self._raise_no_module_found_error() 64 | 65 | def _raise_no_module_found_error(self): 66 | error_message = "No usable modules are found within %s!" 67 | self.log.critical(error_message, self.path) 68 | raise FileNotFoundError(error_message % self.path) 69 | 70 | @staticmethod 71 | def _load_modules_from_path(path): 72 | """ 73 | Function to dynamically load in modules in the given path 74 | :param path: path to the modules 75 | :return: list 76 | """ 77 | modules = [] 78 | for module in os.listdir(path): 79 | module = os.path.join(path, module) 80 | if (module.endswith('.py') and not module.endswith('__.py') 81 | and not module.endswith('abstract.py')): 82 | modules.append(module) 83 | return modules 84 | 85 | def _import(self, usable_modules: list[str]) -> dict[str, object]: 86 | """ 87 | Function to dynamically load in the modules using the 88 | import_module library. 89 | :param usable_modules: list of absolute paths to potential modules 90 | :return: list of usable modules 91 | """ 92 | # For some reason, mypy wants this line to be Typed instead of the method. 93 | return_modules: dict[str, object] = {} 94 | for module in usable_modules: 95 | name = os.path.basename(module).split('.py')[0] 96 | spec = util.spec_from_file_location(name=name, location=module) 97 | loaded_module = self._process_spec(spec) 98 | if loaded_module and module not in return_modules.keys(): 99 | return_modules[module] = loaded_module 100 | return return_modules 101 | 102 | @staticmethod 103 | def _process_spec(spec): 104 | return_spec = None 105 | loaded_spec = util.module_from_spec(spec) 106 | spec.loader.exec_module(loaded_spec) 107 | for attribute in dir(loaded_spec): 108 | if not attribute.startswith('Template') and not attribute.startswith('__'): 109 | get_attribute = getattr(loaded_spec, attribute) 110 | if ('name' in dir(get_attribute) and 'usable' in dir(get_attribute) 111 | and get_attribute().usable is True): 112 | return_spec = get_attribute() 113 | return return_spec 114 | -------------------------------------------------------------------------------- /tests/capice/utilities/test_input_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from pathlib import Path 4 | 5 | from tests.capice.test_templates import _project_root_directory 6 | from molgenis.capice.utilities.input_processor import InputProcessor 7 | 8 | 9 | class TestInputProcessor(unittest.TestCase): 10 | 11 | __FILE__ = 'file_capice.txt' 12 | __GZIPFILE__ = 'file_capice.txt.gz' 13 | 14 | def setUp(self): 15 | print('Setting up.') 16 | output = os.path.join( 17 | _project_root_directory, 18 | 'tests', 19 | 'resources', 20 | 'input_processor', 21 | 'filename.txt' 22 | ) 23 | self.processor = InputProcessor('/test/input/file.txt', output, True, '.txt') 24 | 25 | def tearDown(self) -> None: 26 | potential_file = os.path.join( 27 | _project_root_directory, 28 | 'tests', 29 | 'resources', 30 | self.__FILE__ 31 | ) 32 | if os.path.isfile(potential_file): 33 | os.remove(potential_file) 34 | second_potential_file = os.path.join( 35 | _project_root_directory, 36 | 'tests', 37 | 'resources', 38 | self.__GZIPFILE__ 39 | ) 40 | if os.path.isfile(second_potential_file): 41 | os.remove(second_potential_file) 42 | 43 | def test__set_output_path(self): 44 | output_dir = '/test/input/dir' 45 | filename = 'filename.txt' 46 | self.processor._set_output_path(output_dir, filename) 47 | self.assertEqual(self.processor.get_output_directory(), output_dir) 48 | self.assertEqual(self.processor.get_output_filename(), filename) 49 | 50 | def test_get_filename_from_path(self): 51 | path = '/test/input/dir/filename.txt' 52 | actual = self.processor.get_filename_from_path(path) 53 | self.assertEqual(actual, 'filename_capice.txt') 54 | 55 | def test__check_force(self): 56 | self.processor.force = False 57 | self.assertRaises(FileExistsError, self.processor._check_force) 58 | 59 | def test___handle_input_output_directories_case1(self): 60 | self.processor.output_path = None 61 | self.processor._handle_input_output_directories() 62 | self.assertEqual(str(Path('.').absolute()), self.processor.get_output_directory()) 63 | self.assertEqual(self.__FILE__, self.processor.get_output_filename()) 64 | 65 | def test___handle_input_output_directories_case2(self): 66 | self.processor.output_path = '' 67 | self.processor._handle_input_output_directories() 68 | self.assertEqual(str(Path('.').absolute()), self.processor.get_output_directory()) 69 | self.assertEqual('', self.processor.get_output_filename()) 70 | 71 | def test___handle_input_output_directories_case3(self): 72 | self.processor.output_path = '/something' 73 | self.processor._handle_input_output_directories() 74 | self.assertEqual('/something', self.processor.get_output_directory()) 75 | self.assertEqual(self.__FILE__, self.processor.get_output_filename()) 76 | 77 | def test___handle_input_output_directories_case4(self): 78 | self.processor.output_path = '/directory/file.txt' 79 | self.processor._handle_input_output_directories() 80 | self.assertEqual('/directory', self.processor.get_output_directory()) 81 | self.assertEqual('file.txt', self.processor.get_output_filename()) 82 | 83 | def test___handle_input_output_directories_case5(self): 84 | self.processor.output_path = '.' 85 | self.processor._handle_input_output_directories() 86 | self.assertEqual('.', self.processor.get_output_directory()) 87 | self.assertEqual(self.__FILE__, self.processor.get_output_filename()) 88 | 89 | def test___handle_input_output_directories_case6(self): 90 | self.processor.output_path = './file.txt' 91 | self.processor._handle_input_output_directories() 92 | self.assertEqual('.', self.processor.get_output_directory()) 93 | self.assertEqual('file.txt', self.processor.get_output_filename()) 94 | 95 | def test_force_false_output_missing_output_exists(self): 96 | # This test mimics what happens when output is left empty from the CLI 97 | # and the output file + _capice + default_extension already exists 98 | with open( 99 | os.path.join( 100 | _project_root_directory, 101 | 'tests', 102 | 'resources', 103 | self.__FILE__ 104 | ), 'wt' 105 | ) as some_file: 106 | some_file.write('SomeString') 107 | self.processor.force = False 108 | self.assertRaises( 109 | FileExistsError, 110 | self.processor._handle_input_output_directories 111 | ) 112 | 113 | 114 | if __name__ == '__main__': 115 | unittest.main() 116 | -------------------------------------------------------------------------------- /src/molgenis/capice/main_explain.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import xgboost as xgb 5 | 6 | from molgenis.capice.main_capice import Main 7 | from molgenis.capice.core.logger import Logger 8 | from molgenis.capice.utilities import check_file_exist 9 | from molgenis.capice.core.capice_manager import CapiceManager 10 | 11 | 12 | class CapiceExplain(Main): 13 | def __init__(self, model, output_path, output_given, force): 14 | super().__init__( 15 | input_path=None, 16 | output_path=output_path, 17 | output_given=output_given, 18 | force=force 19 | ) 20 | self.model = model 21 | self.output = output_path 22 | self.log = Logger().logger 23 | 24 | def run(self): 25 | gain_importances = self._extract_features_importances_gain(self.model) 26 | total_gain_importances = self._extract_features_importances_total_gain(self.model) 27 | weight_importances = self._extract_features_importances_weight(self.model) 28 | cover_importances = self._extract_features_importances_cover(self.model) 29 | total_cover_importances = self._extract_features_importances_total_cover(self.model) 30 | importances = self._convert_importances_to_dataframe(gain_importances, 31 | total_gain_importances, 32 | weight_importances, 33 | cover_importances, 34 | total_cover_importances) 35 | self._order_importances(importances) 36 | self._export(importances, self.output) 37 | 38 | def _extract_features_importances_gain(self, model: xgb.XGBClassifier): 39 | self.log.info('Extracting gain from model.') 40 | feature_importances = model.get_booster().get_score(importance_type='gain') 41 | self.log.debug('Extracted %d gain features from model.', len(feature_importances.keys())) 42 | return feature_importances 43 | 44 | def _extract_features_importances_total_gain(self, model: xgb.XGBClassifier): 45 | self.log.info('Extracting total gain from model.') 46 | feature_importances = model.get_booster().get_score(importance_type='total_gain') 47 | self.log.debug('Extracted %d total_gain features from model.', 48 | len(feature_importances.keys())) 49 | return feature_importances 50 | 51 | def _extract_features_importances_weight(self, model: xgb.XGBClassifier): 52 | self.log.info('Extracting weight from model.') 53 | feature_importances = model.get_booster().get_score(importance_type='weight') 54 | self.log.debug('Extracted %d weight features from model.', 55 | len(feature_importances.keys())) 56 | return feature_importances 57 | 58 | def _extract_features_importances_cover(self, model: xgb.XGBClassifier): 59 | self.log.info('Extracting cover from model.') 60 | feature_importances = model.get_booster().get_score(importance_type='cover') 61 | self.log.debug('Extracted %d cover features from model.', 62 | len(feature_importances.keys())) 63 | return feature_importances 64 | 65 | def _extract_features_importances_total_cover(self, model: xgb.XGBClassifier): 66 | self.log.info('Extracting total cover from model.') 67 | feature_importances = model.get_booster().get_score(importance_type='total_cover') 68 | self.log.debug('Extracted %d total_cover features from model.', 69 | len(feature_importances.keys())) 70 | return feature_importances 71 | 72 | def _convert_importances_to_dataframe(self, gain: dict, total_gain: dict, weight: dict, 73 | cover: dict, total_cover: dict): 74 | self.log.info('Converting importances to dataframe.') 75 | feature_importances = pd.DataFrame(data=[gain.keys(), gain.values()], 76 | index=['feature', 'gain']).T 77 | feature_importances['total_gain'] = feature_importances['feature'].map(total_gain) 78 | feature_importances['weight'] = feature_importances['feature'].map(weight) 79 | feature_importances['cover'] = feature_importances['feature'].map(cover) 80 | feature_importances['total_cover'] = feature_importances['feature'].map(total_cover) 81 | self.log.debug('Converted %d features into the dataframe', feature_importances.shape[0]) 82 | self.log.debug('Converted all %d importance types into the dataframe', 83 | feature_importances.shape[1]) 84 | return feature_importances 85 | 86 | def _order_importances(self, importances: pd.DataFrame): 87 | self.log.info('Ordering feature importances.') 88 | importances.sort_values(by='gain', ascending=False, inplace=True) 89 | 90 | def _export(self, dataset, output): 91 | output_path = os.path.join(output, CapiceManager().output_filename) 92 | check_file_exist(output_path, self.force) 93 | dataset.to_csv(output_path, index=False, sep='\t') 94 | if not self.output_given: 95 | print(f'Successfully exported explain to: {output_path}') 96 | -------------------------------------------------------------------------------- /tests/capice/core/test_capice_exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from molgenis.capice.utilities.enums import Column 8 | from molgenis.capice.core.capice_exporter import CapiceExporter 9 | from tests.capice.test_templates import set_up_manager_and_out, teardown 10 | 11 | 12 | class TestCapiceExporter(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | print('Setting up.') 16 | cls.prediction_output_dataframe = pd.DataFrame( 17 | { 18 | Column.chr_pos_ref_alt.value: [ 19 | '1_VeryUniqueCAPICESeparator_100' 20 | '_VeryUniqueCAPICESeparator_A_VeryUniqueCAPICESeparator_C', 21 | '2_VeryUniqueCAPICESeparator_200' 22 | '_VeryUniqueCAPICESeparator_T_VeryUniqueCAPICESeparator_G' 23 | ], 24 | Column.gene_name.value: ['foo', 'bar'], 25 | Column.gene_id.value: [1000, 2000], 26 | Column.id_source.value: ['foo', 'bar'], 27 | Column.feature.value: ['TRANS_01', 'TRANS_02'], 28 | Column.feature_type.value: ['Transcript', 'RegulatoryFeature'], 29 | Column.score.value: [0.01, 0.998], 30 | Column.suggested_class.value: ['VUS', 'VUS'] 31 | } 32 | ) 33 | cls.expected_prediction_output_dataframe = pd.DataFrame( 34 | { 35 | 'chr': ['1', '2'], 36 | 'pos': [100, 200], 37 | 'ref': ['A', 'T'], 38 | 'alt': ['C', 'G'], 39 | 'gene_name': ['foo', 'bar'], 40 | 'gene_id': [1000, 2000], 41 | 'id_source': ['foo', 'bar'], 42 | 'feature': ['TRANS_01', 'TRANS_02'], 43 | 'feature_type': ['Transcript', 'RegulatoryFeature'], 44 | 'score': [0.01, 0.998], 45 | 'suggested_class': ['VUS', 'VUS'] 46 | } 47 | ) 48 | cls.export_dataset = pd.DataFrame( 49 | { 50 | 'chr': [1, 2], 51 | 'pos': [100, 200], 52 | 'ref': ['A', 'A'], 53 | 'alt': ['C', 'G'], 54 | 'feature_1': [0.001, 0.2], 55 | 'feature_2': [0.02, 5.5] 56 | } 57 | ) 58 | 59 | @classmethod 60 | def tearDownClass(cls): 61 | print('Tearing down.') 62 | teardown() 63 | 64 | def setUp(self): 65 | print('Testing case:') 66 | manager, self.output_path = set_up_manager_and_out() 67 | self.exporter = CapiceExporter(file_path=self.output_path, output_given=True, force=False) 68 | 69 | def test_prediction_output(self): 70 | print('Prediction output') 71 | filename = 'test_output.tsv' 72 | filename_path = os.path.join(self.output_path, filename) 73 | self.exporter.capice_filename = filename 74 | self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe) 75 | self.assertTrue(os.path.isfile(filename_path)) 76 | exported_data = pd.read_csv(filename_path, sep='\t') 77 | exported_data[Column.chr.value] = exported_data[Column.chr.value].astype(str) 78 | pd.testing.assert_frame_equal(exported_data, self.expected_prediction_output_dataframe) 79 | 80 | def test_exporter_force_pass(self): 81 | """ 82 | Since force is dealt with at the very start of CAPICE and raises an 83 | error if the output file is already present unless the force flag is 84 | True, this test just makes sure that the overwritten file is correct. 85 | """ 86 | print('Filename generator (with force=True)') 87 | present_file = 'already_present_file.tsv' 88 | present_file_path = os.path.join(self.output_path, present_file) 89 | with open(present_file_path, 'wt') as present_file_conn: 90 | present_file_conn.write('This file is already present') 91 | self.exporter.capice_filename = present_file 92 | self.exporter.force = True 93 | self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe) 94 | forced_file = pd.read_csv(present_file_path, sep='\t') 95 | forced_file[Column.chr.value] = forced_file[Column.chr.value].astype(str) 96 | pd.testing.assert_frame_equal(forced_file, self.expected_prediction_output_dataframe) 97 | 98 | def test_post_process_set_correct_dtypes(self): 99 | print('Test post process set correct dtypes') 100 | some_data = pd.DataFrame( 101 | { 102 | 'foo': [1, 2, 3], 103 | Column.gene_id.value: [1, np.nan, 3] 104 | } 105 | ) 106 | expected_output = some_data.copy(deep=True) 107 | expected_output[Column.gene_id.value] = pd.Series( 108 | expected_output[Column.gene_id.value], dtype='Int64' 109 | ) 110 | out_data = self.exporter._post_process_set_correct_dtypes(some_data) 111 | pd.testing.assert_frame_equal( 112 | out_data.sort_index(axis=1), 113 | expected_output.sort_index(axis=1) 114 | ) 115 | 116 | 117 | if __name__ == '__main__': 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /scripts/convert_vep_vcf_to_tsv_capice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stops script if any error occurs. 4 | set -e 5 | 6 | # Defines error echo. 7 | errcho() { echo "$@" 1>&2; } 8 | 9 | # Usage. 10 | readonly USAGE="VEP VCF output to CAPICE TSV converter 11 | Usage: 12 | convert_vep_to_tsv_capice.sh -p -i -o [-t] [-f] 13 | -p required: The path to the BCFTools image. (available at: https://download.molgeniscloud.org/downloads/vip/images/bcftools-1.14.sif) 14 | -i required: The VEP output VCF. 15 | -o required: The directory and output filename for the CAPICE .tsv.gz. 16 | -f optional: enable force. 17 | -t optional: enable train. Adds the ID column to the output. 18 | 19 | Please note that this script expects apptainer binds to be set correctly by the system administrator. 20 | Additional apptainer binds can be set by setting the environment variable APPTAINER_BIND. 21 | If using SLURM, please export this environment variable to the sbatch instance too. 22 | 23 | Example: 24 | bash convert_vep_vcf_to_tsv_capice.sh -p /path/to/bcftools.sif -i vep_out.vcf.gz -o capice_in.tsv.gz 25 | 26 | Requirements: 27 | - Apptainer (although Singularity should work too, please change the script and adjust apptainer to singularity) 28 | - BCFTools image. (available at: https://download.molgeniscloud.org/downloads/vip/images/bcftools-1.14.sif) 29 | " 30 | 31 | # Global variables 32 | FORCE=false 33 | TRAIN=false 34 | 35 | 36 | main() { 37 | digestCommandLine "$@" 38 | processFile 39 | } 40 | 41 | digestCommandLine() { 42 | while getopts p:i:o:hft flag 43 | do 44 | case "${flag}" in 45 | p) bcftools_path=${OPTARG};; 46 | i) input=${OPTARG};; 47 | o) output=${OPTARG};; 48 | h) 49 | echo "${USAGE}" 50 | exit;; 51 | t) 52 | TRAIN=true;; 53 | f) 54 | FORCE=true;; 55 | \?) 56 | errcho "Error: invalid option" 57 | echo "${USAGE}" 58 | exit 1;; 59 | esac 60 | done 61 | 62 | if [[ ${TRAIN} == true ]] 63 | then 64 | HEADER="CHROM\tPOS\tID\tREF\tALT\t" 65 | FORMAT="%CHROM\t%POS\t%ID\t%REF\t%ALT\t%CSQ\n" 66 | else 67 | HEADER="CHROM\tPOS\tREF\tALT\t" 68 | FORMAT="%CHROM\t%POS\t%REF\t%ALT\t%CSQ\n" 69 | fi 70 | 71 | validateCommandLine 72 | } 73 | 74 | validateCommandLine() { 75 | local valid_command_line=true 76 | 77 | # Validate if BCFTools image is set & not empty 78 | if [ -z "${bcftools_path}" ] 79 | then 80 | valid_command_line=false 81 | errcho "BCFTools image not set/empty" 82 | else 83 | if [ ! -f "${bcftools_path}" ] 84 | then 85 | valid_command_line=false 86 | errcho "BCFTools image does not exist" 87 | fi 88 | fi 89 | 90 | # Validate if input is set & not empty. 91 | if [ -z "${input}" ] 92 | then 93 | valid_command_line=false 94 | errcho "input file not set/empty" 95 | else 96 | # Validate if input file exists. 97 | if [ ! -f "${input}" ] 98 | then 99 | valid_command_line=false 100 | errcho "input file does not exist" 101 | else 102 | # Validate allowed input filetype. 103 | case $(file --mime-type -b "${input}") in 104 | text/plain);; 105 | application/*gzip);; 106 | *) 107 | valid_command_line=false 108 | errcho "input file has invalid type (plain text/gzip allowed)";; 109 | esac 110 | fi 111 | fi 112 | 113 | # Validate if variable is set & not empty. 114 | if [ -z "${output}" ] 115 | then 116 | valid_command_line=false 117 | errcho "output file not set/empty" 118 | else 119 | # Validates proper output filename. 120 | if [[ "${output}" != *.tsv.gz ]] 121 | then 122 | valid_command_line=false 123 | errcho "output filename must end with '.tsv.gz'" 124 | else 125 | # Validates if output doesn't file already exist. 126 | if [ -f "${output}" ] 127 | then 128 | if [[ ${FORCE} == true ]] 129 | then 130 | echo "output file exists, enforcing output" 131 | rm "${output}" 132 | else 133 | errcho "output file exists and force flag is not called" 134 | valid_command_line=false 135 | fi 136 | fi 137 | fi 138 | fi 139 | 140 | # If a the command line arguments are invalid, exits with code 1. 141 | if [[ "${valid_command_line}" == false ]]; then errcho "Exiting."; exit 1; fi 142 | } 143 | 144 | processFile() { 145 | local output="${output%.gz}" 146 | 147 | local args=() 148 | args+=("exec") 149 | args+=("${bcftools_path}") 150 | args+=("bcftools") 151 | args+=("+split-vep") 152 | 153 | # Header 154 | 155 | echo "Obtaining header" 156 | 157 | header_args=("${args[@]}") 158 | header_args+=("-l" "${input}") 159 | 160 | present_features=$(apptainer "${header_args[@]}" | cut -f 2 | tr "\n" "\t" | sed "s/\t$//") 161 | 162 | echo -e "${HEADER}$present_features" > ${output} 163 | 164 | # VEP VCF file content 165 | 166 | echo "Obtaining VCF content" 167 | 168 | file_args=("${args[@]}") 169 | file_args+=("-d") 170 | file_args+=("-f" "${FORMAT}") 171 | file_args+=("-A" "tab") 172 | file_args+=("${input}") 173 | 174 | apptainer "${file_args[@]}" >> ${output} 175 | 176 | echo "BCFTools finished." 177 | 178 | echo "Gzipping output file." 179 | 180 | gzip "${output}" 181 | 182 | echo "Done." 183 | } 184 | 185 | main "$@" 186 | -------------------------------------------------------------------------------- /tests/capice/core/test_logger.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | import logging 4 | import unittest 5 | 6 | from molgenis.capice.core.logger import Logger 7 | from tests.capice.test_templates import teardown 8 | from molgenis.capice.core.capice_manager import CapiceManager 9 | 10 | 11 | class TestLogger(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | print('Setting up.') 15 | cls.manager = CapiceManager() 16 | cls.manager.critical_logging_only = False 17 | cls.not_present_string = 'Not present string' 18 | 19 | @classmethod 20 | def tearDownClass(cls): 21 | print('Tearing down.') 22 | teardown() 23 | 24 | def capture_stdout_call(self): 25 | old_stdout = sys.stdout 26 | listener = io.StringIO() 27 | sys.stdout = listener 28 | log = Logger().logger 29 | log.info('SomeString') 30 | log.debug('SomeString') 31 | out = listener.getvalue() 32 | sys.stdout = old_stdout 33 | self.assertGreater(len(out), 0) 34 | return out 35 | 36 | def capture_stderr_call(self): 37 | old_stderr = sys.stderr 38 | listener = io.StringIO() 39 | sys.stderr = listener 40 | log = Logger().logger 41 | log.critical('SomeString') 42 | log.error('SomeString') 43 | out = listener.getvalue() 44 | sys.stderr = old_stderr 45 | self.assertGreater(len(out), 0) 46 | return out 47 | 48 | def setUp(self): 49 | print('Testing case:') 50 | 51 | def tearDown(self) -> None: 52 | print('Resetting arguments.') 53 | Logger.instance = None 54 | self.manager.critical_logging_only = False 55 | self.manager.loglevel = None 56 | print('Arguments reset.') 57 | 58 | def test_isenbaled_false_debug(self): 59 | print('isEnabledFor(logging.DEBUG) is False') 60 | self.manager.loglevel = 20 61 | log = Logger().logger 62 | self.assertFalse(log.isEnabledFor(logging.DEBUG)) 63 | 64 | def test_isenabled_true_debug(self): 65 | print('isEnabledFor(logging.DEBUG) is True') 66 | self.manager.loglevel = 10 67 | log = Logger().logger 68 | self.assertTrue(log.isEnabledFor(logging.DEBUG)) 69 | 70 | def test_isenabled_false_warning(self): 71 | print('isEnabledFor(logging.WARNING) is False') 72 | self.manager.critical_logging_only = True 73 | log = Logger().logger 74 | self.assertFalse(log.isEnabledFor(logging.WARNING)) 75 | 76 | def test_isenabled_true_warning(self): 77 | print('isEnabledFor(logging.WARNING) is True') 78 | log = Logger().logger 79 | self.assertTrue(log.isEnabledFor(logging.WARNING)) 80 | self.assertFalse(log.isEnabledFor(logging.INFO)) 81 | 82 | def test_set_multiple_loglevels(self): 83 | print('isEnabledFor(logging.DEBUG) is False with ' 84 | 'CapiceManager().critical_logging_only set to True') 85 | self.manager.critical_logging_only = True 86 | self.manager.loglevel = 10 87 | log = Logger().logger 88 | self.assertFalse(log.isEnabledFor(logging.DEBUG)) 89 | 90 | def test_loglevel_nonverbose(self): 91 | """ 92 | Testing Info messages just became a lot harder since the logger is set 93 | to logging.NOTSET by default, with it's StreamHandlers taking care of 94 | the messages itself, specially the stdout StreamHandler. 95 | """ 96 | print('Loglevel info') 97 | self.manager.loglevel = 20 98 | out = self.capture_stdout_call() 99 | self.assertIn('INFO', out) 100 | self.assertNotIn('DEBUG', out) 101 | 102 | def test_loglevel_verbose(self): 103 | print('Loglevel verbose') 104 | self.manager.loglevel = 10 105 | out = self.capture_stdout_call() 106 | self.assertIn('INFO', out) 107 | self.assertIn('DEBUG', out) 108 | 109 | def test_loglevel_critical_logging_only(self): 110 | print('Critical logging only') 111 | self.manager.critical_logging_only = True 112 | out = self.capture_stderr_call() 113 | self.assertIn('CRITICAL', out) 114 | self.assertNotIn('ERROR', out) 115 | 116 | def test_stderr(self): 117 | print('Levels INFO and DEBUG not present in stderr') 118 | self.manager.loglevel = 10 119 | 120 | old_stderr = sys.stderr 121 | listener = io.StringIO() 122 | sys.stderr = listener 123 | 124 | log = Logger().logger 125 | log.info(self.not_present_string) 126 | log.debug(self.not_present_string) 127 | 128 | out = listener.getvalue() 129 | sys.stderr = old_stderr 130 | self.assertNotIn(self.not_present_string, out) 131 | 132 | def test_stdout(self): 133 | print('Levels WARNING, ERROR and CRITICAL not present in stdout') 134 | old_stdout = sys.stdout 135 | listener = io.StringIO() 136 | sys.stdout = listener 137 | 138 | log = Logger().logger 139 | log.warning(self.not_present_string) 140 | log.error(self.not_present_string) 141 | log.critical(self.not_present_string) 142 | 143 | out = listener.getvalue() 144 | sys.stdout = old_stdout 145 | 146 | self.assertNotIn(self.not_present_string, out) 147 | 148 | def test_logger_class(self): 149 | print('Logger class') 150 | self.assertEqual(str(Logger().logger.__class__), "") 151 | 152 | 153 | if __name__ == '__main__': 154 | unittest.main() 155 | -------------------------------------------------------------------------------- /src/molgenis/capice/main_capice.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | import pandas as pd 5 | 6 | from molgenis.capice.core.logger import Logger 7 | from molgenis.capice.core.capice_manager import CapiceManager 8 | from molgenis.capice.utilities.input_parser import InputParser 9 | from molgenis.capice.core.capice_exporter import CapiceExporter 10 | from molgenis.capice.utilities.manual_vep_processor import ManualVEPProcessor 11 | from molgenis.capice.utilities.categorical_processor import CategoricalProcessor 12 | from molgenis.capice.utilities.load_file_postprocessor import LoadFilePostProcessor 13 | from molgenis.capice.validators.post_file_parse_validator import PostFileParseValidator 14 | 15 | 16 | class Main(ABC): 17 | """ 18 | Main class of CAPICE that contains methods to help the different modes to 19 | function. 20 | """ 21 | 22 | def __init__(self, input_path, output_path, output_given, force): 23 | # Assumes CapiceManager has been initialized & filled. 24 | self.manager = CapiceManager() 25 | self.log = Logger().logger 26 | 27 | self.log.info('Initiating selected mode.') 28 | 29 | # Input file. 30 | self.infile = input_path 31 | self.log.debug('Input argument -i / --input confirmed: %s', self.infile) 32 | 33 | # Output file. 34 | self.output = output_path 35 | self.log.debug('Output directory -o / --output confirmed: %s', self.output) 36 | self.output_given = output_given 37 | 38 | self.force = force 39 | self.log.debug('Force output if exists: %s', self.force) 40 | 41 | @abstractmethod 42 | def run(self): 43 | pass 44 | 45 | def _load_file(self, additional_required_features: list | None = None): 46 | """ 47 | Function to load the input TSV file into main 48 | :return: pandas DataFrame 49 | """ 50 | input_parser = InputParser() 51 | input_file = input_parser.parse(input_file_path=self.infile) 52 | post_load_processor = LoadFilePostProcessor(dataset=input_file) 53 | input_file = post_load_processor.process() 54 | validator = PostFileParseValidator() 55 | # Individual calls to the validator for error readability 56 | validator.validate_variants_present(input_file) 57 | validator.validate_chrom_pos(input_file) 58 | validator.validate_n_columns(input_file) 59 | validator.validate_minimally_required_columns( 60 | input_file, 61 | additional_required_features=additional_required_features 62 | ) 63 | return input_file 64 | 65 | @staticmethod 66 | def process(loaded_data: pd.DataFrame, process_features: list[str]) -> tuple[ 67 | pd.DataFrame, dict[str, list[str]] 68 | ]: 69 | # Returns might look funky, but Google pydoc does not support multiple return statements. 70 | """ 71 | Function to call the ManualVEPProcessor over loaded_data using the supplied 72 | process_features list. 73 | 74 | Args: 75 | loaded_data: 76 | The pandas dataframe over which the VEP features should be processed. 77 | 78 | process_features: 79 | List containing either all input features, possibly containing VEP features (in 80 | the case of train) or already all input features that can be VEP processed (in 81 | case of predict). 82 | 83 | Returns: 84 | tuple: 85 | Tuple [0] containing: The output dataframe containing all VEP processed features 86 | according to process_features. Depending on the property "drop" will drop the 87 | feature present in process_features from the columns of the output dataframe. 88 | Tuple [1] containing: The output dictionary containing the VEP feature (key) 89 | and the derivative features that originate from said VEP feature (value). 90 | The property "drop" is of no influence here. 91 | """ 92 | processor = ManualVEPProcessor() 93 | processed_data = processor.process(loaded_data, process_features) 94 | processed_features = processor.get_feature_processes() 95 | # No validation, since that is specific to predict. 96 | # Also predict doesn't technically need processed_features, but within predict the first 97 | # argument in the tuple can just be indexed. 98 | # Still returning both is relevant, in case we want to validate the processed_features in 99 | # the future for predict. 100 | return processed_data, processed_features 101 | 102 | @staticmethod 103 | def categorical_process(loaded_data: pd.DataFrame, 104 | processing_features: dict[str, list[str]] | None = None, 105 | train_features: list | None = None): 106 | processor = CategoricalProcessor() 107 | capice_data, processed_features = processor.process( 108 | loaded_data, 109 | processable_features=train_features, 110 | predetermined_features=processing_features 111 | ) 112 | return capice_data, processed_features 113 | 114 | def _export(self, dataset: pd.DataFrame, output: os.PathLike): 115 | """ 116 | Function to prepare the data to be exported 117 | """ 118 | CapiceExporter( 119 | file_path=output, 120 | output_given=self.output_given, 121 | force=self.force 122 | ).export_capice_prediction(datafile=dataset) 123 | -------------------------------------------------------------------------------- /src/molgenis/capice/core/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | File: logger.py 3 | Created: 2019/10/11 4 | Last Changed: 5 | Author(s): M.Vochteloo and R. J. Sietsma 6 | 7 | Copyright 2019 M. Vochteloo and R. J. Sietsma 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | https://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | """ 21 | 22 | import sys 23 | import logging 24 | 25 | from molgenis.capice.core.capice_manager import CapiceManager 26 | from molgenis.capice.utilities.custom_logging_filter import CustomLoggingFilter 27 | 28 | 29 | class Logger: 30 | """ 31 | Singleton logger class developed by both: 32 | - Martijn Vochteloo 33 | - Robert Jarik Sietsma. 34 | Facilitates the python logging library 35 | """ 36 | 37 | class __Logger: 38 | def __init__(self): 39 | self.global_settings = CapiceManager() 40 | self.stdout = False 41 | self.stdout_filter = [] 42 | self.stderr_loglevel = 50 43 | self.min_loglevel = 50 44 | self.set_stderr_loglevel() 45 | self.logger = None 46 | if self.logger is None: 47 | self.load_logger() 48 | 49 | def set_stderr_loglevel(self): 50 | """ 51 | Function to set the log level at where messages are printed or 52 | logged. For more information, see: 53 | https://docs.python.org/3/library/logging.html#logging-levels 54 | :return: logging level 55 | """ 56 | if not self.global_settings.critical_logging_only: 57 | self.stderr_loglevel = 30 58 | self.min_loglevel = 30 59 | if self.global_settings.loglevel and self.stderr_loglevel < 50: 60 | self.stdout = True 61 | self._set_stdout_filter() 62 | 63 | def _set_stdout_filter(self): 64 | """ 65 | Required because else Warning, Error and CRITICAL messages are 66 | printed to sys.stdout. 67 | """ 68 | logging_info = [logging.INFO] 69 | logging_debug = logging_info + [logging.DEBUG] 70 | dict_of_levels = {10: logging_debug, 20: logging_info} 71 | self.stdout_filter = dict_of_levels[self.global_settings.loglevel] 72 | self.min_loglevel = self.global_settings.loglevel 73 | 74 | def load_logger(self): 75 | """ 76 | Function to set up the logger instance with the stdout and stderr 77 | StreamHandlers (stdout assuming verbose flag is called) and the 78 | formatter. 79 | """ 80 | # Making a root logger to make sure the level is set correctly. 81 | logger = logging.getLogger() 82 | # Now renaming it to CAPICE. 83 | logger.name = 'CAPICE' 84 | 85 | # Capture warnings 86 | logging.captureWarnings(True) 87 | 88 | formatter = logging.Formatter( 89 | "%(asctime)s " 90 | "%(levelname)8s: " 91 | "%(message)s", 92 | datefmt='%Y-%m-%d %H:%M:%S' 93 | ) 94 | 95 | # Setting the log level to debug, but with an applied filter 96 | logger.setLevel(self.min_loglevel) 97 | 98 | # sys.stdout (if critical logging only isn't called and one of 99 | # the verbose flags is called. 100 | if self.stdout: 101 | stdout_handler = logging.StreamHandler(sys.stdout) 102 | stdout_handler.setLevel(self.global_settings.loglevel) 103 | stdout_handler.setFormatter(formatter) 104 | # Filter out warning, error and critical messages. 105 | stdout_handler.addFilter(CustomLoggingFilter(self.stdout_filter)) 106 | logger.addHandler(stdout_handler) 107 | 108 | # sys.stderr 109 | stderr_handler = logging.StreamHandler(sys.stderr) 110 | stderr_handler.setLevel(self.stderr_loglevel) 111 | stderr_handler.setFormatter(formatter) 112 | logger.addHandler(stderr_handler) 113 | self.logger = logger 114 | 115 | @property 116 | def logger(self): 117 | """ 118 | Property to get the logger instance. 119 | 120 | :return: logging.Logger 121 | """ 122 | return self._logger 123 | 124 | @logger.setter 125 | def logger(self, value): 126 | """ 127 | Setter for the logger instance. 128 | 129 | :param value: 130 | :return: 131 | """ 132 | self._logger = value 133 | 134 | @property 135 | def logger(self): 136 | """ 137 | Property to get the logger instance. 138 | 139 | :return: logging.Logger 140 | """ 141 | return self._logger 142 | 143 | instance = None 144 | 145 | def __new__(cls): 146 | """ 147 | Class method to set Logger instance 148 | :return: instance 149 | """ 150 | if not Logger.instance: 151 | Logger.instance = Logger.__Logger() 152 | return Logger.instance 153 | 154 | def __init__(self): 155 | """ 156 | __init__ method to set instance to Logger.__Logger() 157 | """ 158 | if not Logger.instance: 159 | Logger.instance = Logger.__Logger() 160 | 161 | def __getattr__(self, name): 162 | """ 163 | Method to return the value of the named attribute of name 164 | :param name: str 165 | :return: str 166 | """ 167 | return getattr(self.instance, name) 168 | -------------------------------------------------------------------------------- /tests/capice/vep/test_consequence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from molgenis.capice.vep.consequence import Consequence 7 | 8 | 9 | class TestConsequence(unittest.TestCase): 10 | def setUp(self) -> None: 11 | self.data = pd.DataFrame( 12 | { 13 | 'Consequence': ['transcript_ablation&stop_lost', 'start_lost', np.nan] 14 | } 15 | ) 16 | self.expected_data = pd.DataFrame( 17 | { 18 | 'is_regulatory_region_variant': {0: 0, 1: 0, 2: 0}, 19 | 'is_regulatory_region_ablation': {0: 0, 1: 0, 2: 0}, 20 | 'is_regulatory_region_amplification': {0: 0, 1: 0, 2: 0}, 21 | 'is_missense_variant': {0: 0, 1: 0, 2: 0}, 22 | 'is_intron_variant': {0: 0, 1: 0, 2: 0}, 23 | 'is_upstream_gene_variant': {0: 0, 1: 0, 2: 0}, 24 | 'is_downstream_gene_variant': {0: 0, 1: 0, 2: 0}, 25 | 'is_synonymous_variant': {0: 0, 1: 0, 2: 0}, 26 | 'is_TF_binding_site_variant': {0: 0, 1: 0, 2: 0}, 27 | 'is_splice_donor_variant': {0: 0, 1: 0, 2: 0}, 28 | 'is_coding_sequence_variant': {0: 0, 1: 0, 2: 0}, 29 | 'is_splice_region_variant': {0: 0, 1: 0, 2: 0}, 30 | 'is_stop_gained': {0: 0, 1: 0, 2: 0}, 31 | 'is_splice_acceptor_variant': {0: 0, 1: 0, 2: 0}, 32 | 'is_frameshift_variant': {0: 0, 1: 0, 2: 0}, 33 | 'is_3_prime_UTR_variant': {0: 0, 1: 0, 2: 0}, 34 | 'is_inframe_insertion': {0: 0, 1: 0, 2: 0}, 35 | 'is_inframe_deletion': {0: 0, 1: 0, 2: 0}, 36 | 'is_5_prime_UTR_variant': {0: 0, 1: 0, 2: 0}, 37 | 'is_start_lost': {0: 0, 1: 1, 2: 0}, 38 | 'is_non_coding_transcript_exon_variant': {0: 0, 1: 0, 2: 0}, 39 | 'is_non_coding_transcript_variant': {0: 0, 1: 0, 2: 0}, 40 | 'is_TFBS_ablation': {0: 0, 1: 0, 2: 0}, 41 | 'is_TFBS_amplification': {0: 0, 1: 0, 2: 0}, 42 | 'is_protein_altering_variant': {0: 0, 1: 0, 2: 0}, 43 | 'is_stop_lost': {0: 1, 1: 0, 2: 0}, 44 | 'is_stop_retained_variant': {0: 0, 1: 0, 2: 0}, 45 | 'is_transcript_ablation': {0: 1, 1: 0, 2: 0}, 46 | 'is_intergenic_variant': {0: 0, 1: 0, 2: 0}, 47 | 'is_start_retained_variant': {0: 0, 1: 0, 2: 0}, 48 | 'is_transcript_amplification': {0: 0, 1: 0, 2: 0}, 49 | 'is_incomplete_terminal_codon_variant': {0: 0, 1: 0, 2: 0}, 50 | 'is_mature_miRNA_variant': {0: 0, 1: 0, 2: 0}, 51 | 'is_NMD_transcript_variant': {0: 0, 1: 0, 2: 0}, 52 | 'is_feature_elongation': {0: 0, 1: 0, 2: 0}, 53 | 'is_feature_truncation': {0: 0, 1: 0, 2: 0}, 54 | 'is_splice_donor_5th_base_variant': {0: 0, 1: 0, 2: 0}, 55 | 'is_splice_donor_region_variant': {0: 0, 1: 0, 2: 0}, 56 | 'is_splice_polypyrimidine_tract_variant': {0: 0, 1: 0, 2: 0} 57 | } 58 | 59 | ) 60 | 61 | def test_consequence(self): 62 | data_copy = self.data.copy(deep=True) 63 | observerd = Consequence().process(self.data) 64 | # if numpy.array dtype not given, 65 | # then the type will be determined as the minimum type required to hold the 66 | # objects in the sequence. this minimal type is system dependent. 67 | expected = pd.concat( 68 | [ 69 | data_copy, 70 | self.expected_data 71 | ], axis=1 72 | ) 73 | pd.testing.assert_frame_equal(observerd.sort_index(axis=1), expected.sort_index( 74 | axis=1), check_dtype=False) 75 | 76 | def test_non_coding(self): 77 | data = pd.DataFrame({ 78 | 'variants': ['variant_1', 'variant_2', 'variant_3'], 79 | 'Consequence': [np.nan, np.nan, np.nan] 80 | }) 81 | columns = data.columns 82 | expected_altered = self.expected_data.copy(deep=True) 83 | # Easier to locate the ones in self.expected_data than to hardcode a new one 84 | expected_altered.loc[1, 'is_start_lost'] = 0 85 | expected_altered.loc[0, 'is_stop_lost'] = 0 86 | expected_altered.loc[0, 'is_transcript_ablation'] = 0 87 | expected = pd.concat([data, expected_altered], axis=1) 88 | observed = Consequence().process(data) 89 | self.assertFalse(observed[observed.columns.difference(columns)].isnull().values.any()) 90 | pd.testing.assert_frame_equal( 91 | observed.sort_index(axis=1), 92 | expected.sort_index(axis=1) 93 | ) 94 | 95 | def test_consequence_warning(self): 96 | """ 97 | Tests that when a consequence is encountered that is not present within the processor 98 | raises a warning. 99 | """ 100 | dataframe = pd.DataFrame( 101 | { 102 | 'Consequence': ['transcript_ablation&stop_lost', 'start_lost', 'fake_consequence'] 103 | } 104 | ) 105 | dataframe_copy = dataframe.copy(deep=True) 106 | with self.assertLogs() as captured: 107 | observed = Consequence().process(dataframe) 108 | expected = pd.concat( 109 | [ 110 | dataframe_copy, 111 | self.expected_data 112 | ], axis=1 113 | ) 114 | 115 | pd.testing.assert_frame_equal(observed.sort_index(axis=1), expected.sort_index( 116 | axis=1), check_dtype=False) 117 | self.assertEqual('Supplied VEP consequence: fake_consequence is not supported in the ' 118 | 'Consequence processor!', captured.records[0].getMessage()) 119 | 120 | 121 | if __name__ == '__main__': 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /src/molgenis/capice/cli/args_handler_parent.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABCMeta, abstractmethod 3 | 4 | import xgboost as xgb 5 | 6 | from molgenis.capice import __version__ 7 | from molgenis.capice.utilities.input_processor import InputProcessor 8 | from molgenis.capice.validators.input_validator import InputValidator 9 | from molgenis.capice.validators.version_validator import VersionValidator 10 | 11 | 12 | class ArgsHandlerParent(metaclass=ABCMeta): 13 | """ 14 | Parent class of all module specific argument parsers / handlers. 15 | """ 16 | 17 | def __init__(self, parser): 18 | self.parser = parser 19 | self.input_validator = InputValidator() 20 | self.force = False 21 | 22 | @property 23 | @abstractmethod 24 | def _extension(self) -> tuple[str]: 25 | """ 26 | Property to define what extension(s) are allowed for an input file for 27 | each module parser. 28 | """ 29 | pass 30 | 31 | def _extension_str(self) -> str: 32 | """ 33 | String representation of `_extension()` 34 | """ 35 | return self._join_extensions(self._extension) 36 | 37 | @property 38 | @abstractmethod 39 | def _required_output_extensions(self) -> tuple[str]: 40 | """ 41 | Property to define what the output file extensions are allowed for each 42 | module parser. 43 | """ 44 | pass 45 | 46 | def _required_output_extensions_str(self) -> str: 47 | """ 48 | String representation of `_required_output_extensions()` 49 | """ 50 | return self._join_extensions(self._required_output_extensions) 51 | 52 | @property 53 | @abstractmethod 54 | def _empty_output_extension(self) -> str: 55 | """ 56 | Property to define what extension an output file should get if no 57 | output file extension was given. 58 | 59 | Preferably, use: self._required_output_extensions[] 60 | """ 61 | pass 62 | 63 | @abstractmethod 64 | def create(self): 65 | """ 66 | Method to define what parser options should be available for the module. 67 | Use self.parser.add_argument() to add an argument to the subparser. 68 | """ 69 | pass 70 | 71 | def handle(self): 72 | """ 73 | Superclass handler to set the arguments set in create(). Also calls the 74 | parser to proceed with parsing the module specific arguments, validate 75 | them and run the CAPICE code. 76 | """ 77 | self.parser.set_defaults(func=self._handle_args) 78 | 79 | def _handle_args(self, args): 80 | """ 81 | Superclass handle args to parse and validate the input and output 82 | arguments. Also parses the output filename. 83 | """ 84 | version_validator = VersionValidator() 85 | try: 86 | version_validator.validate_capice_version(__version__) 87 | except ValueError as cm: 88 | self.parser.error(str(cm)) 89 | input_path = self._retrieve_argument_from_list(args.input, '-i/--input') 90 | try: 91 | self.input_validator.validate_input_path(input_path, extension=self._extension) 92 | except FileNotFoundError as cm: 93 | self.parser.error(str(cm)) 94 | output_path = self._retrieve_argument_from_list(args.output, '-o/--output') 95 | self.force = args.force 96 | try: 97 | processor = InputProcessor( 98 | input_path=input_path, 99 | output_path=output_path, 100 | force=self.force, 101 | default_extension=self._empty_output_extension 102 | ) 103 | except FileExistsError as cm: 104 | self.parser.error(str(cm)) 105 | output_filename = processor.get_output_filename() 106 | output_filename = self._handle_output_filename(output_filename) 107 | output_given = processor.get_output_given() 108 | output_path = processor.get_output_directory() 109 | try: 110 | self.input_validator.validate_output_path(output_path) 111 | except OSError as cm: 112 | self.parser.error(str(cm)) 113 | self._handle_module_specific_args(input_path, output_path, output_filename, output_given, 114 | args) 115 | 116 | def _retrieve_argument_from_list(self, 117 | arg: list | None, 118 | arg_name: str, 119 | has_default: bool = False) -> None | str: 120 | try: 121 | return self._single_argument_retriever(arg, arg_name, has_default) 122 | except IOError as e: 123 | self.parser.error(e) 124 | return None 125 | 126 | @staticmethod 127 | def _single_argument_retriever(arg: list | None, 128 | arg_name: str, 129 | has_default: bool) -> None | str: 130 | """ 131 | Retrieves the user-argument from a list. It requires the user to have only entered 132 | the argument once (combined with `action='append'` for argument parsing), resulting in a 133 | list of length: 134 | - 0 (no arguments given & no default value) 135 | - 1 (1 argument given or default_value is present) 136 | - 2 (1 argument given and default value present) 137 | 138 | If `has_default`==True, the first list item is assumed to be the default one (set through 139 | `default=[]`) and any extra items in the list being user-input. 140 | 141 | Args: 142 | arg: List of arguments (or None if no arguments where generated and no defaults were 143 | present either) 144 | arg_name: The name of the user-argument to which `arg` belongs 145 | has_default: whether a default arg is present in the given arg list 146 | Returns: 147 | None (if args is None) or a single item from the given list. 148 | Raises: 149 | ValueError: If empty list is given (=programming error) 150 | IOError: If list contains more items than expected (>2 if has_default, else >1). 151 | 152 | """ 153 | # None is simply returned. 154 | if arg is None: 155 | return arg 156 | 157 | arg_len = len(arg) 158 | 159 | # Empty list indicates programming bug. 160 | if arg_len == 0: 161 | raise ValueError('Empty list is given. Should be None or list with elements.') 162 | 163 | # Retrieve value to be used for CLI argument. 164 | if arg_len > 2 or (arg_len > 1 and not has_default): 165 | raise IOError(f'Argument {arg_name} is only allowed once.') 166 | else: 167 | return arg[arg_len-1] 168 | 169 | @abstractmethod 170 | def _handle_module_specific_args(self, input_path, output_path, output_filename, output_given, 171 | args): 172 | """ 173 | Method to be filled in by the module specific parsers. Should perform 174 | additional validation over args specific to the parser. Should then call 175 | the module to continue the module. 176 | """ 177 | pass 178 | 179 | def _handle_output_filename(self, output_filename: str): 180 | """ 181 | Method to validate that an output filename complies with the 182 | required output extension. 183 | """ 184 | if '.' in output_filename and not output_filename.endswith( 185 | self._required_output_extensions): 186 | self.parser.error( 187 | f'Output file extension is incorrect. Expected output extension: ' 188 | f'{self._required_output_extensions}' 189 | ) 190 | else: 191 | return output_filename 192 | 193 | @staticmethod 194 | def load_model(model_path: os.PathLike) -> xgb.XGBClassifier: 195 | model = xgb.XGBClassifier() 196 | model.load_model(model_path) 197 | return model 198 | 199 | @staticmethod 200 | def _join_extensions(extensions: tuple[str]) -> str: 201 | return ', '.join(extensions) 202 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | --------------------------------------------------------------------------------