├── .gitattributes ├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── cogstack.py ├── credentials.py ├── data ├── ReadMe.md ├── cogstack_search_results │ └── ReadMe.md ├── medcattrainer_export │ └── ReadMe.md ├── media │ ├── cogstack_logo.png │ ├── foresight_logo_unofficial.png │ ├── medcat_logo.png │ ├── medcat_pipeline_summary.png │ └── nhs_logo.png ├── snomed │ ├── ReadMe.md │ ├── preprocessing_snomed_ct.ipynb │ └── umls_enricher.py └── umls │ ├── NLM_umls_download.py │ ├── ReadMe.md │ └── working_with_umls.ipynb ├── medcat ├── 1_create_model │ ├── create_cdb │ │ ├── create_cdb.py │ │ └── create_umls_cdb.py │ ├── create_modelpack │ │ └── create_modelpack.py │ └── create_vocab │ │ └── create_vocab.py ├── 2_train_model │ ├── 1_unsupervised_training │ │ ├── splitter.py │ │ ├── unsupervised training.ipynb │ │ ├── unsupervised_medcattraining.py │ │ └── unsupervised_training.py │ ├── 2_supervised_training │ │ ├── meta_annotation_training.ipynb │ │ ├── meta_annotation_training_advanced.ipynb │ │ └── supervised training.ipynb │ └── ReadMe.md ├── 3_run_model │ ├── ReadMe.md │ ├── run_model.ipynb │ └── run_model.py ├── ReadMe.md ├── compare_models │ ├── cmp_utils.py │ ├── comp_nbhelper.py │ ├── compare.py │ ├── compare_annotations.py │ ├── compare_cdb.py │ ├── data │ │ ├── demo-physio-mobility │ │ │ ├── cui_filter.csv │ │ │ └── intechopen_2cols_3.csv │ │ └── some_synthetic_data.csv │ ├── model_comparison.ipynb │ ├── output.py │ ├── tests │ │ ├── __init__.py │ │ ├── resources │ │ │ ├── docs │ │ │ │ └── not_real.csv │ │ │ ├── mct_export │ │ │ │ ├── medcat_trainer_expoert2.json │ │ │ │ └── medcat_trainer_export.json │ │ │ └── model_pack │ │ │ │ ├── cdb.dat │ │ │ │ └── vocab.dat │ │ ├── test_compare.py │ │ ├── test_compare_annotations.py │ │ ├── test_compare_cdb.py │ │ └── test_output.py │ └── validation.py └── evaluate_mct_export │ ├── __init__.py │ ├── mct_analysis.py │ └── mct_export_summary.ipynb ├── models ├── ReadMe.md ├── cdb │ └── .keep ├── modelpack │ └── ReadMe.md └── vocab │ └── .keep ├── mypy.ini ├── projects ├── ReadMe.md └── demo_project_stucture │ └── ReadMe.md ├── requirements-dev.txt ├── requirements.txt ├── search ├── .gitattributes ├── ReadMe.md └── search_template.ipynb ├── tests ├── __init__.py └── medcat │ ├── 1_create_model │ ├── __init__.py │ ├── create_cdb │ │ ├── __init__.py │ │ └── test_create_cdb.py │ ├── create_modelpack │ │ ├── __init__.py │ │ └── test_create_modelpack.py │ └── create_vocab │ │ ├── __init__.py │ │ └── test_create_vocab.py │ ├── 2_train_model │ ├── 1_unsupervised_training │ │ ├── __init__.py │ │ └── test_splitter.py │ └── __init__.py │ ├── __init__.py │ ├── evaluate_mct_export │ ├── __init__.py │ ├── offline_test_mct_analysis.py │ └── test_mct_analysis.py │ └── resources │ ├── MCT_export_example.json │ ├── cdb.dat │ ├── example_cdb_input_snomed.csv │ ├── example_cdb_input_umls.csv │ ├── example_file_to_split.csv │ └── vocab.dat ├── update.py └── utils ├── clinical_note_splitter.py └── ethnicity_map.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=strip-notebook-output 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | native-py: 11 | 12 | runs-on: ubuntu-24.04 13 | strategy: 14 | matrix: 15 | python-version: [ '3.9', '3.10', '3.11', '3.12' ] 16 | max-parallel: 4 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | pip install -r requirements-dev.txt 29 | - name: Typing 30 | # run mypy on all tracked non-test python modules 31 | # and use explicit package base since the project 32 | # is not set up as a python package 33 | run: | 34 | python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal 35 | - name: Test 36 | run: | 37 | python -m unittest discover 38 | python -m unittest discover -s medcat/compare_models 39 | # TODO - in the future, we might want to add automated tests for notebooks as well 40 | # though it's not really possible right now since the notebooks are designed 41 | # in a way that assumes interaction (i.e specifying model pack names) 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore all credential files 2 | credentials.py 3 | 4 | # Ignore ipynotebook checkpoints 5 | *.ipynb_checkpoints 6 | 7 | # data folders 8 | data/snomed/ 9 | data/medcattrainer_export/ 10 | data/cogstack_search_results/ 11 | 12 | # Default environments 13 | venv 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Cogstack_logo 3 | Nhs_logo 4 | Medcat_logo 5 |

6 | 7 | 8 | # Working with CogStack 9 | This repository contains all tools relevant to interacting with an NHS deployment of CogStack. 10 | 11 | It contains: 12 | 1) Easy to follow templates and instructions to interact and search CogStack. 13 | 2) Recommended workflows to create, train, and run, MedCAT models. 14 | 15 | For further discussions or questions. Please join our official [CogStack/MedCAT forum!](https://discourse.cogstack.org/) 16 | 17 | __NOTE__ this section is currently in development. Let me know if there is anything 18 | else to add! 19 | 20 | 21 | ## Setup 22 | 23 | Users can follow these steps to quickly setup and deploy this repository on their machine. 24 | 25 | Any code to enter in these instructions will be represented as `code to enter`. 26 | 27 | Please replace anything within `` with your own specific details. 28 | 29 | ### Step 1: Clone this repository locally 30 | 31 | 1. Enter the directory where you would like to store these files. `cd path/to/where/you/want/this/repository` 32 | 33 | 2. Clone the online repository: `git clone https://github.com/CogStack/working_with_cogstack.git` 34 | 35 | Further instructions and self-help with git and git clone. Please visit this [link.](https://github.com/git-guides/git-clone) 36 | 37 | If you choose to use github desktop rather than the terminal please refer to the [official github desktop guides.](https://docs.github.com/en/desktop) 38 | 39 | 3. Optional: To update to the latest release of this repository: `git pull` 40 | 41 | ### Step 2: Creating a virtual environment and required packages 42 | (Requires Python 3.7+) 43 | 44 | __Windows__ 45 | 1. Create a new virtual env: `python3 -m venv venv` 46 | 2. Load the virtual environment: `.\venv\Scripts\activate` 47 | 3. Install relevant packages and libraries: `pip install -r requirements.txt` 48 | 49 | 50 | __Linux/MAC OS__ 51 | 1. Create a new virtual env: `python3 -m venv venv` 52 | 2. Load the virtual environment: `source venv/bin/activate` 53 | 3. Install relevant packages and libraries: `pip install -r requirements.txt` 54 | 55 | *Optional: If no jupyter instance is installed.* 56 | 1. In the main folder of this repository. Activate your virtual environment, using the (Step 2) command from your respective OS. 57 | 2. Start JupyterLab: `jupyter-lab` 58 | 59 | 60 | ### Step 3: Enter credentials and Login details 61 | In the main folder of this repository you can populate the [credentials.py](credentials.py) file with your own CogStack hostnames, username and passwords. 62 | 63 | For an automatic authentication experience, the credentials.py contents can be prepopulated with your CogStack instance credentials: 64 | ``` 65 | hosts = [] # This is a list of your cogstack elasticsearch instances. 66 | 67 | # These are your login details (either via http_auth or API) 68 | username = None 69 | password = None 70 | ``` 71 | For shared machines it is recommended that you leave the passwords blank. This will trigger a prompt in when accessing a cogstack instance. 72 | 73 | If you have any questions or issues obtaining these details please contact your local CogStack administrator. 74 | 75 | ## Contents 76 | 77 | ## [How to search using CogStack](search) 78 | This directory contains the basics search templates. 79 | 80 | For further information on CogStack please visit their [github](https://github.com/CogStack) 81 | or [wiki page](https://cogstack.org/). 82 | 83 | ## [How to create a watcher](watcher) 84 | This directory contains the basics watcher job templates. 85 | 86 | ## [MedCAT](medcat) 87 | An overview of this process is shown below. 88 | 89 | 90 | 91 | 92 | Further information about MedCAT can be found from their [github](https://github.com/CogStack/MedCAT) 93 | or via their official documentation [here](https://medcat.readthedocs.io/en/latest/). 94 | 95 | General MedCAT tutorials can be found [here](https://github.com/CogStack/MedCATtutorials). 96 | 97 | 98 | ### Demo 99 | A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III to annotate 100 | SNOMED-CT concepts. __Note:__ No supervised training has been provided to this model and therefore should only be used for demonstration 101 | purposes only. 102 | 103 | ### MedCAT Citation 104 | ``` 105 | @ARTICLE{Kraljevic2021-ln, 106 | title="Multi-domain clinical natural language processing with {MedCAT}: The Medical Concept Annotation Toolkit", 107 | author="Kraljevic, Zeljko and Searle, Thomas and Shek, Anthony and Roguski, Lukasz and Noor, Kawsar and Bean, Daniel and Mascio, Aurelie and Zhu, Leilei and Folarin, Amos A and Roberts, Angus and Bendayan, Rebecca and Richardson, Mark P and Stewart, Robert and Shah, Anoop D and Wong, Wai Keong and Ibrahim, Zina and Teo, James T and Dobson, Richard J B", 108 | journal="Artif. Intell. Med.", 109 | volume=117, 110 | pages="102083", 111 | month=jul, 112 | year=2021, 113 | issn="0933-3657", 114 | doi="10.1016/j.artmed.2021.102083" 115 | } 116 | ``` 117 | 118 | 119 | # Foresight (Coming soon...) 120 | Demo is available [here](https://foresight.sites.er.kcl.ac.uk/) 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/__init__.py -------------------------------------------------------------------------------- /cogstack.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | from typing import Dict, List, Any, Optional, Iterable, Tuple 3 | import elasticsearch 4 | import elasticsearch.helpers 5 | import pandas as pd 6 | from tqdm.notebook import tqdm 7 | import eland as ed 8 | 9 | import warnings 10 | warnings.filterwarnings("ignore") 11 | 12 | from credentials import * 13 | 14 | 15 | class CogStack(object): 16 | """ 17 | A class for interacting with Elasticsearch. 18 | 19 | Args: 20 | hosts (List[str]): A list of Elasticsearch host URLs. 21 | username (str, optional): The username to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a username. 22 | password (str, optional): The password to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a password. 23 | api (bool, optional): A boolean value indicating whether to use API keys or basic authentication to connect to Elasticsearch. Defaults to False (i.e., use basic authentication). Elasticsearch 7.17. 24 | api_key (str, optional): The API key to use when connecting to Elasticsearch. 25 | When provided along with `api=True`, this takes precedence over username/password. Only available when using Elasticsearch 8.17. 26 | """ 27 | def __init__(self, hosts: List, username: Optional[str] = None, password: Optional[str] = None, 28 | api: bool = False, timeout: Optional[int]=60, api_key: Optional[str] = None): 29 | 30 | if api_key and api: 31 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts, 32 | api_key=api_key, 33 | verify_certs=False, 34 | timeout=timeout) 35 | 36 | 37 | elif api: 38 | api_username, api_password = self._check_auth_details(username, password) 39 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts, 40 | api_key=(api_username, api_password), 41 | verify_certs=False, 42 | timeout=timeout) 43 | 44 | else: 45 | username, password = self._check_auth_details(username, password) 46 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts, 47 | basic_auth=(username, password), 48 | verify_certs=False, 49 | timeout=timeout) 50 | 51 | 52 | def _check_auth_details(self, username=None, password=None) -> Tuple[str, str]: 53 | """ 54 | Prompt the user for a username and password if the values are not provided as function arguments. 55 | 56 | Args: 57 | api_username (str, optional): The API username. If not provided, the user will be prompted to enter a username. 58 | api_password (str, optional): The API password. If not provided, the user will be prompted to enter a password. 59 | 60 | Returns: 61 | Tuple[str, str]: A tuple containing the API username and password. 62 | """ 63 | if username is None: 64 | username = input("Username: ") 65 | if password is None: 66 | password = getpass.getpass("Password: ") 67 | return username, password 68 | 69 | def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, request_timeout: Optional[int] = 300): 70 | """ 71 | Retrieve a generator object that can be used to iterate through documents in an Elasticsearch index. 72 | 73 | Args: 74 | index (List[str]): A list of Elasticsearch index names to search. 75 | query (Dict): A dictionary containing the search query parameters. 76 | es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800. 77 | request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300. 78 | 79 | Returns: 80 | generator: A generator object that can be used to iterate through the documents in the specified Elasticsearch index. 81 | """ 82 | docs_generator = elasticsearch.helpers.scan(self.elastic, 83 | query=query, 84 | index=index, 85 | size=es_gen_size, 86 | request_timeout=request_timeout) 87 | return docs_generator 88 | 89 | def cogstack2df(self, query: Dict, index: str, column_headers=None, es_gen_size: int=800, request_timeout: int=300, 90 | show_progress: bool = True): 91 | """ 92 | Retrieve documents from an Elasticsearch index and convert them to a Pandas DataFrame. 93 | 94 | Args: 95 | query (Dict): A dictionary containing the search query parameters. 96 | index (str): The name of the Elasticsearch index to search. 97 | column_headers (List[str], optional): A list of column headers to use for the DataFrame. If not provided, the DataFrame will have default column names. 98 | es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800. 99 | request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300. 100 | show_progress (bool, optional): Whether to show the progress in console. Defaults to true. 101 | 102 | Returns: 103 | pandas.DataFrame: A DataFrame containing the retrieved documents. 104 | """ 105 | docs_generator = elasticsearch.helpers.scan(self.elastic, 106 | query=query, 107 | index=index, 108 | size=es_gen_size, 109 | request_timeout=request_timeout) 110 | temp_results = [] 111 | results = self.elastic.count(index=index, query=query['query'], request_timeout=300) # type: ignore 112 | for hit in tqdm(docs_generator, total=results['count'], desc="CogStack retrieved...", disable=not show_progress): 113 | row = dict() 114 | row['_index'] = hit['_index'] 115 | row['_id'] = hit['_id'] 116 | row['_score'] = hit['_score'] 117 | row.update(hit['_source']) 118 | temp_results.append(row) 119 | if column_headers: 120 | df_headers = ['_index', '_id', '_score'] 121 | df_headers.extend(column_headers) 122 | df = pd.DataFrame(temp_results, columns=df_headers) 123 | else: 124 | df = pd.DataFrame(temp_results) 125 | return df 126 | 127 | def DataFrame(self, index: str, columns: Optional[List[str]] = None): 128 | """ 129 | Fast method to return a pandas dataframe from a CogStack search. 130 | 131 | Args: 132 | index (str): A list of indices to search. 133 | columns (List[str], optional): A list of column names to include in the DataFrame. If not provided, all columns will be included. 134 | 135 | Returns: 136 | DataFrame: A pd.DataFrame like object containing the retrieved documents. 137 | """ 138 | return ed.DataFrame(es_client=self.elastic, es_index_pattern=index, columns=columns) 139 | 140 | 141 | def list_chunker(user_list: List[Any], n: int) -> List[List[Any]]: 142 | """ 143 | Divide a list into sublists of a specified size. 144 | 145 | Args: 146 | user_list (List[Any]): The list to be divided. 147 | n (int): The size of the sublists. 148 | 149 | Returns: 150 | List[List[Any]]: A list of sublists containing the elements of the input list. 151 | """ 152 | n=max(1, n) 153 | return [user_list[i:i+n] for i in range(0, len(user_list), n)] 154 | 155 | 156 | def _no_progress_bar(iterable: Iterable, **kwargs): 157 | return iterable 158 | 159 | -------------------------------------------------------------------------------- /credentials.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | # CogStack login details 3 | ## Any questions on what these details are please contact your local CogStack administrator. 4 | 5 | hosts: List[str] = [] # This is a list of your CogStack ElasticSearch instances. 6 | 7 | ## These are your login details (either via http_auth or API) Should be in str format 8 | username = None 9 | password = None 10 | 11 | api_key = None # Encoded api key issued by your cogstack administrator. 12 | 13 | # NLM authentication 14 | # The UMLS REST API requires a UMLS account for the authentication described below. 15 | # If you do not have a UMLS account, you may apply for a license on the UMLS Terminology Services (UTS) website. 16 | # https://documentation.uts.nlm.nih.gov/rest/authentication.html 17 | 18 | # UMLS api key auth 19 | umls_apikey = None 20 | 21 | # SNOMED authentication from NHS TRUD. International releases will require different API access creds. 22 | # api key auth from NHS TRUD 23 | # For more information please see: https://isd.digital.nhs.uk/trud/users/guest/filters/0/api 24 | snomed_apikey = None 25 | -------------------------------------------------------------------------------- /data/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Storage location for all data/models 2 | 3 | To keep the repository clean all data and models should be stored here in their appropriate folder. 4 | 5 | This directory has been organised to assist in the workflow of Working with [CogStack](https://github.com/CogStack/CogStack-NiFi) 6 | and creating/evaluating [MedCAT models](https://github.com/CogStack/MedCAT). 7 | 8 | ## Retrieval and Storage of Data 9 | All raw data relating to a CogStack request. Should be stored here 10 | 11 | 12 | ## SNOMED 13 | 14 | Place holder for all SNOMED related content and downloads [here](/data/snomed). 15 | For other terminologies (UMLS/RxNORM etc...) please create a separate folder and store them within this directory. 16 | 17 | ## MedCAT Models 18 | 19 | All model components and model packs should be stored [here](/data/medcat_models) 20 | 21 | 22 | ## MedCATtrainer 23 | 24 | All MedCATtrainer JSON exports should be stored [here](data/medcattrainer_export). 25 | Scripts to produce export summaries of all annotations and work done, can be found [here](TODO: ). 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /data/cogstack_search_results/ReadMe.md: -------------------------------------------------------------------------------- 1 | # CogStack search results/requests. 2 | 3 | Add subdirectories to more efficiently manage and collate search results relating to a single CogStack Search request. 4 | 5 | 6 | ## Standarise Workflows (Optional) 7 | 8 | The following is just guidelines/recommendations to standardise workflow: 9 | 10 | -

Good practise is to name files with the following structure *YYYYMMDD_filename* 11 |

12 | 13 | A recommended format for the directory structure to efficiently manage each request is as follows: 14 | Ideally the *project_name* should correspond to your CogStack request ID. 15 | 16 | 17 | ``` 18 | project_name/ 19 | --- input/ # raw data files 20 | --- ref/ # reference files 21 | --- result/ # final results 22 | --- src/ # functions to source 23 | --- work/ # intermediate data 24 | --- main.py 25 | --- analysis.py 26 | 27 | ``` 28 | 29 | __[input/]__: Contains the original, or raw, data files. Contents in this folder should be treated as read-only. 30 | 31 | __[ref/]__: Contains reference files, i.e. from research. 32 | 33 | __[result/]__: Contains the final results and explanatory markdown files. 34 | 35 | __[src/]__: Contains functions that are sourced from the main console code. 36 | 37 | __[work/]__: The working directory, should be used to store temporary data files. 38 | With the final scripts (main.py and other analysis scripts...) held directly in the project folder outside of the sub-folders. 39 | Any intermediate data that one may want to reference later should be stored in the work sub-folder. 40 | -------------------------------------------------------------------------------- /data/medcattrainer_export/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Placeholder for MedCATtrainer exports 2 | 3 | All materials exported from medcattrainer should be stored here. 4 | 5 | 6 | MedCATtrainer exports should be placed [here](data/medcattrainer_exports) -------------------------------------------------------------------------------- /data/media/cogstack_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/cogstack_logo.png -------------------------------------------------------------------------------- /data/media/foresight_logo_unofficial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/foresight_logo_unofficial.png -------------------------------------------------------------------------------- /data/media/medcat_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/medcat_logo.png -------------------------------------------------------------------------------- /data/media/medcat_pipeline_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/medcat_pipeline_summary.png -------------------------------------------------------------------------------- /data/media/nhs_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/nhs_logo.png -------------------------------------------------------------------------------- /data/snomed/ReadMe.md: -------------------------------------------------------------------------------- 1 | # SNOMED 2 | 3 | Place holder for all SNOMED related content and downloads here 4 | 5 | -------- 6 | 7 | ## MedCAT proprocessing of SNOMED 8 | 9 | All scripts to preprocess SNOMED CT release files can be found: 10 | 11 | from medcat.utils.preprocess_snomed import Snomed 12 | 13 | snomed = Snomed({path_to_unzipped_snomed}) 14 | df = snomed.to_concept_df() 15 | 16 | SNOMED UK edition or drug extension special releases can be preprocessed via: 17 | 18 | from medcat.utils.preprocess_snomed import Snomed 19 | 20 | snomed = Snomed({path_to_unzipped_snomed}, uk_ext=True, uk_drug_ext=False) 21 | df = snomed.to_concept_df() 22 | 23 | Further information can be found [here](https://github.com/CogStack/MedCAT/blob/master/medcat/utils/preprocess_snomed.py) 24 | 25 | ## About 26 | SNOMED CT is a standarised clinical terminology consisting of >350,000 unique concepts. It is owned, maintained and distributed by SNOMED International. 27 | 28 | Please visit and explore https://www.snomed.org/ to find out further information about the various SNOMED CT products and services which they offer. 29 | 30 | 31 | ## What is SNOMED CT? 32 | 33 | SNOMED CT is a clinical terminology containing concepts with unique meanings and formal logic based definitions organised into hierarchies. For further information please see: https://confluence.ihtsdotools.org/display/DOCSTART/4.+SNOMED+CT+Basics 34 | 35 | ## SNOMED CT Design 36 | SNOMED CT content is represented into 3 main types of components: 37 | 38 | - Concepts representing clinical meanings that are organised into hierarchies. 39 | - Descriptions which link appropriate human readable terms to concepts 40 | - Relationships which link each concept to other related concepts 41 | 42 | It also contains mappings to classification systems such as: 43 | - ICD (International classifications of diseases) 44 | - OPCS (Office of Population Censuses and Surveys) (SNOMED UK extension only) 45 | 46 | --------- 47 | 48 | 49 | ## Access to SNOMED CT release files 50 | 51 | You may download SNOMED CT at the Member country’s designated website. The use of SNOMED CT in Member countries is free. Follow this [link](https://www.snomed.org/our-stakeholders/members) to find out if your country is a member state and explore the website to find directions to where to your national SNOMED CT distribution is held. 52 | 53 | E.g. 54 | * UK -> [NHS TRUD](https://isd.digital.nhs.uk/trud3/user/guest/group/0/home) 55 | 56 | * US -> [NIH National Library of Medicine](https://www.nlm.nih.gov/healthit/snomedct/international.html) Alternative clinical terminologies such as UMLS can be found here. 57 | 58 | 59 | The following Steps are to services provided by SNOMED International for organizations and individuals to request use and access to the International Release of SNOMED CT for use in non-Member countries 60 | 61 | __To access SNOMED CT files from non-member countries:__ 62 | 63 | 1. Please visit the SNOMED [Member Licensing and Distribution Service.](https://mlds.ihtsdotools.org/#/landing) and read their terms and conditions for use. 64 | 65 | 2. Login or Register for an account and wait to be granted access. 66 | 67 | 3. Once you have been granted access. Logged in and visit the tab ["Release Packages"](https://mlds.ihtsdotools.org/#/viewReleases) and retrieve the release of SNOMED CT that you would like to have. Alternatively, for the international SNOMED release simply visit the [International releases](https://mlds.ihtsdotools.org/#/viewReleases/viewRelease/167). 68 | 69 | ---------- -------------------------------------------------------------------------------- /data/snomed/umls_enricher.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import pandas as pd 4 | from umls_downloader import download_umls 5 | from tqdm.autonotebook import tqdm 6 | 7 | api_key = '' 8 | version = '2022AA' 9 | outfile = f'{version}_UMLS_english.csv' 10 | 11 | umls_rows = [] 12 | path = download_umls(version=version, api_key=api_key) 13 | with zipfile.ZipFile(path) as zip_file: 14 | with zip_file.open("MRCONSO.RRF", mode='r') as file: 15 | with tqdm(total=sum(1 for _ in file), unit='line') as pbar: 16 | file.seek(0) # reset file pointer to the begining of the file 17 | for line in file: 18 | umls_rows.append(line.decode('UTF-8').split('|')[:-1]) 19 | pbar.update(1) 20 | columns = [ 21 | 'CUI', 22 | 'LAT', 23 | 'TS', 24 | 'LUI', 25 | 'STT', 26 | 'SUI', 27 | 'ISPREF', 28 | 'AUI', 29 | 'SAUI', 30 | 'SCUI', 31 | 'SDUI', 32 | 'SAB', 33 | 'TTY', 34 | 'CODE', 35 | 'STR', 36 | 'SRL', 37 | 'SUPPRESS', 38 | 'CVF', 39 | ] 40 | 41 | umls_df = pd.DataFrame(columns=columns, data=umls_rows) 42 | eng_umls = umls_df[umls_df['LAT'] == 'ENG'] 43 | del umls_df 44 | outfile = f'{version}_UMLS_english.csv' 45 | eng_umls.to_csv(outfile, index=False) 46 | print(f'file saved as {outfile}') 47 | 48 | medcat_csv_mapper = { 49 | 'CUI':'cui', 50 | 'STR':'name', 51 | 'SAB':'ontologies', 52 | 'ISPREF':'name_status', 53 | 'TUI':'type_ids', 54 | } 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /data/umls/NLM_umls_download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Automating UMLS Terminology Services (UTS) Downloads 3 | The following instructions will allow you to automate the download of RxNorm, UMLS, or SNOMED CT files. 4 | 5 | 6 | Step 1: Get your API key from your UTS profile. 7 | You can find the API key in the UTS ‘My Profile’ area after signing in. An API key remains active as long as 8 | the associated UTS account is active. 9 | https://uts.nlm.nih.gov/uts/?_gl=1*veo3ht*_ga*MTkwNzE1ODcyOC4xNjYyOTcxNDg3*_ga_P1FPTH9PL4*MTY2Mjk3MTQ4Ni4xLjEuMTY2Mjk3MzA0OS4wLjAuMA.. 10 | 11 | """ 12 | import requests 13 | import sys 14 | 15 | apikey = '' # please add apikey 16 | DOWNLOAD_URL = 'https://download.nlm.nih.gov/umls/kss/2022AA/umls-2022AA-full.zip' # Change this to service required 17 | PATH_TO_DOWNLOAD = '' # Default outfile path will be written to current working directory 18 | 19 | print(DOWNLOAD_URL) 20 | value = DOWNLOAD_URL.split('/') 21 | 22 | if not apikey: 23 | sys.exit("Please enter you api key ") 24 | 25 | if not DOWNLOAD_URL: 26 | print("Usage: curl-uts-downloads-apikey.sh download_url ") 27 | print(" For full UMLS:") 28 | print(" e.g. curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/2022AA/umls-2022AA-full.zip") 29 | print(" For RxNorm:") 30 | print(" e.g. curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_full_current.zip") 31 | print(" curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_weekly_current.zip") 32 | sys.exit("Download_url is empty") 33 | 34 | url = 'https://utslogin.nlm.nih.gov/cas/v1/api-key' 35 | param = {'apikey': apikey} 36 | headers = {'Content-type': 'application/x-www-form-urlencoded'} 37 | 38 | TGTresponse = requests.post(url, headers=headers, data=param) 39 | first, second = TGTresponse.text.split('api-key/') 40 | TGTTicket, fourth = second.split('" method') 41 | 42 | print(TGTTicket) 43 | 44 | url = 'https://utslogin.nlm.nih.gov/cas/v1/tickets/'+TGTTicket 45 | param = {'service': DOWNLOAD_URL} 46 | headers = {'Content-type': 'application/x-www-form-urlencoded'} 47 | 48 | STResponse = requests.post(url, headers=headers, data=param) 49 | 50 | print(STResponse.text) 51 | 52 | url = DOWNLOAD_URL+'?ticket='+STResponse.text 53 | r = requests.get(url, allow_redirects=True) 54 | 55 | with open(PATH_TO_DOWNLOAD + value[len(value)-1], 'wb') as f: 56 | f.write(r.content) 57 | 58 | # Retrieve HTTP meta-data 59 | print(r.status_code) 60 | print(r.headers['content-type']) 61 | print(r.encoding) 62 | 63 | print(f'File saved to: {str(PATH_TO_DOWNLOAD + value[len(value)-1])}') 64 | print('Download completed') 65 | -------------------------------------------------------------------------------- /data/umls/ReadMe.md: -------------------------------------------------------------------------------- 1 | # UMLS - The Unified Medical Language System® 2 | 3 | Place holder for all UMLS related content and downloads here 4 | 5 | -------- 6 | 7 | ## About 8 | The UMLS integrates and distributes key terminology, classification and coding standards, 9 | and associated resources to promote creation of more effective and interoperable biomedical information systems and services, 10 | including electronic health records. 11 | 12 | The UMLS, or Unified Medical Language System, is a set of files and software that brings together many health and 13 | biomedical vocabularies and standards to enable interoperability between computer systems. 14 | 15 | ## Access 16 | 17 | [Request a license](https://uts.nlm.nih.gov/uts/?_gl=1*1791eyk*_ga*MTkwNzE1ODcyOC4xNjYyOTcxNDg3*_ga_P1FPTH9PL4*MTY2Mjk3ODA3OS4yLjEuMTY2Mjk3OTQ4Mi4wLjAuMA..) 18 | and sign up for a UMLS Terminology Services (UTS) account. 19 | 20 | - UMLS licenses are issued only to individuals and not to groups or organizations. 21 | - There is no charge for licensing the UMLS from NLM. NLM is a member of [SNOMED International](http://www.snomed.org/) 22 | (owner of SNOMED CT), and there is no charge for SNOMED CT use in the United States and other [member countries](http://www.snomed.org/our-customers/members). 23 | Some uses of the UMLS may require additional agreements with individual terminology vendors. 24 | - Your UTS account provides access to the Unified Medical Language System (UMLS), the Value Set Authority Center (VSAC), 25 | RxNorm downloads, SNOMED CT downloads and more. 26 | - For more, visit [how to license and access UMLS data](https://www.nlm.nih.gov/databases/umls.html) 27 | 28 | 29 | Further information can be found on the [nlm website](https://www.nlm.nih.gov/research/umls/index.html) 30 | 31 | 32 | 33 | ## API Home 34 | 35 | ### Authentication 36 | All users of this terminology require registration with NLM, to download UMLS data (Warning: some restriction may apply depending on country; see UMLS licence and its SNOMED CT appendix): 37 | 38 | https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html 39 | 40 | Documentation for User Authentication can be found [here](https://documentation.uts.nlm.nih.gov/rest/authentication.html) 41 | 42 | 43 | For further information about UMLS API Technical Documentation can be found [here.](https://documentation.uts.nlm.nih.gov/rest/home.html) 44 | 45 | 46 | ### Downloading UMLS 47 | 48 | One can use the scripts found in [NLM_umls_download.py](/data/umls/NLM_umls_download.py) to download the entire UMLS 49 | Knowledge Source. 50 | 51 | Otherwise, one can access the UMLS Knowledge Sources directly: File Downloads can be found [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) 52 | 53 | Alternatively, you can simply follow the scripts in the [working with UMLS notebook](/data/umls/working_with_umls.ipynb). This script will download UMLS and convert the 54 | MRCONSO.RFF.ZIP file to a DataFrame. You can then process this file to get ready to build a MedCAT Concept Database! 55 | 56 | ## Citing the UMLS 57 | If you use UMLS in your work, please cite the original article: 58 | 59 | Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795. 60 | 61 | 62 | -------------------------------------------------------------------------------- /data/umls/working_with_umls.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Working with UMLS\n", 8 | "\n", 9 | "This scripts will walk you through how to:\n", 10 | "1) Download a specific version of UMLS\n", 11 | "\n", 12 | "2) Process the MRCONSO.RFF.ZIP files to a pandas df whcih you can then manipulate\n", 13 | "\n", 14 | "__Note:__ Keep in mind that the UMLS file sets are very large!" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Part 1: Downloading UMLS" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import os\n", 33 | "from umls_downloader import download_umls" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Get this from https://uts.nlm.nih.gov/uts/edit-profile\n", 43 | "api_key = ''\n", 44 | "version = '2022AA' # Change this to the UMLS version that you require" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "path = download_umls(version=version, api_key=api_key)\n", 54 | "print(path) # This is where the UMLS files are now saved" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Part 2: Working with UMLS\n", 62 | "\n", 63 | "The part of UMLS that we require is stored in the MRCONSO.RFF files. The file layout is as follows:\n", 64 | "\n", 65 | "__Concept Names and Sources (File = MRCONSO.RRF)__\n", 66 | "\n", 67 | "|Col.|Description|\n", 68 | "|---|---|\n", 69 | "|CUI|\tUnique identifier for concept|\n", 70 | "|LAT|\tLanguage of term|\n", 71 | "|TS|\tTerm status|\n", 72 | "|LUI|\tUnique identifier for term|\n", 73 | "|STT|\tString type|\n", 74 | "|SUI|\tUnique identifier for string|\n", 75 | "|ISPREF|\tAtom status - preferred (Y) or not (N) for this string within this concept|\n", 76 | "|AUI|\tUnique identifier for atom - variable length field, 8 or 9 characters|\n", 77 | "|SAUI|\tSource asserted atom identifier [optional]|\n", 78 | "|SCUI|\tSource asserted concept identifier [optional]|\n", 79 | "|SDUI|\tSource asserted descriptor identifier [optional]|\n", 80 | "|SAB|\tAbbreviated source name (SAB). Maximum field length is 20 alphanumeric characters. Two source abbreviations are assigned: Root Source Abbreviation (RSAB) — short form, no version information, for example, AI/RHEUM, 1993, has an RSAB of \"AIR\" Versioned Source Abbreviation (VSAB) — includes version information, for example, AI/RHEUM, 1993, has an VSAB of \"AIR93\" Official source names, RSABs, and VSABs are included on the UMLS Source Vocabulary Documentation page.\n", 81 | "|TTY|\tAbbreviation for term type in source vocabulary, for example PN (Metathesaurus Preferred Name) or CD (Clinical Drug). Possible values are listed on the Abbreviations Used in Data Elements page.|\n", 82 | "CODE|\tMost useful source asserted identifier (if the source vocabulary has more than one identifier), or a Metathesaurus-generated source entry identifier (if the source vocabulary has none)|\n", 83 | "|STR|\tString|\n", 84 | "|SRL|\tSource restriction level|\n", 85 | "|SUPPRESS|\tSuppressible flag. Values = O, E, Y, or N O: All obsolete content, whether they are obsolesced by the source or by NLM. These will include all atoms having obsolete TTYs, and other atoms becoming obsolete that have not acquired an obsolete TTY (e.g. RxNorm SCDs no longer associated with current drugs, LNC atoms derived from obsolete LNC concepts). E: Non-obsolete content marked suppressible by an editor. These do not have a suppressible SAB/TTY combination. Y: Non-obsolete content deemed suppressible during inversion. These can be determined by a specific SAB/TTY combination explicitly listed in MRRANK. N: None of the above. Default suppressibility as determined by NLM (i.e., no changes at the Suppressibility tab in MetamorphoSys) should be used by most users, but may not be suitable in some specialized applications. See the MetamorphoSys Help page for information on how to change the SAB/TTY suppressibility to suit your requirements. NLM strongly recommends that users not alter editor-assigned suppressibility, and MetamorphoSys cannot be used for this purpose.|\n", 86 | "|CVF|\tContent View Flag. Bit field used to flag rows included in Content View. This field is a varchar field to maximize the number of bits available for use.|" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "import zipfile\n", 96 | "import pandas as pd" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "umls_rows = []\n", 106 | "with zipfile.ZipFile(path) as zip_file:\n", 107 | " with zip_file.open(\"MRCONSO.RRF\", mode=\"r\") as file:\n", 108 | " for line in file:\n", 109 | " umls_rows.append(line.decode('UTF-8').split('|')[:-1])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "columns = [\n", 119 | " \"CUI\",\n", 120 | " \"LAT\",\n", 121 | " \"TS\",\n", 122 | " \"LUI\",\n", 123 | " \"STT\",\n", 124 | " \"SUI\",\n", 125 | " \"ISPREF\",\n", 126 | " \"AUI\",\n", 127 | " \"SAUI\",\n", 128 | " \"SCUI\",\n", 129 | " \"SDUI\",\n", 130 | " \"SAB\",\n", 131 | " \"TTY\",\n", 132 | " \"CODE\",\n", 133 | " \"STR\",\n", 134 | " \"SRL\",\n", 135 | " \"SUPPRESS\",\n", 136 | " \"CVF\", \n", 137 | "]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "umls_df = pd.DataFrame(columns=columns, data=umls_rows)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "umls_df.head()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Free free to now manipulate the dataframe as you would like!" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Python 3", 176 | "language": "python", 177 | "name": "python3" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 3 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython3", 189 | "version": "3.8.2" 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 1 194 | } 195 | -------------------------------------------------------------------------------- /medcat/1_create_model/create_cdb/create_cdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from medcat.config import Config 4 | from medcat.cdb_maker import CDBMaker 5 | 6 | pd.options.mode.chained_assignment = None # type: ignore 7 | 8 | # relative to file path 9 | _FILE_DIR = os.path.dirname(__file__) 10 | # relative path to working_with_cogstack folder 11 | _REL_PATH = os.path.join("..", "..", "..") 12 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 13 | # absolute path to working_with_cogstack folder 14 | BASE_PATH = os.path.abspath(_BASE_PATH) 15 | 16 | EXPECTED_CSV_PATH = os.path.join(_REL_PATH, "data", "snomed") 17 | 18 | csv_path = input(f"Enter specific SNOMED pre-cdb csv found in the path {EXPECTED_CSV_PATH}: ") 19 | # The preprocessing files for snomed can be found here ../../../data/snomed/: 20 | # The default is output is ../../../data/snomed/preprocessed_snomed.csv 21 | release = csv_path[-12:-4] 22 | # doing it here so that it can later be used for CDBMaker 23 | csv_path = os.path.join(EXPECTED_CSV_PATH, csv_path) 24 | 25 | model_dir = os.path.join(BASE_PATH, "models", "cdb") 26 | output_cdb = os.path.join(model_dir, f"{release}_SNOMED_cdb.dat") 27 | csv = pd.read_csv(csv_path) 28 | 29 | # Remove null values 30 | sctid_null_index = csv[csv['name'].isnull()].index.copy() 31 | csv['name'].iloc[sctid_null_index] = "N/A" 32 | 33 | # Only filter acronyms for specific Semantic tags 34 | csv['acronym'] = csv[~csv['description_type_ids'].str.contains("assessment scale|" 35 | "core metadata concept|" 36 | "metadata|" 37 | "foundation metadata concept" 38 | "|OWL metadata concept")]['name'].str.\ 39 | extract("([A-Z]{2,6}) - ", expand=True) 40 | 41 | print("Cleaning acronyms...") 42 | for i, row in csv[(~csv['acronym'].isnull()) & (csv['name_status'] == 'A')][['name', 'acronym']].iterrows(): 43 | if row['name'][0:len(row['acronym'])] == row['acronym']: 44 | csv['name'].iloc[i] = row['acronym'] # type: ignore 45 | 46 | print("acronyms complete") 47 | 48 | csv = csv.drop_duplicates(keep='first').reset_index(drop=True) 49 | csv.pop('acronym') 50 | 51 | # Setup config 52 | config = Config() 53 | config.general['spacy_model'] = 'en_core_web_md' 54 | config.cdb_maker['remove_parenthesis'] = 1 55 | config.general['cdb_source_name'] = f'SNOMED_{release}' 56 | 57 | maker = CDBMaker(config) 58 | 59 | # Create your CDB 60 | # Add more cdbs to the list 61 | csv_paths = [csv_path] 62 | 63 | cdb = maker.prepare_csvs(csv_paths, full_build=True) 64 | 65 | # Add type_id pretty names to cdb 66 | cdb.addl_info['type_id2name'] = pd.Series(csv.description_type_ids.values, index=csv.type_ids.astype(str)).to_dict() 67 | cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. 68 | 69 | # save model 70 | cdb.save(output_cdb) 71 | print(f"CDB Model saved successfully as: {output_cdb}") 72 | -------------------------------------------------------------------------------- /medcat/1_create_model/create_cdb/create_umls_cdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from medcat.config import Config 4 | from medcat.cdb_maker import CDBMaker 5 | 6 | pd.options.mode.chained_assignment = None # type: ignore 7 | 8 | # relative to file path 9 | _FILE_DIR = os.path.dirname(__file__) 10 | # relative path to working_with_cogstack folder 11 | _REL_PATH = os.path.join("..", "..", "..") 12 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 13 | # absolute path to working_with_cogstack folder 14 | BASE_PATH = os.path.abspath(_BASE_PATH) 15 | 16 | EXPECTED_CSV_PATH = os.path.join(_REL_PATH, "data", "umls") 17 | 18 | # this is expected to be output from medcat.utils.preprocess_umls 19 | # i.e not the raw UMLS files 20 | csv_path = input(f"Enter specific UMLS pre-cdb csv found in the path data/umls in {EXPECTED_CSV_PATH}: ") 21 | # doing it here so that it can later be used for CDBMaker 22 | csv_path = os.path.join(EXPECTED_CSV_PATH, csv_path) 23 | release = '2022AA' # or as appropriate 24 | 25 | if not os.path.exists('models'): 26 | os.makedirs('models') 27 | print("Creating a 'models' folder to store model") 28 | 29 | model_dir = os.path.join(BASE_PATH, "models", "cdb") 30 | output_cdb = os.path.join(model_dir, f"{release}_UMLS_cdb.dat") 31 | csv = pd.read_csv(csv_path) 32 | 33 | # Remove null values 34 | sctid_null_index = csv[csv['name'].isnull()].index.copy() 35 | csv['name'].iloc[sctid_null_index] = "N/A" 36 | 37 | csv = csv.drop_duplicates(keep='first').reset_index(drop=True) 38 | 39 | 40 | # Setup config 41 | config = Config() 42 | config.general['spacy_model'] = 'en_core_web_md' 43 | config.cdb_maker['remove_parenthesis'] = 1 44 | config.general['cdb_source_name'] = f'UMLS_{release}' 45 | 46 | maker = CDBMaker(config) 47 | 48 | 49 | # Create your CDB 50 | # Add more cdbs to the list 51 | csv_paths = [csv_path] 52 | cdb = maker.prepare_csvs(csv_paths, full_build=True) 53 | 54 | # Add type_id pretty names to cdb 55 | cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms. 56 | 57 | # save model 58 | cdb.save(output_cdb) 59 | print(f"CDB Model saved successfully as: {output_cdb}") 60 | -------------------------------------------------------------------------------- /medcat/1_create_model/create_modelpack/create_modelpack.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from medcat.vocab import Vocab 4 | from medcat.cdb import CDB 5 | from medcat.cat import CAT 6 | 7 | # relative to file path 8 | _FILE_DIR = os.path.dirname(__file__) 9 | # relative path to working_with_cogstack folder 10 | _REL_PATH = os.path.join("..", "..", "..") 11 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 12 | # absolute path to working_with_cogstack folder 13 | BASE_PATH = os.path.abspath(_BASE_PATH) 14 | 15 | DEFAULT_CDB_FOLDER = os.path.join(BASE_PATH, "models", "cdb") 16 | 17 | DEFAULT_VOCAB_FOLDER = os.path.join(BASE_PATH, "models", "vocab") 18 | DEFAULT_VOCAB_PATH = os.path.join(DEFAULT_VOCAB_FOLDER, 'vocab.dat') 19 | 20 | DEFAULT_MODELPACK_FOLDER = os.path.join(BASE_PATH, "models", "modelpack") 21 | 22 | model_name = "" # Change to specific cdb of interest 23 | modelpack_name = ".dat" # Change to the name of your model 24 | 25 | def load_cdb_and_save_modelpack(cdb_path: str, 26 | modelpack_name: str, 27 | modelpack_path: str = DEFAULT_MODELPACK_FOLDER, 28 | vocab_path: str = DEFAULT_VOCAB_PATH) -> str: 29 | """Load a CDB and save it as a model pack along with the default Vocab. 30 | 31 | Args: 32 | cdb_path (str): The CDB path to load. 33 | modelpack_name (str): The model pack name to write to. 34 | modelpack_path (str): The folder to write the model pack to. 35 | Defaults to `DEFAULT_MODELPACK_FOLDER`. 36 | vocab_path (str): The vocab path. Defaults to `DEFAULT_VOCAB_PATH`. 37 | 38 | Returns: 39 | str: The model pack path. 40 | """ 41 | # Load cdb 42 | cdb = CDB.load(cdb_path) 43 | 44 | # Set cdb configuration 45 | # technically we already created this during the cdb creation 46 | cdb.config.ner['min_name_len'] = 2 47 | cdb.config.ner['upper_case_limit_len'] = 3 48 | cdb.config.general['spell_check'] = True 49 | cdb.config.linking['train_count_threshold'] = 10 50 | cdb.config.linking['similarity_threshold'] = 0.3 51 | cdb.config.linking['train'] = True 52 | cdb.config.linking['disamb_length_limit'] = 4 53 | cdb.config.general['full_unlink'] = True 54 | 55 | # Load vocab 56 | vocab = Vocab.load(vocab_path) 57 | 58 | # Initialise the model 59 | cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) 60 | 61 | # Create and save model pack 62 | return cat.create_model_pack(save_dir_path=modelpack_path, model_pack_name=modelpack_name) 63 | 64 | 65 | def load_cdb_and_save_modelpack_in_def_location(cdb_name: str, 66 | modelpack_name: str) -> str: 67 | cdb_path = os.path.join(DEFAULT_CDB_FOLDER, cdb_name) 68 | return load_cdb_and_save_modelpack(cdb_path, modelpack_name, 69 | DEFAULT_MODELPACK_FOLDER, 70 | DEFAULT_VOCAB_PATH) 71 | 72 | def main(): 73 | load_cdb_and_save_modelpack_in_def_location(model_name, modelpack_name) 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /medcat/1_create_model/create_vocab/create_vocab.py: -------------------------------------------------------------------------------- 1 | from medcat.vocab import Vocab 2 | import os 3 | 4 | vocab = Vocab() 5 | 6 | # relative to file path 7 | _FILE_DIR = os.path.dirname(__file__) 8 | # relative path to working_with_cogstack folder 9 | _REL_PATH = os.path.join("..", "..", "..") 10 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 11 | # absolute path to working_with_cogstack folder 12 | BASE_PATH = os.path.abspath(_BASE_PATH) 13 | vocab_dir = os.path.join(BASE_PATH, "models", "vocab") 14 | 15 | # the vocab.txt file need to be in the tab sep format: \t\t 16 | # Current vocab uses pre-calculated vector embedding from Word2Vec, future use embeddings calculated from BERT tokeniser 17 | # embeddings of 300 dimensions is standard 18 | 19 | vocab.add_words(os.path.join(vocab_dir, 'vocab_data.txt'), replace=True) 20 | vocab.make_unigram_table() 21 | vocab.save(os.path.join(vocab_dir, "vocab.dat")) 22 | -------------------------------------------------------------------------------- /medcat/2_train_model/1_unsupervised_training/splitter.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | import pydantic 5 | 6 | 7 | class SplitIdentifier: 8 | start_line_start_pattern: re.Pattern = re.compile('(\d+),(\d+),') 9 | 10 | def is_first_line(self, line: str) -> bool: 11 | """Check if the line in question is a suitable first line for an entry. 12 | 13 | The schema: 14 | "subject_id","hadm_id","chartdate","charttime","text","category","description" 15 | 16 | However, "text" is often multiline. 17 | So an example first line could be: 18 | 24776,139951,"2154-11-15 00:00:00","2154-11-15 17:48:00","HPI: 19 | That is, "subject_id","hadm_id","chartdate","charttime" and the start of "text" 20 | 21 | So currently, I am checking that the line: 22 | a) Starts with 2 integers separated by a comma 23 | b) Has an uneven number of quotation marks (i.e ends with an open quote) 24 | c) The number of quotes is greater than 4 25 | 26 | Args: 27 | line (str): The line in question 28 | 29 | Returns: 30 | bool: True if it's a suitable first line 31 | """ 32 | # check if starts with 2 integers separated by comma 33 | if not self.start_line_start_pattern.match(line): 34 | return False 35 | nr_of_quotes = line.count('"') 36 | return (nr_of_quotes // 2) != (nr_of_quotes / 2) and nr_of_quotes > 4 37 | 38 | def is_last_line(self, line: str) -> bool: 39 | """Check if the lin in question is a suitable last line for an entry. 40 | 41 | The schema: 42 | "subject_id","hadm_id","chartdate","charttime","text","category","description" 43 | 44 | However, "text" is often multiline. 45 | So an example last line could be: 46 | ","Physician ","Physician Resident Progress Note" 47 | That is, the end of "text" and then "category","description" 48 | 49 | So currently I am checking that the line: 50 | a) Has an uneven number of quotation marks (i.e starts with an open quote) 51 | b) Number of quotes is greater than 4 52 | 53 | Args: 54 | line (str): The line in question 55 | 56 | Returns: 57 | bool: True if it's a suitable last line 58 | """ 59 | nr_of_quotes = line.count('"') 60 | return (nr_of_quotes // 2) != (nr_of_quotes / 2) and nr_of_quotes > 4 61 | 62 | 63 | class SplitOptions(pydantic.BaseModel): 64 | lines_at_a_time: int 65 | out_file_format: str 66 | header_length: int = 1 67 | 68 | 69 | class SplitBuffer: 70 | 71 | def __init__(self, file_nr: int, opts: SplitOptions, split_identifier: SplitIdentifier, header: str) -> None: 72 | self.file_nr = file_nr 73 | self.opts = opts 74 | self.split_identifier = split_identifier 75 | self.lines: list = [header] 76 | self.prev_line_is_last = False 77 | self._is_done = False 78 | 79 | def save(self) -> None: 80 | file_name = self.opts.out_file_format % self.file_nr 81 | print('Saving', len(self.lines), 'to file nr', 82 | self.file_nr, ':', file_name) 83 | with open(file_name, 'w') as fw: 84 | fw.writelines(self.lines) 85 | 86 | def process_or_write(self, line_nr: int, line: str) -> 'SplitBuffer': 87 | """Process line and write if needed. 88 | 89 | If processing a line results in saving the data into a file, a new SplitBuffer is returned. 90 | This new instance will have the first line added to it already. 91 | If processing did not result in saving the data, the same instance is returned. 92 | 93 | Args: 94 | line_nr (int): The number of the line in the original 95 | line (str): The line contents 96 | 97 | Returns: 98 | SplitBuffer: Returns an instance of the buffer that should be used 99 | """ 100 | if self._is_done: 101 | raise ValueError('Cannot reuse a SplitBuffer - create a new one') 102 | # line = line.replace('\n', '') 103 | has_passed_req_line = line_nr >= self.opts.lines_at_a_time * self.file_nr 104 | cur_line_is_last = self.split_identifier.is_last_line(line) 105 | cur_line_is_first = self.split_identifier.is_first_line(line) 106 | if has_passed_req_line and self.prev_line_is_last and cur_line_is_first: 107 | print('Currently at line', line_nr) 108 | self.save() 109 | # print('Saving', len(self.lines), 'up until', line_nr, 'to file number', self.file_nr, ':', out_file) 110 | # print('PREV line:\n', self.lines[-1]) 111 | # print('NEW line:\n', line) 112 | self._is_done = True 113 | buffer = SplitBuffer( 114 | self.file_nr + 1, self.opts, self.split_identifier, header=self.lines[0]) 115 | return buffer.process_or_write(line_nr, line) 116 | if cur_line_is_last: 117 | self.prev_line_is_last = cur_line_is_last 118 | self.lines.append(line) 119 | return self 120 | 121 | 122 | class Splitter: 123 | 124 | def __init__(self, opts: SplitOptions, split_identifier: SplitIdentifier) -> None: 125 | self.opts = opts 126 | self.split_identifier = split_identifier 127 | 128 | def split(self, in_file: str): 129 | with open(in_file, 'r') as f: 130 | buffer = None 131 | for line_nr, line in enumerate(f): 132 | if buffer is None: # for the first line, just consider the header 133 | buffer = SplitBuffer( 134 | 1, self.opts, self.split_identifier, header=line) 135 | continue 136 | buffer = buffer.process_or_write(line_nr, line) 137 | if buffer and len(buffer.lines) > 1: # if there's more than just a header 138 | buffer.save() # saver remaining 139 | 140 | 141 | def split_file(in_file: str, nr_of_lines: int, out_file_format: str) -> None: 142 | """Splits a file into multiple files of the specified number of lines (or close to it). 143 | 144 | PS! This splitting is currently only designed for a narrow type of CSV files. 145 | This was created to split the MIMIC-III notes into parts. It may work with 146 | later MIMIC releases but is unlikely to work for other datasets. 147 | 148 | Args: 149 | in_file (str): _description_ 150 | nr_of_lines (int): _description_ 151 | out_file_format (str): _description_ 152 | """ 153 | opts = SplitOptions(lines_at_a_time=nr_of_lines, 154 | out_file_format=out_file_format) 155 | split_identifier = SplitIdentifier() 156 | splitter = Splitter(opts, split_identifier) 157 | splitter.split(in_file) 158 | 159 | 160 | if __name__ == '__main__': 161 | import sys 162 | if len(sys.argv) < 3: 163 | print('Need to specify in original file name and target file format') 164 | sys.exit(2) 165 | orig_file = sys.argv[1] 166 | target_format = sys.argv[2] 167 | if '%d' not in target_format: 168 | print('Target format needs to contain "%d" for including number in the file names') 169 | sys.exit(2) 170 | nr_of_lines = 300000 171 | if len(sys.argv) > 3: 172 | try: 173 | nr_of_lines = int(sys.argv[3]) 174 | except ValueError: 175 | print( 176 | 'Third argument needs to be numeric (for the number of lines per each split)') 177 | sys.exit(2) 178 | split_file(orig_file, nr_of_lines, target_format) 179 | -------------------------------------------------------------------------------- /medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from medcat.cat import CAT\n", 10 | "import pandas as pd\n", 11 | "import os" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# relative path to working_with_cogstack folder\n", 21 | "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", 22 | "# absolute path to working_with_cogstack folder\n", 23 | "base_path = os.path.abspath(_rel_path)\n", 24 | "data_dir = os.path.join(base_path, \"data\")\n", 25 | "data_file = '' # file containing training material.\n", 26 | "\n", 27 | "model_dir = os.path.join(data_dir, \"medcat_models\", \"modelpack\")\n", 28 | "\n", 29 | "modelpack = ''\n", 30 | "model_pack_path = os.path.join(model_dir, modelpack)\n", 31 | "\n", 32 | "output_modelpack = '' # Save name for new model\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Initialise model" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Load modelpack\n", 49 | "cat = CAT.load_model_pack(model_pack_path)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "cat.cdb.print_stats()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "scrolled": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# Load training data\n", 70 | "data = pd.read_csv(os.path.join(data_dir, data_file))\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "data.shape" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "scrolled": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "# Print statistics on the CDB before training\n", 91 | "cat.cdb.print_stats()\n", 92 | "\n", 93 | "# Run the annotation procedure over all the documents we have,\n", 94 | "# given that we have a large number of documents this can take quite some time.\n", 95 | "\n", 96 | "for i, text in enumerate(data['text'].values):\n", 97 | " # This will now run the training in the background \n", 98 | " try:\n", 99 | " _ = cat(text, do_train=True)\n", 100 | " except TypeError:\n", 101 | " pass\n", 102 | " \n", 103 | " # So we know how things are moving\n", 104 | " if i % 10000 == 0:\n", 105 | " print(\"Finished {} - text blocks\".format(i))\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# Print statistics on the CDB after training\n", 115 | "cat.cdb.print_stats()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# save modelpack\n", 125 | "cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack)\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "End of script" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "medcat", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.10.8 (main, Nov 24 2022, 08:08:27) [Clang 14.0.6 ]" 153 | }, 154 | "vscode": { 155 | "interpreter": { 156 | "hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca" 157 | } 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 5 162 | } 163 | -------------------------------------------------------------------------------- /medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py: -------------------------------------------------------------------------------- 1 | from medcat.cat import CAT 2 | import logging 3 | import sys 4 | import os 5 | sys.path.append('../../../') 6 | from cogstack import CogStack 7 | from credentials import * 8 | 9 | medcat_logger = logging.getLogger('medcat') 10 | fh = logging.FileHandler('medcat.log') 11 | medcat_logger.addHandler(fh) 12 | 13 | ###Change parameters here### 14 | cogstack_indices: list = [] # list of cogstack indexes here 15 | text_columns = ['body_analysed'] # list of all text containing fields 16 | # relative to file path 17 | _FILE_DIR = os.path.dirname(__file__) 18 | # relative path to working_with_cogstack folder 19 | _REL_PATH = os.path.join("..", "..", "..") 20 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 21 | # absolute path to working_with_cogstack folder 22 | BASE_PATH = os.path.abspath(_BASE_PATH) 23 | model_pack_path = os.path.join(BASE_PATH, 'data', 'medcat_models', 'modelpack') 24 | model_pack_name = '' 25 | output_modelpack_name = '' # name of modelpack to save 26 | 27 | cs = CogStack(hosts, username=username, password=password, api=True) 28 | df = cs.DataFrame(index=cogstack_indices, columns=text_columns) # type: ignore 29 | 30 | cat = CAT.load_model_pack(model_pack_path+model_pack_name) 31 | cat.cdb.print_stats() 32 | cat.train(data_iterator=df[text_columns].iterrows(), 33 | nepochs=1, 34 | fine_tune=True, 35 | progress_print=10000, 36 | is_resumed=False) 37 | 38 | cat.cdb.print_stats() 39 | 40 | cat.create_model_pack(save_dir_path=model_pack_path, model_pack_name=output_modelpack_name) 41 | -------------------------------------------------------------------------------- /medcat/2_train_model/1_unsupervised_training/unsupervised_training.py: -------------------------------------------------------------------------------- 1 | from medcat.cat import CAT 2 | import pandas as pd 3 | import os 4 | import logging 5 | 6 | # python medcat/2_train_model/1_unsupervised_training/splitter.py 7 | # medcat/2_train_model/1_unsupervised_training/all_notes.csv medcat/2_train_model/1_unsupervised_training/split_notes_5M_%d.csv 5000000 8 | # 9 | 10 | all_notes_file = 'all_notes.csv' # CHANGE AS NEEDED 11 | 12 | # in my case, I needed to split the notes into parts. Otherwise, the work just crashed at some point 13 | # I chose to split into 19 parts, around 5000000 lines at a time. 14 | split_format = 'split_notes_5M_%d.csv' 15 | nr_of_lines = 5000000 16 | 17 | from splitter import split_file 18 | 19 | if not os.path.exists(split_format%1): 20 | print(f'\n\nSplitting file into {nr_of_lines} line at a time. This will probably take some time\n\n') 21 | split_file(all_notes_file, nr_of_lines, split_format) 22 | print('\n\nDone with the split!\n\n') 23 | else: 24 | print('\n\nNB!Expecting the split files to already exist\n\n') 25 | 26 | data_dir = '.' # CHANGE AS NEEDED 27 | 28 | 29 | # relative to file path 30 | _FILE_DIR = os.path.dirname(__file__) 31 | # relative path to working_with_cogstack folder 32 | _REL_PATH = os.path.join("..", "..", "..") 33 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 34 | # absolute path to working_with_cogstack folder 35 | BASE_PATH = os.path.abspath(_BASE_PATH) 36 | 37 | model_dir = os.path.join(BASE_PATH, "data", "medcat_models", "modelpack") 38 | 39 | modelpack = 'umls_model2_zip_0d4ccc7b9ae1ecd2.zip' # CHANGE AS NEEDED 40 | model_pack_path = os.path.join(model_dir, modelpack) 41 | 42 | output_modelpack = 'umls_self_train_model' # Save name for new model 43 | 44 | # Load modelpack 45 | print('Loading modelpack') 46 | cat = CAT.load_model_pack(model_pack_path) 47 | cat.log.addHandler(logging.StreamHandler()) # add console output 48 | 49 | print('STATS:') 50 | cat.cdb.print_stats() 51 | 52 | # CHANGE AS NEEDED - if the number of spligt files is different 53 | all_data_files = [f'split_notes_5M_{nr}.csv' for nr in range(1, 20)] # file containing training material. 54 | for i, data_file in enumerate(all_data_files): 55 | # Load training data 56 | print('Load data for', i, 'from', data_file) 57 | data = pd.read_csv(os.path.join(data_dir, data_file)) 58 | cat.train(data.text.values, progress_print=100) 59 | 60 | print('Stats now, after', i) 61 | cat.cdb.print_stats() 62 | 63 | # save modelpack 64 | cat.create_model_pack(save_dir_path=model_dir, model_pack_name=f"{output_modelpack}_{i}") 65 | 66 | # save modelpack - ALL 67 | cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack) 68 | 69 | -------------------------------------------------------------------------------- /medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d58c720d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import json\n", 11 | "import os\n", 12 | "from datetime import date\n", 13 | "from medcat.cat import CAT\n", 14 | "from medcat.meta_cat import MetaCAT\n", 15 | "from medcat.config_meta_cat import ConfigMetaCAT\n", 16 | "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "id": "ca80af0e", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# if you want to enable info level logging\n", 27 | "import logging\n", 28 | "logging.basicConfig(level=logging.INFO,force=True)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "b1c5b9b0", 34 | "metadata": {}, 35 | "source": [ 36 | "#### 💡 To understand the model loading and other functionalities, please refer to the 'meta_annotation_training.ipynb' notebook" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "a2c0431f", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "model_pack = '' # .zip model pack location\n", 47 | "mctrainer_export = \"\" # name of your mct export" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "808c27c1", 53 | "metadata": {}, 54 | "source": [ 55 | "We won't load the models at this stage as they need to be seperately loaded later.
Let's check for meta models in the directory" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "id": "675eab49", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# Iterate through the meta_models contained in the model\n", 66 | "meta_model_names = []\n", 67 | "for dirpath, dirnames, filenames in os.walk(model_pack):\n", 68 | " for dirname in dirnames:\n", 69 | " if dirname.startswith('meta_'):\n", 70 | " meta_model_names.append(dirname[5:])\n", 71 | "\n", 72 | "print(\"Meta models:\",meta_model_names)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "9e499198", 78 | "metadata": {}, 79 | "source": [ 80 | "# Class weights \n", 81 | "\n", 82 | "Adjusting class weights to give more importance to specific classes. Generally, class weights are used in favour of minority classes(classes with less number of samples) to boost their performance.\n", 83 | "

To use class weights, we have 2 options:\n", 84 | "
1. calculate class weights based on class distribution\n", 85 | "
2. using specified class weights\n", 86 | "\n", 87 | "\n", 88 | "#option 1
\n", 89 | "metacat.config.train['class_weights'] = []
\n", 90 | "metacat.config.train['compute_class_weights'] = True
\n", 91 | "
\n", 92 | "#option 2
\n", 93 | "metacat.config.train['class_weights'] = [0.4,0.3,0.1]
" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "fc07f3e9", 99 | "metadata": {}, 100 | "source": [ 101 | "NOTE: Make sure to correctly map the class weights to their corresponding class index.
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n", 102 | "
This will print a dictionary where the class names and their corresponding IDs (indices) are displayed.
\n", 103 | "The first position in the class weight list corresponds to the class with ID 0 in the dictionary, and so on." 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "6a92aa60", 109 | "metadata": {}, 110 | "source": [ 111 | "# 2 phase learning for training\n", 112 | "\n", 113 | "2 phase learning is used to mitigate class imbalance. In 2 phase learning, the models are trained twice:
\n", 114 | "Phase 1: trains for minority class(es) by undersampling data so that there is no class imbalance\n", 115 | "
Phase 2: trains for all classes\n", 116 | "\n", 117 | "Phase 1 ensures that the model learns minority class(es) and captures the details correctly.\n", 118 | "
Phase 2 is when the model is expected to learn the majority class as it is trained on the entire dataset.\n", 119 | "\n", 120 | "Paper reference - https://ieeexplore.ieee.org/document/7533053\n", 121 | "
Make sure to use class weights in favour of minority classes with 2 phase learning" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "id": "5a86b839", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "#--------------------------------Phase 1--------------------------------\n", 132 | "def run_phase_1(meta_model,class_wt_phase1 = None):\n", 133 | " #Loading the pre-defined config for phase 1\n", 134 | " config_ph_1_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph1.json\")\n", 135 | " with open(config_ph_1_path) as f:\n", 136 | " config_ph1 = json.load(f)\n", 137 | " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph1)\n", 138 | "\n", 139 | " if class_wt_phase1:\n", 140 | " mc.config.train['class_weights'] = class_wt_phase1\n", 141 | "\n", 142 | " #You can change the number of epochs, remember to keep them higher for phase 1\n", 143 | " mc.config.train['nepochs'] = 40 \n", 144 | "\n", 145 | " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", 146 | " # Save results\n", 147 | " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n", 148 | "\n", 149 | "#--------------------------------Phase 2--------------------------------\n", 150 | "def run_phase_2(meta_model,class_wt_phase2 = None): \n", 151 | " #Loading the pre-defined config for phase 2\n", 152 | " config_ph_2_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph2.json\")\n", 153 | " with open(config_ph_2_path) as f:\n", 154 | " config_ph2 = json.load(f)\n", 155 | "\n", 156 | " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph2)\n", 157 | "\n", 158 | " if class_wt_phase2:\n", 159 | " mc.config.train['class_weights'] = class_wt_phase2\n", 160 | "\n", 161 | " #You can change the number of epochs\n", 162 | " mc.config.train['nepochs'] = 20\n", 163 | "\n", 164 | " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n", 165 | " # Save results\n", 166 | " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n", 167 | "\n", 168 | "#--------------------------------Driver--------------------------------\n", 169 | "# Train the first meta cat model\n", 170 | "meta_model = meta_model_names[0]\n", 171 | "\n", 172 | "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n", 173 | "meta_cat_task = meta_model\n", 174 | "save_dir_path = os.path.join(model_pack,\"meta_\"+ meta_cat_task)\n", 175 | "\n", 176 | "# To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n", 177 | "class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n", 178 | "class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n", 179 | "\n", 180 | "\n", 181 | "# Train 2 phase learning\n", 182 | "print(\"*** Training meta cat: \",meta_model)\n", 183 | "print(\"Beginning Phase 1...\")\n", 184 | "run_phase_1(meta_model,class_wt_phase1)\n", 185 | "print(\"Beginning Phase 2...\")\n", 186 | "run_phase_2(meta_model,class_wt_phase2)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "60f0e878", 192 | "metadata": {}, 193 | "source": [ 194 | "# Generating synthetic data\n", 195 | "\n", 196 | "You can generate synthetic data to help mitigate class imbalance.
Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)
NOTE: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. " 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "id": "431e1002", 202 | "metadata": {}, 203 | "source": [ 204 | "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "4d07d437", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# To run the training with original + synthetic data\n", 215 | "# Follow all the same steps till and load the model\n", 216 | "\n", 217 | "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n", 218 | "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n", 219 | "\n", 220 | "synthetic_data_export = [[],[],[]]\n", 221 | "\n", 222 | "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n", 223 | "\n", 224 | "# Save results\n", 225 | "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))" 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "pytorch_medcat_clean", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.10.14" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 5 250 | } 251 | -------------------------------------------------------------------------------- /medcat/2_train_model/2_supervised_training/supervised training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import json\n", 11 | "import pandas as pd\n", 12 | "from datetime import date\n", 13 | "from medcat.cat import CAT" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# will be used to date the trained model\n", 23 | "today = str(date.today())\n", 24 | "today = today.replace(\"-\",\"\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "ann_dir = \"working_with_cogstack/data/annotated_docs/\"\n", 34 | "mctrainer_export_path = ann_dir + \"MedCAT_Export_With_Text_2021-08-25_19_55_45.json\" # name of your mct export\n", 35 | "\n", 36 | "model_dir = 'working_with_cogstack/models/modelpack'\n", 37 | "\n", 38 | "modelpack = '' # name of modelpack\n", 39 | "model_pack_path = os.path.join(model_dir, modelpack)\n", 40 | "\n", 41 | "output_modelpack = model_dir + f\"{today}_trained_model\"\n", 42 | "\n", 43 | "# Add training filter if needed\n", 44 | "snomed_filter_path = None # path to snomed filter" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Create CAT - the main class from medcat used for concept annotation\n", 54 | "cat = CAT.load_model_pack(model_pack_path)\n", 55 | "cat.config.linking['filters'] = {'cuis':set()} # To remove exisitng filters" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Set filter\n", 63 | "\n", 64 | "This will speed up the training time. As you will only train a select number of concepts at once." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Add extra training filter if required.\n", 74 | "if snomed_filter_path:\n", 75 | " snomed_filter = set(json.load(open(snomed_filter_path)))\n", 76 | "else:\n", 77 | " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Train" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "cat.train_supervised_from_json(data_path=mctrainer_export_path, \n", 94 | " nepochs=3,\n", 95 | " reset_cui_count=False,\n", 96 | " print_stats=True,\n", 97 | " use_filters=True,\n", 98 | " extra_cui_filter=snomed_filter, # If not filter is set remove this line\n", 99 | " )\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Stats" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "data = json.load(open(mctrainer_export_path))" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "scrolled": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, use_filters=True)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "cui = \"22298006\" # Myocardial infarction\n", 136 | "print(cui_f1[cui], cui_prec[cui], cui_rec[cui])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "# Save\n", 151 | "\n", 152 | "Also remember that you can save specific components within the modelpack. Rather than create a new one" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# save modelpack\n", 162 | "cat.create_model_pack(os.path.join(model_dir, output_modelpack))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Test" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "text = \"The pateint has hypertension and an MI\"\n", 179 | "doc = cat.get_entities(text)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "doc" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.9.6 (default, Sep 26 2022, 11:37:49) \n[Clang 14.0.0 (clang-1400.0.29.202)]" 216 | }, 217 | "vscode": { 218 | "interpreter": { 219 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 220 | } 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 1 225 | } 226 | -------------------------------------------------------------------------------- /medcat/2_train_model/ReadMe.md: -------------------------------------------------------------------------------- 1 | # MedCAT Model Training 2 | 3 | A MedCAT model will undergo Two steps of training. 4 | The first is an unsupervised step which should only be done once. 5 | 6 | The Supervised training step ("Human in the loop") should be done to fine-tune and evaluate models. 7 | This step can be an iterative process where models can be "further" fine-tuned to further its understanding of concepts. 8 | 9 | -------------------------------------------------------------------------------- /medcat/3_run_model/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Running a model to annotate text 2 | 3 | 4 | 5 | ## Command line tips 6 | To run_model.py in the background: 7 | 8 | Running scripts in the python background. Outputs here are saved to an nohup.out `nohup python3 run_model.py &` 9 | 10 | 11 | 12 | You can find the process and it's Process ID (PID): `ps ax | grep run_model.py` 13 | 14 | or list running processes in python 15 | `ps -fA | grep python` 16 | 17 | If you want to stop the execution, you can kill with this command. 18 | `kill PID` 19 | 20 | -------------------------------------------------------------------------------- /medcat/3_run_model/run_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "from medcat.cat import CAT\n", 11 | "from medcat import cat\n", 12 | "import pandas as pd\n", 13 | "import json\n", 14 | "from tqdm.notebook import tqdm\n", 15 | "import re\n", 16 | "import pickle" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Add file logger\n", 26 | "import logging\n", 27 | "medcat_logger = logging.getLogger('medcat')\n", 28 | "fh = logging.FileHandler('medcat.log')\n", 29 | "medcat_logger.addHandler(fh)" 30 | ] 31 | }, 32 | { 33 | "attachments": {}, 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "# Paths and Config" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# relative path to working_with_cogstack folder\n", 47 | "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n", 48 | "# absolute path to working_with_cogstack folder\n", 49 | "base_path = os.path.abspath(_rel_path)\n", 50 | "vocab_dir = os.path.join(base_path, \"models\", \"vocab\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Changes these according to your project\n", 60 | "project_name = 'test_project' # Name of your project. Annotated documents relating to this project will be stored here.\n", 61 | "documents_to_annotate = \"cogstack_search_results/example documents to annotate.csv\" # Add your data file here\n", 62 | "\n", 63 | "modelpack = '' # enter your model here. Should the the output of trained 'output_modelpack'.\n", 64 | "snomed_filter_path = None\n", 65 | "\n", 66 | "\n", 67 | "# Constants (nothing to change below)\n", 68 | "data_dir = 'working_with_cogstack/data'\n", 69 | "\n", 70 | "data_path = os.path.join(base_path, data_dir, documents_to_annotate)\n", 71 | "doc_id_column = \"id\"\n", 72 | "doc_text_column = \"description\"\n", 73 | "\n", 74 | "model_dir = 'working_with_cogstack/models/modelpack'\n", 75 | "model_pack_path = os.path.join(base_path, model_dir, modelpack)\n", 76 | "\n", 77 | "ann_folder_path = os.path.join(base_path, data_dir, f'annotated_docs', project_name)\n", 78 | "if not os.path.exists(ann_folder_path):\n", 79 | " os.makedirs(ann_folder_path)\n", 80 | " print(f'Created folder to store annotations here: {ann_folder_path}')\n", 81 | " \n", 82 | "save_path_annotations_per_doc = os.path.join(base_path, ann_folder_path, \".json\")\n" 83 | ] 84 | }, 85 | { 86 | "attachments": {}, 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "# Load MedCAT model" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Create CAT - the main class from medcat used for concept annotation\n", 100 | "cat = CAT.load_model_pack(model_pack_path)" 101 | ] 102 | }, 103 | { 104 | "attachments": {}, 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Annotate" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# Set snomed filter if needed\n", 118 | "# This is a white list filter of concepts\n", 119 | "if snomed_filter_path:\n", 120 | " snomed_filter = set(json.load(open(snomed_filter_path)))\n", 121 | "else:\n", 122 | " print('There is no concept filter set')\n", 123 | " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n", 124 | "\n", 125 | "cat.config.linking['filters']['cuis'] = snomed_filter \n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]] # Not necessary to filter at this step. But this loads only what is required\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "scrolled": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "# Create generator object\n", 146 | "def data_iterator(data, doc_name, doc_text):\n", 147 | " for id, row in data.iterrows():\n", 148 | " yield (row[doc_name], row[doc_text])" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "batch_char_size = 50000 # Batch size (BS) in number of characters\n", 158 | "cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n", 159 | " batch_size_chars=batch_char_size,\n", 160 | " only_cui=False,\n", 161 | " nproc=8, # Number of processors\n", 162 | " out_split_size_chars=20*batch_char_size,\n", 163 | " save_dir_path=ann_folder_path,\n", 164 | " min_free_memory=0.1,\n", 165 | " )\n", 166 | "\n", 167 | "medcat_logger.warning(f'Annotation process complete!')\n" 168 | ] 169 | }, 170 | { 171 | "attachments": {}, 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Double check if everything has been annotated.\n", 176 | "\n", 177 | "This does not check meta-annotations" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "# Check if everything has run smoothly. If an error has been raised check the logs\n", 187 | "try:\n", 188 | " # Path to your pickle file\n", 189 | " pickle_file_path = os.path.join(ann_folder_path, \"annotated_ids.pickle\")\n", 190 | " # Open the pickle file in read mode\n", 191 | " with open(pickle_file_path, \"rb\") as pickle_file:\n", 192 | " loaded_data = pickle.load(pickle_file)\n", 193 | " assert len(df) == len(loaded_data[0])\n", 194 | "except AssertionError as e:\n", 195 | " print(\"Error:\", \"There are documents which havent been annotated! Check 'medcat.log' for more info\")\n" 196 | ] 197 | }, 198 | { 199 | "attachments": {}, 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "END OF SCRIPT" 204 | ] 205 | }, 206 | { 207 | "attachments": {}, 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [] 211 | }, 212 | { 213 | "attachments": {}, 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "### Inspect the model" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "text = \"He was diagnosed with heart failure\"\n", 227 | "doc = cat(text)\n", 228 | "print(doc.ents)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# Display Snomed codes\n", 238 | "for ent in doc.ents:\n", 239 | " print(ent, \" - \", ent._.cui, \" - \", cat.cdb.cui2preferred_name[ent._.cui])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# To show semantic types for each entity\n", 249 | "for ent in doc.ents:\n", 250 | " print(ent, \" - \", cat.cdb.cui2type_ids.get(ent._.cui))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "# Display\n", 260 | "from spacy import displacy\n", 261 | "displacy.render(doc, style='ent', jupyter=True)" 262 | ] 263 | }, 264 | { 265 | "attachments": {}, 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "# Alternative approach" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "scrolled": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "# This approach does not use multiprocessing. But iterates line by line through your dataset.\n", 281 | "\n", 282 | "docs = {}\n", 283 | "print(f\"Len of df: {len(df)}\") \n", 284 | "\n", 285 | "for i, row in tqdm(df.iterrows(), total=df.shape[0]):\n", 286 | " text = str(row[doc_text_column])\n", 287 | " \n", 288 | " # Skip text if under 10 characters,\n", 289 | " if len(text) > 10:\n", 290 | " docs[row[doc_id_column]] = cat.get_entities(text)\n", 291 | " else:\n", 292 | " docs[row[doc_id_column]] = []" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "cat.cdb.print_stats()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "# Save to file (docs is docs 2 annotations)\n", 311 | "json.dump(docs, open(save_path_annotations_per_doc, \"w\"))\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.10.8" 339 | }, 340 | "vscode": { 341 | "interpreter": { 342 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 343 | } 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 4 348 | } 349 | -------------------------------------------------------------------------------- /medcat/3_run_model/run_model.py: -------------------------------------------------------------------------------- 1 | from medcat.cat import CAT 2 | import os 3 | import pandas as pd 4 | import json 5 | 6 | import logging 7 | medcat_logger = logging.getLogger('medcat') 8 | fh = logging.FileHandler('medcat.log') 9 | medcat_logger.addHandler(fh) 10 | 11 | import sys 12 | sys.path.append(os.path.join('..', '..')) 13 | from credentials import * 14 | from cogstack import CogStack 15 | 16 | 17 | # relative to file path 18 | _FILE_DIR = os.path.dirname(__file__) 19 | # relative path to working_with_cogstack folder 20 | _REL_PATH = os.path.join("..", "..", "..") 21 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH) 22 | # absolute path to working_with_cogstack folder 23 | BASE_PATH = os.path.abspath(_BASE_PATH) 24 | vocab_dir = os.path.join(BASE_PATH, "models", "vocab") 25 | 26 | # Initialise search 27 | cs = CogStack(hosts=hosts, username=username, password=password, api=True) 28 | 29 | cogstack_indices = [''] # Enter your list of relevant cogstack indices here 30 | 31 | # log size of indices 32 | df = cs.DataFrame(index=cogstack_indices, columns=['body_analysed']) # type: ignore 33 | medcat_logger.warning(f'The index size is {df.shape[0]}!') 34 | del df 35 | 36 | # Initialise the model 37 | base_path = BASE_PATH 38 | model_dir = os.path.join('models', 'modelpack') 39 | 40 | modelpack = '' # enter your model here. Should be the output of trained 'output_modelpack' from step 2. 41 | model_pack_path = os.path.join(base_path, model_dir, modelpack) 42 | 43 | snomed_filter_path = None 44 | 45 | data_dir = 'data' 46 | ann_folder_path = os.path.join(base_path, data_dir, f'annotated_docs') 47 | if not os.path.exists(ann_folder_path): 48 | os.makedirs(ann_folder_path) 49 | 50 | medcat_logger.warning(f'Anntotations will be saved here: {ann_folder_path}') 51 | 52 | # Load CAT - the main class from medcat used fro concept annotation 53 | cat = CAT.load_model_pack(model_pack_path) 54 | 55 | # Set snomed filter if needed 56 | # This is a white list filter of concepts 57 | if snomed_filter_path: 58 | snomed_filter = set(json.load(open(snomed_filter_path))) 59 | else: 60 | snomed_filter = set(cat.cdb.cui2preferred_name.keys()) 61 | 62 | cat.config.linking['filters']['cuis'] = snomed_filter 63 | del snomed_filter 64 | 65 | # build query, change as appropriate 66 | query = { 67 | "query": { 68 | "match_all": {} 69 | }, 70 | "_source":["_id", "body_analysed"] 71 | } 72 | 73 | search_gen = cs.get_docs_generator(index=cogstack_indices, query=query, request_timeout=None) 74 | 75 | def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'): 76 | for i in generator: 77 | try: 78 | yield (i[doc_id], i['_source'][text_col]) 79 | except KeyError: 80 | # medcat_logger.warning(f'KeyError {text_col} not found') 81 | continue 82 | 83 | batch_char_size = 500000 # Batch size (BS) in number of characters 84 | 85 | cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen), 86 | batch_size_chars=batch_char_size, 87 | only_cui=False, 88 | nproc=8, # Number of processors 89 | out_split_size_chars=20*batch_char_size, 90 | save_dir_path=ann_folder_path, 91 | min_free_memory=0.1, 92 | ) 93 | 94 | medcat_logger.warning(f'Annotation process complete!') 95 | 96 | -------------------------------------------------------------------------------- /medcat/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Medical oncept Annotation Tool 2 | 3 | This directory contains information on retrieving data and creating models 4 | All details regarding creating, building and running the NLP model are stored here. 5 | 6 | ## Locations for storing data: 7 | 8 | - The [data](/data) directory stores textual content. 9 | Methods for retrieving data should be stored in the [retrieve_data](/search) folder. 10 | 11 | - The [MedCAT models](/data/medcat_models) directory holds models. 12 | 13 | ## Order of processing steps 14 | 15 | #### [__Step 1__](/medcat/1_create_model): Create the model 16 | 17 | Each of the model components are found [here.](/medcat/1_create_model) 18 | This directory contains all the components required to initialise a model pack. 19 | 20 | All models should be stored [here.](/models) 21 | 22 | 23 | #### [__Step 2__](/medcat/2_train_model): Perform training 24 | 25 | - [__Step 2.1__](/medcat/2_train_model/1_unsupervised_training): Unsupervised training 26 | 27 | The unsupervised training steps can be found within unsupervised_training folder. 28 | 29 | 30 | - [__Step 2.2__](/medcat/2_train_model/2_supervised_training): Supervised training 31 | 32 | After providing supervised labels with MedCATtrainer. 33 | The supervised training steps can be found within supervised_training folder. 34 | 35 | #### [__Step 3__](/medcat/3_run_model): Run model 36 | 37 | Run model on your corpus of documents and write to csv/sql db. 38 | Instructions on how to do this can be found within [run_model](/medcat/3_run_model/run_model.ipynb) 39 | 40 | 41 | ## General guidance on how to run an NER annotation project 42 | 43 | 1. Establish your Aims, Hypothesis and Scope. 44 | 45 | 2. Define your cohort/dataset. How will you identify your cohort and relevant documents? 46 | 47 | 3. Select a standardised clinical terminology and version most suitable fit your use case. 48 | 49 | 4. Select an existing model or create your own. 50 | 51 | 5. Produce annotation guidelines. Create a “gold standard”. Manually label you’re a sample of your dataset through annotations. This sample must be as representative as possible to ensure optimal model performance. 52 | 53 | 6. Train and compare the model to your “gold standard”. These annotations can be used for supervised training or benchmarking model performance. 54 | 55 | 7. Calculate performance metrics against the annotation sample. 56 | 57 | 8. Run over your entire data set. 58 | 59 | 9. Random stratified subsample review of performance. 60 | 61 | 10. (Optional generalisability) Test model at an external site/dataset validation of steps 8,9. 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /medcat/compare_models/cmp_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Type, TypeVar, Generic, Iterable, Callable, Optional 2 | 3 | import sqlite3 4 | import re 5 | from pydantic import BaseModel 6 | 7 | 8 | T = TypeVar('T', bound=BaseModel) 9 | 10 | 11 | def sanitize_table_name(name, max_length=64): 12 | # Replace any characters not allowed in table names with underscores 13 | name = re.sub(r'[^a-zA-Z0-9_$]', '_', name) 14 | # Truncate the name if it's too long 15 | name = name[:max_length] 16 | return name 17 | 18 | 19 | class SaveOptions(BaseModel): 20 | use_db: bool = False 21 | db_file_name: Optional[str] = None 22 | clean_callback: Optional[Callable[[], None]] = None 23 | 24 | 25 | class DifferenceDatabase(Generic[T]): 26 | 27 | def __init__(self, db_file: str, part: str, model_type: Type[T], 28 | batch_size: int = 100): 29 | self.db_file = db_file 30 | self.part = sanitize_table_name(part) 31 | self.model_type = model_type 32 | self.conn = sqlite3.connect(self.db_file) 33 | self.cursor = self.conn.cursor() 34 | self._create_table() 35 | self._len = 0 36 | self._batch_size = batch_size 37 | 38 | def _create_table(self): 39 | self.cursor.execute(f'''CREATE TABLE IF NOT EXISTS differences_{self.part} 40 | (id INTEGER PRIMARY KEY, data TEXT)''') 41 | self.conn.commit() 42 | 43 | def append(self, difference: T): 44 | data = difference.json() 45 | self.cursor.execute(f"INSERT INTO differences_{self.part} (data) VALUES (?)", (data,)) 46 | self.conn.commit() 47 | self._len += 1 48 | 49 | def __iter__(self) -> Iterable[T]: 50 | self.cursor.execute(f"SELECT data FROM differences_{self.part}") 51 | while True: 52 | rows = self.cursor.fetchmany(self._batch_size) 53 | if not rows: 54 | break 55 | for row in rows: 56 | yield self.model_type.parse_raw(row[0]) 57 | 58 | def __len__(self) -> int: 59 | return self._len 60 | 61 | def __del__(self): 62 | self.conn.close() 63 | -------------------------------------------------------------------------------- /medcat/compare_models/comp_nbhelper.py: -------------------------------------------------------------------------------- 1 | from ipyfilechooser import FileChooser 2 | from ipywidgets import widgets 3 | from IPython.display import display 4 | import os 5 | from typing import List, Optional 6 | 7 | 8 | from compare import get_diffs_for 9 | from output import parse_and_show, show_dict_deep, compare_dicts 10 | 11 | 12 | _def_path = '../../models/modelpack' 13 | _def_path = _def_path if os.path.exists(_def_path) else '.' 14 | 15 | 16 | class NBComparer: 17 | 18 | def __init__(self, model_path_1: str, model_path_2: str, 19 | documents_file: str, doc_limit: int, is_mct_export_compare: bool, 20 | cui_filter: str, filter_children: bool) -> None: 21 | self.model_path_1 = model_path_1 22 | self.model_path_2 = model_path_2 23 | self.documents_file = documents_file 24 | self.doc_limit = doc_limit 25 | self.is_mct_export_compare = is_mct_export_compare 26 | self.cui_filter = cui_filter 27 | self.filter_children = filter_children 28 | self._run_comparison() 29 | 30 | def _run_comparison(self): 31 | (self.cdb_comp, self.tally1, self.tally2, self.ann_diffs) = get_diffs_for( 32 | self.model_path_1, self.model_path_2, self.documents_file, 33 | cui_filter=self.cui_filter, include_children_in_filter=self.filter_children, 34 | supervised_train_comparison_model=self.is_mct_export_compare, doc_limit=self.doc_limit) 35 | 36 | def show_all(self): 37 | parse_and_show(self.cdb_comp, self.tally1, self.tally2, self.ann_diffs) 38 | 39 | def show_per_document(self, limit: int = -1, print_delimiter: bool = True, 40 | ignore_empty: bool = True): 41 | cnt = 0 42 | for key in self.ann_diffs.per_doc_results.keys(): 43 | comp_dict = self.ann_diffs.per_doc_results[key].nr_of_comparisons 44 | if not ignore_empty or comp_dict: # ignore empty ones 45 | if print_delimiter: 46 | print('='*20,f'\n{key}', f'\n{"="*20}') 47 | show_dict_deep(self.ann_diffs.per_doc_results[key].nr_of_comparisons) 48 | cnt += 1 49 | if limit > -1 and cnt == limit: 50 | break 51 | 52 | def diffs_to_csv(self, file_path: str) -> None: 53 | self.ann_diffs.to_csv(file_path) 54 | 55 | def compare_for_cui(self, cui: str, include_children: int = 2) -> None: 56 | per_cui1 = self.tally1.get_for_cui(cui, include_children=include_children) 57 | per_cui2 = self.tally2.get_for_cui(cui, include_children=include_children) 58 | compare_dicts(per_cui1, per_cui2) 59 | 60 | def show_docs(self, docs: List[str], show_delimiter: bool = True, 61 | omit_identical: bool = True): 62 | for doc_name, pair in self.ann_diffs.iter_ann_pairs(docs=docs, omit_identical=omit_identical): 63 | if show_delimiter: 64 | print('='*20,f'\n{doc_name} ({pair.comparison_type})', f'\n{"="*20}') 65 | # NOTE: if only one of the two has an annotation, the other one will be None 66 | # the following will deal with that automatically, though 67 | compare_dicts(pair.one, pair.two) 68 | 69 | 70 | class NBInputter: 71 | models_overall_title = "Models and data" 72 | mc1_title = "Choose model 1" 73 | mc2_title = "Choose model 2 (or an MCT export)" 74 | docs_title = "Choose the documents file (.csv with 'text' field)" 75 | docs_limit_title = "Limit the number of documents to run (-1 to disable)" 76 | mct_export_title = "Is the 2nd path an MCT export (instead of a model)?" 77 | cui_filter_title_overall = "CUI Filter" 78 | cui_filter_title_file_chooser = "Choose file with comma-separated CUIs" 79 | cui_filter_title_text = "List comma-separated CUIs" 80 | cui_children_title = "How many layers of children of concepts to include?" 81 | 82 | def __init__(self) -> None: 83 | self.model1_chooser = FileChooser(_def_path) 84 | self.model2_chooser = FileChooser(_def_path) 85 | self.documents_chooser = FileChooser(".") 86 | self.doc_limit = widgets.IntText(-1) 87 | self.ckbox = widgets.Checkbox(description="MCT export compare") 88 | 89 | self.cui_filter_chooser = FileChooser(".", description="The CUI filter file") 90 | self.cui_filter_box = widgets.Textarea(description="CUI list") 91 | self.cui_children = widgets.IntText(description="Children", value=-1) 92 | 93 | def show_all(self): 94 | model_choosers = widgets.VBox([ 95 | widgets.HTML(f"

{self.models_overall_title}

"), 96 | widgets.VBox([widgets.Label(self.mc1_title), self.model1_chooser]), 97 | widgets.VBox([widgets.Label(self.mc2_title), self.model2_chooser]), 98 | widgets.VBox([widgets.Label(self.docs_title), self.documents_chooser]), 99 | widgets.VBox([widgets.Label(self.docs_limit_title), self.doc_limit]), 100 | widgets.VBox([widgets.Label(self.mct_export_title), self.ckbox]) 101 | ]) 102 | 103 | cui_filter = widgets.VBox([ 104 | widgets.HTML(f"

{self.cui_filter_title_overall}

"), 105 | widgets.VBox([widgets.Label(self.cui_filter_title_file_chooser), self.cui_filter_chooser]), 106 | widgets.VBox([widgets.Label(self.cui_filter_title_text), self.cui_filter_box]), 107 | widgets.VBox([widgets.Label(self.cui_children_title), self.cui_children]) 108 | ]) 109 | 110 | # Combine all sections into a main VBox 111 | main_box = widgets.VBox([ 112 | model_choosers, 113 | cui_filter 114 | ]) 115 | display(main_box) 116 | 117 | 118 | def _get_params(self): 119 | model_path_1 = self.model1_chooser.selected 120 | model_path_2 = self.model2_chooser.selected 121 | documents_file = self.documents_chooser.selected 122 | doc_limit = self.doc_limit.value 123 | is_mct_export_compare = self.ckbox.value 124 | if not is_mct_export_compare: 125 | print(f"For models, selected:\nModel1: {model_path_1}\nModel2: {model_path_2}" 126 | f"\nDocuments: {documents_file}") 127 | else: 128 | print(f"Selected:\nModel: {model_path_1}\nMCT export: {model_path_2}" 129 | f"\nDocuments: {documents_file}") 130 | # CUI filter 131 | cui_filter = None 132 | filter_children = None 133 | if self.cui_filter_chooser.selected: 134 | cui_filter = self.cui_filter_chooser.selected 135 | elif self.cui_filter_box.value: 136 | cui_filter = self.cui_filter_box.value 137 | if self.cui_children.value and self.cui_children.value > 0: 138 | filter_children = self.cui_children.value 139 | print(f"For CUI filter, selected:\nFilter: {cui_filter}\nChildren: {filter_children}") 140 | return (model_path_1, model_path_2, documents_file, doc_limit, is_mct_export_compare, cui_filter, filter_children) 141 | 142 | def get_comparison(self) -> NBComparer: 143 | return NBComparer(*self._get_params()) 144 | -------------------------------------------------------------------------------- /medcat/compare_models/compare.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict, Set, Optional, Union, Iterator 2 | from functools import partial 3 | import glob 4 | 5 | from medcat.cat import CAT 6 | 7 | import pandas as pd 8 | import tqdm 9 | import tempfile 10 | from itertools import islice 11 | 12 | from compare_cdb import compare as compare_cdbs, CDBCompareResults 13 | from compare_annotations import ResultsTally, PerAnnotationDifferences 14 | from output import parse_and_show 15 | from cmp_utils import SaveOptions 16 | from validation import validate_input 17 | 18 | 19 | 20 | def load_documents(file_name: str, doc_limit: int = -1) -> Iterator[Tuple[str, str]]: 21 | with open(file_name) as f: 22 | df = pd.read_csv(f, names=["id", "text"]) 23 | if df.iloc[0].id == "id" and df.iloc[0].text == "text": 24 | # removes the header 25 | # but also messes up the index a little 26 | df = df.iloc[1:, :] 27 | if doc_limit == -1: 28 | yield from df.itertuples(index=False) 29 | else: 30 | yield from islice(df.itertuples(index=False), doc_limit) 31 | 32 | 33 | def do_counting(cat1: CAT, cat2: CAT, 34 | ann_diffs: PerAnnotationDifferences, 35 | doc_limit: int = -1) -> ResultsTally: 36 | def cui2name(cat, cui): 37 | if cui in cat.cdb.cui2preferred_name: 38 | return cat.cdb.cui2preferred_name[cui] 39 | all_names = cat.cdb.cui2names[cui] 40 | # longest anme 41 | return sorted(all_names, key=lambda name: len(name), reverse=True)[0] 42 | res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.make_stats(), 43 | cui2name=partial(cui2name, cat1)) 44 | res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.make_stats(), 45 | cui2name=partial(cui2name, cat2)) 46 | total = doc_limit if doc_limit != -1 else None 47 | for per_doc in tqdm.tqdm(ann_diffs.per_doc_results.values(), total=total): 48 | res1.count(per_doc.raw1) 49 | res2.count(per_doc.raw2) 50 | return res1, res2 51 | 52 | 53 | def _get_pt2ch(cat: CAT) -> Optional[Dict]: 54 | return cat.cdb.addl_info.get("pt2ch", None) 55 | 56 | 57 | def get_per_annotation_diffs(cat1: CAT, cat2: CAT, documents: Iterator[Tuple[str, str]], 58 | show_progress: bool = True, 59 | keep_raw: bool = True, 60 | doc_limit: int = -1 61 | ) -> PerAnnotationDifferences: 62 | pt2ch1: Optional[Dict] = _get_pt2ch(cat1) 63 | pt2ch2: Optional[Dict] = _get_pt2ch(cat2) 64 | temp_file = tempfile.NamedTemporaryFile() 65 | save_opts = SaveOptions(use_db=True, db_file_name=temp_file.name, 66 | clean_callback=temp_file.close) 67 | pad = PerAnnotationDifferences(pt2ch1=pt2ch1, pt2ch2=pt2ch2, 68 | model1_cuis=set(cat1.cdb.cui2names), 69 | model2_cuis=set(cat2.cdb.cui2names), 70 | keep_raw=keep_raw, 71 | save_options=save_opts) 72 | total = doc_limit if doc_limit != -1 else None 73 | for doc_id, doc in tqdm.tqdm(documents, disable=not show_progress, total=total): 74 | pad.look_at_doc(cat1.get_entities(doc), cat2.get_entities(doc), doc_id, doc) 75 | pad.finalise() 76 | return pad 77 | 78 | 79 | def load_cui_filter(filter_file: str) -> Set[str]: 80 | with open(filter_file) as f: 81 | str_list = f.read().split(',') 82 | return set(item.strip() for item in str_list) 83 | 84 | 85 | def _add_all_children(cat: CAT, cui_filter: Set[str], include_children: int) -> None: 86 | if include_children <= 0: 87 | return 88 | if "pt2ch" not in cat.cdb.addl_info: 89 | return 90 | pt2ch = cat.cdb.addl_info["pt2ch"] 91 | children = set(ch for cui in cui_filter for ch in pt2ch.get(cui, [])) 92 | if include_children > 1: 93 | _add_all_children(cat, children, include_children=include_children-1) 94 | cui_filter.update(children) 95 | 96 | 97 | def load_and_train(model_pack_path: str, mct_export_path: str) -> CAT: 98 | cat = CAT.load_model_pack(model_pack_path) 99 | # NOTE: Allowing mct_export_path to contain wildcat ("*"). 100 | # And in such a case, iterating over all matching files 101 | if "*" not in mct_export_path: 102 | cat.train_supervised_from_json(mct_export_path) 103 | else: 104 | for file in glob.glob(mct_export_path): 105 | cat.train_supervised_from_json(file) 106 | return cat 107 | 108 | 109 | def get_diffs_for(model_pack_path_1: str, 110 | model_pack_path_2: str, 111 | documents_file: str, 112 | cui_filter: Optional[Union[Set[str], str]] = None, 113 | show_progress: bool = True, 114 | include_children_in_filter: Optional[int] = None, 115 | supervised_train_comparison_model: bool = False, 116 | keep_raw: bool = True, 117 | doc_limit: int = -1, 118 | ) -> Tuple[CDBCompareResults, ResultsTally, ResultsTally, PerAnnotationDifferences]: 119 | validate_input(model_pack_path_1, model_pack_path_2, documents_file, cui_filter, supervised_train_comparison_model) 120 | documents = load_documents(documents_file, doc_limit=doc_limit) 121 | if show_progress: 122 | print("Loading [1]", model_pack_path_1) 123 | cat1 = CAT.load_model_pack(model_pack_path_1) 124 | if show_progress: 125 | print("Loading [2]", model_pack_path_2) 126 | if not supervised_train_comparison_model: 127 | cat2 = CAT.load_model_pack(model_pack_path_2) 128 | else: 129 | if show_progress: 130 | print("Reloading model pack 1", model_pack_path_1) 131 | print("And subsequently training on", model_pack_path_2) 132 | print("This may take a while, depending on the amount of " 133 | "data is being trained on") 134 | cat2 = load_and_train(model_pack_path_1, model_pack_path_2) 135 | if show_progress: 136 | print("Per annotations diff finding") 137 | if cui_filter: 138 | if isinstance(cui_filter, str): 139 | cui_filter = load_cui_filter(cui_filter) 140 | if show_progress: 141 | print("Applying filter to CATs:", len(cui_filter), 'CUIs') 142 | if include_children_in_filter: 143 | if show_progress: 144 | print("Adding all children of", include_children_in_filter, 145 | "or lower level from first model") 146 | _add_all_children(cat1, cui_filter, include_children_in_filter) 147 | if show_progress: 148 | print("After adding children from 1st model have a total of", 149 | len(cui_filter), "CUIs") 150 | _add_all_children(cat2, cui_filter, include_children_in_filter) 151 | if show_progress: 152 | print("After adding children from 2nd model have a total of", 153 | len(cui_filter), "CUIs") 154 | cat1.config.linking.filters.cuis = cui_filter 155 | cat2.config.linking.filters.cuis = cui_filter 156 | ann_diffs = get_per_annotation_diffs(cat1, cat2, documents, keep_raw=keep_raw, 157 | doc_limit=doc_limit) 158 | if show_progress: 159 | print("Counting [1&2]") 160 | res1, res2 = do_counting(cat1, cat2, ann_diffs, doc_limit=doc_limit) 161 | if show_progress: 162 | print("CDB compare") 163 | cdb_diff = compare_cdbs(cat1.cdb, cat2.cdb) 164 | return cdb_diff, res1, res2, ann_diffs 165 | 166 | 167 | def main(mpn1: str, mpn2: str, documents_file: str): 168 | cdb_diff, res1, res2, ann_diffs = get_diffs_for(mpn1, mpn2, documents_file, show_progress=False) 169 | print("Results:") 170 | parse_and_show(cdb_diff, res1, res2, ann_diffs) 171 | 172 | 173 | if __name__ == "__main__": 174 | import sys 175 | main(*sys.argv[1:]) -------------------------------------------------------------------------------- /medcat/compare_models/compare_cdb.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Set, Tuple 2 | 3 | from medcat.cdb import CDB 4 | 5 | import tqdm 6 | from itertools import chain 7 | 8 | from pydantic import BaseModel 9 | 10 | 11 | class DictCompareKeys(BaseModel): 12 | """This is based on the keys.""" 13 | total1: int 14 | """The total number of keys in 1st dict""" 15 | total2: int 16 | """The total number of keys in 2nd dict""" 17 | joint: int 18 | """The total number of keys (intersection)""" 19 | not_in_1: int 20 | """The number of keys in 2nd but not in 1st dict""" 21 | not_in_2: int 22 | """The number of keys in 1st but not in 2nd dict""" 23 | 24 | @classmethod 25 | def get(cls, d1: dict, d2: dict) -> "DictCompareKeys": 26 | # helpers 27 | all1 = set(d1) 28 | all2 = set(d2) 29 | # total keys 30 | total1 = len(all1) 31 | total2 = len(all2) 32 | # non-common keys 33 | joint = len(all1 & all2) 34 | all_combined = len(all1 | all2) 35 | not_in_1 = all_combined - total1 36 | not_in_2 = all_combined - total2 37 | return cls(total1=total1, total2=total2, joint=joint, 38 | not_in_1=not_in_1, not_in_2=not_in_2) 39 | 40 | 41 | class DictCompareValues(BaseModel): 42 | """This is based on the notion of the values being sets. 43 | 44 | With respect to the difference between `not_in_1` and `unique_in_2`: 45 | - If we have {"1": {"a", "b"}} and {"2": {"a", "b"}} 46 | - The values are identical overall (`unique_in_1==unique_in_2==0`) 47 | - However, the values are under different keys 48 | - So `not_in_1==not_in_2==2` (since this is per key) 49 | """ 50 | total1: int 51 | """The total number of values in 1st dict""" 52 | total2: int 53 | """The total number of values in 2nd dict""" 54 | not_in_1: int 55 | """The number of values in 2nd, but not in 1st (per key)""" 56 | not_in_2: int 57 | """The number of values in 1st, but not in 2nd (per key)""" 58 | joint: int 59 | """Total number of values in both 1st and 2nd dict (overall)""" 60 | unique_in_1: int 61 | """The number of unique values in 1nd (overall)""" 62 | unique_in_2: int 63 | """The number of unique values in 2nd (overall)""" 64 | 65 | @classmethod 66 | def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictCompareValues": 67 | # helpers 68 | all_keys = set(d1) | set(d2) 69 | vals_in_1 = set(chain.from_iterable(d1.values())) 70 | vals_in_2 = set(chain.from_iterable(d2.values())) 71 | # total names 72 | total1 = sum(len(v) for v in d1.values()) 73 | total2 = sum(len(v) for v in d2.values()) 74 | # names ... 75 | not_in_1 = 0 76 | not_in_2 = 0 77 | for key in tqdm.tqdm(all_keys, desc="keys", disable=not progress): 78 | n1 = d1.get(key, set()) 79 | n2 = d2.get(key, set()) 80 | all_vals4key = len(n1 | n2) 81 | not_in_1 += all_vals4key - len(n1) 82 | not_in_2 += all_vals4key - len(n2) 83 | # names in common 84 | joint = len(vals_in_1 & vals_in_2) 85 | # names unique to one of the two 86 | vals_in_one_but_not_both = vals_in_1 ^ vals_in_2 87 | unique_in_1 = len(vals_in_one_but_not_both & vals_in_1) 88 | unique_in_2 = len(vals_in_one_but_not_both & vals_in_2) 89 | return cls(total1=total1, total2=total2, not_in_1=not_in_1, 90 | not_in_2=not_in_2, joint=joint, 91 | unique_in_1=unique_in_1, unique_in_2=unique_in_2) 92 | 93 | 94 | class DictComparisonResults(BaseModel): 95 | keys: DictCompareKeys 96 | values: DictCompareValues 97 | 98 | @classmethod 99 | def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictComparisonResults": 100 | return cls(keys=DictCompareKeys.get(d1, d2), 101 | values=DictCompareValues.get(d1, d2, progress=progress)) 102 | 103 | 104 | class CDBCompareResults(BaseModel): 105 | names: DictComparisonResults 106 | snames: DictComparisonResults 107 | 108 | 109 | def compare(cdb1: CDB, 110 | cdb2: CDB, 111 | show_progress: bool = True) -> CDBCompareResults: 112 | """_summary_ 113 | 114 | Args: 115 | cdb1 (CDB): _description_ 116 | cdb2 (CDB): _description_ 117 | show_progress (bool, optional): _description_. Defaults to True. 118 | 119 | Returns: 120 | CDBCompareResults: _description_ 121 | """ 122 | reg = DictComparisonResults.get(cdb1.cui2names, cdb2.cui2names, progress=show_progress) 123 | snames = DictComparisonResults.get(cdb1.cui2snames, cdb2.cui2snames, progress=show_progress) 124 | return CDBCompareResults(names=reg, snames=snames) 125 | -------------------------------------------------------------------------------- /medcat/compare_models/data/demo-physio-mobility/cui_filter.csv: -------------------------------------------------------------------------------- 1 | 289001005,289004002,226207007,165224005,129043005,129040008,129041007,704440004,704439001,704437004,129065005,129039006,129035000,165232002,1080000000000000,716422006,45850009,284908004,129045003,129062008,714887007,714916007,719024002,715127003,714915006,282882001,302040002,302043000,165243005,365112008,165255004,105504002,301563003,301497008,160680006,301589003,165248001,165249009,362000000000000,270469004,160729004,248000000000000,160734000,405807003,160685001,285035003,285038001,285034004,428483004,1912002,431188001,864000000000000,301563003,1070000000000000,301497008,78459008,284915007,307439001,229798009,229799001,229797004,31000000000000,818000000000000,763692001,404934007,863721000000101,1100000017,1100000016,1100000030,1100000030,1100000011,863721000000101,863721000000101,863721000000101,282884000,282884000,282884000,1100000027,361721000000103,1100000028,361721000000103,1100000028,361721000000103,1100000027,361721000000103,361721000000103,1100000027,1100000028,1100000027,361721000000103,361721000000103,1100000028,1100000031,31031000119102,310131003,895488007,895488007,394923006,394923006,394923006,394923006,394923006,248171000000108,248171000000108,248171000000108,248171000000108,1100000015,1100000015,1100000012,1100000012,1100000013,1100000012,302046008,25711000087100,165233007,1100000029,718705001,718360006,282871009,895486006,895486006,895486006,895486006,699650006,699650006,8510008,273302005,306171006,257301003,404930003,404930003,404930003,224221006,261001000,184156005,184156005,183376001,154091000119106,301627005,301627005,725594005,445414007,165803005,323701000000101,72042002,24029004,282971008,10610811000001107,161903000,979501000000100,301477003,282966001,1149222004,371153006,311925007,225602000,763264000,249902000,249902000,223600005,386323002,37013008,205511000000108,325831000000100,1073861000000108,273469003,129032002,286489001,761481000000107,129072006,1073311000000100,286489001,286490005,129026007,160689007,286493007,129031009,1069991000000102,1071641000000109,960681000000109 -------------------------------------------------------------------------------- /medcat/compare_models/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/__init__.py -------------------------------------------------------------------------------- /medcat/compare_models/tests/resources/docs/not_real.csv: -------------------------------------------------------------------------------- 1 | "id","text" 2 | "-1","Not real text. Just Virus and Virus Z" 3 | "-2","Really not real Virus text" -------------------------------------------------------------------------------- /medcat/compare_models/tests/resources/mct_export/medcat_trainer_expoert2.json: -------------------------------------------------------------------------------- 1 | {"projects": 2 | [ 3 | { 4 | "name": "SAMPLE FAKE PROJECT", 5 | "id": -2, 6 | "cuis": "", 7 | "tuis": "", 8 | "documents": [ 9 | { 10 | "id": -2, 11 | "name": "FAKE-TEXT", 12 | "text": "FAKE TEXT WITH fake concepts, i.e Virus Z, and Virus.", 13 | "annotations": [ 14 | {"id": -3, "user": "fake", "cui": "C0000139", "value": "gastroesophageal reflux", "start": 34, "end": 41, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}, 15 | {"id": -4, "user": "fake", "cui": "C0000039", "value": "hypertension", "start": 47, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []} 16 | ] 17 | } 18 | ] 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /medcat/compare_models/tests/resources/mct_export/medcat_trainer_export.json: -------------------------------------------------------------------------------- 1 | {"projects": 2 | [ 3 | { 4 | "name": "SAMPLE FAKE PROJECT", 5 | "id": -1, 6 | "cuis": "", 7 | "tuis": "", 8 | "documents": [ 9 | { 10 | "id": -1, 11 | "name": "FAKE-TEXT", 12 | "text": "FAKE TEXT WITH fake concepts, i.e Virus, and Virus Z.", 13 | "annotations": [ 14 | {"id": -1, "user": "fake", "cui": "C0000039", "value": "gastroesophageal reflux", "start": 34, "end": 39, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}, 15 | {"id": -2, "user": "fake", "cui": "C0000139", "value": "hypertension", "start": 45, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []} 16 | ] 17 | } 18 | ] 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /medcat/compare_models/tests/resources/model_pack/cdb.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/resources/model_pack/cdb.dat -------------------------------------------------------------------------------- /medcat/compare_models/tests/resources/model_pack/vocab.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/resources/model_pack/vocab.dat -------------------------------------------------------------------------------- /medcat/compare_models/tests/test_compare.py: -------------------------------------------------------------------------------- 1 | import unittest.mock 2 | from compare import _add_all_children 3 | from compare import get_diffs_for 4 | from compare import (CDBCompareResults, ResultsTally, 5 | ResultsTally, PerAnnotationDifferences) 6 | import unittest 7 | import os 8 | 9 | from medcat.cat import CAT 10 | 11 | 12 | class FakeCDBWithPt2Ch: 13 | 14 | def __init__(self, pt2ch: dict) -> None: 15 | self.pt2ch = pt2ch 16 | self.addl_info = {"pt2ch": self.pt2ch} 17 | 18 | 19 | class FakeCATWithCDBAndPt2Ch: 20 | 21 | def __init__(self, pt2ch: dict) -> None: 22 | self.cdb = FakeCDBWithPt2Ch(pt2ch) 23 | 24 | 25 | _PT2CH = { 26 | "C1": ["C11", "C12", "C13"], 27 | "C2": ["C21"], 28 | # grandchildren 29 | "C11": ["C111", "C112", "C113"], 30 | "C13": ["C131", "C132"], 31 | # great grandchildren 32 | "C132": ["C1321", "C1322"], 33 | } 34 | 35 | 36 | class AddAllChildrenTests(unittest.TestCase): 37 | pt2ch = _PT2CH 38 | fake_cat = FakeCATWithCDBAndPt2Ch(pt2ch) 39 | 40 | _cui_filter = set(['C1', 'C2']) 41 | a = [c for c in pt2ch.get("", [])] 42 | children_1st_order = set(ch for cui in _cui_filter for ch in _PT2CH.get(cui, [])) 43 | children_2nd_order = set(gch for ch in children_1st_order for gch in _PT2CH.get(ch, [])) 44 | 45 | @property 46 | def cui_filter(self) -> set: 47 | return set(self._cui_filter) 48 | 49 | def test_adds_no_children_with_0(self): 50 | f = self.cui_filter # copy 51 | _add_all_children(self.fake_cat, f, include_children=0) 52 | self.assertEqual(f, self.cui_filter) 53 | 54 | def test_add_first_children_with_1(self): 55 | f = self.cui_filter 56 | _add_all_children(self.fake_cat, f, include_children=1) 57 | self.assertGreater(f, self.cui_filter) 58 | self.assertEqual(f, self.cui_filter | self.children_1st_order) 59 | # no grandchildren 60 | self.assertFalse(f & self.children_2nd_order) 61 | 62 | def test_add_grandchildren_with_2(self): 63 | f = self.cui_filter 64 | _add_all_children(self.fake_cat, f, include_children=2) 65 | self.assertGreater(f, self.cui_filter) 66 | self.assertGreater(f, self.cui_filter | self.children_1st_order) 67 | self.assertEqual(f, self.cui_filter | self.children_1st_order | self.children_2nd_order) 68 | 69 | 70 | class TrainAndCompareTests(unittest.TestCase): 71 | _file_dir = os.path.dirname(__file__) 72 | _resources_path = os.path.join(_file_dir, "resources") 73 | cat_path = os.path.join(_resources_path, "model_pack") 74 | mct_export_path_1 = os.path.join(_resources_path, "mct_export", "medcat_trainer_export.json") 75 | mct_export_path_glob = os.path.join(_resources_path, "mct_export", "medcat_trainer_export*.json") 76 | docs_file = os.path.join(_resources_path, "docs", "not_real.csv") 77 | 78 | # this tests that the training is called 79 | @classmethod 80 | @unittest.mock.patch("medcat.cat.CAT.train_supervised_from_json") 81 | def _get_diffs(cls, mct_export_path: str, method): 82 | diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file, 83 | supervised_train_comparison_model=True) 84 | cls.assertTrue(cls, method.called) 85 | return diffs 86 | 87 | 88 | @classmethod 89 | def setUpClass(cls) -> None: 90 | ann_diffs1 = cls._get_diffs(cls.mct_export_path_1) 91 | cls.cdb_comp1, cls.tally1_1, cls.tally1_2, cls.ann_diffs1 = ann_diffs1 92 | ann_diffs_many = cls._get_diffs(cls.mct_export_path_glob) 93 | cls.cdb_comp_many, cls.tally_many_1, cls.tally_many_2, cls.ann_diffs_many = ann_diffs_many 94 | 95 | def test_compares_with_one_file(self): 96 | self.assertIsInstance(self.cdb_comp1, CDBCompareResults) 97 | self.assertIsInstance(self.tally1_1, ResultsTally) 98 | self.assertIsInstance(self.tally1_2, ResultsTally) 99 | self.assertIsInstance(self.ann_diffs1, PerAnnotationDifferences) 100 | 101 | def test_compares_with_multiple_file(self): 102 | self.assertIsInstance(self.cdb_comp_many, CDBCompareResults) 103 | self.assertIsInstance(self.tally_many_1, ResultsTally) 104 | self.assertIsInstance(self.tally_many_2, ResultsTally) 105 | self.assertIsInstance(self.ann_diffs_many, PerAnnotationDifferences) 106 | -------------------------------------------------------------------------------- /medcat/compare_models/tests/test_compare_cdb.py: -------------------------------------------------------------------------------- 1 | import compare_cdb 2 | 3 | import unittest 4 | EXAMPLE1 = { 5 | "C0": {"n01", "n02", "n03"}, # 1 non-unique (#2 CS) 6 | "C1": {"n11", "n12" }, 7 | 8 | "C3": {"n31", "n33"}, # adds 1 CUI, 2 names 9 | 10 | "C5": { "n53"}, # adds 1 CUI, 1 name 11 | } 12 | EXAMPLE2 = { 13 | "C0": {"n01", "n02", "n03"}, # 1 non-unique (CS) 14 | "C1": {"n11", "n12", "n13"}, # adds 1 name 15 | "C2": {"n21", "n23"}, # adds 1 CUI, 2 names 16 | 17 | "C4": {"n41", "n42", "n43"}, # adds 1 CUI, 3 names; 1 non-unique (CS) 18 | 19 | "CS": {"n01", "n42", }, # adds 1 CUI, no names 20 | } 21 | # this should be equivalent to the above 22 | EXPECTED_VALUES_MAN = compare_cdb.DictCompareValues(total1=8, 23 | total2=13, 24 | not_in_1=8, # n13, n21, n23, n41, n42, n43, "n01", "n42" 25 | not_in_2=3, # n31, n33, n53 26 | joint=5, # n01, n02, n03, n11, n12 27 | unique_in_1=3, # overall unique in 1st 28 | unique_in_2=6, # overall unique in 2nd 29 | ) 30 | 31 | keys1 = set(EXAMPLE1.keys()) 32 | keys2 = set(EXAMPLE2.keys()) 33 | EXPECTED_KEYS = compare_cdb.DictCompareKeys(total1=len(keys1), 34 | total2=len(keys2), 35 | joint=len(keys1 & keys2), 36 | not_in_1=(len(keys1 | keys2)) - len(keys1), 37 | not_in_2=(len(keys1 | keys2)) - len(keys2),) 38 | # this should be equivalent to the above 39 | EXPECTED_KEYS_MAN = compare_cdb.DictCompareKeys(total1=4, # C0, C1, C3, C5 40 | total2=5, # C0, C1, C2, C4, CS 41 | joint=2, # C0, C1 42 | not_in_1=3, # C2, C4, CS 43 | not_in_2=2, # C3, C5 44 | ) 45 | vals1 = set(e for v in EXAMPLE1.values() for e in v) 46 | total1 = sum(len(v) for v in EXAMPLE1.values()) 47 | vals2 = set(e for v in EXAMPLE2.values() for e in v) 48 | total2 = sum(len(v) for v in EXAMPLE2.values()) 49 | EXPECTED_VALUES = compare_cdb.DictCompareValues(total1=total1, 50 | total2=total2, 51 | not_in_1=8, # the new/misplaced CUIs in 2nd 52 | not_in_2=3, # the new/misplaced CUIs in 1st 53 | joint=len(vals1 & vals2), 54 | unique_in_1=3, # overall unique in 1st 55 | unique_in_2=6, # overall unique in 2nd 56 | ) 57 | 58 | 59 | class CompareDictTests(unittest.TestCase): 60 | 61 | def test_compare_keys_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_KEYS, exp_man=EXPECTED_KEYS_MAN): 62 | res = compare_cdb.DictCompareKeys.get(d1, d2) 63 | self.assertEqual(res.dict(), exp.dict()) 64 | self.assertEqual(res.dict(), exp_man.dict()) 65 | 66 | def test_compare_values_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_VALUES, exp_man=EXPECTED_VALUES_MAN): 67 | res = compare_cdb.DictCompareValues.get(d1, d2, progress=False) 68 | self.assertEqual(res.dict(), exp.dict()) 69 | self.assertEqual(res.dict(), exp_man.dict()) 70 | 71 | -------------------------------------------------------------------------------- /medcat/compare_models/tests/test_output.py: -------------------------------------------------------------------------------- 1 | import output 2 | 3 | import contextlib 4 | import io 5 | import sys 6 | 7 | import unittest 8 | 9 | 10 | @contextlib.contextmanager 11 | def nostdout(): 12 | save_stdout = sys.stdout 13 | sys.stdout = io.StringIO() 14 | yield 15 | sys.stdout = save_stdout 16 | 17 | 18 | class CompareDictTests(unittest.TestCase): 19 | example_dict = {"k1": "v1", 20 | "k2": "v2", 21 | "k3": {"sk1": 1.0}} 22 | example_dict2 = {'pretty_name': 'Genus Quercus', 23 | 'cui': '53347009', 24 | 'type_ids': ['81102976'], 25 | 'types': [''], 26 | 'source_value': 'Oak', 27 | 'detected_name': 'oak', 28 | 'acc': 0.6368384509248382, 29 | 'context_similarity': 0.6368384509248382, 30 | 'start': 43, 31 | 'end': 46, 32 | 'icd10': [], 33 | 'ontologies': 34 | ['20220803_SNOMED_UK_CLINICAL_EXT'], 35 | 'snomed': [], 36 | 'id': 3, 37 | 'meta_anns': { 38 | 'Presence': {'value': 'True', 'confidence': 0.999996542930603, 'name': 'Presence'}, 39 | 'Subject': {'value': 'Patient', 'confidence': 0.9396798014640808, 'name': 'Subject'}, 40 | 'Time': {'value': 'Recent', 'confidence': 0.9999940395355225, 'name': 'Time'} 41 | } 42 | } 43 | expected_nulled_dict2 = {'pretty_name': '', 44 | 'cui': '', 45 | 'type_ids': '', 46 | 'types': '', 47 | 'source_value': '', 48 | 'detected_name': '', 49 | 'acc': '', 50 | 'context_similarity': '', 51 | 'start': '', 52 | 'end': '', 53 | 'icd10': '', 54 | 'ontologies': '', 55 | 'snomed': '', 56 | 'id': '', 57 | 'meta_anns': {} 58 | } 59 | 60 | def setUp(self) -> None: 61 | self.nulled = output._get_nulled_copy(self.example_dict) 62 | self.nulled2 = output._get_nulled_copy(self.example_dict2) 63 | 64 | def test_compare_dicts_works_1st_None(self): 65 | with nostdout(): 66 | output.compare_dicts(None, self.example_dict) 67 | 68 | def test_compare_dicts_works_2nd_None(self): 69 | with nostdout(): 70 | output.compare_dicts(self.example_dict, None) 71 | 72 | def test_expected_nulled_real(self): 73 | self.assertEqual(self.nulled2, self.expected_nulled_dict2) 74 | 75 | def test_compare_dicts_1st_only_real(self): 76 | with nostdout(): 77 | output.compare_dicts(self.example_dict2, None) 78 | -------------------------------------------------------------------------------- /medcat/compare_models/validation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union, Set 2 | import os 3 | import glob 4 | 5 | 6 | def _is_mct_export(file_path: str) -> bool: 7 | if "*" in file_path: 8 | nr_of_matching_files = len(list(glob.iglob(file_path))) 9 | print("GLOB w", nr_of_matching_files, nr_of_matching_files > 0) 10 | return nr_of_matching_files > 0 11 | print("MCT EXPORT (no-glob?", os.path.exists(file_path), file_path.endswith(".json")) 12 | return os.path.exists(file_path) and file_path.endswith(".json") 13 | 14 | 15 | def validate_input(model_path1: str, model_path2: str, documents_file: str, 16 | cui_filter: Optional[Union[Set[str], str]], 17 | supevised_train_comp: bool): 18 | if not os.path.exists(model_path1): 19 | raise ValueError(f"No model found at specified path (1st model): {model_path1}") 20 | if not is_medcat_model(model_path1): 21 | raise ValueError(f"Not a medcat model: {model_path1}") 22 | if not os.path.exists(model_path2): 23 | if supevised_train_comp and not _is_mct_export(model_path2): 24 | raise ValueError(f"No matching MCT export found for: {model_path2}") 25 | elif not supevised_train_comp: 26 | raise ValueError(f"No file found at specified path (2nd model): {model_path2}") 27 | if supevised_train_comp: 28 | if not os.path.isfile(model_path2) and not _is_mct_export(model_path2): 29 | raise ValueError(f"MCT export provided should be a file not a folder: {model_path2}") 30 | if not model_path2.lower().endswith(".json"): 31 | raise ValueError(f"MCT export expected in .json format, Got: {model_path2}") 32 | elif not is_medcat_model(model_path2): 33 | raise ValueError(f"Not a medcat model: {model_path2}") 34 | if cui_filter is not None: 35 | if isinstance(cui_filter, str): 36 | if not os.path.exists(cui_filter): 37 | raise ValueError(f"File passed as CUI filter does not exist: {cui_filter}") 38 | if not os.path.exists(documents_file): 39 | raise ValueError(f"No documents file found: {documents_file}") 40 | if not documents_file.lower().endswith(".csv"): 41 | raise ValueError(f"Expected a .csv file for documnets, got: {documents_file}") 42 | 43 | 44 | def _is_medcat_model_folder(model_folder: str): 45 | # needs to have CDB and vocab 46 | cdb_path = os.path.join(model_folder, 'cdb.dat') 47 | vocab_path = os.path.join(model_folder, "vocab.dat") 48 | return ((os.path.exists(cdb_path) and os.path.isfile(cdb_path)) and 49 | (os.path.exists(vocab_path) and os.path.isfile(vocab_path))) 50 | 51 | 52 | def is_medcat_model(model_path: str) -> bool: 53 | if os.path.isdir(model_path): 54 | return _is_medcat_model_folder(model_path) 55 | model_folder = model_path[:-len(".zip")] 56 | if os.path.exists(model_folder): 57 | # NOTE: if the model folder doesn't exist, it will 58 | # be extracted upon loading the model 59 | return _is_medcat_model_folder(model_folder) 60 | # NOTE: this does not actually guarantee that it's a model pack 61 | # but it would be outside the scope of this method 62 | # to try and extract or list the contents 63 | return model_path.endswith(".zip") 64 | -------------------------------------------------------------------------------- /medcat/evaluate_mct_export/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/evaluate_mct_export/__init__.py -------------------------------------------------------------------------------- /medcat/evaluate_mct_export/mct_export_summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Evaluate a MedCATtrainer project export\n", 9 | "\n", 10 | "Replace all <<\\>> with a custom name that refers to a file name." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from mct_analysis import MedcatTrainer_export" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Load MCT exports and MedCAT model" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "scrolled": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "lst_mct_export=[\n", 39 | " '../../data/medcattrainer_export/<<>>', # mct_export .json here\n", 40 | " ] \n", 41 | "\n", 42 | "mct_model = \"../../models/modelpack/<<>>\" # Enter your medcat model here\n", 43 | "\n", 44 | "\n", 45 | "mct = MedcatTrainer_export(mct_export_paths=lst_mct_export, model_pack_path= mct_model)\n", 46 | "# You can just jump to the generate the report section. The following code is a breakdown of the intermediate steps" 47 | ] 48 | }, 49 | { 50 | "attachments": {}, 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Evaluate model card" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "scrolled": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# Load the model card\n", 66 | "mct.cat.get_model_card(as_dict=True)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# look to potentially remove any filters that exist in the model\n", 76 | "\"\"\"\n", 77 | "mct.cat.config.linking['filters']\n", 78 | "\"\"\"" 79 | ] 80 | }, 81 | { 82 | "attachments": {}, 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Evaluate MCT export" 87 | ] 88 | }, 89 | { 90 | "attachments": {}, 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### View all Annotations and Meta-annotations created" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "scrolled": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Load all annotations created\n", 106 | "anns_df = mct.annotation_df()\n", 107 | "anns_df" 108 | ] 109 | }, 110 | { 111 | "attachments": {}, 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Summarise all Meta-annotations" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Meta_annotation summary\n", 125 | "for col in anns_df.loc[:,'acc':].iloc[:,1:]:\n", 126 | " print(anns_df[col].value_counts())\n", 127 | " print()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Meta_annotation summary of combinations\n", 137 | "for k,v in anns_df.loc[:,'acc':].iloc[:,1:].value_counts().items():\n", 138 | " print(k,v)" 139 | ] 140 | }, 141 | { 142 | "attachments": {}, 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Overview of the entire MCT export\n", 147 | "This includes all names of all projects within the export and the document ids." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "scrolled": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "# projects\n", 159 | "anns_df['project'].unique()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# documents\n", 169 | "anns_df['document_name'].unique()" 170 | ] 171 | }, 172 | { 173 | "attachments": {}, 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# Annotation Summary" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "performance_summary_df = mct.concept_summary()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "scrolled": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "performance_summary_df" 198 | ] 199 | }, 200 | { 201 | "attachments": {}, 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "# Annotator stats" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# User Stats\n", 215 | "mct.user_stats()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "mct.plot_user_stats(save_fig=True, save_fig_filename='<<>>.html')" 225 | ] 226 | }, 227 | { 228 | "attachments": {}, 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "# Generate report\n", 233 | "All of the above functions added into a single Excel file report" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# Example of function description and parameters\n", 243 | "help(mct.generate_report)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "scrolled": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "mct.generate_report(path='<<>>.xlsx')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [] 263 | }, 264 | { 265 | "attachments": {}, 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "# Meta Annotations\n", 270 | "\n", 271 | "helper function to rename meta_task and meta_task values.\n", 272 | "\n", 273 | "__TODO:__ This Section is incomplete" 274 | ] 275 | }, 276 | { 277 | "attachments": {}, 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Rename meta annotation tasks" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# select which meta tasks to rename\n", 291 | "rename_meta_anns = {'Subject/Experiencer':'Subject'}\n", 292 | "# select which meta values for the corresponding meta tasks.\n", 293 | "rename_meta_anns_values = {'Subject':{'Relative':'Other'}}\n", 294 | "# run the renaming\n", 295 | "mct.rename_meta_anns(meta_anns2rename=rename_meta_anns, meta_ann_values2rename=rename_meta_anns_values)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "anns_df = mct.annotation_df()\n", 305 | "anns_df.head()" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 3 (ipykernel)", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.7.3" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 5 330 | } 331 | -------------------------------------------------------------------------------- /models/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Space to store all components of a MedCAT model. 2 | 3 | MedCAT modelpacks are generally comprised for 3 components: 4 | 1) CDB 5 | 2) Vocab 6 | 3) Config 7 | 8 | Other components include preprocessing tools such as SpaCY package etc.. 9 | -------------------------------------------------------------------------------- /models/cdb/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/models/cdb/.keep -------------------------------------------------------------------------------- /models/modelpack/ReadMe.md: -------------------------------------------------------------------------------- 1 | All MedCAT modelpacks should be placed here. 2 | 3 | To create a modelpack please see [create_modelpack.py](/medcat/1_create_model/create_modelpack/create_modelpack.py) for further details. 4 | -------------------------------------------------------------------------------- /models/vocab/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/models/vocab/.keep -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | # Global options: 2 | 3 | [mypy] 4 | ignore_missing_imports = True 5 | allow_redefinition = True 6 | -------------------------------------------------------------------------------- /projects/ReadMe.md: -------------------------------------------------------------------------------- 1 | # Projects 2 | 3 | This directory is a placeholder for where project workflows are organised and all relevant information relevant to a particular usecase is stored in a single location. 4 | 5 | The [demo project structure](./demo_project_stucture) is a template which you can copy and follow to meet the requirements of your usecase. 6 | 7 | ``` 8 | $ cp -r projects/demo_project_stucture projects/ 9 | ``` 10 | 11 | The majority of this information is also held throughout other sections of this repository and thus this section is simply an alternative workflow which keeps all relevant data and file pertaining to a project together. 12 | 13 | The folder names should correspond to the project and project ID for easy reference. 14 | 15 | ## Standarise Workflows (Optional) 16 | The following is just guidelines/recommendations to standardise workflow: 17 | 18 | -

Good practise is to name files with the following structure *YYYYMMDD_filename* 19 |

20 | 21 | 22 | This working directory, should be used to store temporary data files. With the final scripts (main.py and other analysis scripts...) held directly in the project folder outside of the sub-folders. Any raw or intermediate data that one may want to reference later should be stored in their respective directories. 23 | 24 | A recommended format for the directory structure to efficiently manage each request is as follows: 25 | * Ideally the project_name should correspond to your CogStack request ID. 26 | 27 | 28 | ``` 29 | ── project_name/ 30 | ├── raw_data/ # raw data files 31 | │ └── cogstack_search_hits/ # search results 32 | ├── processed_data/ # intermediate reference files 33 | │ └── ann_folder_path/ # annotated documents 34 | ├── results/ # final results 35 | ├── 1_search.ipynb # search scripts 36 | ├── 2_run_model.ipynb # run model 37 | ├── 3_pipeline.ipynb # convert annotation to output pipeline 38 | ├── 4_evaluation.ipynb # evaluation of the output compared to a gold standard dataset 39 | ``` 40 | 41 | 42 | __[raw_data/]__: Contains the original, or raw, data files. Contents in this folder should be treated as read-only. 43 | 44 | __[raw_data/cogstack_search_hits/]__: Contains the search results from cogstack. Once retreived from cogstack this dataset is static. 45 | 46 | __[processed_data/]__: Contains manipulated files or partially processed files 47 | 48 | __[processed_data/ann_folder_path/]__: All direct annotation output from a medcat model should be stored here. Acts as a checkpoint from which analysis can be conducted. 49 | 50 | __[results/]__: Contains the final results and ideally explanatory markdown files. 51 | 52 | -------------------------------------------------------------------------------- /projects/demo_project_stucture/ReadMe.md: -------------------------------------------------------------------------------- 1 | # \ 2 | 3 | ## \ 4 | 5 | Feel free to edit the meta data of the contents 6 | and write information about the project and objectives here. -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | mypy 2 | pandas-stubs 3 | types-tqdm 4 | types-requests 5 | types-regex -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy>=3.6.0,<4.0 2 | medcat~=1.16.0 3 | plotly~=5.19.0 4 | eland==8.12.1 5 | en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl 6 | ipyfilechooser 7 | jupyter_contrib_nbextensions 8 | -------------------------------------------------------------------------------- /search/.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=strip-notebook-output 2 | -------------------------------------------------------------------------------- /search/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | # Login and search 3 | This directory contains all the scripts necessary to login and conduct a search. 4 | 5 | ## Login details 6 | 1. Create a [credentials.py](../credentials.py) 7 | 2. Populate it with your cogstack instance and login details 8 | An example template can be seen below: 9 | ``` 10 | hosts = [] # This is a list of your cogstack elasticsearch instances. 11 | 12 | # These are your login details (either via http_auth or API) 13 | username = None 14 | password = None 15 | ``` 16 | 17 | __Note__: If these fields are left blank then the user will be prompted to enter the details themselves. 18 | 19 | If you are unsure about the above information please contact your CogStack system administrator. 20 | 21 | ## How to build a Search query 22 | 23 | A core component of cogstack is Elasticsearch which is a search engine built on top of Apache Lucene. 24 | 25 | Lucene has a custom query syntax for querying its indexes (Lucene Query Syntax). This query syntax allows for features such as Keyword matching, Wildcard matching, Regular expression, Proximity matching, Range searches. 26 | 27 | Full documentation for this syntax is available as part of Elasticsearch [query string syntax](https://www.elastic.co/guide/en/elasticsearch/reference/8.5/query-dsl-query-string-query.html#query-string-syntax). -------------------------------------------------------------------------------- /search/search_template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Seaching CogStack\n", 8 | "\n", 9 | "This script is designed to be a template for cogstack searches" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import sys\n", 19 | "sys.path.append('..')\n", 20 | "from credentials import *\n", 21 | "from cogstack import CogStack" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Login and Initialise" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "cs = CogStack(hosts, username=username, password=password, api=True)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Check the list of Indices and columns" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "for i in cs.elastic.indices.get_mapping().keys():\n", 54 | " print(i)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Check the list of columns in that index\n", 64 | "index = ''\n", 65 | "for col in cs.elastic.indices.get_mapping(index=index)[index]['mappings']['properties'].keys():\n", 66 | " print(col)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "# Set parameters" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "pt_list = [] # example list of patients' patient_TrustNumber here" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Columns of interest\n", 90 | "\n", 91 | "Select your fields and list in order of output columns" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "columns = []" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Build query\n", 108 | "\n", 109 | "For further information on [how to build a query can be found here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)\n", 110 | "\n", 111 | "Further information on [free text string queries can be found here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html)\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# Example query structure\n", 121 | "query = {\n", 122 | " \"from\": 0,\n", 123 | " \"size\": 10000,\n", 124 | " \"query\": {\n", 125 | " \"bool\": {\n", 126 | " \"filter\": {\n", 127 | " \"terms\": {\"patient_TrustNumber\": pt_list}\n", 128 | " },\n", 129 | " \"must\": [\n", 130 | " {\"query_string\": {\n", 131 | " \"query\": \"***YOUR LUCENE QUERY HERE***\"}\n", 132 | " }\n", 133 | " ]\n", 134 | " }\n", 135 | " },\n", 136 | " \"_source\": columns # This is a search column filter. remove if all columns are to be retrieved\n", 137 | "}" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "tags": [] 144 | }, 145 | "source": [ 146 | "# Search, Process, and Save" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "df = cs.cogstack2df(query=query, index=index, column_headers=columns)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Process" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# Whatever you want here\n", 172 | "df.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Save" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "path_to_results = \"../data/cogstack_search_results\"\n", 189 | "file_name = \"file_name.csv\"" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "df.to_csv(path_to_results + '/' +file_name, index=False)" 199 | ] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.9.6 (default, Sep 26 2022, 11:37:49) \n[Clang 14.0.0 (clang-1400.0.29.202)]" 219 | }, 220 | "vscode": { 221 | "interpreter": { 222 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 223 | } 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 4 228 | } 229 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/__init__.py -------------------------------------------------------------------------------- /tests/medcat/1_create_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/__init__.py -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_cdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_cdb/__init__.py -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_cdb/test_create_cdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import medcat.cdb 4 | 5 | _FILE_DIR = os.path.dirname(__file__) 6 | 7 | # because this project isn't (at least of of writing this) 8 | # set up as a python project, there are no __init__.py 9 | # files in each folder 10 | # as such, in order to gain access to the relevant module, 11 | # I'll need to add the path manually 12 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..") 13 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_cdb")) 14 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER) 15 | # now we are able to import create_cdb and/or create_umls_cdb 16 | 17 | import unittest 18 | from unittest.mock import patch 19 | 20 | # SNOMED pre-cdb csv 21 | PRE_CDB_CSV_PATH_SNOMED = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_snomed.csv") 22 | PRE_CDB_CSV_PATH_UMLS = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_umls.csv") 23 | 24 | 25 | def get_mock_input(output: str): 26 | def mock_input(prompt: str): 27 | return output 28 | return mock_input 29 | 30 | 31 | class CreateCDBTest(unittest.TestCase): 32 | 33 | def setUp(self) -> None: 34 | self.output_cdb = None 35 | 36 | def tearDown(self) -> None: 37 | if self.output_cdb is not None and os.path.exists(self.output_cdb): 38 | os.remove(self.output_cdb) 39 | 40 | def assertHasCDB(self, path: str): 41 | self.assertTrue(os.path.exists(path)) 42 | self.assertTrue(path.endswith(".dat")) 43 | cdb = medcat.cdb.CDB.load(path) 44 | self.assertIsInstance(cdb, medcat.cdb.CDB) 45 | 46 | def test_snomed_cdb_creation(self): 47 | # Replace the 'input' function with 'mock_input' 48 | with patch('builtins.input', side_effect=get_mock_input(PRE_CDB_CSV_PATH_SNOMED)): 49 | import create_cdb 50 | self.output_cdb = create_cdb.output_cdb 51 | self.assertHasCDB(self.output_cdb) 52 | 53 | def test_umls_cdb_creation(self): 54 | with patch('builtins.input', side_effect=get_mock_input(PRE_CDB_CSV_PATH_UMLS)): 55 | import create_umls_cdb 56 | self.output_cdb = create_umls_cdb.output_cdb 57 | self.assertHasCDB(self.output_cdb) 58 | -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_modelpack/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_modelpack/__init__.py -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_modelpack/test_create_modelpack.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import unittest 5 | # import unittest.mock 6 | 7 | import tempfile 8 | 9 | 10 | # relative to file path 11 | _FILE_DIR = os.path.dirname(__file__) 12 | # because this project isn't (at least of of writing this) 13 | # set up as a python project, there are no __init__.py 14 | # files in each folder 15 | # as such, in order to gain access to the relevant module, 16 | # I'll need to add the path manually 17 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..") 18 | MEDCAT_CREATE_MODELPACK_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_modelpack")) 19 | sys.path.append(MEDCAT_CREATE_MODELPACK_FOLDER) 20 | # now we are able to import create_modelpack 21 | 22 | import create_modelpack 23 | 24 | RESOURCES_FOLDER = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources") 25 | DEFAULT_CDB_PATH = os.path.join(RESOURCES_FOLDER, "cdb.dat") 26 | DEFAULT_VOCAB_PATH = os.path.join(RESOURCES_FOLDER, "vocab.dat") 27 | 28 | 29 | class CreateModelPackTests(unittest.TestCase): 30 | 31 | @classmethod 32 | def setUpClass(cls): 33 | cls.tempfolder = tempfile.TemporaryDirectory() 34 | cls.model_pack_name = "TEMP_MODEL_PACK" 35 | cls.partial_model_pack_path = os.path.join(cls.tempfolder.name, cls.model_pack_name) 36 | 37 | @classmethod 38 | def tearDownClass(cls): 39 | cls.tempfolder.cleanup() 40 | 41 | def test_a(self): 42 | model_pack_name = create_modelpack.load_cdb_and_save_modelpack( 43 | DEFAULT_CDB_PATH, self.model_pack_name, 44 | self.tempfolder.name, DEFAULT_VOCAB_PATH) 45 | self.assertTrue(model_pack_name.startswith(self.model_pack_name)) 46 | model_pack_path = os.path.join(self.tempfolder.name, model_pack_name) 47 | self.assertTrue(os.path.exists(model_pack_path)) 48 | -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_vocab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_vocab/__init__.py -------------------------------------------------------------------------------- /tests/medcat/1_create_model/create_vocab/test_create_vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import medcat.vocab 5 | 6 | _FILE_DIR = os.path.dirname(__file__) 7 | 8 | # because this project isn't (at least of of writing this) 9 | # set up as a python project, there are no __init__.py 10 | # files in each folder 11 | # as such, in order to gain access to the relevant module, 12 | # I'll need to add the path manually 13 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..") 14 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_vocab")) 15 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER) 16 | # now we are able to import create_cdb and/or create_umls_cdb 17 | 18 | import unittest 19 | from unittest.mock import patch, mock_open 20 | 21 | 22 | VOCAB_INPUT_PATH = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "models", "vocab", "vocab_data.txt")) 23 | VOCAB_OUTPUT_PATH = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "models", "vocab", "vocab.dat")) 24 | VOCAB_INPUT = [ 25 | "house 34444 0.3232 0.123213 1.231231" 26 | "dog 14444 0.76762 0.76767 1.45454" 27 | ] 28 | 29 | orig_open = open 30 | 31 | 32 | def custom_open(file, mode="r", *args, **kwargs): 33 | if 'r' in mode: 34 | return mock_open(read_data="\n".join(VOCAB_INPUT))(file, mode, *args, **kwargs) 35 | return orig_open(file, mode, *args, **kwargs) 36 | 37 | 38 | class CreateVocabTest(unittest.TestCase): 39 | temp_vocab_path = "temp_vocab_for_test_create_vocab" 40 | 41 | def setUp(self) -> None: 42 | if os.path.exists(VOCAB_OUTPUT_PATH): 43 | os.rename(VOCAB_OUTPUT_PATH, self.temp_vocab_path) 44 | self.moved = True 45 | else: 46 | self.moved = False 47 | 48 | def tearDown(self) -> None: 49 | if os.path.exists(VOCAB_OUTPUT_PATH): 50 | os.remove(VOCAB_OUTPUT_PATH) 51 | if self.moved: 52 | os.rename(self.temp_vocab_path, VOCAB_OUTPUT_PATH) 53 | 54 | def test_creating_vocab(self): 55 | with patch('builtins.open', side_effect=custom_open): 56 | import create_vocab 57 | vocab_path = os.path.join(create_vocab.vocab_dir, "vocab.dat") 58 | self.assertEqual(os.path.abspath(vocab_path), VOCAB_OUTPUT_PATH) 59 | self.assertTrue(os.path.exists(vocab_path)) 60 | vocab = medcat.vocab.Vocab.load(vocab_path) 61 | self.assertIsInstance(vocab, medcat.vocab.Vocab) 62 | -------------------------------------------------------------------------------- /tests/medcat/2_train_model/1_unsupervised_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/2_train_model/1_unsupervised_training/__init__.py -------------------------------------------------------------------------------- /tests/medcat/2_train_model/1_unsupervised_training/test_splitter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tempfile 4 | import os 5 | import sys 6 | import pandas as pd 7 | 8 | _FILE_DIR = os.path.dirname(__file__) 9 | 10 | # because this project isn't (at least of of writing this) 11 | # set up as a python project, there are no __init__.py 12 | # files in each folder 13 | # as such, in order to gain access to the relevant module, 14 | # I'll need to add the path manually 15 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..") 16 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "2_train_model", "1_unsupervised_training")) 17 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER) 18 | # now we are able to import splitter 19 | 20 | import splitter 21 | 22 | FILE_TO_SPLIT = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_file_to_split.csv") 23 | NR_OF_LINES_IN_FILE = 125 24 | NR_OF_COLUMNS_IN_FILE = 20 25 | 26 | 27 | class SplitFileTests(unittest.TestCase): 28 | # lines per file - we want 4 rows, on average 29 | nr_of_lines = 4 * NR_OF_LINES_IN_FILE // NR_OF_COLUMNS_IN_FILE 30 | # NOTE: If the number of lines is not a multiple of the number of lines 31 | # the expected number of files needs to be one greater 32 | files_expected = NR_OF_LINES_IN_FILE // nr_of_lines 33 | 34 | @classmethod 35 | def setUpClass(cls): 36 | cls.temp_folder = tempfile.TemporaryDirectory() 37 | cls.save_format = os.path.join(cls.temp_folder.name, "split_%03d.csv") 38 | # do the splitting 39 | splitter.split_file(FILE_TO_SPLIT, cls.nr_of_lines, cls.save_format) 40 | 41 | @classmethod 42 | def tearDownClass(cls): 43 | cls.temp_folder.cleanup() 44 | 45 | def test_has_correct_number_of_files(self): 46 | files = list(os.listdir(self.temp_folder.name)) 47 | found = len(files) 48 | self.assertEqual(found, self.files_expected) 49 | 50 | def test_contains_same_content(self): 51 | df_orig = pd.read_csv(FILE_TO_SPLIT) 52 | file_names = [os.path.join(self.temp_folder.name, fn) for fn in os.listdir(self.temp_folder.name)] 53 | # need to sort for order 54 | files_to_read = sorted(file_names) 55 | to_concat = [pd.read_csv(f) for f in files_to_read] 56 | df_split = pd.concat(to_concat, ignore_index=True) 57 | for nr, (lo, ls) in enumerate(zip(df_orig.iterrows(), df_split.iterrows())): 58 | for pnr, (p1, p2) in enumerate(zip(lo, ls)): 59 | with self.subTest(f"L-{nr}; P-{pnr}"): 60 | if isinstance(p1, pd.Series): 61 | self.assertTrue(p1.equals(p2)) 62 | else: 63 | self.assertEqual(p1, p2) 64 | -------------------------------------------------------------------------------- /tests/medcat/2_train_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/2_train_model/__init__.py -------------------------------------------------------------------------------- /tests/medcat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/__init__.py -------------------------------------------------------------------------------- /tests/medcat/evaluate_mct_export/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/evaluate_mct_export/__init__.py -------------------------------------------------------------------------------- /tests/medcat/evaluate_mct_export/offline_test_mct_analysis.py: -------------------------------------------------------------------------------- 1 | """This module is meant to be tested offline (i.e not in a GitHub actions settings). 2 | The main reason is the access to various models it requires. 3 | """ 4 | import os 5 | import sys 6 | 7 | 8 | # because this project isn't (at least of of writing this) 9 | # set up as a python project, there are no __init__.py 10 | # files in each folder 11 | # as such, in order to gain access to the relevant module, 12 | # I'll need to add the path manually 13 | from .test_mct_analysis import (MEDCAT_EVAL_MCT_EXPORT_FOLDER, RESOURCE_DIR, MCT_EXPORT_JSON_PATH, 14 | BaseMCTExportTests) 15 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER) 16 | # and now we can import from mct_analysis 17 | from mct_analysis import MedcatTrainer_export 18 | 19 | 20 | MODEL_PACK_PATH = os.path.join(RESOURCE_DIR, "offline", 21 | "medmen_wstatus_2021_oct.zip") 22 | 23 | 24 | class MCTExportBasicTests(BaseMCTExportTests): 25 | report_path = 'mct_report.xlsx' 26 | 27 | @classmethod 28 | def setUpClass(cls) -> None: 29 | cls.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], MODEL_PACK_PATH) 30 | 31 | # these would need a CAT instance 32 | def test_can_full_annotation_df(self): 33 | full_ann_df = self.export.full_annotation_df() 34 | self.assertNonEmptyDataframe(full_ann_df) 35 | 36 | def test_can_meta_anns_concept_summary(self): 37 | meta_anns_summary_df = self.export.meta_anns_concept_summary() 38 | # this will be empty since I don't think I have anything 39 | # of note regarding meta annotations 40 | self.assertIsNotNone(meta_anns_summary_df) 41 | 42 | def test_generate_report(self): 43 | self.export.generate_report(path=self.report_path) 44 | self.assertTrue(os.path.exists(self.report_path)) 45 | -------------------------------------------------------------------------------- /tests/medcat/evaluate_mct_export/test_mct_analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pandas as pd 5 | 6 | import unittest 7 | 8 | 9 | _FILE_DIR = os.path.dirname(__file__) 10 | 11 | # because this project isn't (at least of of writing this) 12 | # set up as a python project, there are no __init__.py 13 | # files in each folder 14 | # as such, in order to gain access to the relevant module, 15 | # I'll need to add the path manually 16 | _WWC_BASE_FOLDE = os.path.join(_FILE_DIR, "..", "..", "..") 17 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDE, "medcat", "evaluate_mct_export")) 18 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER) 19 | # and now we can import from mct_analysis 20 | from mct_analysis import MedcatTrainer_export 21 | 22 | # add path to MCT export 23 | RESOURCE_DIR = os.path.abspath(os.path.join(_FILE_DIR, "..", "resources")) 24 | MCT_EXPORT_JSON_PATH = os.path.join(RESOURCE_DIR, "MCT_export_example.json") 25 | 26 | 27 | class MCTExportInitTests(unittest.TestCase): 28 | 29 | def test_can_init(self): 30 | inst = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None) 31 | self.assertIsInstance(inst, MedcatTrainer_export) 32 | 33 | 34 | class BaseMCTExportTests(unittest.TestCase): 35 | 36 | @classmethod 37 | def setUpClass(cls) -> None: 38 | cls.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None) 39 | 40 | def assertNonEmptyDataframe(self, df): 41 | self.assertIsInstance(df, pd.DataFrame) 42 | self.assertFalse(df.empty) 43 | 44 | 45 | class MCTExportBasicTests(BaseMCTExportTests): 46 | 47 | def test_can_get_annotations(self): 48 | annotation_df = self.export.annotation_df() 49 | self.assertNonEmptyDataframe(annotation_df) 50 | 51 | def test_can_get_summary(self): 52 | summary_df = self.export.concept_summary() 53 | self.assertNonEmptyDataframe(summary_df) 54 | 55 | def test_can_get_user_stats(self): 56 | users_stats = self.export.user_stats() 57 | self.assertNonEmptyDataframe(users_stats) 58 | 59 | def test_can_rename_meta_anns_empty_no_change(self): 60 | ann_df1 = self.export.annotation_df() 61 | self.export.rename_meta_anns() 62 | ann_df2 = self.export.annotation_df() 63 | self.assertTrue(all(ann_df1 == ann_df2)) 64 | 65 | 66 | class MCTExportUsageTests(BaseMCTExportTests): 67 | 68 | def assertDataFrameHasRowsColumns(self, df, 69 | exp_rows: int, 70 | exp_columns: int): 71 | self.assertEqual(len(df.index), exp_rows) 72 | self.assertEqual(len(df.columns), exp_columns) 73 | 74 | def test_has_correct_projects(self, exp_proj=['MartTestAnnotation']): 75 | got = self.export.project_names 76 | self.assertEqual(len(got), len(exp_proj)) 77 | self.assertEqual(got, exp_proj) 78 | 79 | def test_has_correct_documents(self, exp_docs=['Doc 1', 'Doc 2', 'Doc 3', 'Doc 4', 'Doc 5']): 80 | got = self.export.document_names 81 | self.assertEqual(len(got), len(exp_docs)) 82 | self.assertEqual(got, exp_docs) 83 | 84 | def test_rename_meta_anns_empty_does_not_add_project_and_doc_names(self): 85 | self.export.rename_meta_anns() 86 | self.test_has_correct_projects() 87 | self.test_has_correct_documents() 88 | 89 | def test_annotations_has_correct_rows_columns(self, 90 | exp_rows=362, 91 | exp_columns=19): 92 | ann_df = self.export.annotation_df() 93 | self.assertDataFrameHasRowsColumns(ann_df, exp_rows, exp_columns) 94 | 95 | def test_summary_has_correct_rows_columns(self, 96 | exp_rows=197, 97 | exp_columns=5): 98 | summary_df = self.export.concept_summary() 99 | self.assertDataFrameHasRowsColumns(summary_df, exp_rows, exp_columns) 100 | 101 | def test_cuser_stats_has_correct_rows_columns(self, 102 | exp_rows=1, 103 | exp_columns=2): 104 | users_stats = self.export.user_stats() 105 | self.assertDataFrameHasRowsColumns(users_stats, exp_rows, exp_columns) 106 | 107 | def test_cuser_stats_has_correct_user(self, expected="mart"): 108 | unique_users = self.export.user_stats()["user"].unique().tolist() 109 | self.assertEqual(len(unique_users), 1) 110 | self.assertEqual(unique_users[0], expected) 111 | 112 | 113 | class MCTExportMetaAnnRenameTests(unittest.TestCase): 114 | NAMES2RENAME = {"Status": "VERSION"} 115 | VALUES2RENAME = {"Status": {"Affirmed": "Got it!"}} 116 | # can only rename values if renaming names 117 | # so need a mapping from the same name to the same name 118 | # for each name used in values 119 | VALUES_RENAME_HELPER = dict((n, n) for n in VALUES2RENAME) 120 | 121 | def setUp(self) -> None: 122 | self.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None) 123 | 124 | def _get_all_meta_anns(self): 125 | for proj in self.export.mct_export['projects']: 126 | for doc in proj['documents']: 127 | for ann in doc['annotations']: 128 | for meta_ann in ann["meta_anns"].items(): 129 | yield meta_ann 130 | 131 | def _check_names(self, prev_anns: list): 132 | for (meta_ann_name, _), (prev_name, _) in zip(self._get_all_meta_anns(), prev_anns): 133 | for name, replacement_name in self.NAMES2RENAME.items(): 134 | with self.subTest(f"{name} -> {replacement_name} ({meta_ann_name})"): 135 | self.assertNotEqual(meta_ann_name, name) 136 | if prev_name == name: 137 | self.assertEqual(meta_ann_name, replacement_name) 138 | 139 | def test_meta_annotations_renamed_names(self): 140 | prev_anns = list(self._get_all_meta_anns()) 141 | self.export.rename_meta_anns(meta_anns2rename=self.NAMES2RENAME) 142 | self._check_names(prev_anns) 143 | 144 | def _check_values(self, prev_anns: list, only_values: bool = True): 145 | for (name, ann), (prev_name, prev_ann) in zip(self._get_all_meta_anns(), prev_anns): 146 | with self.subTest(f"{prev_ann} -> {ann}"): 147 | if only_values: 148 | # if only changing values, not names themselves 149 | self.assertEqual(name, prev_name, "Names should not change") 150 | for target_name, value_map in self.VALUES2RENAME.items(): 151 | # if correct target and has a value that can be remapped 152 | if name == target_name and prev_ann["value"] in value_map: 153 | with self.subTest(f"{target_name} with {value_map}"): 154 | start_value = prev_ann["value"] 155 | new_value = ann["value"] 156 | exp_value = value_map[start_value] 157 | self.assertEqual(new_value, exp_value) 158 | 159 | def test_meta_annotations_renamed_values(self): 160 | prev_anns = list(self._get_all_meta_anns()) 161 | self.export.rename_meta_anns(meta_anns2rename=self.VALUES_RENAME_HELPER, 162 | meta_ann_values2rename=self.VALUES2RENAME) 163 | self._check_values(prev_anns) 164 | 165 | def test_meta_annotations_renamed_names_and_values(self): 166 | prev_anns = list(self._get_all_meta_anns()) 167 | self.export.rename_meta_anns(meta_anns2rename=self.NAMES2RENAME, 168 | meta_ann_values2rename=self.VALUES2RENAME) 169 | self._check_names(prev_anns) 170 | self._check_values(prev_anns, only_values=False) 171 | 172 | def test_meta_annotations_renamed_values_only(self): 173 | prev_anns = list(self._get_all_meta_anns()) 174 | self.export.rename_meta_anns(meta_ann_values2rename=self.VALUES2RENAME) 175 | self._check_values(prev_anns, only_values=True) 176 | -------------------------------------------------------------------------------- /tests/medcat/resources/cdb.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/resources/cdb.dat -------------------------------------------------------------------------------- /tests/medcat/resources/example_cdb_input_snomed.csv: -------------------------------------------------------------------------------- 1 | cui,name,name_status,ontologies,description_type_ids,type_ids 2 | 101009,Quilonia ethiopica (organism),P,SNOMED-CT,organism,81102976 3 | 102002,Hemoglobin Okaloosa (substance),P,SNOMED-CT,substance,91187746 4 | 103007,Squirrel fibroma virus (organism),P,SNOMED-CT,organism,81102976 5 | 104001,Excision of lesion of patella (procedure),P,SNOMED-CT,procedure,28321150 6 | 106004,Structure of posterior carpal region (body structure),P,SNOMED-CT,body structure,37552161 7 | 107008,Structure of fetal part of placenta (body structure),P,SNOMED-CT,body structure,37552161 8 | 108003,Entire condylar emissary vein (body structure),P,SNOMED-CT,body structure,37552161 9 | 109006,Anxiety disorder of childhood OR adolescence (disorder),P,SNOMED-CT,disorder,9090192 10 | 110001,Structure of visceral layer of Bowman's capsule (body structure),P,SNOMED-CT,body structure,37552161 11 | 111002,Parathyroid structure (body structure),P,SNOMED-CT,body structure,37552161 12 | 112009,Bembrops anatirostris (organism),P,SNOMED-CT,organism,81102976 13 | 113004,Type-casting-machine operator (occupation),P,SNOMED-CT,occupation,16939031 14 | 114005,Feline calicivirus (organism),P,SNOMED-CT,organism,81102976 15 | 115006,Removable appliance therapy (procedure),P,SNOMED-CT,procedure,28321150 16 | 116007,Subcutaneous tissue structure of medial surface of index finger (body structure),P,SNOMED-CT,body structure,37552161 17 | 117003,Rhipicephalus sanguineus (organism),P,SNOMED-CT,organism,81102976 18 | 118008,Black buffalo weaver (organism),P,SNOMED-CT,organism,81102976 19 | 119000,Thoracoscopic partial lobectomy of lung (procedure),P,SNOMED-CT,procedure,28321150 20 | 120006,Ornithine racemase (substance),P,SNOMED-CT,substance,91187746 21 | 122003,Choroidal hemorrhage (disorder),P,SNOMED-CT,disorder,9090192 22 | 124002,Structure of coronoid process of mandible (body structure),P,SNOMED-CT,body structure,37552161 23 | 125001,Ferrous (59-Fe) sulfate (substance),P,SNOMED-CT,substance,91187746 24 | 126000,Galactosyl-N-acetylglucosaminylgalactosylglucosylceramide alpha-galactosyltransferase (substance),P,SNOMED-CT,substance,91187746 25 | 127009,Miscarriage with laceration of cervix (disorder),P,SNOMED-CT,disorder,9090192 26 | 128004,Hand microscope examination of skin (procedure),P,SNOMED-CT,procedure,28321150 27 | 129007,Homoiothermia (finding),P,SNOMED-CT,finding,67667581 28 | 130002,Hemoglobin Hopkins-II (substance),P,SNOMED-CT,substance,91187746 29 | 131003,Dolichyl-phosphate mannosyltransferase (substance),P,SNOMED-CT,substance,91187746 30 | 132005,Serraniculus pumilio (organism),P,SNOMED-CT,organism,81102976 31 | 133000,Percutaneous implantation of neurostimulator electrodes into neuromuscular component (procedure),P,SNOMED-CT,procedure,28321150 32 | 134006,Decreased hair growth (finding),P,SNOMED-CT,finding,67667581 33 | 135007,Arthrotomy of wrist joint with exploration and biopsy (procedure),P,SNOMED-CT,procedure,28321150 34 | 136008,Acacia erioloba (organism),P,SNOMED-CT,organism,81102976 35 | 138009,No past history of (contextual qualifier) (qualifier value),P,SNOMED-CT,qualifier value,7882689 36 | 139001,Felid herpesvirus 1 (organism),P,SNOMED-CT,organism,81102976 37 | 140004,Chronic pharyngitis (disorder),P,SNOMED-CT,disorder,9090192 38 | 142007,"Excision of tumor from shoulder area, deep, intramuscular (procedure)",P,SNOMED-CT,procedure,28321150 39 | 144008,Normal peripheral vision (finding),P,SNOMED-CT,finding,67667581 40 | 145009,Colloid milium (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986 41 | 146005,Repair of nonunion of metatarsal with bone graft (procedure),P,SNOMED-CT,procedure,28321150 42 | 148006,Preliminary diagnosis (contextual qualifier) (qualifier value),P,SNOMED-CT,qualifier value,7882689 43 | 149003,"Central pair of microtubules, cilium or flagellum, not bacterial (cell structure)",P,SNOMED-CT,cell structure,66527446 44 | 150003,Abnormal bladder continence (finding),P,SNOMED-CT,finding,67667581 45 | 151004,Gonococcal meningitis (disorder),P,SNOMED-CT,disorder,9090192 46 | 153001,Cystourethroscopy with resection of ureterocele (procedure),P,SNOMED-CT,procedure,28321150 47 | 154007,Rubber molding-press operator (occupation),P,SNOMED-CT,occupation,16939031 48 | 155008,Structure of deep circumflex iliac artery (body structure),P,SNOMED-CT,body structure,37552161 49 | 156009,"Spine board, device (physical object)",P,SNOMED-CT,physical object,32816260 50 | 158005,Salmonella Irumu (organism),P,SNOMED-CT,organism,81102976 51 | 159002,Ferrocyanide salt (substance),P,SNOMED-CT,substance,91187746 52 | 160007,Removal of foreign body of tendon and/or tendon sheath (procedure),P,SNOMED-CT,procedure,28321150 53 | 161006,Thermal injury (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986 54 | 162004,Severe manic bipolar I disorder without psychotic features (disorder),P,SNOMED-CT,disorder,9090192 55 | 163009,Bacteroides stercoris (organism),P,SNOMED-CT,organism,81102976 56 | 164003,Phosphoenolpyruvate-protein phosphotransferase (substance),P,SNOMED-CT,substance,91187746 57 | 165002,Accident prone (finding),P,SNOMED-CT,finding,67667581 58 | 166001,Behavioral therapy (regime/therapy),P,SNOMED-CT,regime/therapy,47503797 59 | 167005,Structure of supraclavicular part of brachial plexus (body structure),P,SNOMED-CT,body structure,37552161 60 | 168000,Typhlolithiasis (disorder),P,SNOMED-CT,disorder,9090192 61 | 169008,Product containing hypothalamic releasing factor (product),P,SNOMED-CT,product,91776366 62 | 170009,"Special potency disk identification, vancomycin test (procedure)",P,SNOMED-CT,procedure,28321150 63 | 171008,Injury of ascending right colon without open wound into abdominal cavity (disorder),P,SNOMED-CT,disorder,9090192 64 | 172001,Endometritis following molar AND/OR ectopic pregnancy (disorder),P,SNOMED-CT,disorder,9090192 65 | 173006,Micrognathus crinitus (organism),P,SNOMED-CT,organism,81102976 66 | 174000,Harrison-Richardson operation on vagina (procedure),P,SNOMED-CT,procedure,28321150 67 | 175004,Supraorbital neuralgia (finding),P,SNOMED-CT,finding,67667581 68 | 176003,Anastomosis of rectum (procedure),P,SNOMED-CT,procedure,28321150 69 | 177007,Poisoning by sawfly larvae (disorder),P,SNOMED-CT,disorder,9090192 70 | 178002,Uridine diphosphate galactose (substance),P,SNOMED-CT,substance,91187746 71 | 179005,Apraxia of dressing (finding),P,SNOMED-CT,finding,67667581 72 | 180008,Genus Fijivirus (organism),P,SNOMED-CT,organism,81102976 73 | 181007,Hemorrhagic bronchopneumonia (disorder),P,SNOMED-CT,disorder,9090192 74 | 182000,Canalization (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986 75 | 183005,Autoimmune pancytopenia (disorder),P,SNOMED-CT,disorder,9090192 76 | 184004,Withdrawal arrhythmia (disorder),P,SNOMED-CT,disorder,9090192 77 | 186002,Human leukocyte antigen Cw9 (substance),P,SNOMED-CT,substance,91187746 78 | 187006,Cyanocobalamin (57-Co) (substance),P,SNOMED-CT,substance,91187746 79 | 188001,Injury of intercostal artery (disorder),P,SNOMED-CT,disorder,9090192 80 | 189009,Excision of lesion of artery (procedure),P,SNOMED-CT,procedure,28321150 81 | 191001,Lednice virus (organism),P,SNOMED-CT,organism,81102976 82 | 192008,Congenital syphilitic hepatomegaly (disorder),P,SNOMED-CT,disorder,9090192 83 | 193003,Benign hypertensive renal disease (disorder),P,SNOMED-CT,disorder,9090192 84 | 194009,Notropis whipplei (organism),P,SNOMED-CT,organism,81102976 85 | 196006,Concave shape (qualifier value),P,SNOMED-CT,qualifier value,7882689 86 | 197002,Mold to yeast conversion test (procedure),P,SNOMED-CT,procedure,28321150 87 | 198007,Disease caused by Filoviridae (disorder),P,SNOMED-CT,disorder,9090192 88 | 199004,Decreased lactation (finding),P,SNOMED-CT,finding,67667581 89 | 200001,Berberine (substance),P,SNOMED-CT,substance,91187746 90 | 201002,Oligopus claudei (organism),P,SNOMED-CT,organism,81102976 91 | 202009,Structure of anterior division of renal artery (body structure),P,SNOMED-CT,body structure,37552161 92 | 205006,Entire left commissure of aortic valve (body structure),P,SNOMED-CT,body structure,37552161 93 | 206007,Structure of gluteus maximus muscle (body structure),P,SNOMED-CT,body structure,37552161 94 | 207003,European edible frog (organism),P,SNOMED-CT,organism,81102976 95 | 209000,Plover (organism),P,SNOMED-CT,organism,81102976 96 | 210005,"Arrow, device (physical object)",P,SNOMED-CT,physical object,32816260 97 | 211009,Product containing norethandrolone (medicinal product),P,SNOMED-CT,medicinal product,37785117 98 | 213007,Simian enterovirus 7 (organism),P,SNOMED-CT,organism,81102976 99 | 214001,Streptococcus mutans (organism),P,SNOMED-CT,organism,81102976 100 | 216004,Delusion of persecution (finding),P,SNOMED-CT,finding,67667581 101 | -------------------------------------------------------------------------------- /tests/medcat/resources/example_cdb_input_umls.csv: -------------------------------------------------------------------------------- 1 | cui,name_status,ontologies,name,type_ids 2 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T116 3 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T121 4 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T130 5 | C0000005,Y,MSH,(131)I-MAA,T116 6 | C0000005,Y,MSH,(131)I-MAA,T121 7 | C0000005,Y,MSH,(131)I-MAA,T130 8 | C0000039,N,RXNORM,"1,2-dipalmitoylphosphatidylcholine",T109 9 | C0000039,N,RXNORM,"1,2-dipalmitoylphosphatidylcholine",T121 10 | C0000039,Y,MTH,"1,2-dipalmitoylphosphatidylcholine",T109 11 | C0000039,Y,MTH,"1,2-dipalmitoylphosphatidylcholine",T121 12 | C0000039,N,SNMI,Dipalmitoylphosphatidylcholine,T109 13 | C0000039,N,SNMI,Dipalmitoylphosphatidylcholine,T121 14 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109 15 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121 16 | C0000039,N,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,T109 17 | C0000039,N,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,T121 18 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109 19 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121 20 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109 21 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121 22 | C0000039,Y,MSH,Dipalmitoylphosphatidylcholine,T109 23 | C0000039,Y,MSH,Dipalmitoylphosphatidylcholine,T121 24 | C0000039,Y,MSH,Dipalmitoylglycerophosphocholine,T109 25 | C0000039,Y,MSH,Dipalmitoylglycerophosphocholine,T121 26 | C0000039,Y,MSH,Dipalmitoyllecithin,T109 27 | C0000039,Y,MSH,Dipalmitoyllecithin,T121 28 | C0000039,Y,MSH,"Phosphatidylcholine, Dipalmitoyl",T109 29 | C0000039,Y,MSH,"Phosphatidylcholine, Dipalmitoyl",T121 30 | C0000052,N,MSH,"1,4-alpha-Glucan Branching Enzyme",T116 31 | C0000052,N,MSH,"1,4-alpha-Glucan Branching Enzyme",T126 32 | C0000052,Y,MTH,"1,4-alpha-Glucan Branching Enzyme",T116 33 | C0000052,Y,MTH,"1,4-alpha-Glucan Branching Enzyme",T126 34 | C0000052,N,SNMI,Branching enzyme,T116 35 | C0000052,N,SNMI,Branching enzyme,T126 36 | C0000052,Y,SNOMEDCT_US,Branching enzyme,T116 37 | C0000052,Y,SNOMEDCT_US,Branching enzyme,T126 38 | C0000052,Y,MSH,"Enzyme, Branching",T116 39 | C0000052,Y,MSH,"Enzyme, Branching",T126 40 | C0000052,Y,MSH,"Glycosyltransferase, Branching",T116 41 | C0000052,Y,MSH,"Glycosyltransferase, Branching",T126 42 | C0000052,Y,MSH,Starch Branching Enzyme,T116 43 | C0000052,Y,MSH,Starch Branching Enzyme,T126 44 | C0000052,Y,SNM,alpha-Glucan-branching glycosyltransferase,T116 45 | C0000052,Y,SNM,alpha-Glucan-branching glycosyltransferase,T126 46 | C0000074,Y,MSH,1-Alkyl-2-Acylphosphatidates,T109 47 | C0000074,Y,MSH,1 Alkyl 2 Acylphosphatidates,T109 48 | C0000084,Y,MSH,1-Carboxyglutamic Acid,T116 49 | C0000084,Y,MSH,1-Carboxyglutamic Acid,T123 50 | C0000084,Y,MSH,gamma-Carboxyglutamic Acid,T116 51 | C0000084,Y,MSH,gamma-Carboxyglutamic Acid,T123 52 | C0000096,Y,MSH,1-Methyl-3-isobutylxanthine,T109 53 | C0000096,Y,MSH,1-Methyl-3-isobutylxanthine,T121 54 | C0000096,Y,MSH,3-Isobutyl-1-methylxanthine,T109 55 | C0000096,Y,MSH,3-Isobutyl-1-methylxanthine,T121 56 | C0000096,Y,MSH,IBMX,T109 57 | C0000096,Y,MSH,IBMX,T121 58 | C0000096,Y,MSH,Isobutyltheophylline,T109 59 | C0000096,Y,MSH,Isobutyltheophylline,T121 60 | C0000097,Y,MSH,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",T109 61 | C0000097,Y,MSH,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",T131 62 | C0000097,N,CSP,MPTP,T109 63 | C0000097,N,CSP,MPTP,T131 64 | C0000097,N,PSY,MPTP,T109 65 | C0000097,N,PSY,MPTP,T131 66 | C0000097,Y,MSH,MPTP,T109 67 | C0000097,Y,MSH,MPTP,T131 68 | C0000097,Y,CHV,mptp,T109 69 | C0000097,Y,CHV,mptp,T131 70 | C0000097,N,RCD,Methylphenyltetrahydropyridine,T109 71 | C0000097,N,RCD,Methylphenyltetrahydropyridine,T131 72 | C0000097,N,LCH_NW,Methylphenyltetrahydropyridine,T109 73 | C0000097,N,LCH_NW,Methylphenyltetrahydropyridine,T131 74 | C0000097,N,PSY,Methylphenyltetrahydropyridine,T109 75 | C0000097,N,PSY,Methylphenyltetrahydropyridine,T131 76 | C0000097,Y,SNOMEDCT_US,Methylphenyltetrahydropyridine,T109 77 | C0000097,Y,SNOMEDCT_US,Methylphenyltetrahydropyridine,T131 78 | C0000097,Y,CSP,methylphenyltetrahydropyridine,T109 79 | C0000097,Y,CSP,methylphenyltetrahydropyridine,T131 80 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium,T109 81 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium,T131 82 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium Ion,T109 83 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium Ion,T131 84 | C0000098,Y,MSH,Cyperquat,T109 85 | C0000098,Y,MSH,Cyperquat,T131 86 | C0000098,Y,CSP,MPP+,T109 87 | C0000098,Y,CSP,MPP+,T131 88 | C0000098,Y,MSH,N-Methyl-4-phenylpyridine,T109 89 | C0000098,Y,MSH,N-Methyl-4-phenylpyridine,T131 90 | C0000098,Y,MSH,1-Methyl-4-phenylpyridine,T109 91 | C0000098,Y,MSH,1-Methyl-4-phenylpyridine,T131 92 | C0000098,Y,MSH,N METHYL 4 PHENYLPYRIDINIUM,T109 93 | C0000098,Y,MSH,N METHYL 4 PHENYLPYRIDINIUM,T131 94 | C0000098,Y,MSH,"Pyridinium, 1-methyl-4-phenyl-",T109 95 | C0000098,Y,MSH,"Pyridinium, 1-methyl-4-phenyl-",T131 96 | C0000102,Y,MSH,1-Naphthylamine,T109 97 | C0000102,Y,MSH,1-Naphthylamine,T131 98 | C0000102,Y,CHV,1-naphthylamine,T109 99 | C0000102,Y,CHV,1-naphthylamine,T131 100 | C0000102,Y,MSH,alpha-Naphthylamine,T109 101 | -------------------------------------------------------------------------------- /tests/medcat/resources/example_file_to_split.csv: -------------------------------------------------------------------------------- 1 | "subject_id","hadm_id","chartdate","charttime","text","category","description" 2 | 12345,67890,"2024-01-18","12:30:00","EPR: 3 | Patient, a 55-year-old male, was admitted with complaints of severe chest pain radiating to the left arm and associated shortness of breath. 4 | On examination, the patient appeared diaphoretic with blood pressure elevated at 160/90 mmHg, heart rate of 110 beats per minute, and respiratory rate of 22 breaths per minute. 5 | Initial ECG revealed sinus tachycardia with ST-segment elevation in leads II, III, and aVF. Troponin levels were elevated, suggestive of acute myocardial infarction. 6 | The patient was promptly started on aspirin, clopidogrel, and intravenous nitroglycerin. Cardiology consultation requested for further management and possible catheterization. 7 | ","Admission Note","Acute Myocardial Infarction" 8 | 56789,54321,"2024-01-18","08:45:00","EPR: 9 | Follow-up note for a 65-year-old female with a history of type 2 diabetes mellitus. 10 | Blood glucose levels have been well-controlled with recent HbA1c within the target range. 11 | Medication reconciliation performed, and adjustments made to the insulin regimen to optimize glycemic control. 12 | Patient educated on the importance of regular blood glucose monitoring, proper diet, and exercise. 13 | Follow-up appointment scheduled in 3 months for continued management. 14 | ","Follow-up Note","Diabetes Mellitus Management" 15 | 98765,43210,"2024-01-18","15:20:00","EPR: 16 | Emergency department note for a 40-year-old male involved in a motor vehicle accident. 17 | The patient was brought in by ambulance with complaints of severe right leg pain. 18 | Physical examination revealed tenderness and swelling over the right femur. 19 | CT scan of the pelvis and femur confirmed a displaced fracture of the right femoral shaft. 20 | Orthopedic surgery consulted for further evaluation and management. 21 | The patient was given analgesia and placed in traction pending surgical intervention. 22 | ","Emergency Note","Trauma and Fracture Evaluation" 23 | 23456,78901,"2024-01-18","10:10:00","EPR: 24 | Psychiatric evaluation for a 30-year-old female presenting with symptoms of depression. 25 | The patient reports a persistent low mood, loss of interest in activities, poor appetite, and difficulty sleeping. 26 | No significant suicidal ideation reported. 27 | Past psychiatric history includes a previous episode of major depressive disorder. 28 | Started the patient on sertraline and provided psychoeducation on coping strategies. 29 | Referral made to a therapist for ongoing support. 30 | ","Psychiatric Note","Major Depressive Disorder" 31 | 87654,21098,"2024-01-18","14:00:00","EPR: 32 | Consultation note for a 25-year-old male admitted with severe right lower quadrant abdominal pain. 33 | Physical examination consistent with suspected appendicitis. 34 | Laboratory results showed an elevated white blood cell count. 35 | CT abdomen and pelvis ordered, revealing acute appendicitis with localized abscess formation. 36 | The patient placed on NPO status, started on broad-spectrum antibiotics, and surgical intervention scheduled for appendectomy.","Consultation Note","Appendicitis Evaluation and Management" 37 | 34567,87654,"2024-01-19","09:15:00","EPR: 38 | Admission note for a 45-year-old female presenting with acute respiratory distress. 39 | History reveals a recent upper respiratory tract infection. 40 | On examination, the patient is tachypneic with bilateral crackles on auscultation. 41 | Chest X-ray shows diffuse infiltrates consistent with viral pneumonia. 42 | Oxygen supplementation initiated, and antiviral therapy prescribed. 43 | Close monitoring for respiratory status ongoing. 44 | ","Admission Note","Viral Pneumonia" 45 | 78901,23456,"2024-01-19","13:45:00","EPR: 46 | Follow-up note for a 60-year-old male with a history of hypertension. 47 | Blood pressure well-controlled on current medication regimen. 48 | Discussion on lifestyle modifications, including a low-sodium diet and regular exercise. 49 | Patient advised on the importance of regular follow-up appointments for ongoing blood pressure management. 50 | ","Follow-up Note","Hypertension Management" 51 | 21098,65432,"2024-01-19","16:30:00","EPR: 52 | Emergency department note for a 35-year-old female involved in a fall from a height. 53 | Complaints of back pain and numbness in both lower extremities. 54 | Neurological examination indicates sensory deficits. 55 | CT spine ordered, revealing a thoracic spine fracture. 56 | Neurosurgery consultation requested for further evaluation and management. 57 | ","Emergency Note","Traumatic Spinal Injury" 58 | 54321,78909,"2024-01-19","11:00:00","EPR: 59 | Psychiatric evaluation for a 28-year-old male with symptoms of anxiety and panic attacks. 60 | The patient reports palpitations, sweating, and a sense of impending doom during episodes. 61 | No significant past psychiatric history. 62 | Started on selective serotonin reuptake inhibitors (SSRIs) and referred for cognitive-behavioral therapy. 63 | ","Psychiatric Note","Generalized Anxiety Disorder" 64 | 67890,12345,"2024-01-19","14:20:00","EPR: 65 | Consultation note for a 50-year-old female with abdominal pain and distension. 66 | Physical examination consistent with ascites. 67 | Paracentesis performed, revealing elevated white cell count and protein levels. 68 | Further workup initiated for the underlying cause of ascites. 69 | Gastroenterology consultation requested for comprehensive evaluation. 70 | ","Consultation Note","Ascites Evaluation" 71 | 12312,67845,"2024-01-20","10:45:00","EPR: 72 | Admission note for a 22-year-old male presenting with a seizure episode. 73 | No prior history of seizures reported. Neurological examination unremarkable. 74 | CT scan of the brain performed, showing no acute abnormalities. 75 | The patient started on antiepileptic medication, and an electroencephalogram (EEG) scheduled for further evaluation. 76 | ","Admission Note","First Seizure Evaluation" 77 | 45678,34567,"2024-01-20","14:15:00","EPR: 78 | Follow-up note for a 70-year-old female with a history of congestive heart failure. 79 | Recent exacerbation managed with diuretic adjustment and oxygen therapy. 80 | Patient educated on sodium restriction and fluid management. 81 | Close outpatient follow-up scheduled to monitor symptoms and optimize heart failure management. 82 | ","Follow-up Note","Congestive Heart Failure Management" 83 | 78909,45678,"2024-01-20","16:50:00","EPR: 84 | Emergency department note for a 40-year-old male with a laceration to the right hand from a work-related injury. 85 | Wound cleaned and sutured. Tetanus prophylaxis administered. 86 | Occupational health referral made for further assessment and follow-up. 87 | ","Emergency Note","Hand Laceration Management" 88 | 23456,56789,"2024-01-20","12:30:00","EPR: 89 | Psychiatric evaluation for a 25-year-old female presenting with symptoms of post-traumatic stress disorder (PTSD) following a recent traumatic event. 90 | The patient experiences intrusive thoughts and nightmares. 91 | Started on a selective serotonin-norepinephrine reuptake inhibitor (SNRI) and referred for trauma-focused therapy. 92 | ","Psychiatric Note","Post-Traumatic Stress Disorder" 93 | 67890,89012,"2024-01-20","09:00:00","EPR: 94 | Consultation note for a 55-year-old male with persistent epigastric pain. 95 | Upper endoscopy performed, revealing erosive gastritis. 96 | Proton pump inhibitor prescribed, and lifestyle modifications discussed. 97 | Gastroenterology follow-up recommended for ongoing management. 98 | ","Consultation Note","Gastritis Evaluation and Management" 99 | 54321,98765,"2024-01-21","11:30:00","EPR: 100 | Admission note for a 38-year-old female presenting with a mysterious neurological syndrome. 101 | The patient experiences sudden and transient episodes of total paralysis, lasting a few minutes. 102 | Extensive neurological workup initiated, including genetic testing for a rare hereditary paralysis disorder. 103 | Neurology and genetics consultations requested for further evaluation. 104 | ","Admission Note","Familial Transient Paralysis Syndrome" 105 | 87654,23456,"2024-01-21","14:20:00","EPR: 106 | Follow-up note for a 45-year-old male with a history of unexplained fevers and skin lesions resembling butterfly wings. 107 | Extensive infectious disease and rheumatological workup inconclusive. 108 | Immunology and dermatology consultations ongoing to explore the possibility of a novel autoimmune disorder. 109 | ","Follow-up Note","Butterfly Wing Syndrome" 110 | 21098,76543,"2024-01-21","09:45:00","EPR: 111 | Emergency department note for a 28-year-old male presenting with acute respiratory distress and bizarre neuropsychiatric symptoms. 112 | Preliminary investigations inconclusive. 113 | Suspected rare autoimmune encephalitis with respiratory involvement. 114 | Immunotherapy initiated, and neurology and pulmonology consulted for collaborative management. 115 | ","Emergency Note","Autoimmune Encephalitis with Respiratory Distress" 116 | 67890,32109,"2024-01-21","16:10:00","EPR: 117 | Psychiatric evaluation for a 32-year-old female with sudden-onset obsessive-compulsive behaviors, including a compulsion to count objects in prime numbers. 118 | No history of psychiatric illness. 119 | Neurology and psychiatry consultations in progress for consideration of a rare neurodevelopmental disorder. 120 | ","Psychiatric Note","Prime Number Obsessive-Compulsive Disorder" 121 | 12345,65432,"2024-01-21","13:00:00","EPR: 122 | Consultation note for a 50-year-old male with chronic abdominal pain and gastrointestinal bleeding. 123 | nitial investigations inconclusive. 124 | Gastroenterology and hematology consulted for further evaluation of a suspected rare vascular malformation disorder affecting the gastrointestinal tract. 125 | ","Consultation Note","Gastrointestinal Vascular Malformation Syndrome" -------------------------------------------------------------------------------- /tests/medcat/resources/vocab.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/resources/vocab.dat -------------------------------------------------------------------------------- /update.py: -------------------------------------------------------------------------------- 1 | # Script to update the working_with_cogstack repo from github branch main 2 | import subprocess 3 | 4 | # Command to stash local changes 5 | stash_command = ["git", "stash", "save", "Stashing local changes"] 6 | 7 | # Command to list changes in the stash 8 | list_changes_command = ["git", "stash", "show", "-p", "--name-only"] 9 | 10 | # Command to perform a git pull 11 | pull_command = ["git", "pull"] 12 | 13 | # Replace this with the actual path of the file you want to restore 14 | files_to_restore = ["credentials.py"] 15 | 16 | try: 17 | # Run the command to stash local changes 18 | subprocess.run(stash_command, check=True) 19 | 20 | print("Local changes stashed.") 21 | 22 | # Check if there are stash entries 23 | stash_entries = subprocess.check_output(list_changes_command, text=True) 24 | 25 | # Run the command to list changes in the stash 26 | changes_output = subprocess.check_output(list_changes_command, text=True) 27 | 28 | print("Changes in the stash:") 29 | print(changes_output) 30 | if input(f"Only {', '.join(files_to_restore)} will be preserved.\nAre you should you want to continue? (y/n)") == 'y': 31 | # Run the command to perform a git pull 32 | subprocess.run(pull_command, check=True) 33 | 34 | print("Pull complete.") 35 | 36 | # Run the command to restore the specific file 37 | for file_to_restore in files_to_restore: 38 | # Command to restore a specific file from the stash 39 | subprocess.run(["git", "checkout", "stash@{0}", "--", file_to_restore], check=True) 40 | 41 | print(f"File {file_to_restore} restored from stash.") 42 | else: 43 | print("Operation cancelled.") 44 | 45 | except subprocess.CalledProcessError as e: 46 | if e.returncode == 1: 47 | print("No stash entries found. Continuing with git pull.") 48 | # Run the command to perform a git pull 49 | subprocess.run(pull_command, check=True) 50 | else: 51 | print("An error occurred:") 52 | print(e) 53 | 54 | -------------------------------------------------------------------------------- /utils/clinical_note_splitter.py: -------------------------------------------------------------------------------- 1 | # This script is specific to certain hospital sites and is not part of the main repository. 2 | import regex 3 | import logging 4 | 5 | 6 | def normalize_date(date, id_, start, end): 7 | """Normalizes different dates encountered in the clinical notes. 8 | Current accepted formats: 9 | 28 Feb 2913 04:50 10 | Thu 28 Feb 2013 04:50 11 | 28-Feb-2013 04:50 12 | Output: 13 | 28 Feb 2013 04:50 14 | """ 15 | 16 | if '-' in date: 17 | date = date.replace("-", " ").strip() 18 | elif date.strip()[0].isalpha(): 19 | date = date[date.index(' '):].strip() 20 | elif date.strip()[0].isnumeric(): 21 | # all good 22 | date = date.strip() 23 | else: 24 | logging.warning("Unsupported date format: %s for id: %s with start: %s, end: %s", date, id_, start, end) 25 | return None 26 | 27 | return date 28 | 29 | 30 | def split_one_note(id_, text): 31 | """Splits the text of one note by date. 32 | 33 | Returns: 34 | List[Dict]: 35 | Returns a list of dictionary in the format: {'start': , 36 | 'end': , 37 | 'text': , 38 | 'date': } 39 | """ 40 | r = r'\n\w{0,5}\s*\d{1,2}(\s|-)[a-zA-Z]{3,5}(\s|-)\d{4}\s+\d{2}\:\d{2}' 41 | dates = regex.finditer(r, text) 42 | start = 0 43 | end = -1 44 | split_note = [] 45 | previous_date = None 46 | 47 | for date in dates: 48 | if start == 0: 49 | start = date.span()[0] 50 | previous_date = date.captures()[0] 51 | elif previous_date is None or date.captures()[0] != previous_date: 52 | end = date.span()[0] 53 | note_text = text[start:end] 54 | if 'entered on -' in note_text.lower(): 55 | if len(regex.findall(r'entered on -', note_text)) > 1: 56 | logging.warning("Possible problems for span with start: %s and end: %s for note with id: %s", start, end, id_) 57 | split_note.append({'start': start, 'end': end, 'text': note_text, 'date': normalize_date(previous_date, id_, start, end)}) 58 | start = end 59 | previous_date = date.captures()[0] 60 | # Add the last note 61 | if previous_date is not None and 'entered on -' in text[start:].lower(): 62 | split_note.append({'start': start, 'end': len(text), 'text': text[start:], 'date': normalize_date(previous_date, id_, start, len(text))}) 63 | else: 64 | logging.warning("No date/entered-on detected for id: %s wth start: %s, end: %s and text:\n%s...", id_, start, end, text[0:300]) 65 | 66 | return split_note 67 | 68 | 69 | def split_clinical_notes(clinical_notes): 70 | """Splits clinical notes. 71 | 72 | Args: 73 | clinical_notes(dict): 74 | Dictionary in the form {: , ...}. 75 | 76 | Returns: 77 | Dict: 78 | The split notes. 79 | """ 80 | split_notes = {} 81 | for id_text, text in clinical_notes.items(): 82 | split_notes[id_text] = split_one_note(id_text, text) 83 | return split_notes 84 | -------------------------------------------------------------------------------- /utils/ethnicity_map.py: -------------------------------------------------------------------------------- 1 | # Mapped on top-level of 2001 NHS Data Dictionary; https://datadictionary.nhs.uk/data_elements/ethnic_category.html 2 | ethnicity_map = {'Algerian': 'Black', 3 | 'Any Other Group': 'Other', 4 | 'Asian and Chinese': 'Asian', 5 | 'Bangladeshi': 'Asian', 6 | 'Black African': 'Black', 7 | 'Black British': 'Black', 8 | 'British': 'White', 9 | 'Caribbean': 'Black', 10 | 'Chinese': 'Asian', 11 | 'Cypriot (Part nt st)': 'White', 12 | 'Ecuadorian': 'Other', 13 | 'English': 'White', 14 | 'Ethiopian': 'Black', 15 | 'Filipino': 'Asian', 16 | 'Ghanaian': 'Black', 17 | 'Greek Cypriot': 'White', 18 | 'Indian/British India': 'Asian', 19 | 'Iranian': 'Other', 20 | 'Italian': 'White', 21 | 'Mixed Black': 'Black', 22 | 'Mixed Caribbean': 'Black', 23 | 'Nigerian': 'Black', 24 | 'Not Given': 'Unknown', 25 | 'Not Specified': 'Unknown', 26 | 'Not Stated': 'Unknown', 27 | 'OTHER ASIAN BACKGROU': 'Asian', 28 | 'Other Asian Unspecif': 'Asian', 29 | 'OTHER BLACK BACKGROU': 'Black', 30 | 'Other Black Unspecif': 'Black', 31 | 'Other Ethnic Group': 'Other', 32 | 'Other Latin American': 'Other', 33 | 'OTHER WHITE BACK GRO': 'White', 34 | 'Other White Unspecif': 'White', 35 | 'Other White/Mixed Eu': 'White', 36 | 'Pakistani/British Pa': 'Asian', 37 | 'Portuguese': 'White', 38 | 'Somali': 'Black', 39 | 'Spanish': 'White', 40 | 'Sri Lankan': 'Asian', 41 | 'Sudanese': 'Black', 42 | 'Turkish': 'Other', 43 | 'Ugandan': 'Black', 44 | 'Vietnamese': 'Asian', 45 | 'White Irish': 'White', 46 | 'Former USSR Rep': 'White', 47 | 'POLISH': 'White', 48 | 'Iraqi': 'Other', 49 | 'Albanian': 'Other', 50 | 'Columbian': 'Other', 51 | 'Scottish': 'White', 52 | 'Not stated': 'Unknown', 53 | 'OTHER MIXED BACKGROU': 'Mixed', 54 | 'Welsh': 'White', 55 | 'British Asian': 'Asian', 56 | 'Caribbean Asian': 'Asian', 57 | 'Eritrean': 'Black', 58 | 'Turkish Cypriot': 'Other', 59 | 'Sinhalese': 'Asian', 60 | 'White and Asian': 'Asian', 61 | 'Other Mixed': 'Mixed', 62 | 'Mixed Asian': 'Asian', 63 | 'Greek': 'White', 64 | 'Arab': 'Other', 65 | 'MULTIPLE CODES': 'MULTIPLE CODES', 66 | 'Irish': 'White', 67 | 'Japanese': 'Asian', 68 | 'Middle East': 'Other', 69 | 'Croatian': 'White', 70 | 'Black and Asian': 'Mixed', 71 | 'Black and White': 'Mixed'} 72 | 73 | # Mapped on bottom-level of 2001 NHS Data Dictionary; https://datadictionary.nhs.uk/data_elements/ethnic_category.html 74 | ethnicity_map_detail = {'Algerian': 'Black or Black British - African', 75 | 'Any Other Group': 'Other Ethnic Groups - Any other ethnic group', 76 | 'Asian and Chinese': 'Other Ethnic Groups - Chinese', 77 | 'Bangladeshi': 'Asian or Asian British - Pakistani', 78 | 'Black African': 'Black or Black British - African', 79 | 'Black British': 'Black or Black British - Any Other Black background', 80 | 'British': 'White - British', 81 | 'Caribbean': 'Black or Black British - Caribbean', 82 | 'Chinese': 'Other Ethnic Groups - Chinese', 83 | 'Cypriot (Part nt st)': 'White - Any other White background', 84 | 'Ecuadorian': 'Other Ethnic Groups - Any other ethnic group', 85 | 'English': 'White - British', 86 | 'Ethiopian': 'Black or Black British - African', 87 | 'Filipino': 'Asian or Asian British - Any other Asian background', 88 | 'Ghanaian': 'Black or Black British - African', 89 | 'Greek Cypriot': 'White - Any other White background', 90 | 'Indian/British India': 'Asian or Asian British - Indian', 91 | 'Iranian': 'Other Ethnic Groups - Any other ethnic group', 92 | 'Italian': 'White - Any other White background', 93 | 'Mixed Black': 'Black or Black British - Any other Black background', 94 | 'Mixed Caribbean': 'Black or Black British - Caribbean', 95 | 'Nigerian': 'Black or Black British - African', 96 | 'Not Given': 'Not stated', 97 | 'Not Specified': 'Not stated', 98 | 'Not Stated': 'Not stated', 99 | 'OTHER ASIAN BACKGROU': 'Asian or Asian British - Any other Asian background', 100 | 'Other Asian Unspecif': 'Asian or Asian British - Any other Asian background', 101 | 'OTHER BLACK BACKGROU': 'Black or Black British - Any Other Black background', 102 | 'Other Black Unspecif': 'Black or Black British - Any Other Black background', 103 | 'Other Ethnic Group': 'Other Ethnic Groups - Any other ethnic group', 104 | 'Other Latin American': 'Other Ethnic Groups - Any other ethnic group', 105 | 'OTHER WHITE BACK GRO': 'White - Any other White background', 106 | 'Other White Unspecif': 'White - Any other White background', 107 | 'Other White/Mixed Eu': 'White - Any other White background', 108 | 'Pakistani/British Pa': 'Asian or Asian British - Pakistani', 109 | 'Portuguese': 'White - Any other White background', 110 | 'Somali': 'Black or Black British - African', 111 | 'Spanish': 'White - Any other White background', 112 | 'Sri Lankan': 'Asian or Asian British - Any other Asian background', 113 | 'Sudanese': 'Black or Black British - African', 114 | 'Turkish': 'Other Ethnic Groups - Any other ethnic group', 115 | 'Ugandan': 'Black or Black British - African', 116 | 'Vietnamese': 'Other Ethnic Groups - Any other ethnic group', 117 | 'White Irish': 'White - Irish', 118 | 'Former USSR Rep': 'White - Any other White background', 119 | 'POLISH': 'White - Any other White background', 120 | 'Iraqi': 'Other Ethnic Groups - Any other ethnic group', 121 | 'Albanian': 'White - Any other White background', 122 | 'Columbian': 'Other Ethnic Groups - Any other ethnic group', 123 | 'Scottish': 'White - British', 124 | 'Not stated': 'Not stated', 125 | 'OTHER MIXED BACKGROU': 'Mixed - Any other mixed background', 126 | 'Welsh': 'White - British', 127 | 'British Asian': 'Asian or Asian British - Any other Asian background', 128 | 'Caribbean Asian': 'Mixed - Any other mixed background', 129 | 'Eritrean': 'Black or Black British - African', 130 | 'Turkish Cypriot': 'Other Ethnic Groups - Any other ethnic group', 131 | 'Sinhalese': 'Asian or Asian British - Any other Asian background', 132 | 'White and Asian': 'Mixed - White and Asian', 133 | 'Other Mixed': 'Mixed - Any other mixed background', 134 | 'Mixed Asian': 'Mixed - Any other mixed background', 135 | 'Greek': 'White - Any other White background', 136 | 'Arab': 'Other Ethnic Groups - Any other ethnic group', 137 | 'MULTIPLE CODES': 'MULTIPLE CODES', 138 | 'Irish': 'White - Irish', 139 | 'Japanese': 'Other Ethnic Groups - Any other ethnic group', 140 | 'Middle East': 'Other Ethnic Groups - Any other ethnic group', 141 | 'Croatian': 'White - Any other White background', 142 | 'Black and Asian': 'Mixed - White and Asian', 143 | 'Black and White': 'Mixed - Any other mixed background'} 144 | --------------------------------------------------------------------------------