├── .gitattributes
├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── cogstack.py
├── credentials.py
├── data
├── ReadMe.md
├── cogstack_search_results
│ └── ReadMe.md
├── medcattrainer_export
│ └── ReadMe.md
├── media
│ ├── cogstack_logo.png
│ ├── foresight_logo_unofficial.png
│ ├── medcat_logo.png
│ ├── medcat_pipeline_summary.png
│ └── nhs_logo.png
├── snomed
│ ├── ReadMe.md
│ ├── preprocessing_snomed_ct.ipynb
│ └── umls_enricher.py
└── umls
│ ├── NLM_umls_download.py
│ ├── ReadMe.md
│ └── working_with_umls.ipynb
├── medcat
├── 1_create_model
│ ├── create_cdb
│ │ ├── create_cdb.py
│ │ └── create_umls_cdb.py
│ ├── create_modelpack
│ │ └── create_modelpack.py
│ └── create_vocab
│ │ └── create_vocab.py
├── 2_train_model
│ ├── 1_unsupervised_training
│ │ ├── splitter.py
│ │ ├── unsupervised training.ipynb
│ │ ├── unsupervised_medcattraining.py
│ │ └── unsupervised_training.py
│ ├── 2_supervised_training
│ │ ├── meta_annotation_training.ipynb
│ │ ├── meta_annotation_training_advanced.ipynb
│ │ └── supervised training.ipynb
│ └── ReadMe.md
├── 3_run_model
│ ├── ReadMe.md
│ ├── run_model.ipynb
│ └── run_model.py
├── ReadMe.md
├── compare_models
│ ├── cmp_utils.py
│ ├── comp_nbhelper.py
│ ├── compare.py
│ ├── compare_annotations.py
│ ├── compare_cdb.py
│ ├── data
│ │ ├── demo-physio-mobility
│ │ │ ├── cui_filter.csv
│ │ │ └── intechopen_2cols_3.csv
│ │ └── some_synthetic_data.csv
│ ├── model_comparison.ipynb
│ ├── output.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── resources
│ │ │ ├── docs
│ │ │ │ └── not_real.csv
│ │ │ ├── mct_export
│ │ │ │ ├── medcat_trainer_expoert2.json
│ │ │ │ └── medcat_trainer_export.json
│ │ │ └── model_pack
│ │ │ │ ├── cdb.dat
│ │ │ │ └── vocab.dat
│ │ ├── test_compare.py
│ │ ├── test_compare_annotations.py
│ │ ├── test_compare_cdb.py
│ │ └── test_output.py
│ └── validation.py
└── evaluate_mct_export
│ ├── __init__.py
│ ├── mct_analysis.py
│ └── mct_export_summary.ipynb
├── models
├── ReadMe.md
├── cdb
│ └── .keep
├── modelpack
│ └── ReadMe.md
└── vocab
│ └── .keep
├── mypy.ini
├── projects
├── ReadMe.md
└── demo_project_stucture
│ └── ReadMe.md
├── requirements-dev.txt
├── requirements.txt
├── search
├── .gitattributes
├── ReadMe.md
└── search_template.ipynb
├── tests
├── __init__.py
└── medcat
│ ├── 1_create_model
│ ├── __init__.py
│ ├── create_cdb
│ │ ├── __init__.py
│ │ └── test_create_cdb.py
│ ├── create_modelpack
│ │ ├── __init__.py
│ │ └── test_create_modelpack.py
│ └── create_vocab
│ │ ├── __init__.py
│ │ └── test_create_vocab.py
│ ├── 2_train_model
│ ├── 1_unsupervised_training
│ │ ├── __init__.py
│ │ └── test_splitter.py
│ └── __init__.py
│ ├── __init__.py
│ ├── evaluate_mct_export
│ ├── __init__.py
│ ├── offline_test_mct_analysis.py
│ └── test_mct_analysis.py
│ └── resources
│ ├── MCT_export_example.json
│ ├── cdb.dat
│ ├── example_cdb_input_snomed.csv
│ ├── example_cdb_input_umls.csv
│ ├── example_file_to_split.csv
│ └── vocab.dat
├── update.py
└── utils
├── clinical_note_splitter.py
└── ethnicity_map.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=strip-notebook-output
2 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: build
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 |
9 | jobs:
10 | native-py:
11 |
12 | runs-on: ubuntu-24.04
13 | strategy:
14 | matrix:
15 | python-version: [ '3.9', '3.10', '3.11', '3.12' ]
16 | max-parallel: 4
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v2
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install -r requirements.txt
28 | pip install -r requirements-dev.txt
29 | - name: Typing
30 | # run mypy on all tracked non-test python modules
31 | # and use explicit package base since the project
32 | # is not set up as a python package
33 | run: |
34 | python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal
35 | - name: Test
36 | run: |
37 | python -m unittest discover
38 | python -m unittest discover -s medcat/compare_models
39 | # TODO - in the future, we might want to add automated tests for notebooks as well
40 | # though it's not really possible right now since the notebooks are designed
41 | # in a way that assumes interaction (i.e specifying model pack names)
42 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore all credential files
2 | credentials.py
3 |
4 | # Ignore ipynotebook checkpoints
5 | *.ipynb_checkpoints
6 |
7 | # data folders
8 | data/snomed/
9 | data/medcattrainer_export/
10 | data/cogstack_search_results/
11 |
12 | # Default environments
13 | venv
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | # Working with CogStack
9 | This repository contains all tools relevant to interacting with an NHS deployment of CogStack.
10 |
11 | It contains:
12 | 1) Easy to follow templates and instructions to interact and search CogStack.
13 | 2) Recommended workflows to create, train, and run, MedCAT models.
14 |
15 | For further discussions or questions. Please join our official [CogStack/MedCAT forum!](https://discourse.cogstack.org/)
16 |
17 | __NOTE__ this section is currently in development. Let me know if there is anything
18 | else to add!
19 |
20 |
21 | ## Setup
22 |
23 | Users can follow these steps to quickly setup and deploy this repository on their machine.
24 |
25 | Any code to enter in these instructions will be represented as `code to enter`.
26 |
27 | Please replace anything within `` with your own specific details.
28 |
29 | ### Step 1: Clone this repository locally
30 |
31 | 1. Enter the directory where you would like to store these files. `cd path/to/where/you/want/this/repository`
32 |
33 | 2. Clone the online repository: `git clone https://github.com/CogStack/working_with_cogstack.git`
34 |
35 | Further instructions and self-help with git and git clone. Please visit this [link.](https://github.com/git-guides/git-clone)
36 |
37 | If you choose to use github desktop rather than the terminal please refer to the [official github desktop guides.](https://docs.github.com/en/desktop)
38 |
39 | 3. Optional: To update to the latest release of this repository: `git pull`
40 |
41 | ### Step 2: Creating a virtual environment and required packages
42 | (Requires Python 3.7+)
43 |
44 | __Windows__
45 | 1. Create a new virtual env: `python3 -m venv venv`
46 | 2. Load the virtual environment: `.\venv\Scripts\activate`
47 | 3. Install relevant packages and libraries: `pip install -r requirements.txt`
48 |
49 |
50 | __Linux/MAC OS__
51 | 1. Create a new virtual env: `python3 -m venv venv`
52 | 2. Load the virtual environment: `source venv/bin/activate`
53 | 3. Install relevant packages and libraries: `pip install -r requirements.txt`
54 |
55 | *Optional: If no jupyter instance is installed.*
56 | 1. In the main folder of this repository. Activate your virtual environment, using the (Step 2) command from your respective OS.
57 | 2. Start JupyterLab: `jupyter-lab`
58 |
59 |
60 | ### Step 3: Enter credentials and Login details
61 | In the main folder of this repository you can populate the [credentials.py](credentials.py) file with your own CogStack hostnames, username and passwords.
62 |
63 | For an automatic authentication experience, the credentials.py contents can be prepopulated with your CogStack instance credentials:
64 | ```
65 | hosts = [] # This is a list of your cogstack elasticsearch instances.
66 |
67 | # These are your login details (either via http_auth or API)
68 | username = None
69 | password = None
70 | ```
71 | For shared machines it is recommended that you leave the passwords blank. This will trigger a prompt in when accessing a cogstack instance.
72 |
73 | If you have any questions or issues obtaining these details please contact your local CogStack administrator.
74 |
75 | ## Contents
76 |
77 | ## [How to search using CogStack](search)
78 | This directory contains the basics search templates.
79 |
80 | For further information on CogStack please visit their [github](https://github.com/CogStack)
81 | or [wiki page](https://cogstack.org/).
82 |
83 | ## [How to create a watcher](watcher)
84 | This directory contains the basics watcher job templates.
85 |
86 | ## [MedCAT](medcat)
87 | An overview of this process is shown below.
88 |
89 |
90 |
91 |
92 | Further information about MedCAT can be found from their [github](https://github.com/CogStack/MedCAT)
93 | or via their official documentation [here](https://medcat.readthedocs.io/en/latest/).
94 |
95 | General MedCAT tutorials can be found [here](https://github.com/CogStack/MedCATtutorials).
96 |
97 |
98 | ### Demo
99 | A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III to annotate
100 | SNOMED-CT concepts. __Note:__ No supervised training has been provided to this model and therefore should only be used for demonstration
101 | purposes only.
102 |
103 | ### MedCAT Citation
104 | ```
105 | @ARTICLE{Kraljevic2021-ln,
106 | title="Multi-domain clinical natural language processing with {MedCAT}: The Medical Concept Annotation Toolkit",
107 | author="Kraljevic, Zeljko and Searle, Thomas and Shek, Anthony and Roguski, Lukasz and Noor, Kawsar and Bean, Daniel and Mascio, Aurelie and Zhu, Leilei and Folarin, Amos A and Roberts, Angus and Bendayan, Rebecca and Richardson, Mark P and Stewart, Robert and Shah, Anoop D and Wong, Wai Keong and Ibrahim, Zina and Teo, James T and Dobson, Richard J B",
108 | journal="Artif. Intell. Med.",
109 | volume=117,
110 | pages="102083",
111 | month=jul,
112 | year=2021,
113 | issn="0933-3657",
114 | doi="10.1016/j.artmed.2021.102083"
115 | }
116 | ```
117 |
118 |
119 | # Foresight (Coming soon...)
120 | Demo is available [here](https://foresight.sites.er.kcl.ac.uk/)
121 |
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/__init__.py
--------------------------------------------------------------------------------
/cogstack.py:
--------------------------------------------------------------------------------
1 | import getpass
2 | from typing import Dict, List, Any, Optional, Iterable, Tuple
3 | import elasticsearch
4 | import elasticsearch.helpers
5 | import pandas as pd
6 | from tqdm.notebook import tqdm
7 | import eland as ed
8 |
9 | import warnings
10 | warnings.filterwarnings("ignore")
11 |
12 | from credentials import *
13 |
14 |
15 | class CogStack(object):
16 | """
17 | A class for interacting with Elasticsearch.
18 |
19 | Args:
20 | hosts (List[str]): A list of Elasticsearch host URLs.
21 | username (str, optional): The username to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a username.
22 | password (str, optional): The password to use when connecting to Elasticsearch. If not provided, the user will be prompted to enter a password.
23 | api (bool, optional): A boolean value indicating whether to use API keys or basic authentication to connect to Elasticsearch. Defaults to False (i.e., use basic authentication). Elasticsearch 7.17.
24 | api_key (str, optional): The API key to use when connecting to Elasticsearch.
25 | When provided along with `api=True`, this takes precedence over username/password. Only available when using Elasticsearch 8.17.
26 | """
27 | def __init__(self, hosts: List, username: Optional[str] = None, password: Optional[str] = None,
28 | api: bool = False, timeout: Optional[int]=60, api_key: Optional[str] = None):
29 |
30 | if api_key and api:
31 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
32 | api_key=api_key,
33 | verify_certs=False,
34 | timeout=timeout)
35 |
36 |
37 | elif api:
38 | api_username, api_password = self._check_auth_details(username, password)
39 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
40 | api_key=(api_username, api_password),
41 | verify_certs=False,
42 | timeout=timeout)
43 |
44 | else:
45 | username, password = self._check_auth_details(username, password)
46 | self.elastic = elasticsearch.Elasticsearch(hosts=hosts,
47 | basic_auth=(username, password),
48 | verify_certs=False,
49 | timeout=timeout)
50 |
51 |
52 | def _check_auth_details(self, username=None, password=None) -> Tuple[str, str]:
53 | """
54 | Prompt the user for a username and password if the values are not provided as function arguments.
55 |
56 | Args:
57 | api_username (str, optional): The API username. If not provided, the user will be prompted to enter a username.
58 | api_password (str, optional): The API password. If not provided, the user will be prompted to enter a password.
59 |
60 | Returns:
61 | Tuple[str, str]: A tuple containing the API username and password.
62 | """
63 | if username is None:
64 | username = input("Username: ")
65 | if password is None:
66 | password = getpass.getpass("Password: ")
67 | return username, password
68 |
69 | def get_docs_generator(self, index: List, query: Dict, es_gen_size: int=800, request_timeout: Optional[int] = 300):
70 | """
71 | Retrieve a generator object that can be used to iterate through documents in an Elasticsearch index.
72 |
73 | Args:
74 | index (List[str]): A list of Elasticsearch index names to search.
75 | query (Dict): A dictionary containing the search query parameters.
76 | es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800.
77 | request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300.
78 |
79 | Returns:
80 | generator: A generator object that can be used to iterate through the documents in the specified Elasticsearch index.
81 | """
82 | docs_generator = elasticsearch.helpers.scan(self.elastic,
83 | query=query,
84 | index=index,
85 | size=es_gen_size,
86 | request_timeout=request_timeout)
87 | return docs_generator
88 |
89 | def cogstack2df(self, query: Dict, index: str, column_headers=None, es_gen_size: int=800, request_timeout: int=300,
90 | show_progress: bool = True):
91 | """
92 | Retrieve documents from an Elasticsearch index and convert them to a Pandas DataFrame.
93 |
94 | Args:
95 | query (Dict): A dictionary containing the search query parameters.
96 | index (str): The name of the Elasticsearch index to search.
97 | column_headers (List[str], optional): A list of column headers to use for the DataFrame. If not provided, the DataFrame will have default column names.
98 | es_gen_size (int, optional): The number of documents to retrieve per batch. Defaults to 800.
99 | request_timeout (int, optional): The time in seconds to wait for a response from Elasticsearch before timing out. Defaults to 300.
100 | show_progress (bool, optional): Whether to show the progress in console. Defaults to true.
101 |
102 | Returns:
103 | pandas.DataFrame: A DataFrame containing the retrieved documents.
104 | """
105 | docs_generator = elasticsearch.helpers.scan(self.elastic,
106 | query=query,
107 | index=index,
108 | size=es_gen_size,
109 | request_timeout=request_timeout)
110 | temp_results = []
111 | results = self.elastic.count(index=index, query=query['query'], request_timeout=300) # type: ignore
112 | for hit in tqdm(docs_generator, total=results['count'], desc="CogStack retrieved...", disable=not show_progress):
113 | row = dict()
114 | row['_index'] = hit['_index']
115 | row['_id'] = hit['_id']
116 | row['_score'] = hit['_score']
117 | row.update(hit['_source'])
118 | temp_results.append(row)
119 | if column_headers:
120 | df_headers = ['_index', '_id', '_score']
121 | df_headers.extend(column_headers)
122 | df = pd.DataFrame(temp_results, columns=df_headers)
123 | else:
124 | df = pd.DataFrame(temp_results)
125 | return df
126 |
127 | def DataFrame(self, index: str, columns: Optional[List[str]] = None):
128 | """
129 | Fast method to return a pandas dataframe from a CogStack search.
130 |
131 | Args:
132 | index (str): A list of indices to search.
133 | columns (List[str], optional): A list of column names to include in the DataFrame. If not provided, all columns will be included.
134 |
135 | Returns:
136 | DataFrame: A pd.DataFrame like object containing the retrieved documents.
137 | """
138 | return ed.DataFrame(es_client=self.elastic, es_index_pattern=index, columns=columns)
139 |
140 |
141 | def list_chunker(user_list: List[Any], n: int) -> List[List[Any]]:
142 | """
143 | Divide a list into sublists of a specified size.
144 |
145 | Args:
146 | user_list (List[Any]): The list to be divided.
147 | n (int): The size of the sublists.
148 |
149 | Returns:
150 | List[List[Any]]: A list of sublists containing the elements of the input list.
151 | """
152 | n=max(1, n)
153 | return [user_list[i:i+n] for i in range(0, len(user_list), n)]
154 |
155 |
156 | def _no_progress_bar(iterable: Iterable, **kwargs):
157 | return iterable
158 |
159 |
--------------------------------------------------------------------------------
/credentials.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | # CogStack login details
3 | ## Any questions on what these details are please contact your local CogStack administrator.
4 |
5 | hosts: List[str] = [] # This is a list of your CogStack ElasticSearch instances.
6 |
7 | ## These are your login details (either via http_auth or API) Should be in str format
8 | username = None
9 | password = None
10 |
11 | api_key = None # Encoded api key issued by your cogstack administrator.
12 |
13 | # NLM authentication
14 | # The UMLS REST API requires a UMLS account for the authentication described below.
15 | # If you do not have a UMLS account, you may apply for a license on the UMLS Terminology Services (UTS) website.
16 | # https://documentation.uts.nlm.nih.gov/rest/authentication.html
17 |
18 | # UMLS api key auth
19 | umls_apikey = None
20 |
21 | # SNOMED authentication from NHS TRUD. International releases will require different API access creds.
22 | # api key auth from NHS TRUD
23 | # For more information please see: https://isd.digital.nhs.uk/trud/users/guest/filters/0/api
24 | snomed_apikey = None
25 |
--------------------------------------------------------------------------------
/data/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Storage location for all data/models
2 |
3 | To keep the repository clean all data and models should be stored here in their appropriate folder.
4 |
5 | This directory has been organised to assist in the workflow of Working with [CogStack](https://github.com/CogStack/CogStack-NiFi)
6 | and creating/evaluating [MedCAT models](https://github.com/CogStack/MedCAT).
7 |
8 | ## Retrieval and Storage of Data
9 | All raw data relating to a CogStack request. Should be stored here
10 |
11 |
12 | ## SNOMED
13 |
14 | Place holder for all SNOMED related content and downloads [here](/data/snomed).
15 | For other terminologies (UMLS/RxNORM etc...) please create a separate folder and store them within this directory.
16 |
17 | ## MedCAT Models
18 |
19 | All model components and model packs should be stored [here](/data/medcat_models)
20 |
21 |
22 | ## MedCATtrainer
23 |
24 | All MedCATtrainer JSON exports should be stored [here](data/medcattrainer_export).
25 | Scripts to produce export summaries of all annotations and work done, can be found [here](TODO: ).
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/data/cogstack_search_results/ReadMe.md:
--------------------------------------------------------------------------------
1 | # CogStack search results/requests.
2 |
3 | Add subdirectories to more efficiently manage and collate search results relating to a single CogStack Search request.
4 |
5 |
6 | ## Standarise Workflows (Optional)
7 |
8 | The following is just guidelines/recommendations to standardise workflow:
9 |
10 | - Good practise is to name files with the following structure *YYYYMMDD_filename*
11 |
12 |
13 | A recommended format for the directory structure to efficiently manage each request is as follows:
14 | Ideally the *project_name* should correspond to your CogStack request ID.
15 |
16 |
17 | ```
18 | project_name/
19 | --- input/ # raw data files
20 | --- ref/ # reference files
21 | --- result/ # final results
22 | --- src/ # functions to source
23 | --- work/ # intermediate data
24 | --- main.py
25 | --- analysis.py
26 |
27 | ```
28 |
29 | __[input/]__: Contains the original, or raw, data files. Contents in this folder should be treated as read-only.
30 |
31 | __[ref/]__: Contains reference files, i.e. from research.
32 |
33 | __[result/]__: Contains the final results and explanatory markdown files.
34 |
35 | __[src/]__: Contains functions that are sourced from the main console code.
36 |
37 | __[work/]__: The working directory, should be used to store temporary data files.
38 | With the final scripts (main.py and other analysis scripts...) held directly in the project folder outside of the sub-folders.
39 | Any intermediate data that one may want to reference later should be stored in the work sub-folder.
40 |
--------------------------------------------------------------------------------
/data/medcattrainer_export/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Placeholder for MedCATtrainer exports
2 |
3 | All materials exported from medcattrainer should be stored here.
4 |
5 |
6 | MedCATtrainer exports should be placed [here](data/medcattrainer_exports)
--------------------------------------------------------------------------------
/data/media/cogstack_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/cogstack_logo.png
--------------------------------------------------------------------------------
/data/media/foresight_logo_unofficial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/foresight_logo_unofficial.png
--------------------------------------------------------------------------------
/data/media/medcat_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/medcat_logo.png
--------------------------------------------------------------------------------
/data/media/medcat_pipeline_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/medcat_pipeline_summary.png
--------------------------------------------------------------------------------
/data/media/nhs_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/data/media/nhs_logo.png
--------------------------------------------------------------------------------
/data/snomed/ReadMe.md:
--------------------------------------------------------------------------------
1 | # SNOMED
2 |
3 | Place holder for all SNOMED related content and downloads here
4 |
5 | --------
6 |
7 | ## MedCAT proprocessing of SNOMED
8 |
9 | All scripts to preprocess SNOMED CT release files can be found:
10 |
11 | from medcat.utils.preprocess_snomed import Snomed
12 |
13 | snomed = Snomed({path_to_unzipped_snomed})
14 | df = snomed.to_concept_df()
15 |
16 | SNOMED UK edition or drug extension special releases can be preprocessed via:
17 |
18 | from medcat.utils.preprocess_snomed import Snomed
19 |
20 | snomed = Snomed({path_to_unzipped_snomed}, uk_ext=True, uk_drug_ext=False)
21 | df = snomed.to_concept_df()
22 |
23 | Further information can be found [here](https://github.com/CogStack/MedCAT/blob/master/medcat/utils/preprocess_snomed.py)
24 |
25 | ## About
26 | SNOMED CT is a standarised clinical terminology consisting of >350,000 unique concepts. It is owned, maintained and distributed by SNOMED International.
27 |
28 | Please visit and explore https://www.snomed.org/ to find out further information about the various SNOMED CT products and services which they offer.
29 |
30 |
31 | ## What is SNOMED CT?
32 |
33 | SNOMED CT is a clinical terminology containing concepts with unique meanings and formal logic based definitions organised into hierarchies. For further information please see: https://confluence.ihtsdotools.org/display/DOCSTART/4.+SNOMED+CT+Basics
34 |
35 | ## SNOMED CT Design
36 | SNOMED CT content is represented into 3 main types of components:
37 |
38 | - Concepts representing clinical meanings that are organised into hierarchies.
39 | - Descriptions which link appropriate human readable terms to concepts
40 | - Relationships which link each concept to other related concepts
41 |
42 | It also contains mappings to classification systems such as:
43 | - ICD (International classifications of diseases)
44 | - OPCS (Office of Population Censuses and Surveys) (SNOMED UK extension only)
45 |
46 | ---------
47 |
48 |
49 | ## Access to SNOMED CT release files
50 |
51 | You may download SNOMED CT at the Member country’s designated website. The use of SNOMED CT in Member countries is free. Follow this [link](https://www.snomed.org/our-stakeholders/members) to find out if your country is a member state and explore the website to find directions to where to your national SNOMED CT distribution is held.
52 |
53 | E.g.
54 | * UK -> [NHS TRUD](https://isd.digital.nhs.uk/trud3/user/guest/group/0/home)
55 |
56 | * US -> [NIH National Library of Medicine](https://www.nlm.nih.gov/healthit/snomedct/international.html) Alternative clinical terminologies such as UMLS can be found here.
57 |
58 |
59 | The following Steps are to services provided by SNOMED International for organizations and individuals to request use and access to the International Release of SNOMED CT for use in non-Member countries
60 |
61 | __To access SNOMED CT files from non-member countries:__
62 |
63 | 1. Please visit the SNOMED [Member Licensing and Distribution Service.](https://mlds.ihtsdotools.org/#/landing) and read their terms and conditions for use.
64 |
65 | 2. Login or Register for an account and wait to be granted access.
66 |
67 | 3. Once you have been granted access. Logged in and visit the tab ["Release Packages"](https://mlds.ihtsdotools.org/#/viewReleases) and retrieve the release of SNOMED CT that you would like to have. Alternatively, for the international SNOMED release simply visit the [International releases](https://mlds.ihtsdotools.org/#/viewReleases/viewRelease/167).
68 |
69 | ----------
--------------------------------------------------------------------------------
/data/snomed/umls_enricher.py:
--------------------------------------------------------------------------------
1 | import os
2 | import zipfile
3 | import pandas as pd
4 | from umls_downloader import download_umls
5 | from tqdm.autonotebook import tqdm
6 |
7 | api_key = ''
8 | version = '2022AA'
9 | outfile = f'{version}_UMLS_english.csv'
10 |
11 | umls_rows = []
12 | path = download_umls(version=version, api_key=api_key)
13 | with zipfile.ZipFile(path) as zip_file:
14 | with zip_file.open("MRCONSO.RRF", mode='r') as file:
15 | with tqdm(total=sum(1 for _ in file), unit='line') as pbar:
16 | file.seek(0) # reset file pointer to the begining of the file
17 | for line in file:
18 | umls_rows.append(line.decode('UTF-8').split('|')[:-1])
19 | pbar.update(1)
20 | columns = [
21 | 'CUI',
22 | 'LAT',
23 | 'TS',
24 | 'LUI',
25 | 'STT',
26 | 'SUI',
27 | 'ISPREF',
28 | 'AUI',
29 | 'SAUI',
30 | 'SCUI',
31 | 'SDUI',
32 | 'SAB',
33 | 'TTY',
34 | 'CODE',
35 | 'STR',
36 | 'SRL',
37 | 'SUPPRESS',
38 | 'CVF',
39 | ]
40 |
41 | umls_df = pd.DataFrame(columns=columns, data=umls_rows)
42 | eng_umls = umls_df[umls_df['LAT'] == 'ENG']
43 | del umls_df
44 | outfile = f'{version}_UMLS_english.csv'
45 | eng_umls.to_csv(outfile, index=False)
46 | print(f'file saved as {outfile}')
47 |
48 | medcat_csv_mapper = {
49 | 'CUI':'cui',
50 | 'STR':'name',
51 | 'SAB':'ontologies',
52 | 'ISPREF':'name_status',
53 | 'TUI':'type_ids',
54 | }
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/data/umls/NLM_umls_download.py:
--------------------------------------------------------------------------------
1 | """
2 | Automating UMLS Terminology Services (UTS) Downloads
3 | The following instructions will allow you to automate the download of RxNorm, UMLS, or SNOMED CT files.
4 |
5 |
6 | Step 1: Get your API key from your UTS profile.
7 | You can find the API key in the UTS ‘My Profile’ area after signing in. An API key remains active as long as
8 | the associated UTS account is active.
9 | https://uts.nlm.nih.gov/uts/?_gl=1*veo3ht*_ga*MTkwNzE1ODcyOC4xNjYyOTcxNDg3*_ga_P1FPTH9PL4*MTY2Mjk3MTQ4Ni4xLjEuMTY2Mjk3MzA0OS4wLjAuMA..
10 |
11 | """
12 | import requests
13 | import sys
14 |
15 | apikey = '' # please add apikey
16 | DOWNLOAD_URL = 'https://download.nlm.nih.gov/umls/kss/2022AA/umls-2022AA-full.zip' # Change this to service required
17 | PATH_TO_DOWNLOAD = '' # Default outfile path will be written to current working directory
18 |
19 | print(DOWNLOAD_URL)
20 | value = DOWNLOAD_URL.split('/')
21 |
22 | if not apikey:
23 | sys.exit("Please enter you api key ")
24 |
25 | if not DOWNLOAD_URL:
26 | print("Usage: curl-uts-downloads-apikey.sh download_url ")
27 | print(" For full UMLS:")
28 | print(" e.g. curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/2022AA/umls-2022AA-full.zip")
29 | print(" For RxNorm:")
30 | print(" e.g. curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_full_current.zip")
31 | print(" curl-uts-download-apikey.sh https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_weekly_current.zip")
32 | sys.exit("Download_url is empty")
33 |
34 | url = 'https://utslogin.nlm.nih.gov/cas/v1/api-key'
35 | param = {'apikey': apikey}
36 | headers = {'Content-type': 'application/x-www-form-urlencoded'}
37 |
38 | TGTresponse = requests.post(url, headers=headers, data=param)
39 | first, second = TGTresponse.text.split('api-key/')
40 | TGTTicket, fourth = second.split('" method')
41 |
42 | print(TGTTicket)
43 |
44 | url = 'https://utslogin.nlm.nih.gov/cas/v1/tickets/'+TGTTicket
45 | param = {'service': DOWNLOAD_URL}
46 | headers = {'Content-type': 'application/x-www-form-urlencoded'}
47 |
48 | STResponse = requests.post(url, headers=headers, data=param)
49 |
50 | print(STResponse.text)
51 |
52 | url = DOWNLOAD_URL+'?ticket='+STResponse.text
53 | r = requests.get(url, allow_redirects=True)
54 |
55 | with open(PATH_TO_DOWNLOAD + value[len(value)-1], 'wb') as f:
56 | f.write(r.content)
57 |
58 | # Retrieve HTTP meta-data
59 | print(r.status_code)
60 | print(r.headers['content-type'])
61 | print(r.encoding)
62 |
63 | print(f'File saved to: {str(PATH_TO_DOWNLOAD + value[len(value)-1])}')
64 | print('Download completed')
65 |
--------------------------------------------------------------------------------
/data/umls/ReadMe.md:
--------------------------------------------------------------------------------
1 | # UMLS - The Unified Medical Language System®
2 |
3 | Place holder for all UMLS related content and downloads here
4 |
5 | --------
6 |
7 | ## About
8 | The UMLS integrates and distributes key terminology, classification and coding standards,
9 | and associated resources to promote creation of more effective and interoperable biomedical information systems and services,
10 | including electronic health records.
11 |
12 | The UMLS, or Unified Medical Language System, is a set of files and software that brings together many health and
13 | biomedical vocabularies and standards to enable interoperability between computer systems.
14 |
15 | ## Access
16 |
17 | [Request a license](https://uts.nlm.nih.gov/uts/?_gl=1*1791eyk*_ga*MTkwNzE1ODcyOC4xNjYyOTcxNDg3*_ga_P1FPTH9PL4*MTY2Mjk3ODA3OS4yLjEuMTY2Mjk3OTQ4Mi4wLjAuMA..)
18 | and sign up for a UMLS Terminology Services (UTS) account.
19 |
20 | - UMLS licenses are issued only to individuals and not to groups or organizations.
21 | - There is no charge for licensing the UMLS from NLM. NLM is a member of [SNOMED International](http://www.snomed.org/)
22 | (owner of SNOMED CT), and there is no charge for SNOMED CT use in the United States and other [member countries](http://www.snomed.org/our-customers/members).
23 | Some uses of the UMLS may require additional agreements with individual terminology vendors.
24 | - Your UTS account provides access to the Unified Medical Language System (UMLS), the Value Set Authority Center (VSAC),
25 | RxNorm downloads, SNOMED CT downloads and more.
26 | - For more, visit [how to license and access UMLS data](https://www.nlm.nih.gov/databases/umls.html)
27 |
28 |
29 | Further information can be found on the [nlm website](https://www.nlm.nih.gov/research/umls/index.html)
30 |
31 |
32 |
33 | ## API Home
34 |
35 | ### Authentication
36 | All users of this terminology require registration with NLM, to download UMLS data (Warning: some restriction may apply depending on country; see UMLS licence and its SNOMED CT appendix):
37 |
38 | https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html
39 |
40 | Documentation for User Authentication can be found [here](https://documentation.uts.nlm.nih.gov/rest/authentication.html)
41 |
42 |
43 | For further information about UMLS API Technical Documentation can be found [here.](https://documentation.uts.nlm.nih.gov/rest/home.html)
44 |
45 |
46 | ### Downloading UMLS
47 |
48 | One can use the scripts found in [NLM_umls_download.py](/data/umls/NLM_umls_download.py) to download the entire UMLS
49 | Knowledge Source.
50 |
51 | Otherwise, one can access the UMLS Knowledge Sources directly: File Downloads can be found [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html)
52 |
53 | Alternatively, you can simply follow the scripts in the [working with UMLS notebook](/data/umls/working_with_umls.ipynb). This script will download UMLS and convert the
54 | MRCONSO.RFF.ZIP file to a DataFrame. You can then process this file to get ready to build a MedCAT Concept Database!
55 |
56 | ## Citing the UMLS
57 | If you use UMLS in your work, please cite the original article:
58 |
59 | Bodenreider O. The Unified Medical Language System (UMLS): integrating biomedical terminology. Nucleic Acids Res. 2004 Jan 1;32(Database issue):D267-70. doi: 10.1093/nar/gkh061. PubMed PMID: 14681409; PubMed Central PMCID: PMC308795.
60 |
61 |
62 |
--------------------------------------------------------------------------------
/data/umls/working_with_umls.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Working with UMLS\n",
8 | "\n",
9 | "This scripts will walk you through how to:\n",
10 | "1) Download a specific version of UMLS\n",
11 | "\n",
12 | "2) Process the MRCONSO.RFF.ZIP files to a pandas df whcih you can then manipulate\n",
13 | "\n",
14 | "__Note:__ Keep in mind that the UMLS file sets are very large!"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Part 1: Downloading UMLS"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import os\n",
33 | "from umls_downloader import download_umls"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# Get this from https://uts.nlm.nih.gov/uts/edit-profile\n",
43 | "api_key = ''\n",
44 | "version = '2022AA' # Change this to the UMLS version that you require"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "path = download_umls(version=version, api_key=api_key)\n",
54 | "print(path) # This is where the UMLS files are now saved"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Part 2: Working with UMLS\n",
62 | "\n",
63 | "The part of UMLS that we require is stored in the MRCONSO.RFF files. The file layout is as follows:\n",
64 | "\n",
65 | "__Concept Names and Sources (File = MRCONSO.RRF)__\n",
66 | "\n",
67 | "|Col.|Description|\n",
68 | "|---|---|\n",
69 | "|CUI|\tUnique identifier for concept|\n",
70 | "|LAT|\tLanguage of term|\n",
71 | "|TS|\tTerm status|\n",
72 | "|LUI|\tUnique identifier for term|\n",
73 | "|STT|\tString type|\n",
74 | "|SUI|\tUnique identifier for string|\n",
75 | "|ISPREF|\tAtom status - preferred (Y) or not (N) for this string within this concept|\n",
76 | "|AUI|\tUnique identifier for atom - variable length field, 8 or 9 characters|\n",
77 | "|SAUI|\tSource asserted atom identifier [optional]|\n",
78 | "|SCUI|\tSource asserted concept identifier [optional]|\n",
79 | "|SDUI|\tSource asserted descriptor identifier [optional]|\n",
80 | "|SAB|\tAbbreviated source name (SAB). Maximum field length is 20 alphanumeric characters. Two source abbreviations are assigned: Root Source Abbreviation (RSAB) — short form, no version information, for example, AI/RHEUM, 1993, has an RSAB of \"AIR\" Versioned Source Abbreviation (VSAB) — includes version information, for example, AI/RHEUM, 1993, has an VSAB of \"AIR93\" Official source names, RSABs, and VSABs are included on the UMLS Source Vocabulary Documentation page.\n",
81 | "|TTY|\tAbbreviation for term type in source vocabulary, for example PN (Metathesaurus Preferred Name) or CD (Clinical Drug). Possible values are listed on the Abbreviations Used in Data Elements page.|\n",
82 | "CODE|\tMost useful source asserted identifier (if the source vocabulary has more than one identifier), or a Metathesaurus-generated source entry identifier (if the source vocabulary has none)|\n",
83 | "|STR|\tString|\n",
84 | "|SRL|\tSource restriction level|\n",
85 | "|SUPPRESS|\tSuppressible flag. Values = O, E, Y, or N O: All obsolete content, whether they are obsolesced by the source or by NLM. These will include all atoms having obsolete TTYs, and other atoms becoming obsolete that have not acquired an obsolete TTY (e.g. RxNorm SCDs no longer associated with current drugs, LNC atoms derived from obsolete LNC concepts). E: Non-obsolete content marked suppressible by an editor. These do not have a suppressible SAB/TTY combination. Y: Non-obsolete content deemed suppressible during inversion. These can be determined by a specific SAB/TTY combination explicitly listed in MRRANK. N: None of the above. Default suppressibility as determined by NLM (i.e., no changes at the Suppressibility tab in MetamorphoSys) should be used by most users, but may not be suitable in some specialized applications. See the MetamorphoSys Help page for information on how to change the SAB/TTY suppressibility to suit your requirements. NLM strongly recommends that users not alter editor-assigned suppressibility, and MetamorphoSys cannot be used for this purpose.|\n",
86 | "|CVF|\tContent View Flag. Bit field used to flag rows included in Content View. This field is a varchar field to maximize the number of bits available for use.|"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "import zipfile\n",
96 | "import pandas as pd"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "umls_rows = []\n",
106 | "with zipfile.ZipFile(path) as zip_file:\n",
107 | " with zip_file.open(\"MRCONSO.RRF\", mode=\"r\") as file:\n",
108 | " for line in file:\n",
109 | " umls_rows.append(line.decode('UTF-8').split('|')[:-1])"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "columns = [\n",
119 | " \"CUI\",\n",
120 | " \"LAT\",\n",
121 | " \"TS\",\n",
122 | " \"LUI\",\n",
123 | " \"STT\",\n",
124 | " \"SUI\",\n",
125 | " \"ISPREF\",\n",
126 | " \"AUI\",\n",
127 | " \"SAUI\",\n",
128 | " \"SCUI\",\n",
129 | " \"SDUI\",\n",
130 | " \"SAB\",\n",
131 | " \"TTY\",\n",
132 | " \"CODE\",\n",
133 | " \"STR\",\n",
134 | " \"SRL\",\n",
135 | " \"SUPPRESS\",\n",
136 | " \"CVF\", \n",
137 | "]"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "umls_df = pd.DataFrame(columns=columns, data=umls_rows)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "umls_df.head()"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Free free to now manipulate the dataframe as you would like!"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": []
171 | }
172 | ],
173 | "metadata": {
174 | "kernelspec": {
175 | "display_name": "Python 3",
176 | "language": "python",
177 | "name": "python3"
178 | },
179 | "language_info": {
180 | "codemirror_mode": {
181 | "name": "ipython",
182 | "version": 3
183 | },
184 | "file_extension": ".py",
185 | "mimetype": "text/x-python",
186 | "name": "python",
187 | "nbconvert_exporter": "python",
188 | "pygments_lexer": "ipython3",
189 | "version": "3.8.2"
190 | }
191 | },
192 | "nbformat": 4,
193 | "nbformat_minor": 1
194 | }
195 |
--------------------------------------------------------------------------------
/medcat/1_create_model/create_cdb/create_cdb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from medcat.config import Config
4 | from medcat.cdb_maker import CDBMaker
5 |
6 | pd.options.mode.chained_assignment = None # type: ignore
7 |
8 | # relative to file path
9 | _FILE_DIR = os.path.dirname(__file__)
10 | # relative path to working_with_cogstack folder
11 | _REL_PATH = os.path.join("..", "..", "..")
12 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
13 | # absolute path to working_with_cogstack folder
14 | BASE_PATH = os.path.abspath(_BASE_PATH)
15 |
16 | EXPECTED_CSV_PATH = os.path.join(_REL_PATH, "data", "snomed")
17 |
18 | csv_path = input(f"Enter specific SNOMED pre-cdb csv found in the path {EXPECTED_CSV_PATH}: ")
19 | # The preprocessing files for snomed can be found here ../../../data/snomed/:
20 | # The default is output is ../../../data/snomed/preprocessed_snomed.csv
21 | release = csv_path[-12:-4]
22 | # doing it here so that it can later be used for CDBMaker
23 | csv_path = os.path.join(EXPECTED_CSV_PATH, csv_path)
24 |
25 | model_dir = os.path.join(BASE_PATH, "models", "cdb")
26 | output_cdb = os.path.join(model_dir, f"{release}_SNOMED_cdb.dat")
27 | csv = pd.read_csv(csv_path)
28 |
29 | # Remove null values
30 | sctid_null_index = csv[csv['name'].isnull()].index.copy()
31 | csv['name'].iloc[sctid_null_index] = "N/A"
32 |
33 | # Only filter acronyms for specific Semantic tags
34 | csv['acronym'] = csv[~csv['description_type_ids'].str.contains("assessment scale|"
35 | "core metadata concept|"
36 | "metadata|"
37 | "foundation metadata concept"
38 | "|OWL metadata concept")]['name'].str.\
39 | extract("([A-Z]{2,6}) - ", expand=True)
40 |
41 | print("Cleaning acronyms...")
42 | for i, row in csv[(~csv['acronym'].isnull()) & (csv['name_status'] == 'A')][['name', 'acronym']].iterrows():
43 | if row['name'][0:len(row['acronym'])] == row['acronym']:
44 | csv['name'].iloc[i] = row['acronym'] # type: ignore
45 |
46 | print("acronyms complete")
47 |
48 | csv = csv.drop_duplicates(keep='first').reset_index(drop=True)
49 | csv.pop('acronym')
50 |
51 | # Setup config
52 | config = Config()
53 | config.general['spacy_model'] = 'en_core_web_md'
54 | config.cdb_maker['remove_parenthesis'] = 1
55 | config.general['cdb_source_name'] = f'SNOMED_{release}'
56 |
57 | maker = CDBMaker(config)
58 |
59 | # Create your CDB
60 | # Add more cdbs to the list
61 | csv_paths = [csv_path]
62 |
63 | cdb = maker.prepare_csvs(csv_paths, full_build=True)
64 |
65 | # Add type_id pretty names to cdb
66 | cdb.addl_info['type_id2name'] = pd.Series(csv.description_type_ids.values, index=csv.type_ids.astype(str)).to_dict()
67 | cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms.
68 |
69 | # save model
70 | cdb.save(output_cdb)
71 | print(f"CDB Model saved successfully as: {output_cdb}")
72 |
--------------------------------------------------------------------------------
/medcat/1_create_model/create_cdb/create_umls_cdb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from medcat.config import Config
4 | from medcat.cdb_maker import CDBMaker
5 |
6 | pd.options.mode.chained_assignment = None # type: ignore
7 |
8 | # relative to file path
9 | _FILE_DIR = os.path.dirname(__file__)
10 | # relative path to working_with_cogstack folder
11 | _REL_PATH = os.path.join("..", "..", "..")
12 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
13 | # absolute path to working_with_cogstack folder
14 | BASE_PATH = os.path.abspath(_BASE_PATH)
15 |
16 | EXPECTED_CSV_PATH = os.path.join(_REL_PATH, "data", "umls")
17 |
18 | # this is expected to be output from medcat.utils.preprocess_umls
19 | # i.e not the raw UMLS files
20 | csv_path = input(f"Enter specific UMLS pre-cdb csv found in the path data/umls in {EXPECTED_CSV_PATH}: ")
21 | # doing it here so that it can later be used for CDBMaker
22 | csv_path = os.path.join(EXPECTED_CSV_PATH, csv_path)
23 | release = '2022AA' # or as appropriate
24 |
25 | if not os.path.exists('models'):
26 | os.makedirs('models')
27 | print("Creating a 'models' folder to store model")
28 |
29 | model_dir = os.path.join(BASE_PATH, "models", "cdb")
30 | output_cdb = os.path.join(model_dir, f"{release}_UMLS_cdb.dat")
31 | csv = pd.read_csv(csv_path)
32 |
33 | # Remove null values
34 | sctid_null_index = csv[csv['name'].isnull()].index.copy()
35 | csv['name'].iloc[sctid_null_index] = "N/A"
36 |
37 | csv = csv.drop_duplicates(keep='first').reset_index(drop=True)
38 |
39 |
40 | # Setup config
41 | config = Config()
42 | config.general['spacy_model'] = 'en_core_web_md'
43 | config.cdb_maker['remove_parenthesis'] = 1
44 | config.general['cdb_source_name'] = f'UMLS_{release}'
45 |
46 | maker = CDBMaker(config)
47 |
48 |
49 | # Create your CDB
50 | # Add more cdbs to the list
51 | csv_paths = [csv_path]
52 | cdb = maker.prepare_csvs(csv_paths, full_build=True)
53 |
54 | # Add type_id pretty names to cdb
55 | cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist()) # Add all cuis to filter out legacy terms.
56 |
57 | # save model
58 | cdb.save(output_cdb)
59 | print(f"CDB Model saved successfully as: {output_cdb}")
60 |
--------------------------------------------------------------------------------
/medcat/1_create_model/create_modelpack/create_modelpack.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from medcat.vocab import Vocab
4 | from medcat.cdb import CDB
5 | from medcat.cat import CAT
6 |
7 | # relative to file path
8 | _FILE_DIR = os.path.dirname(__file__)
9 | # relative path to working_with_cogstack folder
10 | _REL_PATH = os.path.join("..", "..", "..")
11 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
12 | # absolute path to working_with_cogstack folder
13 | BASE_PATH = os.path.abspath(_BASE_PATH)
14 |
15 | DEFAULT_CDB_FOLDER = os.path.join(BASE_PATH, "models", "cdb")
16 |
17 | DEFAULT_VOCAB_FOLDER = os.path.join(BASE_PATH, "models", "vocab")
18 | DEFAULT_VOCAB_PATH = os.path.join(DEFAULT_VOCAB_FOLDER, 'vocab.dat')
19 |
20 | DEFAULT_MODELPACK_FOLDER = os.path.join(BASE_PATH, "models", "modelpack")
21 |
22 | model_name = "" # Change to specific cdb of interest
23 | modelpack_name = ".dat" # Change to the name of your model
24 |
25 | def load_cdb_and_save_modelpack(cdb_path: str,
26 | modelpack_name: str,
27 | modelpack_path: str = DEFAULT_MODELPACK_FOLDER,
28 | vocab_path: str = DEFAULT_VOCAB_PATH) -> str:
29 | """Load a CDB and save it as a model pack along with the default Vocab.
30 |
31 | Args:
32 | cdb_path (str): The CDB path to load.
33 | modelpack_name (str): The model pack name to write to.
34 | modelpack_path (str): The folder to write the model pack to.
35 | Defaults to `DEFAULT_MODELPACK_FOLDER`.
36 | vocab_path (str): The vocab path. Defaults to `DEFAULT_VOCAB_PATH`.
37 |
38 | Returns:
39 | str: The model pack path.
40 | """
41 | # Load cdb
42 | cdb = CDB.load(cdb_path)
43 |
44 | # Set cdb configuration
45 | # technically we already created this during the cdb creation
46 | cdb.config.ner['min_name_len'] = 2
47 | cdb.config.ner['upper_case_limit_len'] = 3
48 | cdb.config.general['spell_check'] = True
49 | cdb.config.linking['train_count_threshold'] = 10
50 | cdb.config.linking['similarity_threshold'] = 0.3
51 | cdb.config.linking['train'] = True
52 | cdb.config.linking['disamb_length_limit'] = 4
53 | cdb.config.general['full_unlink'] = True
54 |
55 | # Load vocab
56 | vocab = Vocab.load(vocab_path)
57 |
58 | # Initialise the model
59 | cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
60 |
61 | # Create and save model pack
62 | return cat.create_model_pack(save_dir_path=modelpack_path, model_pack_name=modelpack_name)
63 |
64 |
65 | def load_cdb_and_save_modelpack_in_def_location(cdb_name: str,
66 | modelpack_name: str) -> str:
67 | cdb_path = os.path.join(DEFAULT_CDB_FOLDER, cdb_name)
68 | return load_cdb_and_save_modelpack(cdb_path, modelpack_name,
69 | DEFAULT_MODELPACK_FOLDER,
70 | DEFAULT_VOCAB_PATH)
71 |
72 | def main():
73 | load_cdb_and_save_modelpack_in_def_location(model_name, modelpack_name)
74 |
75 | if __name__ == "__main__":
76 | main()
77 |
--------------------------------------------------------------------------------
/medcat/1_create_model/create_vocab/create_vocab.py:
--------------------------------------------------------------------------------
1 | from medcat.vocab import Vocab
2 | import os
3 |
4 | vocab = Vocab()
5 |
6 | # relative to file path
7 | _FILE_DIR = os.path.dirname(__file__)
8 | # relative path to working_with_cogstack folder
9 | _REL_PATH = os.path.join("..", "..", "..")
10 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
11 | # absolute path to working_with_cogstack folder
12 | BASE_PATH = os.path.abspath(_BASE_PATH)
13 | vocab_dir = os.path.join(BASE_PATH, "models", "vocab")
14 |
15 | # the vocab.txt file need to be in the tab sep format: \t\t
16 | # Current vocab uses pre-calculated vector embedding from Word2Vec, future use embeddings calculated from BERT tokeniser
17 | # embeddings of 300 dimensions is standard
18 |
19 | vocab.add_words(os.path.join(vocab_dir, 'vocab_data.txt'), replace=True)
20 | vocab.make_unigram_table()
21 | vocab.save(os.path.join(vocab_dir, "vocab.dat"))
22 |
--------------------------------------------------------------------------------
/medcat/2_train_model/1_unsupervised_training/splitter.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | import pydantic
5 |
6 |
7 | class SplitIdentifier:
8 | start_line_start_pattern: re.Pattern = re.compile('(\d+),(\d+),')
9 |
10 | def is_first_line(self, line: str) -> bool:
11 | """Check if the line in question is a suitable first line for an entry.
12 |
13 | The schema:
14 | "subject_id","hadm_id","chartdate","charttime","text","category","description"
15 |
16 | However, "text" is often multiline.
17 | So an example first line could be:
18 | 24776,139951,"2154-11-15 00:00:00","2154-11-15 17:48:00","HPI:
19 | That is, "subject_id","hadm_id","chartdate","charttime" and the start of "text"
20 |
21 | So currently, I am checking that the line:
22 | a) Starts with 2 integers separated by a comma
23 | b) Has an uneven number of quotation marks (i.e ends with an open quote)
24 | c) The number of quotes is greater than 4
25 |
26 | Args:
27 | line (str): The line in question
28 |
29 | Returns:
30 | bool: True if it's a suitable first line
31 | """
32 | # check if starts with 2 integers separated by comma
33 | if not self.start_line_start_pattern.match(line):
34 | return False
35 | nr_of_quotes = line.count('"')
36 | return (nr_of_quotes // 2) != (nr_of_quotes / 2) and nr_of_quotes > 4
37 |
38 | def is_last_line(self, line: str) -> bool:
39 | """Check if the lin in question is a suitable last line for an entry.
40 |
41 | The schema:
42 | "subject_id","hadm_id","chartdate","charttime","text","category","description"
43 |
44 | However, "text" is often multiline.
45 | So an example last line could be:
46 | ","Physician ","Physician Resident Progress Note"
47 | That is, the end of "text" and then "category","description"
48 |
49 | So currently I am checking that the line:
50 | a) Has an uneven number of quotation marks (i.e starts with an open quote)
51 | b) Number of quotes is greater than 4
52 |
53 | Args:
54 | line (str): The line in question
55 |
56 | Returns:
57 | bool: True if it's a suitable last line
58 | """
59 | nr_of_quotes = line.count('"')
60 | return (nr_of_quotes // 2) != (nr_of_quotes / 2) and nr_of_quotes > 4
61 |
62 |
63 | class SplitOptions(pydantic.BaseModel):
64 | lines_at_a_time: int
65 | out_file_format: str
66 | header_length: int = 1
67 |
68 |
69 | class SplitBuffer:
70 |
71 | def __init__(self, file_nr: int, opts: SplitOptions, split_identifier: SplitIdentifier, header: str) -> None:
72 | self.file_nr = file_nr
73 | self.opts = opts
74 | self.split_identifier = split_identifier
75 | self.lines: list = [header]
76 | self.prev_line_is_last = False
77 | self._is_done = False
78 |
79 | def save(self) -> None:
80 | file_name = self.opts.out_file_format % self.file_nr
81 | print('Saving', len(self.lines), 'to file nr',
82 | self.file_nr, ':', file_name)
83 | with open(file_name, 'w') as fw:
84 | fw.writelines(self.lines)
85 |
86 | def process_or_write(self, line_nr: int, line: str) -> 'SplitBuffer':
87 | """Process line and write if needed.
88 |
89 | If processing a line results in saving the data into a file, a new SplitBuffer is returned.
90 | This new instance will have the first line added to it already.
91 | If processing did not result in saving the data, the same instance is returned.
92 |
93 | Args:
94 | line_nr (int): The number of the line in the original
95 | line (str): The line contents
96 |
97 | Returns:
98 | SplitBuffer: Returns an instance of the buffer that should be used
99 | """
100 | if self._is_done:
101 | raise ValueError('Cannot reuse a SplitBuffer - create a new one')
102 | # line = line.replace('\n', '')
103 | has_passed_req_line = line_nr >= self.opts.lines_at_a_time * self.file_nr
104 | cur_line_is_last = self.split_identifier.is_last_line(line)
105 | cur_line_is_first = self.split_identifier.is_first_line(line)
106 | if has_passed_req_line and self.prev_line_is_last and cur_line_is_first:
107 | print('Currently at line', line_nr)
108 | self.save()
109 | # print('Saving', len(self.lines), 'up until', line_nr, 'to file number', self.file_nr, ':', out_file)
110 | # print('PREV line:\n', self.lines[-1])
111 | # print('NEW line:\n', line)
112 | self._is_done = True
113 | buffer = SplitBuffer(
114 | self.file_nr + 1, self.opts, self.split_identifier, header=self.lines[0])
115 | return buffer.process_or_write(line_nr, line)
116 | if cur_line_is_last:
117 | self.prev_line_is_last = cur_line_is_last
118 | self.lines.append(line)
119 | return self
120 |
121 |
122 | class Splitter:
123 |
124 | def __init__(self, opts: SplitOptions, split_identifier: SplitIdentifier) -> None:
125 | self.opts = opts
126 | self.split_identifier = split_identifier
127 |
128 | def split(self, in_file: str):
129 | with open(in_file, 'r') as f:
130 | buffer = None
131 | for line_nr, line in enumerate(f):
132 | if buffer is None: # for the first line, just consider the header
133 | buffer = SplitBuffer(
134 | 1, self.opts, self.split_identifier, header=line)
135 | continue
136 | buffer = buffer.process_or_write(line_nr, line)
137 | if buffer and len(buffer.lines) > 1: # if there's more than just a header
138 | buffer.save() # saver remaining
139 |
140 |
141 | def split_file(in_file: str, nr_of_lines: int, out_file_format: str) -> None:
142 | """Splits a file into multiple files of the specified number of lines (or close to it).
143 |
144 | PS! This splitting is currently only designed for a narrow type of CSV files.
145 | This was created to split the MIMIC-III notes into parts. It may work with
146 | later MIMIC releases but is unlikely to work for other datasets.
147 |
148 | Args:
149 | in_file (str): _description_
150 | nr_of_lines (int): _description_
151 | out_file_format (str): _description_
152 | """
153 | opts = SplitOptions(lines_at_a_time=nr_of_lines,
154 | out_file_format=out_file_format)
155 | split_identifier = SplitIdentifier()
156 | splitter = Splitter(opts, split_identifier)
157 | splitter.split(in_file)
158 |
159 |
160 | if __name__ == '__main__':
161 | import sys
162 | if len(sys.argv) < 3:
163 | print('Need to specify in original file name and target file format')
164 | sys.exit(2)
165 | orig_file = sys.argv[1]
166 | target_format = sys.argv[2]
167 | if '%d' not in target_format:
168 | print('Target format needs to contain "%d" for including number in the file names')
169 | sys.exit(2)
170 | nr_of_lines = 300000
171 | if len(sys.argv) > 3:
172 | try:
173 | nr_of_lines = int(sys.argv[3])
174 | except ValueError:
175 | print(
176 | 'Third argument needs to be numeric (for the number of lines per each split)')
177 | sys.exit(2)
178 | split_file(orig_file, nr_of_lines, target_format)
179 |
--------------------------------------------------------------------------------
/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from medcat.cat import CAT\n",
10 | "import pandas as pd\n",
11 | "import os"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# relative path to working_with_cogstack folder\n",
21 | "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n",
22 | "# absolute path to working_with_cogstack folder\n",
23 | "base_path = os.path.abspath(_rel_path)\n",
24 | "data_dir = os.path.join(base_path, \"data\")\n",
25 | "data_file = '' # file containing training material.\n",
26 | "\n",
27 | "model_dir = os.path.join(data_dir, \"medcat_models\", \"modelpack\")\n",
28 | "\n",
29 | "modelpack = ''\n",
30 | "model_pack_path = os.path.join(model_dir, modelpack)\n",
31 | "\n",
32 | "output_modelpack = '' # Save name for new model\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Initialise model"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "# Load modelpack\n",
49 | "cat = CAT.load_model_pack(model_pack_path)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "cat.cdb.print_stats()"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "scrolled": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "# Load training data\n",
70 | "data = pd.read_csv(os.path.join(data_dir, data_file))\n"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "data.shape"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "scrolled": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "# Print statistics on the CDB before training\n",
91 | "cat.cdb.print_stats()\n",
92 | "\n",
93 | "# Run the annotation procedure over all the documents we have,\n",
94 | "# given that we have a large number of documents this can take quite some time.\n",
95 | "\n",
96 | "for i, text in enumerate(data['text'].values):\n",
97 | " # This will now run the training in the background \n",
98 | " try:\n",
99 | " _ = cat(text, do_train=True)\n",
100 | " except TypeError:\n",
101 | " pass\n",
102 | " \n",
103 | " # So we know how things are moving\n",
104 | " if i % 10000 == 0:\n",
105 | " print(\"Finished {} - text blocks\".format(i))\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "# Print statistics on the CDB after training\n",
115 | "cat.cdb.print_stats()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# save modelpack\n",
125 | "cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack)\n"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "End of script"
133 | ]
134 | }
135 | ],
136 | "metadata": {
137 | "kernelspec": {
138 | "display_name": "medcat",
139 | "language": "python",
140 | "name": "python3"
141 | },
142 | "language_info": {
143 | "codemirror_mode": {
144 | "name": "ipython",
145 | "version": 3
146 | },
147 | "file_extension": ".py",
148 | "mimetype": "text/x-python",
149 | "name": "python",
150 | "nbconvert_exporter": "python",
151 | "pygments_lexer": "ipython3",
152 | "version": "3.10.8 (main, Nov 24 2022, 08:08:27) [Clang 14.0.6 ]"
153 | },
154 | "vscode": {
155 | "interpreter": {
156 | "hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca"
157 | }
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 5
162 | }
163 |
--------------------------------------------------------------------------------
/medcat/2_train_model/1_unsupervised_training/unsupervised_medcattraining.py:
--------------------------------------------------------------------------------
1 | from medcat.cat import CAT
2 | import logging
3 | import sys
4 | import os
5 | sys.path.append('../../../')
6 | from cogstack import CogStack
7 | from credentials import *
8 |
9 | medcat_logger = logging.getLogger('medcat')
10 | fh = logging.FileHandler('medcat.log')
11 | medcat_logger.addHandler(fh)
12 |
13 | ###Change parameters here###
14 | cogstack_indices: list = [] # list of cogstack indexes here
15 | text_columns = ['body_analysed'] # list of all text containing fields
16 | # relative to file path
17 | _FILE_DIR = os.path.dirname(__file__)
18 | # relative path to working_with_cogstack folder
19 | _REL_PATH = os.path.join("..", "..", "..")
20 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
21 | # absolute path to working_with_cogstack folder
22 | BASE_PATH = os.path.abspath(_BASE_PATH)
23 | model_pack_path = os.path.join(BASE_PATH, 'data', 'medcat_models', 'modelpack')
24 | model_pack_name = ''
25 | output_modelpack_name = '' # name of modelpack to save
26 |
27 | cs = CogStack(hosts, username=username, password=password, api=True)
28 | df = cs.DataFrame(index=cogstack_indices, columns=text_columns) # type: ignore
29 |
30 | cat = CAT.load_model_pack(model_pack_path+model_pack_name)
31 | cat.cdb.print_stats()
32 | cat.train(data_iterator=df[text_columns].iterrows(),
33 | nepochs=1,
34 | fine_tune=True,
35 | progress_print=10000,
36 | is_resumed=False)
37 |
38 | cat.cdb.print_stats()
39 |
40 | cat.create_model_pack(save_dir_path=model_pack_path, model_pack_name=output_modelpack_name)
41 |
--------------------------------------------------------------------------------
/medcat/2_train_model/1_unsupervised_training/unsupervised_training.py:
--------------------------------------------------------------------------------
1 | from medcat.cat import CAT
2 | import pandas as pd
3 | import os
4 | import logging
5 |
6 | # python medcat/2_train_model/1_unsupervised_training/splitter.py
7 | # medcat/2_train_model/1_unsupervised_training/all_notes.csv medcat/2_train_model/1_unsupervised_training/split_notes_5M_%d.csv 5000000
8 | #
9 |
10 | all_notes_file = 'all_notes.csv' # CHANGE AS NEEDED
11 |
12 | # in my case, I needed to split the notes into parts. Otherwise, the work just crashed at some point
13 | # I chose to split into 19 parts, around 5000000 lines at a time.
14 | split_format = 'split_notes_5M_%d.csv'
15 | nr_of_lines = 5000000
16 |
17 | from splitter import split_file
18 |
19 | if not os.path.exists(split_format%1):
20 | print(f'\n\nSplitting file into {nr_of_lines} line at a time. This will probably take some time\n\n')
21 | split_file(all_notes_file, nr_of_lines, split_format)
22 | print('\n\nDone with the split!\n\n')
23 | else:
24 | print('\n\nNB!Expecting the split files to already exist\n\n')
25 |
26 | data_dir = '.' # CHANGE AS NEEDED
27 |
28 |
29 | # relative to file path
30 | _FILE_DIR = os.path.dirname(__file__)
31 | # relative path to working_with_cogstack folder
32 | _REL_PATH = os.path.join("..", "..", "..")
33 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
34 | # absolute path to working_with_cogstack folder
35 | BASE_PATH = os.path.abspath(_BASE_PATH)
36 |
37 | model_dir = os.path.join(BASE_PATH, "data", "medcat_models", "modelpack")
38 |
39 | modelpack = 'umls_model2_zip_0d4ccc7b9ae1ecd2.zip' # CHANGE AS NEEDED
40 | model_pack_path = os.path.join(model_dir, modelpack)
41 |
42 | output_modelpack = 'umls_self_train_model' # Save name for new model
43 |
44 | # Load modelpack
45 | print('Loading modelpack')
46 | cat = CAT.load_model_pack(model_pack_path)
47 | cat.log.addHandler(logging.StreamHandler()) # add console output
48 |
49 | print('STATS:')
50 | cat.cdb.print_stats()
51 |
52 | # CHANGE AS NEEDED - if the number of spligt files is different
53 | all_data_files = [f'split_notes_5M_{nr}.csv' for nr in range(1, 20)] # file containing training material.
54 | for i, data_file in enumerate(all_data_files):
55 | # Load training data
56 | print('Load data for', i, 'from', data_file)
57 | data = pd.read_csv(os.path.join(data_dir, data_file))
58 | cat.train(data.text.values, progress_print=100)
59 |
60 | print('Stats now, after', i)
61 | cat.cdb.print_stats()
62 |
63 | # save modelpack
64 | cat.create_model_pack(save_dir_path=model_dir, model_pack_name=f"{output_modelpack}_{i}")
65 |
66 | # save modelpack - ALL
67 | cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack)
68 |
69 |
--------------------------------------------------------------------------------
/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "d58c720d",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import json\n",
11 | "import os\n",
12 | "from datetime import date\n",
13 | "from medcat.cat import CAT\n",
14 | "from medcat.meta_cat import MetaCAT\n",
15 | "from medcat.config_meta_cat import ConfigMetaCAT\n",
16 | "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "id": "ca80af0e",
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "# if you want to enable info level logging\n",
27 | "import logging\n",
28 | "logging.basicConfig(level=logging.INFO,force=True)"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "b1c5b9b0",
34 | "metadata": {},
35 | "source": [
36 | "#### 💡 To understand the model loading and other functionalities, please refer to the 'meta_annotation_training.ipynb' notebook"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "id": "a2c0431f",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "model_pack = '' # .zip model pack location\n",
47 | "mctrainer_export = \"\" # name of your mct export"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "808c27c1",
53 | "metadata": {},
54 | "source": [
55 | "We won't load the models at this stage as they need to be seperately loaded later.
Let's check for meta models in the directory"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "id": "675eab49",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# Iterate through the meta_models contained in the model\n",
66 | "meta_model_names = []\n",
67 | "for dirpath, dirnames, filenames in os.walk(model_pack):\n",
68 | " for dirname in dirnames:\n",
69 | " if dirname.startswith('meta_'):\n",
70 | " meta_model_names.append(dirname[5:])\n",
71 | "\n",
72 | "print(\"Meta models:\",meta_model_names)"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "9e499198",
78 | "metadata": {},
79 | "source": [
80 | "# Class weights \n",
81 | "\n",
82 | "Adjusting class weights to give more importance to specific classes. Generally, class weights are used in favour of minority classes(classes with less number of samples) to boost their performance.\n",
83 | "
To use class weights, we have 2 options:\n",
84 | "
1. calculate class weights based on class distribution\n",
85 | "
2. using specified class weights\n",
86 | "\n",
87 | "\n",
88 | "#option 1
\n",
89 | "metacat.config.train['class_weights'] = []
\n",
90 | "metacat.config.train['compute_class_weights'] = True
\n",
91 | "
\n",
92 | "#option 2
\n",
93 | "metacat.config.train['class_weights'] = [0.4,0.3,0.1]
"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "id": "fc07f3e9",
99 | "metadata": {},
100 | "source": [
101 | "NOTE: Make sure to correctly map the class weights to their corresponding class index.
To check the index assigned to the classes, use:
`print(mc.config.general['category_value2id'])`\n",
102 | "
This will print a dictionary where the class names and their corresponding IDs (indices) are displayed.
\n",
103 | "The first position in the class weight list corresponds to the class with ID 0 in the dictionary, and so on."
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "id": "6a92aa60",
109 | "metadata": {},
110 | "source": [
111 | "# 2 phase learning for training\n",
112 | "\n",
113 | "2 phase learning is used to mitigate class imbalance. In 2 phase learning, the models are trained twice:
\n",
114 | "Phase 1: trains for minority class(es) by undersampling data so that there is no class imbalance\n",
115 | "
Phase 2: trains for all classes\n",
116 | "\n",
117 | "Phase 1 ensures that the model learns minority class(es) and captures the details correctly.\n",
118 | "
Phase 2 is when the model is expected to learn the majority class as it is trained on the entire dataset.\n",
119 | "\n",
120 | "Paper reference - https://ieeexplore.ieee.org/document/7533053\n",
121 | "
Make sure to use class weights in favour of minority classes with 2 phase learning"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 5,
127 | "id": "5a86b839",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "#--------------------------------Phase 1--------------------------------\n",
132 | "def run_phase_1(meta_model,class_wt_phase1 = None):\n",
133 | " #Loading the pre-defined config for phase 1\n",
134 | " config_ph_1_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph1.json\")\n",
135 | " with open(config_ph_1_path) as f:\n",
136 | " config_ph1 = json.load(f)\n",
137 | " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph1)\n",
138 | "\n",
139 | " if class_wt_phase1:\n",
140 | " mc.config.train['class_weights'] = class_wt_phase1\n",
141 | "\n",
142 | " #You can change the number of epochs, remember to keep them higher for phase 1\n",
143 | " mc.config.train['nepochs'] = 40 \n",
144 | "\n",
145 | " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
146 | " # Save results\n",
147 | " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n",
148 | "\n",
149 | "#--------------------------------Phase 2--------------------------------\n",
150 | "def run_phase_2(meta_model,class_wt_phase2 = None): \n",
151 | " #Loading the pre-defined config for phase 2\n",
152 | " config_ph_2_path = os.path.join(model_pack,\"meta_\"+meta_model,\"config_ph2.json\")\n",
153 | " with open(config_ph_2_path) as f:\n",
154 | " config_ph2 = json.load(f)\n",
155 | "\n",
156 | " mc = MetaCAT.load(save_dir_path=os.path.join(model_pack,\"meta_\"+meta_model),config_dict = config_ph2)\n",
157 | "\n",
158 | " if class_wt_phase2:\n",
159 | " mc.config.train['class_weights'] = class_wt_phase2\n",
160 | "\n",
161 | " #You can change the number of epochs\n",
162 | " mc.config.train['nepochs'] = 20\n",
163 | "\n",
164 | " results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
165 | " # Save results\n",
166 | " json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n",
167 | "\n",
168 | "#--------------------------------Driver--------------------------------\n",
169 | "# Train the first meta cat model\n",
170 | "meta_model = meta_model_names[0]\n",
171 | "\n",
172 | "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n",
173 | "meta_cat_task = meta_model\n",
174 | "save_dir_path = os.path.join(model_pack,\"meta_\"+ meta_cat_task)\n",
175 | "\n",
176 | "# To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n",
177 | "class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n",
178 | "class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n",
179 | "\n",
180 | "\n",
181 | "# Train 2 phase learning\n",
182 | "print(\"*** Training meta cat: \",meta_model)\n",
183 | "print(\"Beginning Phase 1...\")\n",
184 | "run_phase_1(meta_model,class_wt_phase1)\n",
185 | "print(\"Beginning Phase 2...\")\n",
186 | "run_phase_2(meta_model,class_wt_phase2)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "60f0e878",
192 | "metadata": {},
193 | "source": [
194 | "# Generating synthetic data\n",
195 | "\n",
196 | "You can generate synthetic data to help mitigate class imbalance.
Use this code to generate synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)
NOTE: the generated data will require manual quality check to ensure that high quality and relevant data is used for training. "
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "id": "431e1002",
202 | "metadata": {},
203 | "source": [
204 | "The data generated from the gist code and the format of the data required by MedCAT are different, requiring manual formatting at the moment. We will update this module to include the code to handle the same."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "id": "4d07d437",
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "# To run the training with original + synthetic data\n",
215 | "# Follow all the same steps till and load the model\n",
216 | "\n",
217 | "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
218 | "# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",
219 | "\n",
220 | "synthetic_data_export = [[],[],[]]\n",
221 | "\n",
222 | "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)\n",
223 | "\n",
224 | "# Save results\n",
225 | "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "pytorch_medcat_clean",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.10.14"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 5
250 | }
251 |
--------------------------------------------------------------------------------
/medcat/2_train_model/2_supervised_training/supervised training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import json\n",
11 | "import pandas as pd\n",
12 | "from datetime import date\n",
13 | "from medcat.cat import CAT"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# will be used to date the trained model\n",
23 | "today = str(date.today())\n",
24 | "today = today.replace(\"-\",\"\")"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "ann_dir = \"working_with_cogstack/data/annotated_docs/\"\n",
34 | "mctrainer_export_path = ann_dir + \"MedCAT_Export_With_Text_2021-08-25_19_55_45.json\" # name of your mct export\n",
35 | "\n",
36 | "model_dir = 'working_with_cogstack/models/modelpack'\n",
37 | "\n",
38 | "modelpack = '' # name of modelpack\n",
39 | "model_pack_path = os.path.join(model_dir, modelpack)\n",
40 | "\n",
41 | "output_modelpack = model_dir + f\"{today}_trained_model\"\n",
42 | "\n",
43 | "# Add training filter if needed\n",
44 | "snomed_filter_path = None # path to snomed filter"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Create CAT - the main class from medcat used for concept annotation\n",
54 | "cat = CAT.load_model_pack(model_pack_path)\n",
55 | "cat.config.linking['filters'] = {'cuis':set()} # To remove exisitng filters"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "# Set filter\n",
63 | "\n",
64 | "This will speed up the training time. As you will only train a select number of concepts at once."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "# Add extra training filter if required.\n",
74 | "if snomed_filter_path:\n",
75 | " snomed_filter = set(json.load(open(snomed_filter_path)))\n",
76 | "else:\n",
77 | " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "# Train"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "cat.train_supervised_from_json(data_path=mctrainer_export_path, \n",
94 | " nepochs=3,\n",
95 | " reset_cui_count=False,\n",
96 | " print_stats=True,\n",
97 | " use_filters=True,\n",
98 | " extra_cui_filter=snomed_filter, # If not filter is set remove this line\n",
99 | " )\n"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "# Stats"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "data = json.load(open(mctrainer_export_path))"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "scrolled": true
123 | },
124 | "outputs": [],
125 | "source": [
126 | "fps, fns, tps, cui_prec, cui_rec, cui_f1, cui_counts, examples = cat._print_stats(data, use_filters=True)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "cui = \"22298006\" # Myocardial infarction\n",
136 | "print(cui_f1[cui], cui_prec[cui], cui_rec[cui])"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": []
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "# Save\n",
151 | "\n",
152 | "Also remember that you can save specific components within the modelpack. Rather than create a new one"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "# save modelpack\n",
162 | "cat.create_model_pack(os.path.join(model_dir, output_modelpack))"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "# Test"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "text = \"The pateint has hypertension and an MI\"\n",
179 | "doc = cat.get_entities(text)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "doc"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": []
197 | }
198 | ],
199 | "metadata": {
200 | "kernelspec": {
201 | "display_name": "Python 3",
202 | "language": "python",
203 | "name": "python3"
204 | },
205 | "language_info": {
206 | "codemirror_mode": {
207 | "name": "ipython",
208 | "version": 3
209 | },
210 | "file_extension": ".py",
211 | "mimetype": "text/x-python",
212 | "name": "python",
213 | "nbconvert_exporter": "python",
214 | "pygments_lexer": "ipython3",
215 | "version": "3.9.6 (default, Sep 26 2022, 11:37:49) \n[Clang 14.0.0 (clang-1400.0.29.202)]"
216 | },
217 | "vscode": {
218 | "interpreter": {
219 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
220 | }
221 | }
222 | },
223 | "nbformat": 4,
224 | "nbformat_minor": 1
225 | }
226 |
--------------------------------------------------------------------------------
/medcat/2_train_model/ReadMe.md:
--------------------------------------------------------------------------------
1 | # MedCAT Model Training
2 |
3 | A MedCAT model will undergo Two steps of training.
4 | The first is an unsupervised step which should only be done once.
5 |
6 | The Supervised training step ("Human in the loop") should be done to fine-tune and evaluate models.
7 | This step can be an iterative process where models can be "further" fine-tuned to further its understanding of concepts.
8 |
9 |
--------------------------------------------------------------------------------
/medcat/3_run_model/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Running a model to annotate text
2 |
3 |
4 |
5 | ## Command line tips
6 | To run_model.py in the background:
7 |
8 | Running scripts in the python background. Outputs here are saved to an nohup.out `nohup python3 run_model.py &`
9 |
10 |
11 |
12 | You can find the process and it's Process ID (PID): `ps ax | grep run_model.py`
13 |
14 | or list running processes in python
15 | `ps -fA | grep python`
16 |
17 | If you want to stop the execution, you can kill with this command.
18 | `kill PID`
19 |
20 |
--------------------------------------------------------------------------------
/medcat/3_run_model/run_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "from medcat.cat import CAT\n",
11 | "from medcat import cat\n",
12 | "import pandas as pd\n",
13 | "import json\n",
14 | "from tqdm.notebook import tqdm\n",
15 | "import re\n",
16 | "import pickle"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Add file logger\n",
26 | "import logging\n",
27 | "medcat_logger = logging.getLogger('medcat')\n",
28 | "fh = logging.FileHandler('medcat.log')\n",
29 | "medcat_logger.addHandler(fh)"
30 | ]
31 | },
32 | {
33 | "attachments": {},
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "# Paths and Config"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# relative path to working_with_cogstack folder\n",
47 | "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n",
48 | "# absolute path to working_with_cogstack folder\n",
49 | "base_path = os.path.abspath(_rel_path)\n",
50 | "vocab_dir = os.path.join(base_path, \"models\", \"vocab\")"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# Changes these according to your project\n",
60 | "project_name = 'test_project' # Name of your project. Annotated documents relating to this project will be stored here.\n",
61 | "documents_to_annotate = \"cogstack_search_results/example documents to annotate.csv\" # Add your data file here\n",
62 | "\n",
63 | "modelpack = '' # enter your model here. Should the the output of trained 'output_modelpack'.\n",
64 | "snomed_filter_path = None\n",
65 | "\n",
66 | "\n",
67 | "# Constants (nothing to change below)\n",
68 | "data_dir = 'working_with_cogstack/data'\n",
69 | "\n",
70 | "data_path = os.path.join(base_path, data_dir, documents_to_annotate)\n",
71 | "doc_id_column = \"id\"\n",
72 | "doc_text_column = \"description\"\n",
73 | "\n",
74 | "model_dir = 'working_with_cogstack/models/modelpack'\n",
75 | "model_pack_path = os.path.join(base_path, model_dir, modelpack)\n",
76 | "\n",
77 | "ann_folder_path = os.path.join(base_path, data_dir, f'annotated_docs', project_name)\n",
78 | "if not os.path.exists(ann_folder_path):\n",
79 | " os.makedirs(ann_folder_path)\n",
80 | " print(f'Created folder to store annotations here: {ann_folder_path}')\n",
81 | " \n",
82 | "save_path_annotations_per_doc = os.path.join(base_path, ann_folder_path, \".json\")\n"
83 | ]
84 | },
85 | {
86 | "attachments": {},
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "# Load MedCAT model"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "# Create CAT - the main class from medcat used for concept annotation\n",
100 | "cat = CAT.load_model_pack(model_pack_path)"
101 | ]
102 | },
103 | {
104 | "attachments": {},
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "# Annotate"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# Set snomed filter if needed\n",
118 | "# This is a white list filter of concepts\n",
119 | "if snomed_filter_path:\n",
120 | " snomed_filter = set(json.load(open(snomed_filter_path)))\n",
121 | "else:\n",
122 | " print('There is no concept filter set')\n",
123 | " snomed_filter = set(cat.cdb.cui2preferred_name.keys())\n",
124 | "\n",
125 | "cat.config.linking['filters']['cuis'] = snomed_filter \n"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "df = pd.read_csv(data_path)[[doc_id_column, doc_text_column]] # Not necessary to filter at this step. But this loads only what is required\n"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "scrolled": true
142 | },
143 | "outputs": [],
144 | "source": [
145 | "# Create generator object\n",
146 | "def data_iterator(data, doc_name, doc_text):\n",
147 | " for id, row in data.iterrows():\n",
148 | " yield (row[doc_name], row[doc_text])"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "batch_char_size = 50000 # Batch size (BS) in number of characters\n",
158 | "cat.multiprocessing_batch_char_size(data_iterator(df, doc_id_column, doc_text_column),\n",
159 | " batch_size_chars=batch_char_size,\n",
160 | " only_cui=False,\n",
161 | " nproc=8, # Number of processors\n",
162 | " out_split_size_chars=20*batch_char_size,\n",
163 | " save_dir_path=ann_folder_path,\n",
164 | " min_free_memory=0.1,\n",
165 | " )\n",
166 | "\n",
167 | "medcat_logger.warning(f'Annotation process complete!')\n"
168 | ]
169 | },
170 | {
171 | "attachments": {},
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "### Double check if everything has been annotated.\n",
176 | "\n",
177 | "This does not check meta-annotations"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "# Check if everything has run smoothly. If an error has been raised check the logs\n",
187 | "try:\n",
188 | " # Path to your pickle file\n",
189 | " pickle_file_path = os.path.join(ann_folder_path, \"annotated_ids.pickle\")\n",
190 | " # Open the pickle file in read mode\n",
191 | " with open(pickle_file_path, \"rb\") as pickle_file:\n",
192 | " loaded_data = pickle.load(pickle_file)\n",
193 | " assert len(df) == len(loaded_data[0])\n",
194 | "except AssertionError as e:\n",
195 | " print(\"Error:\", \"There are documents which havent been annotated! Check 'medcat.log' for more info\")\n"
196 | ]
197 | },
198 | {
199 | "attachments": {},
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "END OF SCRIPT"
204 | ]
205 | },
206 | {
207 | "attachments": {},
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": []
211 | },
212 | {
213 | "attachments": {},
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "### Inspect the model"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "text = \"He was diagnosed with heart failure\"\n",
227 | "doc = cat(text)\n",
228 | "print(doc.ents)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "# Display Snomed codes\n",
238 | "for ent in doc.ents:\n",
239 | " print(ent, \" - \", ent._.cui, \" - \", cat.cdb.cui2preferred_name[ent._.cui])"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "# To show semantic types for each entity\n",
249 | "for ent in doc.ents:\n",
250 | " print(ent, \" - \", cat.cdb.cui2type_ids.get(ent._.cui))"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "# Display\n",
260 | "from spacy import displacy\n",
261 | "displacy.render(doc, style='ent', jupyter=True)"
262 | ]
263 | },
264 | {
265 | "attachments": {},
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "# Alternative approach"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "scrolled": true
277 | },
278 | "outputs": [],
279 | "source": [
280 | "# This approach does not use multiprocessing. But iterates line by line through your dataset.\n",
281 | "\n",
282 | "docs = {}\n",
283 | "print(f\"Len of df: {len(df)}\") \n",
284 | "\n",
285 | "for i, row in tqdm(df.iterrows(), total=df.shape[0]):\n",
286 | " text = str(row[doc_text_column])\n",
287 | " \n",
288 | " # Skip text if under 10 characters,\n",
289 | " if len(text) > 10:\n",
290 | " docs[row[doc_id_column]] = cat.get_entities(text)\n",
291 | " else:\n",
292 | " docs[row[doc_id_column]] = []"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "cat.cdb.print_stats()"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "# Save to file (docs is docs 2 annotations)\n",
311 | "json.dump(docs, open(save_path_annotations_per_doc, \"w\"))\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": []
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 3",
325 | "language": "python",
326 | "name": "python3"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 3
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython3",
338 | "version": "3.10.8"
339 | },
340 | "vscode": {
341 | "interpreter": {
342 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
343 | }
344 | }
345 | },
346 | "nbformat": 4,
347 | "nbformat_minor": 4
348 | }
349 |
--------------------------------------------------------------------------------
/medcat/3_run_model/run_model.py:
--------------------------------------------------------------------------------
1 | from medcat.cat import CAT
2 | import os
3 | import pandas as pd
4 | import json
5 |
6 | import logging
7 | medcat_logger = logging.getLogger('medcat')
8 | fh = logging.FileHandler('medcat.log')
9 | medcat_logger.addHandler(fh)
10 |
11 | import sys
12 | sys.path.append(os.path.join('..', '..'))
13 | from credentials import *
14 | from cogstack import CogStack
15 |
16 |
17 | # relative to file path
18 | _FILE_DIR = os.path.dirname(__file__)
19 | # relative path to working_with_cogstack folder
20 | _REL_PATH = os.path.join("..", "..", "..")
21 | _BASE_PATH = os.path.join(_FILE_DIR, _REL_PATH)
22 | # absolute path to working_with_cogstack folder
23 | BASE_PATH = os.path.abspath(_BASE_PATH)
24 | vocab_dir = os.path.join(BASE_PATH, "models", "vocab")
25 |
26 | # Initialise search
27 | cs = CogStack(hosts=hosts, username=username, password=password, api=True)
28 |
29 | cogstack_indices = [''] # Enter your list of relevant cogstack indices here
30 |
31 | # log size of indices
32 | df = cs.DataFrame(index=cogstack_indices, columns=['body_analysed']) # type: ignore
33 | medcat_logger.warning(f'The index size is {df.shape[0]}!')
34 | del df
35 |
36 | # Initialise the model
37 | base_path = BASE_PATH
38 | model_dir = os.path.join('models', 'modelpack')
39 |
40 | modelpack = '' # enter your model here. Should be the output of trained 'output_modelpack' from step 2.
41 | model_pack_path = os.path.join(base_path, model_dir, modelpack)
42 |
43 | snomed_filter_path = None
44 |
45 | data_dir = 'data'
46 | ann_folder_path = os.path.join(base_path, data_dir, f'annotated_docs')
47 | if not os.path.exists(ann_folder_path):
48 | os.makedirs(ann_folder_path)
49 |
50 | medcat_logger.warning(f'Anntotations will be saved here: {ann_folder_path}')
51 |
52 | # Load CAT - the main class from medcat used fro concept annotation
53 | cat = CAT.load_model_pack(model_pack_path)
54 |
55 | # Set snomed filter if needed
56 | # This is a white list filter of concepts
57 | if snomed_filter_path:
58 | snomed_filter = set(json.load(open(snomed_filter_path)))
59 | else:
60 | snomed_filter = set(cat.cdb.cui2preferred_name.keys())
61 |
62 | cat.config.linking['filters']['cuis'] = snomed_filter
63 | del snomed_filter
64 |
65 | # build query, change as appropriate
66 | query = {
67 | "query": {
68 | "match_all": {}
69 | },
70 | "_source":["_id", "body_analysed"]
71 | }
72 |
73 | search_gen = cs.get_docs_generator(index=cogstack_indices, query=query, request_timeout=None)
74 |
75 | def relevant_text_gen(generator, doc_id = '_id', text_col='body_analysed'):
76 | for i in generator:
77 | try:
78 | yield (i[doc_id], i['_source'][text_col])
79 | except KeyError:
80 | # medcat_logger.warning(f'KeyError {text_col} not found')
81 | continue
82 |
83 | batch_char_size = 500000 # Batch size (BS) in number of characters
84 |
85 | cat.multiprocessing_batch_char_size(relevant_text_gen(search_gen),
86 | batch_size_chars=batch_char_size,
87 | only_cui=False,
88 | nproc=8, # Number of processors
89 | out_split_size_chars=20*batch_char_size,
90 | save_dir_path=ann_folder_path,
91 | min_free_memory=0.1,
92 | )
93 |
94 | medcat_logger.warning(f'Annotation process complete!')
95 |
96 |
--------------------------------------------------------------------------------
/medcat/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Medical
oncept Annotation Tool
2 |
3 | This directory contains information on retrieving data and creating models
4 | All details regarding creating, building and running the NLP model are stored here.
5 |
6 | ## Locations for storing data:
7 |
8 | - The [data](/data) directory stores textual content.
9 | Methods for retrieving data should be stored in the [retrieve_data](/search) folder.
10 |
11 | - The [MedCAT models](/data/medcat_models) directory holds models.
12 |
13 | ## Order of processing steps
14 |
15 | #### [__Step 1__](/medcat/1_create_model): Create the model
16 |
17 | Each of the model components are found [here.](/medcat/1_create_model)
18 | This directory contains all the components required to initialise a model pack.
19 |
20 | All models should be stored [here.](/models)
21 |
22 |
23 | #### [__Step 2__](/medcat/2_train_model): Perform training
24 |
25 | - [__Step 2.1__](/medcat/2_train_model/1_unsupervised_training): Unsupervised training
26 |
27 | The unsupervised training steps can be found within unsupervised_training folder.
28 |
29 |
30 | - [__Step 2.2__](/medcat/2_train_model/2_supervised_training): Supervised training
31 |
32 | After providing supervised labels with MedCATtrainer.
33 | The supervised training steps can be found within supervised_training folder.
34 |
35 | #### [__Step 3__](/medcat/3_run_model): Run model
36 |
37 | Run model on your corpus of documents and write to csv/sql db.
38 | Instructions on how to do this can be found within [run_model](/medcat/3_run_model/run_model.ipynb)
39 |
40 |
41 | ## General guidance on how to run an NER annotation project
42 |
43 | 1. Establish your Aims, Hypothesis and Scope.
44 |
45 | 2. Define your cohort/dataset. How will you identify your cohort and relevant documents?
46 |
47 | 3. Select a standardised clinical terminology and version most suitable fit your use case.
48 |
49 | 4. Select an existing model or create your own.
50 |
51 | 5. Produce annotation guidelines. Create a “gold standard”. Manually label you’re a sample of your dataset through annotations. This sample must be as representative as possible to ensure optimal model performance.
52 |
53 | 6. Train and compare the model to your “gold standard”. These annotations can be used for supervised training or benchmarking model performance.
54 |
55 | 7. Calculate performance metrics against the annotation sample.
56 |
57 | 8. Run over your entire data set.
58 |
59 | 9. Random stratified subsample review of performance.
60 |
61 | 10. (Optional generalisability) Test model at an external site/dataset validation of steps 8,9.
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/medcat/compare_models/cmp_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Type, TypeVar, Generic, Iterable, Callable, Optional
2 |
3 | import sqlite3
4 | import re
5 | from pydantic import BaseModel
6 |
7 |
8 | T = TypeVar('T', bound=BaseModel)
9 |
10 |
11 | def sanitize_table_name(name, max_length=64):
12 | # Replace any characters not allowed in table names with underscores
13 | name = re.sub(r'[^a-zA-Z0-9_$]', '_', name)
14 | # Truncate the name if it's too long
15 | name = name[:max_length]
16 | return name
17 |
18 |
19 | class SaveOptions(BaseModel):
20 | use_db: bool = False
21 | db_file_name: Optional[str] = None
22 | clean_callback: Optional[Callable[[], None]] = None
23 |
24 |
25 | class DifferenceDatabase(Generic[T]):
26 |
27 | def __init__(self, db_file: str, part: str, model_type: Type[T],
28 | batch_size: int = 100):
29 | self.db_file = db_file
30 | self.part = sanitize_table_name(part)
31 | self.model_type = model_type
32 | self.conn = sqlite3.connect(self.db_file)
33 | self.cursor = self.conn.cursor()
34 | self._create_table()
35 | self._len = 0
36 | self._batch_size = batch_size
37 |
38 | def _create_table(self):
39 | self.cursor.execute(f'''CREATE TABLE IF NOT EXISTS differences_{self.part}
40 | (id INTEGER PRIMARY KEY, data TEXT)''')
41 | self.conn.commit()
42 |
43 | def append(self, difference: T):
44 | data = difference.json()
45 | self.cursor.execute(f"INSERT INTO differences_{self.part} (data) VALUES (?)", (data,))
46 | self.conn.commit()
47 | self._len += 1
48 |
49 | def __iter__(self) -> Iterable[T]:
50 | self.cursor.execute(f"SELECT data FROM differences_{self.part}")
51 | while True:
52 | rows = self.cursor.fetchmany(self._batch_size)
53 | if not rows:
54 | break
55 | for row in rows:
56 | yield self.model_type.parse_raw(row[0])
57 |
58 | def __len__(self) -> int:
59 | return self._len
60 |
61 | def __del__(self):
62 | self.conn.close()
63 |
--------------------------------------------------------------------------------
/medcat/compare_models/comp_nbhelper.py:
--------------------------------------------------------------------------------
1 | from ipyfilechooser import FileChooser
2 | from ipywidgets import widgets
3 | from IPython.display import display
4 | import os
5 | from typing import List, Optional
6 |
7 |
8 | from compare import get_diffs_for
9 | from output import parse_and_show, show_dict_deep, compare_dicts
10 |
11 |
12 | _def_path = '../../models/modelpack'
13 | _def_path = _def_path if os.path.exists(_def_path) else '.'
14 |
15 |
16 | class NBComparer:
17 |
18 | def __init__(self, model_path_1: str, model_path_2: str,
19 | documents_file: str, doc_limit: int, is_mct_export_compare: bool,
20 | cui_filter: str, filter_children: bool) -> None:
21 | self.model_path_1 = model_path_1
22 | self.model_path_2 = model_path_2
23 | self.documents_file = documents_file
24 | self.doc_limit = doc_limit
25 | self.is_mct_export_compare = is_mct_export_compare
26 | self.cui_filter = cui_filter
27 | self.filter_children = filter_children
28 | self._run_comparison()
29 |
30 | def _run_comparison(self):
31 | (self.cdb_comp, self.tally1, self.tally2, self.ann_diffs) = get_diffs_for(
32 | self.model_path_1, self.model_path_2, self.documents_file,
33 | cui_filter=self.cui_filter, include_children_in_filter=self.filter_children,
34 | supervised_train_comparison_model=self.is_mct_export_compare, doc_limit=self.doc_limit)
35 |
36 | def show_all(self):
37 | parse_and_show(self.cdb_comp, self.tally1, self.tally2, self.ann_diffs)
38 |
39 | def show_per_document(self, limit: int = -1, print_delimiter: bool = True,
40 | ignore_empty: bool = True):
41 | cnt = 0
42 | for key in self.ann_diffs.per_doc_results.keys():
43 | comp_dict = self.ann_diffs.per_doc_results[key].nr_of_comparisons
44 | if not ignore_empty or comp_dict: # ignore empty ones
45 | if print_delimiter:
46 | print('='*20,f'\n{key}', f'\n{"="*20}')
47 | show_dict_deep(self.ann_diffs.per_doc_results[key].nr_of_comparisons)
48 | cnt += 1
49 | if limit > -1 and cnt == limit:
50 | break
51 |
52 | def diffs_to_csv(self, file_path: str) -> None:
53 | self.ann_diffs.to_csv(file_path)
54 |
55 | def compare_for_cui(self, cui: str, include_children: int = 2) -> None:
56 | per_cui1 = self.tally1.get_for_cui(cui, include_children=include_children)
57 | per_cui2 = self.tally2.get_for_cui(cui, include_children=include_children)
58 | compare_dicts(per_cui1, per_cui2)
59 |
60 | def show_docs(self, docs: List[str], show_delimiter: bool = True,
61 | omit_identical: bool = True):
62 | for doc_name, pair in self.ann_diffs.iter_ann_pairs(docs=docs, omit_identical=omit_identical):
63 | if show_delimiter:
64 | print('='*20,f'\n{doc_name} ({pair.comparison_type})', f'\n{"="*20}')
65 | # NOTE: if only one of the two has an annotation, the other one will be None
66 | # the following will deal with that automatically, though
67 | compare_dicts(pair.one, pair.two)
68 |
69 |
70 | class NBInputter:
71 | models_overall_title = "Models and data"
72 | mc1_title = "Choose model 1"
73 | mc2_title = "Choose model 2 (or an MCT export)"
74 | docs_title = "Choose the documents file (.csv with 'text' field)"
75 | docs_limit_title = "Limit the number of documents to run (-1 to disable)"
76 | mct_export_title = "Is the 2nd path an MCT export (instead of a model)?"
77 | cui_filter_title_overall = "CUI Filter"
78 | cui_filter_title_file_chooser = "Choose file with comma-separated CUIs"
79 | cui_filter_title_text = "List comma-separated CUIs"
80 | cui_children_title = "How many layers of children of concepts to include?"
81 |
82 | def __init__(self) -> None:
83 | self.model1_chooser = FileChooser(_def_path)
84 | self.model2_chooser = FileChooser(_def_path)
85 | self.documents_chooser = FileChooser(".")
86 | self.doc_limit = widgets.IntText(-1)
87 | self.ckbox = widgets.Checkbox(description="MCT export compare")
88 |
89 | self.cui_filter_chooser = FileChooser(".", description="The CUI filter file")
90 | self.cui_filter_box = widgets.Textarea(description="CUI list")
91 | self.cui_children = widgets.IntText(description="Children", value=-1)
92 |
93 | def show_all(self):
94 | model_choosers = widgets.VBox([
95 | widgets.HTML(f"{self.models_overall_title}
"),
96 | widgets.VBox([widgets.Label(self.mc1_title), self.model1_chooser]),
97 | widgets.VBox([widgets.Label(self.mc2_title), self.model2_chooser]),
98 | widgets.VBox([widgets.Label(self.docs_title), self.documents_chooser]),
99 | widgets.VBox([widgets.Label(self.docs_limit_title), self.doc_limit]),
100 | widgets.VBox([widgets.Label(self.mct_export_title), self.ckbox])
101 | ])
102 |
103 | cui_filter = widgets.VBox([
104 | widgets.HTML(f"{self.cui_filter_title_overall}
"),
105 | widgets.VBox([widgets.Label(self.cui_filter_title_file_chooser), self.cui_filter_chooser]),
106 | widgets.VBox([widgets.Label(self.cui_filter_title_text), self.cui_filter_box]),
107 | widgets.VBox([widgets.Label(self.cui_children_title), self.cui_children])
108 | ])
109 |
110 | # Combine all sections into a main VBox
111 | main_box = widgets.VBox([
112 | model_choosers,
113 | cui_filter
114 | ])
115 | display(main_box)
116 |
117 |
118 | def _get_params(self):
119 | model_path_1 = self.model1_chooser.selected
120 | model_path_2 = self.model2_chooser.selected
121 | documents_file = self.documents_chooser.selected
122 | doc_limit = self.doc_limit.value
123 | is_mct_export_compare = self.ckbox.value
124 | if not is_mct_export_compare:
125 | print(f"For models, selected:\nModel1: {model_path_1}\nModel2: {model_path_2}"
126 | f"\nDocuments: {documents_file}")
127 | else:
128 | print(f"Selected:\nModel: {model_path_1}\nMCT export: {model_path_2}"
129 | f"\nDocuments: {documents_file}")
130 | # CUI filter
131 | cui_filter = None
132 | filter_children = None
133 | if self.cui_filter_chooser.selected:
134 | cui_filter = self.cui_filter_chooser.selected
135 | elif self.cui_filter_box.value:
136 | cui_filter = self.cui_filter_box.value
137 | if self.cui_children.value and self.cui_children.value > 0:
138 | filter_children = self.cui_children.value
139 | print(f"For CUI filter, selected:\nFilter: {cui_filter}\nChildren: {filter_children}")
140 | return (model_path_1, model_path_2, documents_file, doc_limit, is_mct_export_compare, cui_filter, filter_children)
141 |
142 | def get_comparison(self) -> NBComparer:
143 | return NBComparer(*self._get_params())
144 |
--------------------------------------------------------------------------------
/medcat/compare_models/compare.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Dict, Set, Optional, Union, Iterator
2 | from functools import partial
3 | import glob
4 |
5 | from medcat.cat import CAT
6 |
7 | import pandas as pd
8 | import tqdm
9 | import tempfile
10 | from itertools import islice
11 |
12 | from compare_cdb import compare as compare_cdbs, CDBCompareResults
13 | from compare_annotations import ResultsTally, PerAnnotationDifferences
14 | from output import parse_and_show
15 | from cmp_utils import SaveOptions
16 | from validation import validate_input
17 |
18 |
19 |
20 | def load_documents(file_name: str, doc_limit: int = -1) -> Iterator[Tuple[str, str]]:
21 | with open(file_name) as f:
22 | df = pd.read_csv(f, names=["id", "text"])
23 | if df.iloc[0].id == "id" and df.iloc[0].text == "text":
24 | # removes the header
25 | # but also messes up the index a little
26 | df = df.iloc[1:, :]
27 | if doc_limit == -1:
28 | yield from df.itertuples(index=False)
29 | else:
30 | yield from islice(df.itertuples(index=False), doc_limit)
31 |
32 |
33 | def do_counting(cat1: CAT, cat2: CAT,
34 | ann_diffs: PerAnnotationDifferences,
35 | doc_limit: int = -1) -> ResultsTally:
36 | def cui2name(cat, cui):
37 | if cui in cat.cdb.cui2preferred_name:
38 | return cat.cdb.cui2preferred_name[cui]
39 | all_names = cat.cdb.cui2names[cui]
40 | # longest anme
41 | return sorted(all_names, key=lambda name: len(name), reverse=True)[0]
42 | res1 = ResultsTally(pt2ch=_get_pt2ch(cat1), cat_data=cat1.cdb.make_stats(),
43 | cui2name=partial(cui2name, cat1))
44 | res2 = ResultsTally(pt2ch=_get_pt2ch(cat2), cat_data=cat2.cdb.make_stats(),
45 | cui2name=partial(cui2name, cat2))
46 | total = doc_limit if doc_limit != -1 else None
47 | for per_doc in tqdm.tqdm(ann_diffs.per_doc_results.values(), total=total):
48 | res1.count(per_doc.raw1)
49 | res2.count(per_doc.raw2)
50 | return res1, res2
51 |
52 |
53 | def _get_pt2ch(cat: CAT) -> Optional[Dict]:
54 | return cat.cdb.addl_info.get("pt2ch", None)
55 |
56 |
57 | def get_per_annotation_diffs(cat1: CAT, cat2: CAT, documents: Iterator[Tuple[str, str]],
58 | show_progress: bool = True,
59 | keep_raw: bool = True,
60 | doc_limit: int = -1
61 | ) -> PerAnnotationDifferences:
62 | pt2ch1: Optional[Dict] = _get_pt2ch(cat1)
63 | pt2ch2: Optional[Dict] = _get_pt2ch(cat2)
64 | temp_file = tempfile.NamedTemporaryFile()
65 | save_opts = SaveOptions(use_db=True, db_file_name=temp_file.name,
66 | clean_callback=temp_file.close)
67 | pad = PerAnnotationDifferences(pt2ch1=pt2ch1, pt2ch2=pt2ch2,
68 | model1_cuis=set(cat1.cdb.cui2names),
69 | model2_cuis=set(cat2.cdb.cui2names),
70 | keep_raw=keep_raw,
71 | save_options=save_opts)
72 | total = doc_limit if doc_limit != -1 else None
73 | for doc_id, doc in tqdm.tqdm(documents, disable=not show_progress, total=total):
74 | pad.look_at_doc(cat1.get_entities(doc), cat2.get_entities(doc), doc_id, doc)
75 | pad.finalise()
76 | return pad
77 |
78 |
79 | def load_cui_filter(filter_file: str) -> Set[str]:
80 | with open(filter_file) as f:
81 | str_list = f.read().split(',')
82 | return set(item.strip() for item in str_list)
83 |
84 |
85 | def _add_all_children(cat: CAT, cui_filter: Set[str], include_children: int) -> None:
86 | if include_children <= 0:
87 | return
88 | if "pt2ch" not in cat.cdb.addl_info:
89 | return
90 | pt2ch = cat.cdb.addl_info["pt2ch"]
91 | children = set(ch for cui in cui_filter for ch in pt2ch.get(cui, []))
92 | if include_children > 1:
93 | _add_all_children(cat, children, include_children=include_children-1)
94 | cui_filter.update(children)
95 |
96 |
97 | def load_and_train(model_pack_path: str, mct_export_path: str) -> CAT:
98 | cat = CAT.load_model_pack(model_pack_path)
99 | # NOTE: Allowing mct_export_path to contain wildcat ("*").
100 | # And in such a case, iterating over all matching files
101 | if "*" not in mct_export_path:
102 | cat.train_supervised_from_json(mct_export_path)
103 | else:
104 | for file in glob.glob(mct_export_path):
105 | cat.train_supervised_from_json(file)
106 | return cat
107 |
108 |
109 | def get_diffs_for(model_pack_path_1: str,
110 | model_pack_path_2: str,
111 | documents_file: str,
112 | cui_filter: Optional[Union[Set[str], str]] = None,
113 | show_progress: bool = True,
114 | include_children_in_filter: Optional[int] = None,
115 | supervised_train_comparison_model: bool = False,
116 | keep_raw: bool = True,
117 | doc_limit: int = -1,
118 | ) -> Tuple[CDBCompareResults, ResultsTally, ResultsTally, PerAnnotationDifferences]:
119 | validate_input(model_pack_path_1, model_pack_path_2, documents_file, cui_filter, supervised_train_comparison_model)
120 | documents = load_documents(documents_file, doc_limit=doc_limit)
121 | if show_progress:
122 | print("Loading [1]", model_pack_path_1)
123 | cat1 = CAT.load_model_pack(model_pack_path_1)
124 | if show_progress:
125 | print("Loading [2]", model_pack_path_2)
126 | if not supervised_train_comparison_model:
127 | cat2 = CAT.load_model_pack(model_pack_path_2)
128 | else:
129 | if show_progress:
130 | print("Reloading model pack 1", model_pack_path_1)
131 | print("And subsequently training on", model_pack_path_2)
132 | print("This may take a while, depending on the amount of "
133 | "data is being trained on")
134 | cat2 = load_and_train(model_pack_path_1, model_pack_path_2)
135 | if show_progress:
136 | print("Per annotations diff finding")
137 | if cui_filter:
138 | if isinstance(cui_filter, str):
139 | cui_filter = load_cui_filter(cui_filter)
140 | if show_progress:
141 | print("Applying filter to CATs:", len(cui_filter), 'CUIs')
142 | if include_children_in_filter:
143 | if show_progress:
144 | print("Adding all children of", include_children_in_filter,
145 | "or lower level from first model")
146 | _add_all_children(cat1, cui_filter, include_children_in_filter)
147 | if show_progress:
148 | print("After adding children from 1st model have a total of",
149 | len(cui_filter), "CUIs")
150 | _add_all_children(cat2, cui_filter, include_children_in_filter)
151 | if show_progress:
152 | print("After adding children from 2nd model have a total of",
153 | len(cui_filter), "CUIs")
154 | cat1.config.linking.filters.cuis = cui_filter
155 | cat2.config.linking.filters.cuis = cui_filter
156 | ann_diffs = get_per_annotation_diffs(cat1, cat2, documents, keep_raw=keep_raw,
157 | doc_limit=doc_limit)
158 | if show_progress:
159 | print("Counting [1&2]")
160 | res1, res2 = do_counting(cat1, cat2, ann_diffs, doc_limit=doc_limit)
161 | if show_progress:
162 | print("CDB compare")
163 | cdb_diff = compare_cdbs(cat1.cdb, cat2.cdb)
164 | return cdb_diff, res1, res2, ann_diffs
165 |
166 |
167 | def main(mpn1: str, mpn2: str, documents_file: str):
168 | cdb_diff, res1, res2, ann_diffs = get_diffs_for(mpn1, mpn2, documents_file, show_progress=False)
169 | print("Results:")
170 | parse_and_show(cdb_diff, res1, res2, ann_diffs)
171 |
172 |
173 | if __name__ == "__main__":
174 | import sys
175 | main(*sys.argv[1:])
--------------------------------------------------------------------------------
/medcat/compare_models/compare_cdb.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Set, Tuple
2 |
3 | from medcat.cdb import CDB
4 |
5 | import tqdm
6 | from itertools import chain
7 |
8 | from pydantic import BaseModel
9 |
10 |
11 | class DictCompareKeys(BaseModel):
12 | """This is based on the keys."""
13 | total1: int
14 | """The total number of keys in 1st dict"""
15 | total2: int
16 | """The total number of keys in 2nd dict"""
17 | joint: int
18 | """The total number of keys (intersection)"""
19 | not_in_1: int
20 | """The number of keys in 2nd but not in 1st dict"""
21 | not_in_2: int
22 | """The number of keys in 1st but not in 2nd dict"""
23 |
24 | @classmethod
25 | def get(cls, d1: dict, d2: dict) -> "DictCompareKeys":
26 | # helpers
27 | all1 = set(d1)
28 | all2 = set(d2)
29 | # total keys
30 | total1 = len(all1)
31 | total2 = len(all2)
32 | # non-common keys
33 | joint = len(all1 & all2)
34 | all_combined = len(all1 | all2)
35 | not_in_1 = all_combined - total1
36 | not_in_2 = all_combined - total2
37 | return cls(total1=total1, total2=total2, joint=joint,
38 | not_in_1=not_in_1, not_in_2=not_in_2)
39 |
40 |
41 | class DictCompareValues(BaseModel):
42 | """This is based on the notion of the values being sets.
43 |
44 | With respect to the difference between `not_in_1` and `unique_in_2`:
45 | - If we have {"1": {"a", "b"}} and {"2": {"a", "b"}}
46 | - The values are identical overall (`unique_in_1==unique_in_2==0`)
47 | - However, the values are under different keys
48 | - So `not_in_1==not_in_2==2` (since this is per key)
49 | """
50 | total1: int
51 | """The total number of values in 1st dict"""
52 | total2: int
53 | """The total number of values in 2nd dict"""
54 | not_in_1: int
55 | """The number of values in 2nd, but not in 1st (per key)"""
56 | not_in_2: int
57 | """The number of values in 1st, but not in 2nd (per key)"""
58 | joint: int
59 | """Total number of values in both 1st and 2nd dict (overall)"""
60 | unique_in_1: int
61 | """The number of unique values in 1nd (overall)"""
62 | unique_in_2: int
63 | """The number of unique values in 2nd (overall)"""
64 |
65 | @classmethod
66 | def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictCompareValues":
67 | # helpers
68 | all_keys = set(d1) | set(d2)
69 | vals_in_1 = set(chain.from_iterable(d1.values()))
70 | vals_in_2 = set(chain.from_iterable(d2.values()))
71 | # total names
72 | total1 = sum(len(v) for v in d1.values())
73 | total2 = sum(len(v) for v in d2.values())
74 | # names ...
75 | not_in_1 = 0
76 | not_in_2 = 0
77 | for key in tqdm.tqdm(all_keys, desc="keys", disable=not progress):
78 | n1 = d1.get(key, set())
79 | n2 = d2.get(key, set())
80 | all_vals4key = len(n1 | n2)
81 | not_in_1 += all_vals4key - len(n1)
82 | not_in_2 += all_vals4key - len(n2)
83 | # names in common
84 | joint = len(vals_in_1 & vals_in_2)
85 | # names unique to one of the two
86 | vals_in_one_but_not_both = vals_in_1 ^ vals_in_2
87 | unique_in_1 = len(vals_in_one_but_not_both & vals_in_1)
88 | unique_in_2 = len(vals_in_one_but_not_both & vals_in_2)
89 | return cls(total1=total1, total2=total2, not_in_1=not_in_1,
90 | not_in_2=not_in_2, joint=joint,
91 | unique_in_1=unique_in_1, unique_in_2=unique_in_2)
92 |
93 |
94 | class DictComparisonResults(BaseModel):
95 | keys: DictCompareKeys
96 | values: DictCompareValues
97 |
98 | @classmethod
99 | def get(cls, d1: dict, d2: dict, progress: bool = True) -> "DictComparisonResults":
100 | return cls(keys=DictCompareKeys.get(d1, d2),
101 | values=DictCompareValues.get(d1, d2, progress=progress))
102 |
103 |
104 | class CDBCompareResults(BaseModel):
105 | names: DictComparisonResults
106 | snames: DictComparisonResults
107 |
108 |
109 | def compare(cdb1: CDB,
110 | cdb2: CDB,
111 | show_progress: bool = True) -> CDBCompareResults:
112 | """_summary_
113 |
114 | Args:
115 | cdb1 (CDB): _description_
116 | cdb2 (CDB): _description_
117 | show_progress (bool, optional): _description_. Defaults to True.
118 |
119 | Returns:
120 | CDBCompareResults: _description_
121 | """
122 | reg = DictComparisonResults.get(cdb1.cui2names, cdb2.cui2names, progress=show_progress)
123 | snames = DictComparisonResults.get(cdb1.cui2snames, cdb2.cui2snames, progress=show_progress)
124 | return CDBCompareResults(names=reg, snames=snames)
125 |
--------------------------------------------------------------------------------
/medcat/compare_models/data/demo-physio-mobility/cui_filter.csv:
--------------------------------------------------------------------------------
1 | 289001005,289004002,226207007,165224005,129043005,129040008,129041007,704440004,704439001,704437004,129065005,129039006,129035000,165232002,1080000000000000,716422006,45850009,284908004,129045003,129062008,714887007,714916007,719024002,715127003,714915006,282882001,302040002,302043000,165243005,365112008,165255004,105504002,301563003,301497008,160680006,301589003,165248001,165249009,362000000000000,270469004,160729004,248000000000000,160734000,405807003,160685001,285035003,285038001,285034004,428483004,1912002,431188001,864000000000000,301563003,1070000000000000,301497008,78459008,284915007,307439001,229798009,229799001,229797004,31000000000000,818000000000000,763692001,404934007,863721000000101,1100000017,1100000016,1100000030,1100000030,1100000011,863721000000101,863721000000101,863721000000101,282884000,282884000,282884000,1100000027,361721000000103,1100000028,361721000000103,1100000028,361721000000103,1100000027,361721000000103,361721000000103,1100000027,1100000028,1100000027,361721000000103,361721000000103,1100000028,1100000031,31031000119102,310131003,895488007,895488007,394923006,394923006,394923006,394923006,394923006,248171000000108,248171000000108,248171000000108,248171000000108,1100000015,1100000015,1100000012,1100000012,1100000013,1100000012,302046008,25711000087100,165233007,1100000029,718705001,718360006,282871009,895486006,895486006,895486006,895486006,699650006,699650006,8510008,273302005,306171006,257301003,404930003,404930003,404930003,224221006,261001000,184156005,184156005,183376001,154091000119106,301627005,301627005,725594005,445414007,165803005,323701000000101,72042002,24029004,282971008,10610811000001107,161903000,979501000000100,301477003,282966001,1149222004,371153006,311925007,225602000,763264000,249902000,249902000,223600005,386323002,37013008,205511000000108,325831000000100,1073861000000108,273469003,129032002,286489001,761481000000107,129072006,1073311000000100,286489001,286490005,129026007,160689007,286493007,129031009,1069991000000102,1071641000000109,960681000000109
--------------------------------------------------------------------------------
/medcat/compare_models/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/__init__.py
--------------------------------------------------------------------------------
/medcat/compare_models/tests/resources/docs/not_real.csv:
--------------------------------------------------------------------------------
1 | "id","text"
2 | "-1","Not real text. Just Virus and Virus Z"
3 | "-2","Really not real Virus text"
--------------------------------------------------------------------------------
/medcat/compare_models/tests/resources/mct_export/medcat_trainer_expoert2.json:
--------------------------------------------------------------------------------
1 | {"projects":
2 | [
3 | {
4 | "name": "SAMPLE FAKE PROJECT",
5 | "id": -2,
6 | "cuis": "",
7 | "tuis": "",
8 | "documents": [
9 | {
10 | "id": -2,
11 | "name": "FAKE-TEXT",
12 | "text": "FAKE TEXT WITH fake concepts, i.e Virus Z, and Virus.",
13 | "annotations": [
14 | {"id": -3, "user": "fake", "cui": "C0000139", "value": "gastroesophageal reflux", "start": 34, "end": 41, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []},
15 | {"id": -4, "user": "fake", "cui": "C0000039", "value": "hypertension", "start": 47, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}
16 | ]
17 | }
18 | ]
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/medcat/compare_models/tests/resources/mct_export/medcat_trainer_export.json:
--------------------------------------------------------------------------------
1 | {"projects":
2 | [
3 | {
4 | "name": "SAMPLE FAKE PROJECT",
5 | "id": -1,
6 | "cuis": "",
7 | "tuis": "",
8 | "documents": [
9 | {
10 | "id": -1,
11 | "name": "FAKE-TEXT",
12 | "text": "FAKE TEXT WITH fake concepts, i.e Virus, and Virus Z.",
13 | "annotations": [
14 | {"id": -1, "user": "fake", "cui": "C0000039", "value": "gastroesophageal reflux", "start": 34, "end": 39, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2024-04-16 11:54:00.00000+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []},
15 | {"id": -2, "user": "fake", "cui": "C0000139", "value": "hypertension", "start": 45, "end": 52, "validated": true, "correct": true, "deleted": false, "alternative": false, "killed": false, "last_modified": "2020-04-01 22:06:30.394941+00:00", "manually_created": false, "acc": 1.0, "meta_anns": []}
16 | ]
17 | }
18 | ]
19 | }
20 | ]
21 | }
--------------------------------------------------------------------------------
/medcat/compare_models/tests/resources/model_pack/cdb.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/resources/model_pack/cdb.dat
--------------------------------------------------------------------------------
/medcat/compare_models/tests/resources/model_pack/vocab.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/compare_models/tests/resources/model_pack/vocab.dat
--------------------------------------------------------------------------------
/medcat/compare_models/tests/test_compare.py:
--------------------------------------------------------------------------------
1 | import unittest.mock
2 | from compare import _add_all_children
3 | from compare import get_diffs_for
4 | from compare import (CDBCompareResults, ResultsTally,
5 | ResultsTally, PerAnnotationDifferences)
6 | import unittest
7 | import os
8 |
9 | from medcat.cat import CAT
10 |
11 |
12 | class FakeCDBWithPt2Ch:
13 |
14 | def __init__(self, pt2ch: dict) -> None:
15 | self.pt2ch = pt2ch
16 | self.addl_info = {"pt2ch": self.pt2ch}
17 |
18 |
19 | class FakeCATWithCDBAndPt2Ch:
20 |
21 | def __init__(self, pt2ch: dict) -> None:
22 | self.cdb = FakeCDBWithPt2Ch(pt2ch)
23 |
24 |
25 | _PT2CH = {
26 | "C1": ["C11", "C12", "C13"],
27 | "C2": ["C21"],
28 | # grandchildren
29 | "C11": ["C111", "C112", "C113"],
30 | "C13": ["C131", "C132"],
31 | # great grandchildren
32 | "C132": ["C1321", "C1322"],
33 | }
34 |
35 |
36 | class AddAllChildrenTests(unittest.TestCase):
37 | pt2ch = _PT2CH
38 | fake_cat = FakeCATWithCDBAndPt2Ch(pt2ch)
39 |
40 | _cui_filter = set(['C1', 'C2'])
41 | a = [c for c in pt2ch.get("", [])]
42 | children_1st_order = set(ch for cui in _cui_filter for ch in _PT2CH.get(cui, []))
43 | children_2nd_order = set(gch for ch in children_1st_order for gch in _PT2CH.get(ch, []))
44 |
45 | @property
46 | def cui_filter(self) -> set:
47 | return set(self._cui_filter)
48 |
49 | def test_adds_no_children_with_0(self):
50 | f = self.cui_filter # copy
51 | _add_all_children(self.fake_cat, f, include_children=0)
52 | self.assertEqual(f, self.cui_filter)
53 |
54 | def test_add_first_children_with_1(self):
55 | f = self.cui_filter
56 | _add_all_children(self.fake_cat, f, include_children=1)
57 | self.assertGreater(f, self.cui_filter)
58 | self.assertEqual(f, self.cui_filter | self.children_1st_order)
59 | # no grandchildren
60 | self.assertFalse(f & self.children_2nd_order)
61 |
62 | def test_add_grandchildren_with_2(self):
63 | f = self.cui_filter
64 | _add_all_children(self.fake_cat, f, include_children=2)
65 | self.assertGreater(f, self.cui_filter)
66 | self.assertGreater(f, self.cui_filter | self.children_1st_order)
67 | self.assertEqual(f, self.cui_filter | self.children_1st_order | self.children_2nd_order)
68 |
69 |
70 | class TrainAndCompareTests(unittest.TestCase):
71 | _file_dir = os.path.dirname(__file__)
72 | _resources_path = os.path.join(_file_dir, "resources")
73 | cat_path = os.path.join(_resources_path, "model_pack")
74 | mct_export_path_1 = os.path.join(_resources_path, "mct_export", "medcat_trainer_export.json")
75 | mct_export_path_glob = os.path.join(_resources_path, "mct_export", "medcat_trainer_export*.json")
76 | docs_file = os.path.join(_resources_path, "docs", "not_real.csv")
77 |
78 | # this tests that the training is called
79 | @classmethod
80 | @unittest.mock.patch("medcat.cat.CAT.train_supervised_from_json")
81 | def _get_diffs(cls, mct_export_path: str, method):
82 | diffs = get_diffs_for(cls.cat_path, mct_export_path, cls.docs_file,
83 | supervised_train_comparison_model=True)
84 | cls.assertTrue(cls, method.called)
85 | return diffs
86 |
87 |
88 | @classmethod
89 | def setUpClass(cls) -> None:
90 | ann_diffs1 = cls._get_diffs(cls.mct_export_path_1)
91 | cls.cdb_comp1, cls.tally1_1, cls.tally1_2, cls.ann_diffs1 = ann_diffs1
92 | ann_diffs_many = cls._get_diffs(cls.mct_export_path_glob)
93 | cls.cdb_comp_many, cls.tally_many_1, cls.tally_many_2, cls.ann_diffs_many = ann_diffs_many
94 |
95 | def test_compares_with_one_file(self):
96 | self.assertIsInstance(self.cdb_comp1, CDBCompareResults)
97 | self.assertIsInstance(self.tally1_1, ResultsTally)
98 | self.assertIsInstance(self.tally1_2, ResultsTally)
99 | self.assertIsInstance(self.ann_diffs1, PerAnnotationDifferences)
100 |
101 | def test_compares_with_multiple_file(self):
102 | self.assertIsInstance(self.cdb_comp_many, CDBCompareResults)
103 | self.assertIsInstance(self.tally_many_1, ResultsTally)
104 | self.assertIsInstance(self.tally_many_2, ResultsTally)
105 | self.assertIsInstance(self.ann_diffs_many, PerAnnotationDifferences)
106 |
--------------------------------------------------------------------------------
/medcat/compare_models/tests/test_compare_cdb.py:
--------------------------------------------------------------------------------
1 | import compare_cdb
2 |
3 | import unittest
4 | EXAMPLE1 = {
5 | "C0": {"n01", "n02", "n03"}, # 1 non-unique (#2 CS)
6 | "C1": {"n11", "n12" },
7 |
8 | "C3": {"n31", "n33"}, # adds 1 CUI, 2 names
9 |
10 | "C5": { "n53"}, # adds 1 CUI, 1 name
11 | }
12 | EXAMPLE2 = {
13 | "C0": {"n01", "n02", "n03"}, # 1 non-unique (CS)
14 | "C1": {"n11", "n12", "n13"}, # adds 1 name
15 | "C2": {"n21", "n23"}, # adds 1 CUI, 2 names
16 |
17 | "C4": {"n41", "n42", "n43"}, # adds 1 CUI, 3 names; 1 non-unique (CS)
18 |
19 | "CS": {"n01", "n42", }, # adds 1 CUI, no names
20 | }
21 | # this should be equivalent to the above
22 | EXPECTED_VALUES_MAN = compare_cdb.DictCompareValues(total1=8,
23 | total2=13,
24 | not_in_1=8, # n13, n21, n23, n41, n42, n43, "n01", "n42"
25 | not_in_2=3, # n31, n33, n53
26 | joint=5, # n01, n02, n03, n11, n12
27 | unique_in_1=3, # overall unique in 1st
28 | unique_in_2=6, # overall unique in 2nd
29 | )
30 |
31 | keys1 = set(EXAMPLE1.keys())
32 | keys2 = set(EXAMPLE2.keys())
33 | EXPECTED_KEYS = compare_cdb.DictCompareKeys(total1=len(keys1),
34 | total2=len(keys2),
35 | joint=len(keys1 & keys2),
36 | not_in_1=(len(keys1 | keys2)) - len(keys1),
37 | not_in_2=(len(keys1 | keys2)) - len(keys2),)
38 | # this should be equivalent to the above
39 | EXPECTED_KEYS_MAN = compare_cdb.DictCompareKeys(total1=4, # C0, C1, C3, C5
40 | total2=5, # C0, C1, C2, C4, CS
41 | joint=2, # C0, C1
42 | not_in_1=3, # C2, C4, CS
43 | not_in_2=2, # C3, C5
44 | )
45 | vals1 = set(e for v in EXAMPLE1.values() for e in v)
46 | total1 = sum(len(v) for v in EXAMPLE1.values())
47 | vals2 = set(e for v in EXAMPLE2.values() for e in v)
48 | total2 = sum(len(v) for v in EXAMPLE2.values())
49 | EXPECTED_VALUES = compare_cdb.DictCompareValues(total1=total1,
50 | total2=total2,
51 | not_in_1=8, # the new/misplaced CUIs in 2nd
52 | not_in_2=3, # the new/misplaced CUIs in 1st
53 | joint=len(vals1 & vals2),
54 | unique_in_1=3, # overall unique in 1st
55 | unique_in_2=6, # overall unique in 2nd
56 | )
57 |
58 |
59 | class CompareDictTests(unittest.TestCase):
60 |
61 | def test_compare_keys_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_KEYS, exp_man=EXPECTED_KEYS_MAN):
62 | res = compare_cdb.DictCompareKeys.get(d1, d2)
63 | self.assertEqual(res.dict(), exp.dict())
64 | self.assertEqual(res.dict(), exp_man.dict())
65 |
66 | def test_compare_values_works(self, d1=EXAMPLE1, d2=EXAMPLE2, exp=EXPECTED_VALUES, exp_man=EXPECTED_VALUES_MAN):
67 | res = compare_cdb.DictCompareValues.get(d1, d2, progress=False)
68 | self.assertEqual(res.dict(), exp.dict())
69 | self.assertEqual(res.dict(), exp_man.dict())
70 |
71 |
--------------------------------------------------------------------------------
/medcat/compare_models/tests/test_output.py:
--------------------------------------------------------------------------------
1 | import output
2 |
3 | import contextlib
4 | import io
5 | import sys
6 |
7 | import unittest
8 |
9 |
10 | @contextlib.contextmanager
11 | def nostdout():
12 | save_stdout = sys.stdout
13 | sys.stdout = io.StringIO()
14 | yield
15 | sys.stdout = save_stdout
16 |
17 |
18 | class CompareDictTests(unittest.TestCase):
19 | example_dict = {"k1": "v1",
20 | "k2": "v2",
21 | "k3": {"sk1": 1.0}}
22 | example_dict2 = {'pretty_name': 'Genus Quercus',
23 | 'cui': '53347009',
24 | 'type_ids': ['81102976'],
25 | 'types': [''],
26 | 'source_value': 'Oak',
27 | 'detected_name': 'oak',
28 | 'acc': 0.6368384509248382,
29 | 'context_similarity': 0.6368384509248382,
30 | 'start': 43,
31 | 'end': 46,
32 | 'icd10': [],
33 | 'ontologies':
34 | ['20220803_SNOMED_UK_CLINICAL_EXT'],
35 | 'snomed': [],
36 | 'id': 3,
37 | 'meta_anns': {
38 | 'Presence': {'value': 'True', 'confidence': 0.999996542930603, 'name': 'Presence'},
39 | 'Subject': {'value': 'Patient', 'confidence': 0.9396798014640808, 'name': 'Subject'},
40 | 'Time': {'value': 'Recent', 'confidence': 0.9999940395355225, 'name': 'Time'}
41 | }
42 | }
43 | expected_nulled_dict2 = {'pretty_name': '',
44 | 'cui': '',
45 | 'type_ids': '',
46 | 'types': '',
47 | 'source_value': '',
48 | 'detected_name': '',
49 | 'acc': '',
50 | 'context_similarity': '',
51 | 'start': '',
52 | 'end': '',
53 | 'icd10': '',
54 | 'ontologies': '',
55 | 'snomed': '',
56 | 'id': '',
57 | 'meta_anns': {}
58 | }
59 |
60 | def setUp(self) -> None:
61 | self.nulled = output._get_nulled_copy(self.example_dict)
62 | self.nulled2 = output._get_nulled_copy(self.example_dict2)
63 |
64 | def test_compare_dicts_works_1st_None(self):
65 | with nostdout():
66 | output.compare_dicts(None, self.example_dict)
67 |
68 | def test_compare_dicts_works_2nd_None(self):
69 | with nostdout():
70 | output.compare_dicts(self.example_dict, None)
71 |
72 | def test_expected_nulled_real(self):
73 | self.assertEqual(self.nulled2, self.expected_nulled_dict2)
74 |
75 | def test_compare_dicts_1st_only_real(self):
76 | with nostdout():
77 | output.compare_dicts(self.example_dict2, None)
78 |
--------------------------------------------------------------------------------
/medcat/compare_models/validation.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Union, Set
2 | import os
3 | import glob
4 |
5 |
6 | def _is_mct_export(file_path: str) -> bool:
7 | if "*" in file_path:
8 | nr_of_matching_files = len(list(glob.iglob(file_path)))
9 | print("GLOB w", nr_of_matching_files, nr_of_matching_files > 0)
10 | return nr_of_matching_files > 0
11 | print("MCT EXPORT (no-glob?", os.path.exists(file_path), file_path.endswith(".json"))
12 | return os.path.exists(file_path) and file_path.endswith(".json")
13 |
14 |
15 | def validate_input(model_path1: str, model_path2: str, documents_file: str,
16 | cui_filter: Optional[Union[Set[str], str]],
17 | supevised_train_comp: bool):
18 | if not os.path.exists(model_path1):
19 | raise ValueError(f"No model found at specified path (1st model): {model_path1}")
20 | if not is_medcat_model(model_path1):
21 | raise ValueError(f"Not a medcat model: {model_path1}")
22 | if not os.path.exists(model_path2):
23 | if supevised_train_comp and not _is_mct_export(model_path2):
24 | raise ValueError(f"No matching MCT export found for: {model_path2}")
25 | elif not supevised_train_comp:
26 | raise ValueError(f"No file found at specified path (2nd model): {model_path2}")
27 | if supevised_train_comp:
28 | if not os.path.isfile(model_path2) and not _is_mct_export(model_path2):
29 | raise ValueError(f"MCT export provided should be a file not a folder: {model_path2}")
30 | if not model_path2.lower().endswith(".json"):
31 | raise ValueError(f"MCT export expected in .json format, Got: {model_path2}")
32 | elif not is_medcat_model(model_path2):
33 | raise ValueError(f"Not a medcat model: {model_path2}")
34 | if cui_filter is not None:
35 | if isinstance(cui_filter, str):
36 | if not os.path.exists(cui_filter):
37 | raise ValueError(f"File passed as CUI filter does not exist: {cui_filter}")
38 | if not os.path.exists(documents_file):
39 | raise ValueError(f"No documents file found: {documents_file}")
40 | if not documents_file.lower().endswith(".csv"):
41 | raise ValueError(f"Expected a .csv file for documnets, got: {documents_file}")
42 |
43 |
44 | def _is_medcat_model_folder(model_folder: str):
45 | # needs to have CDB and vocab
46 | cdb_path = os.path.join(model_folder, 'cdb.dat')
47 | vocab_path = os.path.join(model_folder, "vocab.dat")
48 | return ((os.path.exists(cdb_path) and os.path.isfile(cdb_path)) and
49 | (os.path.exists(vocab_path) and os.path.isfile(vocab_path)))
50 |
51 |
52 | def is_medcat_model(model_path: str) -> bool:
53 | if os.path.isdir(model_path):
54 | return _is_medcat_model_folder(model_path)
55 | model_folder = model_path[:-len(".zip")]
56 | if os.path.exists(model_folder):
57 | # NOTE: if the model folder doesn't exist, it will
58 | # be extracted upon loading the model
59 | return _is_medcat_model_folder(model_folder)
60 | # NOTE: this does not actually guarantee that it's a model pack
61 | # but it would be outside the scope of this method
62 | # to try and extract or list the contents
63 | return model_path.endswith(".zip")
64 |
--------------------------------------------------------------------------------
/medcat/evaluate_mct_export/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/medcat/evaluate_mct_export/__init__.py
--------------------------------------------------------------------------------
/medcat/evaluate_mct_export/mct_export_summary.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Evaluate a MedCATtrainer project export\n",
9 | "\n",
10 | "Replace all <<\\>> with a custom name that refers to a file name."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from mct_analysis import MedcatTrainer_export"
20 | ]
21 | },
22 | {
23 | "attachments": {},
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Load MCT exports and MedCAT model"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "scrolled": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "lst_mct_export=[\n",
39 | " '../../data/medcattrainer_export/<<>>', # mct_export .json here\n",
40 | " ] \n",
41 | "\n",
42 | "mct_model = \"../../models/modelpack/<<>>\" # Enter your medcat model here\n",
43 | "\n",
44 | "\n",
45 | "mct = MedcatTrainer_export(mct_export_paths=lst_mct_export, model_pack_path= mct_model)\n",
46 | "# You can just jump to the generate the report section. The following code is a breakdown of the intermediate steps"
47 | ]
48 | },
49 | {
50 | "attachments": {},
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "# Evaluate model card"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "scrolled": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "# Load the model card\n",
66 | "mct.cat.get_model_card(as_dict=True)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "# look to potentially remove any filters that exist in the model\n",
76 | "\"\"\"\n",
77 | "mct.cat.config.linking['filters']\n",
78 | "\"\"\""
79 | ]
80 | },
81 | {
82 | "attachments": {},
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "# Evaluate MCT export"
87 | ]
88 | },
89 | {
90 | "attachments": {},
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "### View all Annotations and Meta-annotations created"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {
101 | "scrolled": false
102 | },
103 | "outputs": [],
104 | "source": [
105 | "# Load all annotations created\n",
106 | "anns_df = mct.annotation_df()\n",
107 | "anns_df"
108 | ]
109 | },
110 | {
111 | "attachments": {},
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "### Summarise all Meta-annotations"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# Meta_annotation summary\n",
125 | "for col in anns_df.loc[:,'acc':].iloc[:,1:]:\n",
126 | " print(anns_df[col].value_counts())\n",
127 | " print()"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "# Meta_annotation summary of combinations\n",
137 | "for k,v in anns_df.loc[:,'acc':].iloc[:,1:].value_counts().items():\n",
138 | " print(k,v)"
139 | ]
140 | },
141 | {
142 | "attachments": {},
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### Overview of the entire MCT export\n",
147 | "This includes all names of all projects within the export and the document ids."
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "scrolled": true
155 | },
156 | "outputs": [],
157 | "source": [
158 | "# projects\n",
159 | "anns_df['project'].unique()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "# documents\n",
169 | "anns_df['document_name'].unique()"
170 | ]
171 | },
172 | {
173 | "attachments": {},
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "# Annotation Summary"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "performance_summary_df = mct.concept_summary()"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "scrolled": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "performance_summary_df"
198 | ]
199 | },
200 | {
201 | "attachments": {},
202 | "cell_type": "markdown",
203 | "metadata": {},
204 | "source": [
205 | "# Annotator stats"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "# User Stats\n",
215 | "mct.user_stats()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "mct.plot_user_stats(save_fig=True, save_fig_filename='<<>>.html')"
225 | ]
226 | },
227 | {
228 | "attachments": {},
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "# Generate report\n",
233 | "All of the above functions added into a single Excel file report"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# Example of function description and parameters\n",
243 | "help(mct.generate_report)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "scrolled": true
251 | },
252 | "outputs": [],
253 | "source": [
254 | "mct.generate_report(path='<<>>.xlsx')"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": []
263 | },
264 | {
265 | "attachments": {},
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "# Meta Annotations\n",
270 | "\n",
271 | "helper function to rename meta_task and meta_task values.\n",
272 | "\n",
273 | "__TODO:__ This Section is incomplete"
274 | ]
275 | },
276 | {
277 | "attachments": {},
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "### Rename meta annotation tasks"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "# select which meta tasks to rename\n",
291 | "rename_meta_anns = {'Subject/Experiencer':'Subject'}\n",
292 | "# select which meta values for the corresponding meta tasks.\n",
293 | "rename_meta_anns_values = {'Subject':{'Relative':'Other'}}\n",
294 | "# run the renaming\n",
295 | "mct.rename_meta_anns(meta_anns2rename=rename_meta_anns, meta_ann_values2rename=rename_meta_anns_values)"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "anns_df = mct.annotation_df()\n",
305 | "anns_df.head()"
306 | ]
307 | }
308 | ],
309 | "metadata": {
310 | "kernelspec": {
311 | "display_name": "Python 3 (ipykernel)",
312 | "language": "python",
313 | "name": "python3"
314 | },
315 | "language_info": {
316 | "codemirror_mode": {
317 | "name": "ipython",
318 | "version": 3
319 | },
320 | "file_extension": ".py",
321 | "mimetype": "text/x-python",
322 | "name": "python",
323 | "nbconvert_exporter": "python",
324 | "pygments_lexer": "ipython3",
325 | "version": "3.7.3"
326 | }
327 | },
328 | "nbformat": 4,
329 | "nbformat_minor": 5
330 | }
331 |
--------------------------------------------------------------------------------
/models/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Space to store all components of a MedCAT model.
2 |
3 | MedCAT modelpacks are generally comprised for 3 components:
4 | 1) CDB
5 | 2) Vocab
6 | 3) Config
7 |
8 | Other components include preprocessing tools such as SpaCY package etc..
9 |
--------------------------------------------------------------------------------
/models/cdb/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/models/cdb/.keep
--------------------------------------------------------------------------------
/models/modelpack/ReadMe.md:
--------------------------------------------------------------------------------
1 | All MedCAT modelpacks should be placed here.
2 |
3 | To create a modelpack please see [create_modelpack.py](/medcat/1_create_model/create_modelpack/create_modelpack.py) for further details.
4 |
--------------------------------------------------------------------------------
/models/vocab/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/models/vocab/.keep
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | # Global options:
2 |
3 | [mypy]
4 | ignore_missing_imports = True
5 | allow_redefinition = True
6 |
--------------------------------------------------------------------------------
/projects/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Projects
2 |
3 | This directory is a placeholder for where project workflows are organised and all relevant information relevant to a particular usecase is stored in a single location.
4 |
5 | The [demo project structure](./demo_project_stucture) is a template which you can copy and follow to meet the requirements of your usecase.
6 |
7 | ```
8 | $ cp -r projects/demo_project_stucture projects/
9 | ```
10 |
11 | The majority of this information is also held throughout other sections of this repository and thus this section is simply an alternative workflow which keeps all relevant data and file pertaining to a project together.
12 |
13 | The folder names should correspond to the project and project ID for easy reference.
14 |
15 | ## Standarise Workflows (Optional)
16 | The following is just guidelines/recommendations to standardise workflow:
17 |
18 | - Good practise is to name files with the following structure *YYYYMMDD_filename*
19 |
20 |
21 |
22 | This working directory, should be used to store temporary data files. With the final scripts (main.py and other analysis scripts...) held directly in the project folder outside of the sub-folders. Any raw or intermediate data that one may want to reference later should be stored in their respective directories.
23 |
24 | A recommended format for the directory structure to efficiently manage each request is as follows:
25 | * Ideally the project_name should correspond to your CogStack request ID.
26 |
27 |
28 | ```
29 | ── project_name/
30 | ├── raw_data/ # raw data files
31 | │ └── cogstack_search_hits/ # search results
32 | ├── processed_data/ # intermediate reference files
33 | │ └── ann_folder_path/ # annotated documents
34 | ├── results/ # final results
35 | ├── 1_search.ipynb # search scripts
36 | ├── 2_run_model.ipynb # run model
37 | ├── 3_pipeline.ipynb # convert annotation to output pipeline
38 | ├── 4_evaluation.ipynb # evaluation of the output compared to a gold standard dataset
39 | ```
40 |
41 |
42 | __[raw_data/]__: Contains the original, or raw, data files. Contents in this folder should be treated as read-only.
43 |
44 | __[raw_data/cogstack_search_hits/]__: Contains the search results from cogstack. Once retreived from cogstack this dataset is static.
45 |
46 | __[processed_data/]__: Contains manipulated files or partially processed files
47 |
48 | __[processed_data/ann_folder_path/]__: All direct annotation output from a medcat model should be stored here. Acts as a checkpoint from which analysis can be conducted.
49 |
50 | __[results/]__: Contains the final results and ideally explanatory markdown files.
51 |
52 |
--------------------------------------------------------------------------------
/projects/demo_project_stucture/ReadMe.md:
--------------------------------------------------------------------------------
1 | # \
2 |
3 | ## \
4 |
5 | Feel free to edit the meta data of the contents
6 | and write information about the project and objectives here.
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | mypy
2 | pandas-stubs
3 | types-tqdm
4 | types-requests
5 | types-regex
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy>=3.6.0,<4.0
2 | medcat~=1.16.0
3 | plotly~=5.19.0
4 | eland==8.12.1
5 | en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl
6 | ipyfilechooser
7 | jupyter_contrib_nbextensions
8 |
--------------------------------------------------------------------------------
/search/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=strip-notebook-output
2 |
--------------------------------------------------------------------------------
/search/ReadMe.md:
--------------------------------------------------------------------------------
1 |
2 | # Login and search
3 | This directory contains all the scripts necessary to login and conduct a search.
4 |
5 | ## Login details
6 | 1. Create a [credentials.py](../credentials.py)
7 | 2. Populate it with your cogstack instance and login details
8 | An example template can be seen below:
9 | ```
10 | hosts = [] # This is a list of your cogstack elasticsearch instances.
11 |
12 | # These are your login details (either via http_auth or API)
13 | username = None
14 | password = None
15 | ```
16 |
17 | __Note__: If these fields are left blank then the user will be prompted to enter the details themselves.
18 |
19 | If you are unsure about the above information please contact your CogStack system administrator.
20 |
21 | ## How to build a Search query
22 |
23 | A core component of cogstack is Elasticsearch which is a search engine built on top of Apache Lucene.
24 |
25 | Lucene has a custom query syntax for querying its indexes (Lucene Query Syntax). This query syntax allows for features such as Keyword matching, Wildcard matching, Regular expression, Proximity matching, Range searches.
26 |
27 | Full documentation for this syntax is available as part of Elasticsearch [query string syntax](https://www.elastic.co/guide/en/elasticsearch/reference/8.5/query-dsl-query-string-query.html#query-string-syntax).
--------------------------------------------------------------------------------
/search/search_template.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Seaching CogStack\n",
8 | "\n",
9 | "This script is designed to be a template for cogstack searches"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import sys\n",
19 | "sys.path.append('..')\n",
20 | "from credentials import *\n",
21 | "from cogstack import CogStack"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Login and Initialise"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "cs = CogStack(hosts, username=username, password=password, api=True)"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "# Check the list of Indices and columns"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "for i in cs.elastic.indices.get_mapping().keys():\n",
54 | " print(i)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Check the list of columns in that index\n",
64 | "index = ''\n",
65 | "for col in cs.elastic.indices.get_mapping(index=index)[index]['mappings']['properties'].keys():\n",
66 | " print(col)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "# Set parameters"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "pt_list = [] # example list of patients' patient_TrustNumber here"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Columns of interest\n",
90 | "\n",
91 | "Select your fields and list in order of output columns"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "columns = []"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## Build query\n",
108 | "\n",
109 | "For further information on [how to build a query can be found here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)\n",
110 | "\n",
111 | "Further information on [free text string queries can be found here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html)\n"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "# Example query structure\n",
121 | "query = {\n",
122 | " \"from\": 0,\n",
123 | " \"size\": 10000,\n",
124 | " \"query\": {\n",
125 | " \"bool\": {\n",
126 | " \"filter\": {\n",
127 | " \"terms\": {\"patient_TrustNumber\": pt_list}\n",
128 | " },\n",
129 | " \"must\": [\n",
130 | " {\"query_string\": {\n",
131 | " \"query\": \"***YOUR LUCENE QUERY HERE***\"}\n",
132 | " }\n",
133 | " ]\n",
134 | " }\n",
135 | " },\n",
136 | " \"_source\": columns # This is a search column filter. remove if all columns are to be retrieved\n",
137 | "}"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {
143 | "tags": []
144 | },
145 | "source": [
146 | "# Search, Process, and Save"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "df = cs.cogstack2df(query=query, index=index, column_headers=columns)"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "## Process"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "# Whatever you want here\n",
172 | "df.head()"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "## Save"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "path_to_results = \"../data/cogstack_search_results\"\n",
189 | "file_name = \"file_name.csv\""
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "df.to_csv(path_to_results + '/' +file_name, index=False)"
199 | ]
200 | }
201 | ],
202 | "metadata": {
203 | "kernelspec": {
204 | "display_name": "Python 3",
205 | "language": "python",
206 | "name": "python3"
207 | },
208 | "language_info": {
209 | "codemirror_mode": {
210 | "name": "ipython",
211 | "version": 3
212 | },
213 | "file_extension": ".py",
214 | "mimetype": "text/x-python",
215 | "name": "python",
216 | "nbconvert_exporter": "python",
217 | "pygments_lexer": "ipython3",
218 | "version": "3.9.6 (default, Sep 26 2022, 11:37:49) \n[Clang 14.0.0 (clang-1400.0.29.202)]"
219 | },
220 | "vscode": {
221 | "interpreter": {
222 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
223 | }
224 | }
225 | },
226 | "nbformat": 4,
227 | "nbformat_minor": 4
228 | }
229 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_cdb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_cdb/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_cdb/test_create_cdb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import medcat.cdb
4 |
5 | _FILE_DIR = os.path.dirname(__file__)
6 |
7 | # because this project isn't (at least of of writing this)
8 | # set up as a python project, there are no __init__.py
9 | # files in each folder
10 | # as such, in order to gain access to the relevant module,
11 | # I'll need to add the path manually
12 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..")
13 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_cdb"))
14 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER)
15 | # now we are able to import create_cdb and/or create_umls_cdb
16 |
17 | import unittest
18 | from unittest.mock import patch
19 |
20 | # SNOMED pre-cdb csv
21 | PRE_CDB_CSV_PATH_SNOMED = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_snomed.csv")
22 | PRE_CDB_CSV_PATH_UMLS = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_cdb_input_umls.csv")
23 |
24 |
25 | def get_mock_input(output: str):
26 | def mock_input(prompt: str):
27 | return output
28 | return mock_input
29 |
30 |
31 | class CreateCDBTest(unittest.TestCase):
32 |
33 | def setUp(self) -> None:
34 | self.output_cdb = None
35 |
36 | def tearDown(self) -> None:
37 | if self.output_cdb is not None and os.path.exists(self.output_cdb):
38 | os.remove(self.output_cdb)
39 |
40 | def assertHasCDB(self, path: str):
41 | self.assertTrue(os.path.exists(path))
42 | self.assertTrue(path.endswith(".dat"))
43 | cdb = medcat.cdb.CDB.load(path)
44 | self.assertIsInstance(cdb, medcat.cdb.CDB)
45 |
46 | def test_snomed_cdb_creation(self):
47 | # Replace the 'input' function with 'mock_input'
48 | with patch('builtins.input', side_effect=get_mock_input(PRE_CDB_CSV_PATH_SNOMED)):
49 | import create_cdb
50 | self.output_cdb = create_cdb.output_cdb
51 | self.assertHasCDB(self.output_cdb)
52 |
53 | def test_umls_cdb_creation(self):
54 | with patch('builtins.input', side_effect=get_mock_input(PRE_CDB_CSV_PATH_UMLS)):
55 | import create_umls_cdb
56 | self.output_cdb = create_umls_cdb.output_cdb
57 | self.assertHasCDB(self.output_cdb)
58 |
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_modelpack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_modelpack/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_modelpack/test_create_modelpack.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import unittest
5 | # import unittest.mock
6 |
7 | import tempfile
8 |
9 |
10 | # relative to file path
11 | _FILE_DIR = os.path.dirname(__file__)
12 | # because this project isn't (at least of of writing this)
13 | # set up as a python project, there are no __init__.py
14 | # files in each folder
15 | # as such, in order to gain access to the relevant module,
16 | # I'll need to add the path manually
17 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..")
18 | MEDCAT_CREATE_MODELPACK_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_modelpack"))
19 | sys.path.append(MEDCAT_CREATE_MODELPACK_FOLDER)
20 | # now we are able to import create_modelpack
21 |
22 | import create_modelpack
23 |
24 | RESOURCES_FOLDER = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources")
25 | DEFAULT_CDB_PATH = os.path.join(RESOURCES_FOLDER, "cdb.dat")
26 | DEFAULT_VOCAB_PATH = os.path.join(RESOURCES_FOLDER, "vocab.dat")
27 |
28 |
29 | class CreateModelPackTests(unittest.TestCase):
30 |
31 | @classmethod
32 | def setUpClass(cls):
33 | cls.tempfolder = tempfile.TemporaryDirectory()
34 | cls.model_pack_name = "TEMP_MODEL_PACK"
35 | cls.partial_model_pack_path = os.path.join(cls.tempfolder.name, cls.model_pack_name)
36 |
37 | @classmethod
38 | def tearDownClass(cls):
39 | cls.tempfolder.cleanup()
40 |
41 | def test_a(self):
42 | model_pack_name = create_modelpack.load_cdb_and_save_modelpack(
43 | DEFAULT_CDB_PATH, self.model_pack_name,
44 | self.tempfolder.name, DEFAULT_VOCAB_PATH)
45 | self.assertTrue(model_pack_name.startswith(self.model_pack_name))
46 | model_pack_path = os.path.join(self.tempfolder.name, model_pack_name)
47 | self.assertTrue(os.path.exists(model_pack_path))
48 |
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_vocab/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/1_create_model/create_vocab/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/1_create_model/create_vocab/test_create_vocab.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import medcat.vocab
5 |
6 | _FILE_DIR = os.path.dirname(__file__)
7 |
8 | # because this project isn't (at least of of writing this)
9 | # set up as a python project, there are no __init__.py
10 | # files in each folder
11 | # as such, in order to gain access to the relevant module,
12 | # I'll need to add the path manually
13 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..")
14 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "1_create_model", "create_vocab"))
15 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER)
16 | # now we are able to import create_cdb and/or create_umls_cdb
17 |
18 | import unittest
19 | from unittest.mock import patch, mock_open
20 |
21 |
22 | VOCAB_INPUT_PATH = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "models", "vocab", "vocab_data.txt"))
23 | VOCAB_OUTPUT_PATH = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "models", "vocab", "vocab.dat"))
24 | VOCAB_INPUT = [
25 | "house 34444 0.3232 0.123213 1.231231"
26 | "dog 14444 0.76762 0.76767 1.45454"
27 | ]
28 |
29 | orig_open = open
30 |
31 |
32 | def custom_open(file, mode="r", *args, **kwargs):
33 | if 'r' in mode:
34 | return mock_open(read_data="\n".join(VOCAB_INPUT))(file, mode, *args, **kwargs)
35 | return orig_open(file, mode, *args, **kwargs)
36 |
37 |
38 | class CreateVocabTest(unittest.TestCase):
39 | temp_vocab_path = "temp_vocab_for_test_create_vocab"
40 |
41 | def setUp(self) -> None:
42 | if os.path.exists(VOCAB_OUTPUT_PATH):
43 | os.rename(VOCAB_OUTPUT_PATH, self.temp_vocab_path)
44 | self.moved = True
45 | else:
46 | self.moved = False
47 |
48 | def tearDown(self) -> None:
49 | if os.path.exists(VOCAB_OUTPUT_PATH):
50 | os.remove(VOCAB_OUTPUT_PATH)
51 | if self.moved:
52 | os.rename(self.temp_vocab_path, VOCAB_OUTPUT_PATH)
53 |
54 | def test_creating_vocab(self):
55 | with patch('builtins.open', side_effect=custom_open):
56 | import create_vocab
57 | vocab_path = os.path.join(create_vocab.vocab_dir, "vocab.dat")
58 | self.assertEqual(os.path.abspath(vocab_path), VOCAB_OUTPUT_PATH)
59 | self.assertTrue(os.path.exists(vocab_path))
60 | vocab = medcat.vocab.Vocab.load(vocab_path)
61 | self.assertIsInstance(vocab, medcat.vocab.Vocab)
62 |
--------------------------------------------------------------------------------
/tests/medcat/2_train_model/1_unsupervised_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/2_train_model/1_unsupervised_training/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/2_train_model/1_unsupervised_training/test_splitter.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import tempfile
4 | import os
5 | import sys
6 | import pandas as pd
7 |
8 | _FILE_DIR = os.path.dirname(__file__)
9 |
10 | # because this project isn't (at least of of writing this)
11 | # set up as a python project, there are no __init__.py
12 | # files in each folder
13 | # as such, in order to gain access to the relevant module,
14 | # I'll need to add the path manually
15 | _WWC_BASE_FOLDER = os.path.join(_FILE_DIR, "..", "..", "..", "..")
16 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDER, "medcat", "2_train_model", "1_unsupervised_training"))
17 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER)
18 | # now we are able to import splitter
19 |
20 | import splitter
21 |
22 | FILE_TO_SPLIT = os.path.join(_WWC_BASE_FOLDER, "tests", "medcat", "resources", "example_file_to_split.csv")
23 | NR_OF_LINES_IN_FILE = 125
24 | NR_OF_COLUMNS_IN_FILE = 20
25 |
26 |
27 | class SplitFileTests(unittest.TestCase):
28 | # lines per file - we want 4 rows, on average
29 | nr_of_lines = 4 * NR_OF_LINES_IN_FILE // NR_OF_COLUMNS_IN_FILE
30 | # NOTE: If the number of lines is not a multiple of the number of lines
31 | # the expected number of files needs to be one greater
32 | files_expected = NR_OF_LINES_IN_FILE // nr_of_lines
33 |
34 | @classmethod
35 | def setUpClass(cls):
36 | cls.temp_folder = tempfile.TemporaryDirectory()
37 | cls.save_format = os.path.join(cls.temp_folder.name, "split_%03d.csv")
38 | # do the splitting
39 | splitter.split_file(FILE_TO_SPLIT, cls.nr_of_lines, cls.save_format)
40 |
41 | @classmethod
42 | def tearDownClass(cls):
43 | cls.temp_folder.cleanup()
44 |
45 | def test_has_correct_number_of_files(self):
46 | files = list(os.listdir(self.temp_folder.name))
47 | found = len(files)
48 | self.assertEqual(found, self.files_expected)
49 |
50 | def test_contains_same_content(self):
51 | df_orig = pd.read_csv(FILE_TO_SPLIT)
52 | file_names = [os.path.join(self.temp_folder.name, fn) for fn in os.listdir(self.temp_folder.name)]
53 | # need to sort for order
54 | files_to_read = sorted(file_names)
55 | to_concat = [pd.read_csv(f) for f in files_to_read]
56 | df_split = pd.concat(to_concat, ignore_index=True)
57 | for nr, (lo, ls) in enumerate(zip(df_orig.iterrows(), df_split.iterrows())):
58 | for pnr, (p1, p2) in enumerate(zip(lo, ls)):
59 | with self.subTest(f"L-{nr}; P-{pnr}"):
60 | if isinstance(p1, pd.Series):
61 | self.assertTrue(p1.equals(p2))
62 | else:
63 | self.assertEqual(p1, p2)
64 |
--------------------------------------------------------------------------------
/tests/medcat/2_train_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/2_train_model/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/evaluate_mct_export/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/evaluate_mct_export/__init__.py
--------------------------------------------------------------------------------
/tests/medcat/evaluate_mct_export/offline_test_mct_analysis.py:
--------------------------------------------------------------------------------
1 | """This module is meant to be tested offline (i.e not in a GitHub actions settings).
2 | The main reason is the access to various models it requires.
3 | """
4 | import os
5 | import sys
6 |
7 |
8 | # because this project isn't (at least of of writing this)
9 | # set up as a python project, there are no __init__.py
10 | # files in each folder
11 | # as such, in order to gain access to the relevant module,
12 | # I'll need to add the path manually
13 | from .test_mct_analysis import (MEDCAT_EVAL_MCT_EXPORT_FOLDER, RESOURCE_DIR, MCT_EXPORT_JSON_PATH,
14 | BaseMCTExportTests)
15 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER)
16 | # and now we can import from mct_analysis
17 | from mct_analysis import MedcatTrainer_export
18 |
19 |
20 | MODEL_PACK_PATH = os.path.join(RESOURCE_DIR, "offline",
21 | "medmen_wstatus_2021_oct.zip")
22 |
23 |
24 | class MCTExportBasicTests(BaseMCTExportTests):
25 | report_path = 'mct_report.xlsx'
26 |
27 | @classmethod
28 | def setUpClass(cls) -> None:
29 | cls.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], MODEL_PACK_PATH)
30 |
31 | # these would need a CAT instance
32 | def test_can_full_annotation_df(self):
33 | full_ann_df = self.export.full_annotation_df()
34 | self.assertNonEmptyDataframe(full_ann_df)
35 |
36 | def test_can_meta_anns_concept_summary(self):
37 | meta_anns_summary_df = self.export.meta_anns_concept_summary()
38 | # this will be empty since I don't think I have anything
39 | # of note regarding meta annotations
40 | self.assertIsNotNone(meta_anns_summary_df)
41 |
42 | def test_generate_report(self):
43 | self.export.generate_report(path=self.report_path)
44 | self.assertTrue(os.path.exists(self.report_path))
45 |
--------------------------------------------------------------------------------
/tests/medcat/evaluate_mct_export/test_mct_analysis.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import pandas as pd
5 |
6 | import unittest
7 |
8 |
9 | _FILE_DIR = os.path.dirname(__file__)
10 |
11 | # because this project isn't (at least of of writing this)
12 | # set up as a python project, there are no __init__.py
13 | # files in each folder
14 | # as such, in order to gain access to the relevant module,
15 | # I'll need to add the path manually
16 | _WWC_BASE_FOLDE = os.path.join(_FILE_DIR, "..", "..", "..")
17 | MEDCAT_EVAL_MCT_EXPORT_FOLDER = os.path.abspath(os.path.join(_WWC_BASE_FOLDE, "medcat", "evaluate_mct_export"))
18 | sys.path.append(MEDCAT_EVAL_MCT_EXPORT_FOLDER)
19 | # and now we can import from mct_analysis
20 | from mct_analysis import MedcatTrainer_export
21 |
22 | # add path to MCT export
23 | RESOURCE_DIR = os.path.abspath(os.path.join(_FILE_DIR, "..", "resources"))
24 | MCT_EXPORT_JSON_PATH = os.path.join(RESOURCE_DIR, "MCT_export_example.json")
25 |
26 |
27 | class MCTExportInitTests(unittest.TestCase):
28 |
29 | def test_can_init(self):
30 | inst = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None)
31 | self.assertIsInstance(inst, MedcatTrainer_export)
32 |
33 |
34 | class BaseMCTExportTests(unittest.TestCase):
35 |
36 | @classmethod
37 | def setUpClass(cls) -> None:
38 | cls.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None)
39 |
40 | def assertNonEmptyDataframe(self, df):
41 | self.assertIsInstance(df, pd.DataFrame)
42 | self.assertFalse(df.empty)
43 |
44 |
45 | class MCTExportBasicTests(BaseMCTExportTests):
46 |
47 | def test_can_get_annotations(self):
48 | annotation_df = self.export.annotation_df()
49 | self.assertNonEmptyDataframe(annotation_df)
50 |
51 | def test_can_get_summary(self):
52 | summary_df = self.export.concept_summary()
53 | self.assertNonEmptyDataframe(summary_df)
54 |
55 | def test_can_get_user_stats(self):
56 | users_stats = self.export.user_stats()
57 | self.assertNonEmptyDataframe(users_stats)
58 |
59 | def test_can_rename_meta_anns_empty_no_change(self):
60 | ann_df1 = self.export.annotation_df()
61 | self.export.rename_meta_anns()
62 | ann_df2 = self.export.annotation_df()
63 | self.assertTrue(all(ann_df1 == ann_df2))
64 |
65 |
66 | class MCTExportUsageTests(BaseMCTExportTests):
67 |
68 | def assertDataFrameHasRowsColumns(self, df,
69 | exp_rows: int,
70 | exp_columns: int):
71 | self.assertEqual(len(df.index), exp_rows)
72 | self.assertEqual(len(df.columns), exp_columns)
73 |
74 | def test_has_correct_projects(self, exp_proj=['MartTestAnnotation']):
75 | got = self.export.project_names
76 | self.assertEqual(len(got), len(exp_proj))
77 | self.assertEqual(got, exp_proj)
78 |
79 | def test_has_correct_documents(self, exp_docs=['Doc 1', 'Doc 2', 'Doc 3', 'Doc 4', 'Doc 5']):
80 | got = self.export.document_names
81 | self.assertEqual(len(got), len(exp_docs))
82 | self.assertEqual(got, exp_docs)
83 |
84 | def test_rename_meta_anns_empty_does_not_add_project_and_doc_names(self):
85 | self.export.rename_meta_anns()
86 | self.test_has_correct_projects()
87 | self.test_has_correct_documents()
88 |
89 | def test_annotations_has_correct_rows_columns(self,
90 | exp_rows=362,
91 | exp_columns=19):
92 | ann_df = self.export.annotation_df()
93 | self.assertDataFrameHasRowsColumns(ann_df, exp_rows, exp_columns)
94 |
95 | def test_summary_has_correct_rows_columns(self,
96 | exp_rows=197,
97 | exp_columns=5):
98 | summary_df = self.export.concept_summary()
99 | self.assertDataFrameHasRowsColumns(summary_df, exp_rows, exp_columns)
100 |
101 | def test_cuser_stats_has_correct_rows_columns(self,
102 | exp_rows=1,
103 | exp_columns=2):
104 | users_stats = self.export.user_stats()
105 | self.assertDataFrameHasRowsColumns(users_stats, exp_rows, exp_columns)
106 |
107 | def test_cuser_stats_has_correct_user(self, expected="mart"):
108 | unique_users = self.export.user_stats()["user"].unique().tolist()
109 | self.assertEqual(len(unique_users), 1)
110 | self.assertEqual(unique_users[0], expected)
111 |
112 |
113 | class MCTExportMetaAnnRenameTests(unittest.TestCase):
114 | NAMES2RENAME = {"Status": "VERSION"}
115 | VALUES2RENAME = {"Status": {"Affirmed": "Got it!"}}
116 | # can only rename values if renaming names
117 | # so need a mapping from the same name to the same name
118 | # for each name used in values
119 | VALUES_RENAME_HELPER = dict((n, n) for n in VALUES2RENAME)
120 |
121 | def setUp(self) -> None:
122 | self.export = MedcatTrainer_export([MCT_EXPORT_JSON_PATH, ], None)
123 |
124 | def _get_all_meta_anns(self):
125 | for proj in self.export.mct_export['projects']:
126 | for doc in proj['documents']:
127 | for ann in doc['annotations']:
128 | for meta_ann in ann["meta_anns"].items():
129 | yield meta_ann
130 |
131 | def _check_names(self, prev_anns: list):
132 | for (meta_ann_name, _), (prev_name, _) in zip(self._get_all_meta_anns(), prev_anns):
133 | for name, replacement_name in self.NAMES2RENAME.items():
134 | with self.subTest(f"{name} -> {replacement_name} ({meta_ann_name})"):
135 | self.assertNotEqual(meta_ann_name, name)
136 | if prev_name == name:
137 | self.assertEqual(meta_ann_name, replacement_name)
138 |
139 | def test_meta_annotations_renamed_names(self):
140 | prev_anns = list(self._get_all_meta_anns())
141 | self.export.rename_meta_anns(meta_anns2rename=self.NAMES2RENAME)
142 | self._check_names(prev_anns)
143 |
144 | def _check_values(self, prev_anns: list, only_values: bool = True):
145 | for (name, ann), (prev_name, prev_ann) in zip(self._get_all_meta_anns(), prev_anns):
146 | with self.subTest(f"{prev_ann} -> {ann}"):
147 | if only_values:
148 | # if only changing values, not names themselves
149 | self.assertEqual(name, prev_name, "Names should not change")
150 | for target_name, value_map in self.VALUES2RENAME.items():
151 | # if correct target and has a value that can be remapped
152 | if name == target_name and prev_ann["value"] in value_map:
153 | with self.subTest(f"{target_name} with {value_map}"):
154 | start_value = prev_ann["value"]
155 | new_value = ann["value"]
156 | exp_value = value_map[start_value]
157 | self.assertEqual(new_value, exp_value)
158 |
159 | def test_meta_annotations_renamed_values(self):
160 | prev_anns = list(self._get_all_meta_anns())
161 | self.export.rename_meta_anns(meta_anns2rename=self.VALUES_RENAME_HELPER,
162 | meta_ann_values2rename=self.VALUES2RENAME)
163 | self._check_values(prev_anns)
164 |
165 | def test_meta_annotations_renamed_names_and_values(self):
166 | prev_anns = list(self._get_all_meta_anns())
167 | self.export.rename_meta_anns(meta_anns2rename=self.NAMES2RENAME,
168 | meta_ann_values2rename=self.VALUES2RENAME)
169 | self._check_names(prev_anns)
170 | self._check_values(prev_anns, only_values=False)
171 |
172 | def test_meta_annotations_renamed_values_only(self):
173 | prev_anns = list(self._get_all_meta_anns())
174 | self.export.rename_meta_anns(meta_ann_values2rename=self.VALUES2RENAME)
175 | self._check_values(prev_anns, only_values=True)
176 |
--------------------------------------------------------------------------------
/tests/medcat/resources/cdb.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/resources/cdb.dat
--------------------------------------------------------------------------------
/tests/medcat/resources/example_cdb_input_snomed.csv:
--------------------------------------------------------------------------------
1 | cui,name,name_status,ontologies,description_type_ids,type_ids
2 | 101009,Quilonia ethiopica (organism),P,SNOMED-CT,organism,81102976
3 | 102002,Hemoglobin Okaloosa (substance),P,SNOMED-CT,substance,91187746
4 | 103007,Squirrel fibroma virus (organism),P,SNOMED-CT,organism,81102976
5 | 104001,Excision of lesion of patella (procedure),P,SNOMED-CT,procedure,28321150
6 | 106004,Structure of posterior carpal region (body structure),P,SNOMED-CT,body structure,37552161
7 | 107008,Structure of fetal part of placenta (body structure),P,SNOMED-CT,body structure,37552161
8 | 108003,Entire condylar emissary vein (body structure),P,SNOMED-CT,body structure,37552161
9 | 109006,Anxiety disorder of childhood OR adolescence (disorder),P,SNOMED-CT,disorder,9090192
10 | 110001,Structure of visceral layer of Bowman's capsule (body structure),P,SNOMED-CT,body structure,37552161
11 | 111002,Parathyroid structure (body structure),P,SNOMED-CT,body structure,37552161
12 | 112009,Bembrops anatirostris (organism),P,SNOMED-CT,organism,81102976
13 | 113004,Type-casting-machine operator (occupation),P,SNOMED-CT,occupation,16939031
14 | 114005,Feline calicivirus (organism),P,SNOMED-CT,organism,81102976
15 | 115006,Removable appliance therapy (procedure),P,SNOMED-CT,procedure,28321150
16 | 116007,Subcutaneous tissue structure of medial surface of index finger (body structure),P,SNOMED-CT,body structure,37552161
17 | 117003,Rhipicephalus sanguineus (organism),P,SNOMED-CT,organism,81102976
18 | 118008,Black buffalo weaver (organism),P,SNOMED-CT,organism,81102976
19 | 119000,Thoracoscopic partial lobectomy of lung (procedure),P,SNOMED-CT,procedure,28321150
20 | 120006,Ornithine racemase (substance),P,SNOMED-CT,substance,91187746
21 | 122003,Choroidal hemorrhage (disorder),P,SNOMED-CT,disorder,9090192
22 | 124002,Structure of coronoid process of mandible (body structure),P,SNOMED-CT,body structure,37552161
23 | 125001,Ferrous (59-Fe) sulfate (substance),P,SNOMED-CT,substance,91187746
24 | 126000,Galactosyl-N-acetylglucosaminylgalactosylglucosylceramide alpha-galactosyltransferase (substance),P,SNOMED-CT,substance,91187746
25 | 127009,Miscarriage with laceration of cervix (disorder),P,SNOMED-CT,disorder,9090192
26 | 128004,Hand microscope examination of skin (procedure),P,SNOMED-CT,procedure,28321150
27 | 129007,Homoiothermia (finding),P,SNOMED-CT,finding,67667581
28 | 130002,Hemoglobin Hopkins-II (substance),P,SNOMED-CT,substance,91187746
29 | 131003,Dolichyl-phosphate mannosyltransferase (substance),P,SNOMED-CT,substance,91187746
30 | 132005,Serraniculus pumilio (organism),P,SNOMED-CT,organism,81102976
31 | 133000,Percutaneous implantation of neurostimulator electrodes into neuromuscular component (procedure),P,SNOMED-CT,procedure,28321150
32 | 134006,Decreased hair growth (finding),P,SNOMED-CT,finding,67667581
33 | 135007,Arthrotomy of wrist joint with exploration and biopsy (procedure),P,SNOMED-CT,procedure,28321150
34 | 136008,Acacia erioloba (organism),P,SNOMED-CT,organism,81102976
35 | 138009,No past history of (contextual qualifier) (qualifier value),P,SNOMED-CT,qualifier value,7882689
36 | 139001,Felid herpesvirus 1 (organism),P,SNOMED-CT,organism,81102976
37 | 140004,Chronic pharyngitis (disorder),P,SNOMED-CT,disorder,9090192
38 | 142007,"Excision of tumor from shoulder area, deep, intramuscular (procedure)",P,SNOMED-CT,procedure,28321150
39 | 144008,Normal peripheral vision (finding),P,SNOMED-CT,finding,67667581
40 | 145009,Colloid milium (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986
41 | 146005,Repair of nonunion of metatarsal with bone graft (procedure),P,SNOMED-CT,procedure,28321150
42 | 148006,Preliminary diagnosis (contextual qualifier) (qualifier value),P,SNOMED-CT,qualifier value,7882689
43 | 149003,"Central pair of microtubules, cilium or flagellum, not bacterial (cell structure)",P,SNOMED-CT,cell structure,66527446
44 | 150003,Abnormal bladder continence (finding),P,SNOMED-CT,finding,67667581
45 | 151004,Gonococcal meningitis (disorder),P,SNOMED-CT,disorder,9090192
46 | 153001,Cystourethroscopy with resection of ureterocele (procedure),P,SNOMED-CT,procedure,28321150
47 | 154007,Rubber molding-press operator (occupation),P,SNOMED-CT,occupation,16939031
48 | 155008,Structure of deep circumflex iliac artery (body structure),P,SNOMED-CT,body structure,37552161
49 | 156009,"Spine board, device (physical object)",P,SNOMED-CT,physical object,32816260
50 | 158005,Salmonella Irumu (organism),P,SNOMED-CT,organism,81102976
51 | 159002,Ferrocyanide salt (substance),P,SNOMED-CT,substance,91187746
52 | 160007,Removal of foreign body of tendon and/or tendon sheath (procedure),P,SNOMED-CT,procedure,28321150
53 | 161006,Thermal injury (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986
54 | 162004,Severe manic bipolar I disorder without psychotic features (disorder),P,SNOMED-CT,disorder,9090192
55 | 163009,Bacteroides stercoris (organism),P,SNOMED-CT,organism,81102976
56 | 164003,Phosphoenolpyruvate-protein phosphotransferase (substance),P,SNOMED-CT,substance,91187746
57 | 165002,Accident prone (finding),P,SNOMED-CT,finding,67667581
58 | 166001,Behavioral therapy (regime/therapy),P,SNOMED-CT,regime/therapy,47503797
59 | 167005,Structure of supraclavicular part of brachial plexus (body structure),P,SNOMED-CT,body structure,37552161
60 | 168000,Typhlolithiasis (disorder),P,SNOMED-CT,disorder,9090192
61 | 169008,Product containing hypothalamic releasing factor (product),P,SNOMED-CT,product,91776366
62 | 170009,"Special potency disk identification, vancomycin test (procedure)",P,SNOMED-CT,procedure,28321150
63 | 171008,Injury of ascending right colon without open wound into abdominal cavity (disorder),P,SNOMED-CT,disorder,9090192
64 | 172001,Endometritis following molar AND/OR ectopic pregnancy (disorder),P,SNOMED-CT,disorder,9090192
65 | 173006,Micrognathus crinitus (organism),P,SNOMED-CT,organism,81102976
66 | 174000,Harrison-Richardson operation on vagina (procedure),P,SNOMED-CT,procedure,28321150
67 | 175004,Supraorbital neuralgia (finding),P,SNOMED-CT,finding,67667581
68 | 176003,Anastomosis of rectum (procedure),P,SNOMED-CT,procedure,28321150
69 | 177007,Poisoning by sawfly larvae (disorder),P,SNOMED-CT,disorder,9090192
70 | 178002,Uridine diphosphate galactose (substance),P,SNOMED-CT,substance,91187746
71 | 179005,Apraxia of dressing (finding),P,SNOMED-CT,finding,67667581
72 | 180008,Genus Fijivirus (organism),P,SNOMED-CT,organism,81102976
73 | 181007,Hemorrhagic bronchopneumonia (disorder),P,SNOMED-CT,disorder,9090192
74 | 182000,Canalization (morphologic abnormality),P,SNOMED-CT,morphologic abnormality,33782986
75 | 183005,Autoimmune pancytopenia (disorder),P,SNOMED-CT,disorder,9090192
76 | 184004,Withdrawal arrhythmia (disorder),P,SNOMED-CT,disorder,9090192
77 | 186002,Human leukocyte antigen Cw9 (substance),P,SNOMED-CT,substance,91187746
78 | 187006,Cyanocobalamin (57-Co) (substance),P,SNOMED-CT,substance,91187746
79 | 188001,Injury of intercostal artery (disorder),P,SNOMED-CT,disorder,9090192
80 | 189009,Excision of lesion of artery (procedure),P,SNOMED-CT,procedure,28321150
81 | 191001,Lednice virus (organism),P,SNOMED-CT,organism,81102976
82 | 192008,Congenital syphilitic hepatomegaly (disorder),P,SNOMED-CT,disorder,9090192
83 | 193003,Benign hypertensive renal disease (disorder),P,SNOMED-CT,disorder,9090192
84 | 194009,Notropis whipplei (organism),P,SNOMED-CT,organism,81102976
85 | 196006,Concave shape (qualifier value),P,SNOMED-CT,qualifier value,7882689
86 | 197002,Mold to yeast conversion test (procedure),P,SNOMED-CT,procedure,28321150
87 | 198007,Disease caused by Filoviridae (disorder),P,SNOMED-CT,disorder,9090192
88 | 199004,Decreased lactation (finding),P,SNOMED-CT,finding,67667581
89 | 200001,Berberine (substance),P,SNOMED-CT,substance,91187746
90 | 201002,Oligopus claudei (organism),P,SNOMED-CT,organism,81102976
91 | 202009,Structure of anterior division of renal artery (body structure),P,SNOMED-CT,body structure,37552161
92 | 205006,Entire left commissure of aortic valve (body structure),P,SNOMED-CT,body structure,37552161
93 | 206007,Structure of gluteus maximus muscle (body structure),P,SNOMED-CT,body structure,37552161
94 | 207003,European edible frog (organism),P,SNOMED-CT,organism,81102976
95 | 209000,Plover (organism),P,SNOMED-CT,organism,81102976
96 | 210005,"Arrow, device (physical object)",P,SNOMED-CT,physical object,32816260
97 | 211009,Product containing norethandrolone (medicinal product),P,SNOMED-CT,medicinal product,37785117
98 | 213007,Simian enterovirus 7 (organism),P,SNOMED-CT,organism,81102976
99 | 214001,Streptococcus mutans (organism),P,SNOMED-CT,organism,81102976
100 | 216004,Delusion of persecution (finding),P,SNOMED-CT,finding,67667581
101 |
--------------------------------------------------------------------------------
/tests/medcat/resources/example_cdb_input_umls.csv:
--------------------------------------------------------------------------------
1 | cui,name_status,ontologies,name,type_ids
2 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T116
3 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T121
4 | C0000005,Y,MSH,(131)I-Macroaggregated Albumin,T130
5 | C0000005,Y,MSH,(131)I-MAA,T116
6 | C0000005,Y,MSH,(131)I-MAA,T121
7 | C0000005,Y,MSH,(131)I-MAA,T130
8 | C0000039,N,RXNORM,"1,2-dipalmitoylphosphatidylcholine",T109
9 | C0000039,N,RXNORM,"1,2-dipalmitoylphosphatidylcholine",T121
10 | C0000039,Y,MTH,"1,2-dipalmitoylphosphatidylcholine",T109
11 | C0000039,Y,MTH,"1,2-dipalmitoylphosphatidylcholine",T121
12 | C0000039,N,SNMI,Dipalmitoylphosphatidylcholine,T109
13 | C0000039,N,SNMI,Dipalmitoylphosphatidylcholine,T121
14 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109
15 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121
16 | C0000039,N,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,T109
17 | C0000039,N,SNOMEDCT_US,Dipalmitoylphosphatidylcholine,T121
18 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109
19 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121
20 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T109
21 | C0000039,N,LNC,Dipalmitoylphosphatidylcholine,T121
22 | C0000039,Y,MSH,Dipalmitoylphosphatidylcholine,T109
23 | C0000039,Y,MSH,Dipalmitoylphosphatidylcholine,T121
24 | C0000039,Y,MSH,Dipalmitoylglycerophosphocholine,T109
25 | C0000039,Y,MSH,Dipalmitoylglycerophosphocholine,T121
26 | C0000039,Y,MSH,Dipalmitoyllecithin,T109
27 | C0000039,Y,MSH,Dipalmitoyllecithin,T121
28 | C0000039,Y,MSH,"Phosphatidylcholine, Dipalmitoyl",T109
29 | C0000039,Y,MSH,"Phosphatidylcholine, Dipalmitoyl",T121
30 | C0000052,N,MSH,"1,4-alpha-Glucan Branching Enzyme",T116
31 | C0000052,N,MSH,"1,4-alpha-Glucan Branching Enzyme",T126
32 | C0000052,Y,MTH,"1,4-alpha-Glucan Branching Enzyme",T116
33 | C0000052,Y,MTH,"1,4-alpha-Glucan Branching Enzyme",T126
34 | C0000052,N,SNMI,Branching enzyme,T116
35 | C0000052,N,SNMI,Branching enzyme,T126
36 | C0000052,Y,SNOMEDCT_US,Branching enzyme,T116
37 | C0000052,Y,SNOMEDCT_US,Branching enzyme,T126
38 | C0000052,Y,MSH,"Enzyme, Branching",T116
39 | C0000052,Y,MSH,"Enzyme, Branching",T126
40 | C0000052,Y,MSH,"Glycosyltransferase, Branching",T116
41 | C0000052,Y,MSH,"Glycosyltransferase, Branching",T126
42 | C0000052,Y,MSH,Starch Branching Enzyme,T116
43 | C0000052,Y,MSH,Starch Branching Enzyme,T126
44 | C0000052,Y,SNM,alpha-Glucan-branching glycosyltransferase,T116
45 | C0000052,Y,SNM,alpha-Glucan-branching glycosyltransferase,T126
46 | C0000074,Y,MSH,1-Alkyl-2-Acylphosphatidates,T109
47 | C0000074,Y,MSH,1 Alkyl 2 Acylphosphatidates,T109
48 | C0000084,Y,MSH,1-Carboxyglutamic Acid,T116
49 | C0000084,Y,MSH,1-Carboxyglutamic Acid,T123
50 | C0000084,Y,MSH,gamma-Carboxyglutamic Acid,T116
51 | C0000084,Y,MSH,gamma-Carboxyglutamic Acid,T123
52 | C0000096,Y,MSH,1-Methyl-3-isobutylxanthine,T109
53 | C0000096,Y,MSH,1-Methyl-3-isobutylxanthine,T121
54 | C0000096,Y,MSH,3-Isobutyl-1-methylxanthine,T109
55 | C0000096,Y,MSH,3-Isobutyl-1-methylxanthine,T121
56 | C0000096,Y,MSH,IBMX,T109
57 | C0000096,Y,MSH,IBMX,T121
58 | C0000096,Y,MSH,Isobutyltheophylline,T109
59 | C0000096,Y,MSH,Isobutyltheophylline,T121
60 | C0000097,Y,MSH,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",T109
61 | C0000097,Y,MSH,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",T131
62 | C0000097,N,CSP,MPTP,T109
63 | C0000097,N,CSP,MPTP,T131
64 | C0000097,N,PSY,MPTP,T109
65 | C0000097,N,PSY,MPTP,T131
66 | C0000097,Y,MSH,MPTP,T109
67 | C0000097,Y,MSH,MPTP,T131
68 | C0000097,Y,CHV,mptp,T109
69 | C0000097,Y,CHV,mptp,T131
70 | C0000097,N,RCD,Methylphenyltetrahydropyridine,T109
71 | C0000097,N,RCD,Methylphenyltetrahydropyridine,T131
72 | C0000097,N,LCH_NW,Methylphenyltetrahydropyridine,T109
73 | C0000097,N,LCH_NW,Methylphenyltetrahydropyridine,T131
74 | C0000097,N,PSY,Methylphenyltetrahydropyridine,T109
75 | C0000097,N,PSY,Methylphenyltetrahydropyridine,T131
76 | C0000097,Y,SNOMEDCT_US,Methylphenyltetrahydropyridine,T109
77 | C0000097,Y,SNOMEDCT_US,Methylphenyltetrahydropyridine,T131
78 | C0000097,Y,CSP,methylphenyltetrahydropyridine,T109
79 | C0000097,Y,CSP,methylphenyltetrahydropyridine,T131
80 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium,T109
81 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium,T131
82 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium Ion,T109
83 | C0000098,Y,MSH,1-Methyl-4-phenylpyridinium Ion,T131
84 | C0000098,Y,MSH,Cyperquat,T109
85 | C0000098,Y,MSH,Cyperquat,T131
86 | C0000098,Y,CSP,MPP+,T109
87 | C0000098,Y,CSP,MPP+,T131
88 | C0000098,Y,MSH,N-Methyl-4-phenylpyridine,T109
89 | C0000098,Y,MSH,N-Methyl-4-phenylpyridine,T131
90 | C0000098,Y,MSH,1-Methyl-4-phenylpyridine,T109
91 | C0000098,Y,MSH,1-Methyl-4-phenylpyridine,T131
92 | C0000098,Y,MSH,N METHYL 4 PHENYLPYRIDINIUM,T109
93 | C0000098,Y,MSH,N METHYL 4 PHENYLPYRIDINIUM,T131
94 | C0000098,Y,MSH,"Pyridinium, 1-methyl-4-phenyl-",T109
95 | C0000098,Y,MSH,"Pyridinium, 1-methyl-4-phenyl-",T131
96 | C0000102,Y,MSH,1-Naphthylamine,T109
97 | C0000102,Y,MSH,1-Naphthylamine,T131
98 | C0000102,Y,CHV,1-naphthylamine,T109
99 | C0000102,Y,CHV,1-naphthylamine,T131
100 | C0000102,Y,MSH,alpha-Naphthylamine,T109
101 |
--------------------------------------------------------------------------------
/tests/medcat/resources/example_file_to_split.csv:
--------------------------------------------------------------------------------
1 | "subject_id","hadm_id","chartdate","charttime","text","category","description"
2 | 12345,67890,"2024-01-18","12:30:00","EPR:
3 | Patient, a 55-year-old male, was admitted with complaints of severe chest pain radiating to the left arm and associated shortness of breath.
4 | On examination, the patient appeared diaphoretic with blood pressure elevated at 160/90 mmHg, heart rate of 110 beats per minute, and respiratory rate of 22 breaths per minute.
5 | Initial ECG revealed sinus tachycardia with ST-segment elevation in leads II, III, and aVF. Troponin levels were elevated, suggestive of acute myocardial infarction.
6 | The patient was promptly started on aspirin, clopidogrel, and intravenous nitroglycerin. Cardiology consultation requested for further management and possible catheterization.
7 | ","Admission Note","Acute Myocardial Infarction"
8 | 56789,54321,"2024-01-18","08:45:00","EPR:
9 | Follow-up note for a 65-year-old female with a history of type 2 diabetes mellitus.
10 | Blood glucose levels have been well-controlled with recent HbA1c within the target range.
11 | Medication reconciliation performed, and adjustments made to the insulin regimen to optimize glycemic control.
12 | Patient educated on the importance of regular blood glucose monitoring, proper diet, and exercise.
13 | Follow-up appointment scheduled in 3 months for continued management.
14 | ","Follow-up Note","Diabetes Mellitus Management"
15 | 98765,43210,"2024-01-18","15:20:00","EPR:
16 | Emergency department note for a 40-year-old male involved in a motor vehicle accident.
17 | The patient was brought in by ambulance with complaints of severe right leg pain.
18 | Physical examination revealed tenderness and swelling over the right femur.
19 | CT scan of the pelvis and femur confirmed a displaced fracture of the right femoral shaft.
20 | Orthopedic surgery consulted for further evaluation and management.
21 | The patient was given analgesia and placed in traction pending surgical intervention.
22 | ","Emergency Note","Trauma and Fracture Evaluation"
23 | 23456,78901,"2024-01-18","10:10:00","EPR:
24 | Psychiatric evaluation for a 30-year-old female presenting with symptoms of depression.
25 | The patient reports a persistent low mood, loss of interest in activities, poor appetite, and difficulty sleeping.
26 | No significant suicidal ideation reported.
27 | Past psychiatric history includes a previous episode of major depressive disorder.
28 | Started the patient on sertraline and provided psychoeducation on coping strategies.
29 | Referral made to a therapist for ongoing support.
30 | ","Psychiatric Note","Major Depressive Disorder"
31 | 87654,21098,"2024-01-18","14:00:00","EPR:
32 | Consultation note for a 25-year-old male admitted with severe right lower quadrant abdominal pain.
33 | Physical examination consistent with suspected appendicitis.
34 | Laboratory results showed an elevated white blood cell count.
35 | CT abdomen and pelvis ordered, revealing acute appendicitis with localized abscess formation.
36 | The patient placed on NPO status, started on broad-spectrum antibiotics, and surgical intervention scheduled for appendectomy.","Consultation Note","Appendicitis Evaluation and Management"
37 | 34567,87654,"2024-01-19","09:15:00","EPR:
38 | Admission note for a 45-year-old female presenting with acute respiratory distress.
39 | History reveals a recent upper respiratory tract infection.
40 | On examination, the patient is tachypneic with bilateral crackles on auscultation.
41 | Chest X-ray shows diffuse infiltrates consistent with viral pneumonia.
42 | Oxygen supplementation initiated, and antiviral therapy prescribed.
43 | Close monitoring for respiratory status ongoing.
44 | ","Admission Note","Viral Pneumonia"
45 | 78901,23456,"2024-01-19","13:45:00","EPR:
46 | Follow-up note for a 60-year-old male with a history of hypertension.
47 | Blood pressure well-controlled on current medication regimen.
48 | Discussion on lifestyle modifications, including a low-sodium diet and regular exercise.
49 | Patient advised on the importance of regular follow-up appointments for ongoing blood pressure management.
50 | ","Follow-up Note","Hypertension Management"
51 | 21098,65432,"2024-01-19","16:30:00","EPR:
52 | Emergency department note for a 35-year-old female involved in a fall from a height.
53 | Complaints of back pain and numbness in both lower extremities.
54 | Neurological examination indicates sensory deficits.
55 | CT spine ordered, revealing a thoracic spine fracture.
56 | Neurosurgery consultation requested for further evaluation and management.
57 | ","Emergency Note","Traumatic Spinal Injury"
58 | 54321,78909,"2024-01-19","11:00:00","EPR:
59 | Psychiatric evaluation for a 28-year-old male with symptoms of anxiety and panic attacks.
60 | The patient reports palpitations, sweating, and a sense of impending doom during episodes.
61 | No significant past psychiatric history.
62 | Started on selective serotonin reuptake inhibitors (SSRIs) and referred for cognitive-behavioral therapy.
63 | ","Psychiatric Note","Generalized Anxiety Disorder"
64 | 67890,12345,"2024-01-19","14:20:00","EPR:
65 | Consultation note for a 50-year-old female with abdominal pain and distension.
66 | Physical examination consistent with ascites.
67 | Paracentesis performed, revealing elevated white cell count and protein levels.
68 | Further workup initiated for the underlying cause of ascites.
69 | Gastroenterology consultation requested for comprehensive evaluation.
70 | ","Consultation Note","Ascites Evaluation"
71 | 12312,67845,"2024-01-20","10:45:00","EPR:
72 | Admission note for a 22-year-old male presenting with a seizure episode.
73 | No prior history of seizures reported. Neurological examination unremarkable.
74 | CT scan of the brain performed, showing no acute abnormalities.
75 | The patient started on antiepileptic medication, and an electroencephalogram (EEG) scheduled for further evaluation.
76 | ","Admission Note","First Seizure Evaluation"
77 | 45678,34567,"2024-01-20","14:15:00","EPR:
78 | Follow-up note for a 70-year-old female with a history of congestive heart failure.
79 | Recent exacerbation managed with diuretic adjustment and oxygen therapy.
80 | Patient educated on sodium restriction and fluid management.
81 | Close outpatient follow-up scheduled to monitor symptoms and optimize heart failure management.
82 | ","Follow-up Note","Congestive Heart Failure Management"
83 | 78909,45678,"2024-01-20","16:50:00","EPR:
84 | Emergency department note for a 40-year-old male with a laceration to the right hand from a work-related injury.
85 | Wound cleaned and sutured. Tetanus prophylaxis administered.
86 | Occupational health referral made for further assessment and follow-up.
87 | ","Emergency Note","Hand Laceration Management"
88 | 23456,56789,"2024-01-20","12:30:00","EPR:
89 | Psychiatric evaluation for a 25-year-old female presenting with symptoms of post-traumatic stress disorder (PTSD) following a recent traumatic event.
90 | The patient experiences intrusive thoughts and nightmares.
91 | Started on a selective serotonin-norepinephrine reuptake inhibitor (SNRI) and referred for trauma-focused therapy.
92 | ","Psychiatric Note","Post-Traumatic Stress Disorder"
93 | 67890,89012,"2024-01-20","09:00:00","EPR:
94 | Consultation note for a 55-year-old male with persistent epigastric pain.
95 | Upper endoscopy performed, revealing erosive gastritis.
96 | Proton pump inhibitor prescribed, and lifestyle modifications discussed.
97 | Gastroenterology follow-up recommended for ongoing management.
98 | ","Consultation Note","Gastritis Evaluation and Management"
99 | 54321,98765,"2024-01-21","11:30:00","EPR:
100 | Admission note for a 38-year-old female presenting with a mysterious neurological syndrome.
101 | The patient experiences sudden and transient episodes of total paralysis, lasting a few minutes.
102 | Extensive neurological workup initiated, including genetic testing for a rare hereditary paralysis disorder.
103 | Neurology and genetics consultations requested for further evaluation.
104 | ","Admission Note","Familial Transient Paralysis Syndrome"
105 | 87654,23456,"2024-01-21","14:20:00","EPR:
106 | Follow-up note for a 45-year-old male with a history of unexplained fevers and skin lesions resembling butterfly wings.
107 | Extensive infectious disease and rheumatological workup inconclusive.
108 | Immunology and dermatology consultations ongoing to explore the possibility of a novel autoimmune disorder.
109 | ","Follow-up Note","Butterfly Wing Syndrome"
110 | 21098,76543,"2024-01-21","09:45:00","EPR:
111 | Emergency department note for a 28-year-old male presenting with acute respiratory distress and bizarre neuropsychiatric symptoms.
112 | Preliminary investigations inconclusive.
113 | Suspected rare autoimmune encephalitis with respiratory involvement.
114 | Immunotherapy initiated, and neurology and pulmonology consulted for collaborative management.
115 | ","Emergency Note","Autoimmune Encephalitis with Respiratory Distress"
116 | 67890,32109,"2024-01-21","16:10:00","EPR:
117 | Psychiatric evaluation for a 32-year-old female with sudden-onset obsessive-compulsive behaviors, including a compulsion to count objects in prime numbers.
118 | No history of psychiatric illness.
119 | Neurology and psychiatry consultations in progress for consideration of a rare neurodevelopmental disorder.
120 | ","Psychiatric Note","Prime Number Obsessive-Compulsive Disorder"
121 | 12345,65432,"2024-01-21","13:00:00","EPR:
122 | Consultation note for a 50-year-old male with chronic abdominal pain and gastrointestinal bleeding.
123 | nitial investigations inconclusive.
124 | Gastroenterology and hematology consulted for further evaluation of a suspected rare vascular malformation disorder affecting the gastrointestinal tract.
125 | ","Consultation Note","Gastrointestinal Vascular Malformation Syndrome"
--------------------------------------------------------------------------------
/tests/medcat/resources/vocab.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/working_with_cogstack/3e5ec5d43f72f01bcf42c90ae6fc6fbe74c965d1/tests/medcat/resources/vocab.dat
--------------------------------------------------------------------------------
/update.py:
--------------------------------------------------------------------------------
1 | # Script to update the working_with_cogstack repo from github branch main
2 | import subprocess
3 |
4 | # Command to stash local changes
5 | stash_command = ["git", "stash", "save", "Stashing local changes"]
6 |
7 | # Command to list changes in the stash
8 | list_changes_command = ["git", "stash", "show", "-p", "--name-only"]
9 |
10 | # Command to perform a git pull
11 | pull_command = ["git", "pull"]
12 |
13 | # Replace this with the actual path of the file you want to restore
14 | files_to_restore = ["credentials.py"]
15 |
16 | try:
17 | # Run the command to stash local changes
18 | subprocess.run(stash_command, check=True)
19 |
20 | print("Local changes stashed.")
21 |
22 | # Check if there are stash entries
23 | stash_entries = subprocess.check_output(list_changes_command, text=True)
24 |
25 | # Run the command to list changes in the stash
26 | changes_output = subprocess.check_output(list_changes_command, text=True)
27 |
28 | print("Changes in the stash:")
29 | print(changes_output)
30 | if input(f"Only {', '.join(files_to_restore)} will be preserved.\nAre you should you want to continue? (y/n)") == 'y':
31 | # Run the command to perform a git pull
32 | subprocess.run(pull_command, check=True)
33 |
34 | print("Pull complete.")
35 |
36 | # Run the command to restore the specific file
37 | for file_to_restore in files_to_restore:
38 | # Command to restore a specific file from the stash
39 | subprocess.run(["git", "checkout", "stash@{0}", "--", file_to_restore], check=True)
40 |
41 | print(f"File {file_to_restore} restored from stash.")
42 | else:
43 | print("Operation cancelled.")
44 |
45 | except subprocess.CalledProcessError as e:
46 | if e.returncode == 1:
47 | print("No stash entries found. Continuing with git pull.")
48 | # Run the command to perform a git pull
49 | subprocess.run(pull_command, check=True)
50 | else:
51 | print("An error occurred:")
52 | print(e)
53 |
54 |
--------------------------------------------------------------------------------
/utils/clinical_note_splitter.py:
--------------------------------------------------------------------------------
1 | # This script is specific to certain hospital sites and is not part of the main repository.
2 | import regex
3 | import logging
4 |
5 |
6 | def normalize_date(date, id_, start, end):
7 | """Normalizes different dates encountered in the clinical notes.
8 | Current accepted formats:
9 | 28 Feb 2913 04:50
10 | Thu 28 Feb 2013 04:50
11 | 28-Feb-2013 04:50
12 | Output:
13 | 28 Feb 2013 04:50
14 | """
15 |
16 | if '-' in date:
17 | date = date.replace("-", " ").strip()
18 | elif date.strip()[0].isalpha():
19 | date = date[date.index(' '):].strip()
20 | elif date.strip()[0].isnumeric():
21 | # all good
22 | date = date.strip()
23 | else:
24 | logging.warning("Unsupported date format: %s for id: %s with start: %s, end: %s", date, id_, start, end)
25 | return None
26 |
27 | return date
28 |
29 |
30 | def split_one_note(id_, text):
31 | """Splits the text of one note by date.
32 |
33 | Returns:
34 | List[Dict]:
35 | Returns a list of dictionary in the format: {'start': ,
36 | 'end': ,
37 | 'text': ,
38 | 'date': }
39 | """
40 | r = r'\n\w{0,5}\s*\d{1,2}(\s|-)[a-zA-Z]{3,5}(\s|-)\d{4}\s+\d{2}\:\d{2}'
41 | dates = regex.finditer(r, text)
42 | start = 0
43 | end = -1
44 | split_note = []
45 | previous_date = None
46 |
47 | for date in dates:
48 | if start == 0:
49 | start = date.span()[0]
50 | previous_date = date.captures()[0]
51 | elif previous_date is None or date.captures()[0] != previous_date:
52 | end = date.span()[0]
53 | note_text = text[start:end]
54 | if 'entered on -' in note_text.lower():
55 | if len(regex.findall(r'entered on -', note_text)) > 1:
56 | logging.warning("Possible problems for span with start: %s and end: %s for note with id: %s", start, end, id_)
57 | split_note.append({'start': start, 'end': end, 'text': note_text, 'date': normalize_date(previous_date, id_, start, end)})
58 | start = end
59 | previous_date = date.captures()[0]
60 | # Add the last note
61 | if previous_date is not None and 'entered on -' in text[start:].lower():
62 | split_note.append({'start': start, 'end': len(text), 'text': text[start:], 'date': normalize_date(previous_date, id_, start, len(text))})
63 | else:
64 | logging.warning("No date/entered-on detected for id: %s wth start: %s, end: %s and text:\n%s...", id_, start, end, text[0:300])
65 |
66 | return split_note
67 |
68 |
69 | def split_clinical_notes(clinical_notes):
70 | """Splits clinical notes.
71 |
72 | Args:
73 | clinical_notes(dict):
74 | Dictionary in the form {: , ...}.
75 |
76 | Returns:
77 | Dict:
78 | The split notes.
79 | """
80 | split_notes = {}
81 | for id_text, text in clinical_notes.items():
82 | split_notes[id_text] = split_one_note(id_text, text)
83 | return split_notes
84 |
--------------------------------------------------------------------------------
/utils/ethnicity_map.py:
--------------------------------------------------------------------------------
1 | # Mapped on top-level of 2001 NHS Data Dictionary; https://datadictionary.nhs.uk/data_elements/ethnic_category.html
2 | ethnicity_map = {'Algerian': 'Black',
3 | 'Any Other Group': 'Other',
4 | 'Asian and Chinese': 'Asian',
5 | 'Bangladeshi': 'Asian',
6 | 'Black African': 'Black',
7 | 'Black British': 'Black',
8 | 'British': 'White',
9 | 'Caribbean': 'Black',
10 | 'Chinese': 'Asian',
11 | 'Cypriot (Part nt st)': 'White',
12 | 'Ecuadorian': 'Other',
13 | 'English': 'White',
14 | 'Ethiopian': 'Black',
15 | 'Filipino': 'Asian',
16 | 'Ghanaian': 'Black',
17 | 'Greek Cypriot': 'White',
18 | 'Indian/British India': 'Asian',
19 | 'Iranian': 'Other',
20 | 'Italian': 'White',
21 | 'Mixed Black': 'Black',
22 | 'Mixed Caribbean': 'Black',
23 | 'Nigerian': 'Black',
24 | 'Not Given': 'Unknown',
25 | 'Not Specified': 'Unknown',
26 | 'Not Stated': 'Unknown',
27 | 'OTHER ASIAN BACKGROU': 'Asian',
28 | 'Other Asian Unspecif': 'Asian',
29 | 'OTHER BLACK BACKGROU': 'Black',
30 | 'Other Black Unspecif': 'Black',
31 | 'Other Ethnic Group': 'Other',
32 | 'Other Latin American': 'Other',
33 | 'OTHER WHITE BACK GRO': 'White',
34 | 'Other White Unspecif': 'White',
35 | 'Other White/Mixed Eu': 'White',
36 | 'Pakistani/British Pa': 'Asian',
37 | 'Portuguese': 'White',
38 | 'Somali': 'Black',
39 | 'Spanish': 'White',
40 | 'Sri Lankan': 'Asian',
41 | 'Sudanese': 'Black',
42 | 'Turkish': 'Other',
43 | 'Ugandan': 'Black',
44 | 'Vietnamese': 'Asian',
45 | 'White Irish': 'White',
46 | 'Former USSR Rep': 'White',
47 | 'POLISH': 'White',
48 | 'Iraqi': 'Other',
49 | 'Albanian': 'Other',
50 | 'Columbian': 'Other',
51 | 'Scottish': 'White',
52 | 'Not stated': 'Unknown',
53 | 'OTHER MIXED BACKGROU': 'Mixed',
54 | 'Welsh': 'White',
55 | 'British Asian': 'Asian',
56 | 'Caribbean Asian': 'Asian',
57 | 'Eritrean': 'Black',
58 | 'Turkish Cypriot': 'Other',
59 | 'Sinhalese': 'Asian',
60 | 'White and Asian': 'Asian',
61 | 'Other Mixed': 'Mixed',
62 | 'Mixed Asian': 'Asian',
63 | 'Greek': 'White',
64 | 'Arab': 'Other',
65 | 'MULTIPLE CODES': 'MULTIPLE CODES',
66 | 'Irish': 'White',
67 | 'Japanese': 'Asian',
68 | 'Middle East': 'Other',
69 | 'Croatian': 'White',
70 | 'Black and Asian': 'Mixed',
71 | 'Black and White': 'Mixed'}
72 |
73 | # Mapped on bottom-level of 2001 NHS Data Dictionary; https://datadictionary.nhs.uk/data_elements/ethnic_category.html
74 | ethnicity_map_detail = {'Algerian': 'Black or Black British - African',
75 | 'Any Other Group': 'Other Ethnic Groups - Any other ethnic group',
76 | 'Asian and Chinese': 'Other Ethnic Groups - Chinese',
77 | 'Bangladeshi': 'Asian or Asian British - Pakistani',
78 | 'Black African': 'Black or Black British - African',
79 | 'Black British': 'Black or Black British - Any Other Black background',
80 | 'British': 'White - British',
81 | 'Caribbean': 'Black or Black British - Caribbean',
82 | 'Chinese': 'Other Ethnic Groups - Chinese',
83 | 'Cypriot (Part nt st)': 'White - Any other White background',
84 | 'Ecuadorian': 'Other Ethnic Groups - Any other ethnic group',
85 | 'English': 'White - British',
86 | 'Ethiopian': 'Black or Black British - African',
87 | 'Filipino': 'Asian or Asian British - Any other Asian background',
88 | 'Ghanaian': 'Black or Black British - African',
89 | 'Greek Cypriot': 'White - Any other White background',
90 | 'Indian/British India': 'Asian or Asian British - Indian',
91 | 'Iranian': 'Other Ethnic Groups - Any other ethnic group',
92 | 'Italian': 'White - Any other White background',
93 | 'Mixed Black': 'Black or Black British - Any other Black background',
94 | 'Mixed Caribbean': 'Black or Black British - Caribbean',
95 | 'Nigerian': 'Black or Black British - African',
96 | 'Not Given': 'Not stated',
97 | 'Not Specified': 'Not stated',
98 | 'Not Stated': 'Not stated',
99 | 'OTHER ASIAN BACKGROU': 'Asian or Asian British - Any other Asian background',
100 | 'Other Asian Unspecif': 'Asian or Asian British - Any other Asian background',
101 | 'OTHER BLACK BACKGROU': 'Black or Black British - Any Other Black background',
102 | 'Other Black Unspecif': 'Black or Black British - Any Other Black background',
103 | 'Other Ethnic Group': 'Other Ethnic Groups - Any other ethnic group',
104 | 'Other Latin American': 'Other Ethnic Groups - Any other ethnic group',
105 | 'OTHER WHITE BACK GRO': 'White - Any other White background',
106 | 'Other White Unspecif': 'White - Any other White background',
107 | 'Other White/Mixed Eu': 'White - Any other White background',
108 | 'Pakistani/British Pa': 'Asian or Asian British - Pakistani',
109 | 'Portuguese': 'White - Any other White background',
110 | 'Somali': 'Black or Black British - African',
111 | 'Spanish': 'White - Any other White background',
112 | 'Sri Lankan': 'Asian or Asian British - Any other Asian background',
113 | 'Sudanese': 'Black or Black British - African',
114 | 'Turkish': 'Other Ethnic Groups - Any other ethnic group',
115 | 'Ugandan': 'Black or Black British - African',
116 | 'Vietnamese': 'Other Ethnic Groups - Any other ethnic group',
117 | 'White Irish': 'White - Irish',
118 | 'Former USSR Rep': 'White - Any other White background',
119 | 'POLISH': 'White - Any other White background',
120 | 'Iraqi': 'Other Ethnic Groups - Any other ethnic group',
121 | 'Albanian': 'White - Any other White background',
122 | 'Columbian': 'Other Ethnic Groups - Any other ethnic group',
123 | 'Scottish': 'White - British',
124 | 'Not stated': 'Not stated',
125 | 'OTHER MIXED BACKGROU': 'Mixed - Any other mixed background',
126 | 'Welsh': 'White - British',
127 | 'British Asian': 'Asian or Asian British - Any other Asian background',
128 | 'Caribbean Asian': 'Mixed - Any other mixed background',
129 | 'Eritrean': 'Black or Black British - African',
130 | 'Turkish Cypriot': 'Other Ethnic Groups - Any other ethnic group',
131 | 'Sinhalese': 'Asian or Asian British - Any other Asian background',
132 | 'White and Asian': 'Mixed - White and Asian',
133 | 'Other Mixed': 'Mixed - Any other mixed background',
134 | 'Mixed Asian': 'Mixed - Any other mixed background',
135 | 'Greek': 'White - Any other White background',
136 | 'Arab': 'Other Ethnic Groups - Any other ethnic group',
137 | 'MULTIPLE CODES': 'MULTIPLE CODES',
138 | 'Irish': 'White - Irish',
139 | 'Japanese': 'Other Ethnic Groups - Any other ethnic group',
140 | 'Middle East': 'Other Ethnic Groups - Any other ethnic group',
141 | 'Croatian': 'White - Any other White background',
142 | 'Black and Asian': 'Mixed - White and Asian',
143 | 'Black and White': 'Mixed - Any other mixed background'}
144 |
--------------------------------------------------------------------------------