├── .dockerignore
├── LICENSE
├── README.md
├── compute_stats.py
├── dataset
    ├── README.md
    └── original
    │   ├── celiac
    │       └── README.md
    │   ├── cervix
    │       └── README.md
    │   ├── colon
    │       └── README.md
    │   └── lung
    │       └── README.md
├── docker-compose.yml
├── docker-sket_server-config
    ├── docker_sket_cpu
    │   └── Dockerfile
    ├── docker_sket_gpu
    │   └── Dockerfile
    └── requirements.txt
├── evaluate_sket.py
├── examples
    ├── test.xlsx
    ├── test_multiple_reports.json
    └── test_single_report.json
├── ground_truth
    └── README.md
├── manage.py
├── outputs
    └── README.md
├── requirements.txt
├── run_med_sket.py
├── run_sket.py
├── sket
    ├── __init__.py
    ├── negex
    │   ├── __init__.py
    │   ├── negation.py
    │   ├── termsets.py
    │   └── test.py
    ├── nerd
    │   ├── __init__.py
    │   ├── nerd.py
    │   ├── normalizer.py
    │   └── rules
    │   │   ├── cin_mappings.txt
    │   │   ├── dysplasia_mappings.txt
    │   │   └── rules.txt
    ├── ont_proc
    │   ├── __init__.py
    │   ├── ontology
    │   │   └── examode.owl
    │   ├── ontology_processing.py
    │   └── rules
    │   │   └── hierarchy_relations.txt
    ├── rdf_proc
    │   ├── __init__.py
    │   └── rdf_processing.py
    ├── rep_proc
    │   ├── __init__.py
    │   ├── report_processing.py
    │   └── rules
    │   │   └── report_fields.txt
    ├── sket.py
    └── utils
    │   ├── __init__.py
    │   └── utils.py
└── sket_server
    ├── sket_rest_app
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   ├── __init__.cpython-39.pyc
        │   ├── admin.cpython-38.pyc
        │   ├── admin.cpython-39.pyc
        │   ├── apps.cpython-38.pyc
        │   ├── apps.cpython-39.pyc
        │   ├── models.cpython-38.pyc
        │   ├── models.cpython-39.pyc
        │   ├── urls.cpython-38.pyc
        │   ├── urls.cpython-39.pyc
        │   ├── views.cpython-38.pyc
        │   └── views.cpython-39.pyc
        ├── admin.py
        ├── apps.py
        ├── migrations
        │   ├── __init__.py
        │   └── __pycache__
        │   │   ├── __init__.cpython-38.pyc
        │   │   └── __init__.cpython-39.pyc
        ├── models.py
        ├── tests.py
        ├── urls.py
        └── views.py
    └── sket_rest_config
        ├── __init__.py
        ├── __pycache__
            ├── __init__.cpython-38.pyc
            ├── __init__.cpython-39.pyc
            ├── settings.cpython-38.pyc
            ├── settings.cpython-39.pyc
            ├── urls.cpython-38.pyc
            ├── urls.cpython-39.pyc
            ├── wsgi.cpython-38.pyc
            └── wsgi.cpython-39.pyc
        ├── asgi.py
        ├── config.json
        ├── settings.py
        ├── urls.py
        └── wsgi.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | # ignore outputs and dataset (i.e., volume) when building image
2 | outputs
3 | dataset
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ExaNLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SKET
  2 | This repository contains the source code for the Semantic Knowledge Extractor Tool (SKET). <br /> SKET is an unsupervised hybrid knowledge extraction system that combines a rule-based expert system with pre-trained machine learning models to extract cancer-related information from pathology reports.
  3 | 
  4 | ## Installation 
  5 | 
  6 | <b>CAVEAT</b>: the package has been tested using Python 3.7 and 3.8 on unix-based systems and win64 systems. There are no guarantees that it works with different configurations.
  7 | 
  8 | Clone this repository
  9 | 
 10 | ```bash
 11 | git clone https://github.com/ExaNLP/sket.git
 12 | ```
 13 | 
 14 | Install all the requirements:
 15 | 
 16 | ```bash
 17 | pip install -r requirements.txt
 18 | ```
 19 | 
 20 | Then install any ```core``` model from scispacy v0.3.0 (default is ```en_core_sci_sm```):
 21 | 
 22 | ```bash
 23 | pip install </path/to/download>
 24 | ```
 25 | 
 26 | The required scispacy models are available at: https://github.com/allenai/scispacy/tree/v0.3.0
 27 | 
 28 | ## Datasets
 29 | 
 30 | Users can go into the ```datasets``` folder and place their datasets within the corresponding use case folders. Use cases are: Colon Cancer (colon), Cervix Uterine Cancer (cervix), and Lung Cancer (lung). 
 31 | 
 32 | Datasets can be provided in two formats:
 33 | 
 34 | ### XLS Format
 35 | 
 36 | Users can provide ```.xls``` or ```.xlsx``` files with the first row consisting of column headers (i.e., fields) and the rest of data inputs. 
 37 | 
 38 | ### JSON Format
 39 | 
 40 | Users can provide ```.json``` files structured in two ways: <br />
 41 | 
 42 | As a dict containing a ```reports``` field consisting of multiple key-value reports; 
 43 | 
 44 | ```bash
 45 | {'reports': [{k: v, ...}, ...]}
 46 | ```
 47 | 
 48 | As a dict containing a single key-value report.
 49 | 
 50 | ```bash
 51 | {k: v, ...}
 52 | ```
 53 | 
 54 | SKET concatenates data from all the fields before translation. Users can alterate this behavior by filling ```./sket/rep_proc/rules/report_fields.txt``` with target fields, one per line. Users can also provide a custom file to SKET, as long as it contains one field per line (more on this below).
 55 | 
 56 | Users can provide <i>special</i> headers that are treated differently from regular text by SKET. These fields are: <br />
 57 | ```id```: when specified, the ```id``` field is used to identify the corresponding report. Otherwise, ```uuid``` is used.
 58 | ```gender```: when specified, the ```gender``` field is used to provide patient's information within RDF graphs. Otherwise, ```gender``` is set to None.
 59 | ```age```: when specified, the ```age``` field is used to provide patient's information within RDF graphs. Otherwise, ```age``` is set to None.
 60 | 
 61 | ## Dataset Statistics
 62 | 
 63 | Users can compute dataset statistics to uderstand the distribution of concepts extracted by SKET for each use case. For instance, if a user wants to compute statistics for Colon Cancer, they can run 
 64 | 
 65 | ```bash
 66 | python compute_stats.py --outputs ./outputs/concepts/refined/colon/*.json --use_case colon
 67 | ```
 68 | 
 69 | ## Pretrain
 70 | 
 71 | SKET can be deployed with different pretrained models, i.e., fastText and BERT. In our experiments, we employed the [BioWordVec](https://github.com/ncbi-nlp/BioSentVec) fastText model and the [Bio + Clinical BERT model](https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT). <br />
 72 | <b>BioWordVec</b> can be downloaded from https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.bin <br />
 73 | <b>Bio + Clinical BERT</b> model can be automatically downloaded at run time by setting the ```biobert``` SKET parameter equal to 'emilyalsentzer/Bio_ClinicalBERT'
 74 | 
 75 | Users can pass different pretrained models depending on their preferences. 
 76 | 
 77 | 
 78 | ## Usage
 79 |   
 80 | Users can deploy SKET using ```run_med_sket.py```. We release within ```./examples``` three sample datasets that can be used as toy examples to play with SKET. SKET can be deployed with different configurations and using different combinations of matching models. 
 81 | 
 82 | Furthermore, SKET exhibits a tunable ```threshold``` parameter that can be tuned to decide the harshness of the entity linking component. The higher the ```threshold```, the more precise the model -- at the expense of recall -- and vice versa. Users can fine-tune this parameter to obtain the desired trade-off between precision and recall. Note that ```threshold``` must always be lower than or equal to the number of considered matching models. Otherwise, the entity linking component does not return any concept.
 83 | 
 84 | The available matching models, in form of SKET parameters, are: <br />
 85 | ```biow2v```: the ScispaCy pretrained word embeddings. Set this parameter to ```True``` to use them. <br />
 86 | ```biofast```: the fastText model. Set this parameter to ```/path/to/fastText/file``` to use fastText. <br />
 87 | ```biobert```: the BERT model. Set this parameter to ```bert-name``` to use BERT (see https://huggingface.co/transformers/pretrained_models.html for model IDs). <br />
 88 | ```str_match```: the Gestalt Pattern Matching (GPM) model. Set this parameter to ```True``` to use GPM.
 89 | 
 90 | When using BERT, users can also set ```gpu``` parameter to the corresponding GPU number to fasten SKET execution.
 91 | 
 92 | For instance, a user can run the following script to obtain concepts, labels, and RDF graphs on the test.xlsx sample dataset:
 93 | 
 94 | ```bash
 95 | python run_med_sket.py 
 96 |        --src_lang it 
 97 |        --use_case colon 
 98 |        --spacy_model en_core_sci_sm 
 99 |        --w2v_model 
100 |        --string_model 
101 |        --thr 2.0 
102 |        --store 
103 |        --dataset ./examples/test.xlsx
104 | ```
105 | 
106 | or, if a user also wants to use BERT with GPU support, they can run the following script: 
107 | 
108 | ```bash
109 | python run_med_sket.py  
110 |        --src_lang it 
111 |        --use_case colon 
112 |        --spacy_model en_core_sci_sm 
113 |        --w2v_model 
114 |        --string_model 
115 |        --bert_model emilyalsentzer/Bio_ClinicalBERT
116 |        --gpu 0 
117 |        --thr 2.5 
118 |        --store 
119 |        --dataset ./examples/test.xlsx
120 | ```
121 | 
122 | In both cases, we set the ```src_lang``` to ```it``` as the source language of reports is Italian. Therefore, SKET needs to translate reports from Italian to English before performing information extraction.
123 | 
124 | ## Docker
125 | 
126 | SKET can also be deployed as a Docker container -- thus avoiding the need to install its dependencies directly on the host machine. Two Docker images can be built: <b>sket_cpu</b> and <b>sket_gpu</b>. <br /> 
127 | For ```sket_gpu```, NVIDIA drivers have to be already installed within the host machine. Users can refer to NVIDIA [user-guide](https://docs.nvidia.com/deeplearning/frameworks/user-guide/#nvcontainers) for more information.
128 | 
129 | Instructions on how to build and run sket images are reported below, if you already have [docker](https://docs.docker.com/engine/reference/commandline/docker/) installed on your machine, you can skip the first step.
130 | 
131 | 1) Install Docker. In this regard, check out the correct [installation procedure](https://docs.docker.com/get-docker/) for your platform.
132 | 
133 | 2) Install docker-compose. In this regard, check the correct [installation procedure](https://docs.docker.com/compose/install/) for your platform.
134 | 
135 | 3) Check the Docker daemon (i.e., ```dockerd```) is up and running.
136 | 
137 | 4) Download or clone the [sket](https://github.com/ExaNLP/sket) repository.
138 | 
139 | 5) In ```sket_server/sket_rest_config``` the ```config.json``` file allows you to configure the sket instance, edit this file in order to set the following parameters: ```w2v_model```, ```fasttext_model```, ```bert_model```, ```string_model```, ```gpu```, and ```thr```, where ```thr``` stands for *similarity threshold* and its default value is set to 0.9.
140 | 
141 | 6) Depending on the Docker image of interest, follow one of the two procedures below: <br />
142 |     6a) <b>SKET CPU-only</b>: from the [sket](https://github.com/ExaNLP/sket/), type: ```docker-compose run --service-ports sket_cpu ```<br />
143 |     6b) <b>SKET GPU-enabled</b>: from the [sket](https://github.com/ExaNLP/sket/), type: ```docker-compose run --service-ports sket_gpu ```<br />
144 |     
145 | 7) When the image is ready, the sket server is running at: http://0.0.0.0:8000 if you run ```sket_cpu ```. If you run ```sket_gpu ``` the server will run at: http://0.0.0.0:8001.
146 | 
147 | 8) The annotation of medical reports can be performed with two types of <b>POST request</b>:<br />
148 |     8a) If you want to store the annotations in the ```outputs``` directory, the URL to make the request to is: ```http://0.0.0.0:8000/annotate/<use_case>/<language>``` where ```use_case``` and ```language``` are the use case and the language (identified using [ISO 639-1 Code](https://www.loc.gov/standards/iso639-2/php/code_list.php)) of your reports, respectively.<br />
149 |    <br /> Request example: 
150 |     ```bash
151 |     curl -H "Content-Type: multipart/form-data" -F "data=@path/to/examples/test.xlsx" http://0.0.0.0:8000/annotate/colon/it
152 |     ```
153 |     
154 |     where ```path/to/examples``` is the path to the examples folder. With this type of request, labels and concepts are stored in ```.json``` files, while graphs are stored in ```.json```,```.n3```,```.ttl```,```.trig``` files.<br />
155 |     If you want to store exclusively one file format among ```.n3```,```.ttl```, and ```.trig```, put after the desired language ```/trig``` if you want to store graphs in ```.trig``` format, ```/turtle``` if you want to store graphs in ```ttl``` format and ```/n3``` if you want to store graphs in ```.n3``` format.  <br />
156 |        <br /> Request example: 
157 |     ```bash
158 |     curl -H "Content-Type: multipart/form-data" -F "data=@path/to/examples/test.xlsx" http://0.0.0.0:8000/annotate/colon/it/turtle
159 |     ```
160 |     
161 |     where ```path/to/examples``` is the path to the examples folder.<br/><br/>
162 |     8b) If you want to use the labels, the concepts, or the graphs returned by sket without saving them, the URL to make the request to is: ```http://0.0.0.0:8000/annotate/<use_case>/<language>/<output>``` where ```use_case``` and ```language``` are the use case and the language (identified using [ISO 639-1 Code](https://www.loc.gov/standards/iso639-2/php/code_list.php)) of your reports, respectively, and ```output``` is ```labels```, ```concepts```, or ```graphs```.<br />
163 |         <br />Request example: 
164 |     ```bash
165 |     curl -H "Content-Type: multipart/form-data" -F "data=@path/to/examples/test.xlsx" http://0.0.0.0:8000/annotate/colon/it/labels
166 |     ```
167 |     where ```path/to/examples``` is the path to the examples folder. <br />
168 |     If you want your request to return a graph, your request must include also the graph format. Hence, your request will be: ```http://0.0.0.0:8000/annotate/<use_case>/<language>/graphs/<rdf_format>``` where ```<rdf_format>``` can be on format among:  ```turtle```, ```n3``` and ```trig```.<br/>
169 |      ```bash
170 |     curl -H "Content-Type: multipart/form-data" -F "data=@path/to/examples/test.xlsx" http://0.0.0.0:8000/annotate/colon/it/graphs/turtle
171 |     ```
172 |     where ```path/to/examples``` is the path to the examples folder. <br /><br/>
173 | 
174 | 9) If you want to embed your medical reports in the request, change the application type and set: ```-H "Content-Type: application/json"``` then, instead of ```- F "data=@..."``` put ```-d '{"reports":[{},...,{}]}'``` if you have multiple reports, or ```-d '{"k":"v",...}'``` if you have a single report.
175 | 
176 | 10) If you want to build the images again, from the project folder type ```docker-compose down --rmi local```, pay attention that this command will remove all the images created (both CPU and GPU). If you want to remove only one image between CPU and GPU see the [docker image documentation](https://docs.docker.com/engine/reference/commandline/image/). Finally repeat steps 5-8. 
177 | 
178 | Regarding SKET GPU-enabled, the corresponding Dockerfile (you can find the Dockerfile at the following path: sket_server/docker-sket_server-config/sket_gpu) contains the ```nvidia/cuda:11.0-devel```. Users are encouraged to change the NVIDIA/CUDA image within the Dockerfile depending on the NVIDIA drivers installed in their host machine. NVIDIA images can be found [here](https://hub.docker.com/r/nvidia/cuda/tags?page=1&ordering=last_updated).
179 | 
180 | ## Cite
181 | 
182 | If you use or extend our work, please cite the following:
183 | 
184 | ```
185 | @article{jpi_sket-2022,
186 |   title = "Empowering Digital Pathology Applications through Explainable Knowledge Extraction Tools",
187 |   author = "S. Marchesin and F. Giachelle and N. Marini and M. Atzori and S. Boytcheva and G. Buttafuoco and F. Ciompi and G. M. Di Nunzio and F. Fraggetta and O. Irrera and H. Müller and T. Primov and S. Vatrano and G. Silvello",
188 |   journal = "Journal of Pathology Informatics",
189 |   year = "2022",
190 |   url = "https://www.sciencedirect.com/science/article/pii/S2153353922007337",
191 |   doi = "https://doi.org/10.1016/j.jpi.2022.100139",
192 |   pages = "100139"
193 | }
194 | ```
195 | 
196 | 
197 | ```
198 | @article{npj_dig_med-2022,
199 |   title = "Unleashing the potential of digital pathology data by training computer-aided diagnosis models without human annotations",
200 |   author = "N. Marini and S. Marchesin and S. Otálora and M. Wodzinski and A. Caputo and M. van Rijthoven and W. Aswolinskiy and J. M. Bokhorst and D. Podareanu and E. Petters and S. Boytcheva and G. Buttafuoco and S. Vatrano and F. Fraggetta and J. der Laak and M. Agosti and F. Ciompi and G. Silvello and H. Müller and M. Atzori",
201 |   journal = "npj Digital Medicine",
202 |   year = "2022",
203 |   url = "http://dx.doi.org/10.1038/s41746-022-00635-4",
204 |   doi = "10.1038/s41746-022-00635-4",
205 |   volume = "5",
206 |   number = "1",
207 |   pages = "1--18"
208 | }
209 | ```
210 | 


--------------------------------------------------------------------------------
/compute_stats.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import glob
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--outputs', default='./outputs/concepts/refined/aoec/colon/*.json', type=str, help='SKET results file.')
 8 | parser.add_argument('--use_case', default='colon', choices=['colon', 'cervix', 'lung', 'celiac'], help='Considered use-case.')
 9 | args = parser.parse_args()
10 | 
11 | 
12 | def main():
13 |     # read SKET results
14 |     if '*.json' == args.outputs.split('/')[-1]:  # read files
15 |         # read file paths
16 |         rsfps = glob.glob(args.outputs)
17 |         # set dict
18 |         rs = {}
19 |         for rsfp in rsfps:
20 |             with open(rsfp, 'r') as rsf:
21 |                 rs.update(json.load(rsf))
22 |     else:  # read file
23 |         with open(args.outputs, 'r') as rsf:
24 |             rs = json.load(rsf)
25 | 
26 |     stats = []
27 |     # loop over reports and store size
28 |     for rid, rdata in rs.items():
29 |         stats.append(sum([len(sem_data) for sem_cat, sem_data in rdata.items()]))
30 |     # convert into numpy
31 |     stats = np.array(stats)
32 |     print('size: {}'.format(np.size(stats)))
33 |     print('max: {}'.format(np.max(stats)))
34 |     print('min: {}'.format(np.min(stats)))
35 |     print('mean: {}'.format(np.mean(stats)))
36 |     print('std: {}'.format(np.std(stats)))
37 |     print('tot: {}'.format(np.sum(stats)))
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 | 
3 | Please use this folder to store the datasets to process with SKET.
4 | 


--------------------------------------------------------------------------------
/dataset/original/celiac/README.md:
--------------------------------------------------------------------------------
1 | # Celiac datasets
2 | 
3 | Put here datasets containing Celiac Disease pathology reports.
4 | 


--------------------------------------------------------------------------------
/dataset/original/cervix/README.md:
--------------------------------------------------------------------------------
1 | # Cervix datasets
2 | 
3 | Put here datasets containing Cervix Uterine Cancer pathology reports.
4 | 


--------------------------------------------------------------------------------
/dataset/original/colon/README.md:
--------------------------------------------------------------------------------
1 | # Colon datasets
2 | 
3 | Put here datasets containing Colon Cancer pathology reports.
4 | 


--------------------------------------------------------------------------------
/dataset/original/lung/README.md:
--------------------------------------------------------------------------------
1 | # Lung datasets
2 | 
3 | Put here datasets containing Lung Cancer pathology reports.
4 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2.3"
 2 |    
 3 | services:
 4 |   sket_cpu:
 5 | 
 6 |     build: 
 7 |       context: .
 8 |       dockerfile: ./docker-sket_server-config/docker_sket_cpu/Dockerfile
 9 | 
10 |     volumes:
11 |       - .:/code
12 |     ports:
13 |       - "8000:8000"
14 |     command: bash -c 'python manage.py runserver 0.0.0.0:8000'
15 |     
16 |     
17 |   sket_gpu:
18 |     runtime: nvidia
19 |     environment:
20 |         - NVIDIA_VISIBLE_DEVICES=all
21 | 
22 |     build: 
23 |       context: .
24 |       dockerfile: ./docker-sket_server-config/docker_sket_gpu/Dockerfile
25 | 
26 |     volumes:
27 |       - .:/code
28 |     ports:
29 |       - "8001:8001"
30 |     command: bash -c 'python3 manage.py runserver 0.0.0.0:8001'


--------------------------------------------------------------------------------
/docker-sket_server-config/docker_sket_cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-buster
2 | ENV PYTHONUNBUFFERED=1
3 | WORKDIR /code
4 | COPY ./docker-sket_server-config/requirements.txt /code/
5 | RUN pip install --no-cache-dir -r requirements.txt
6 | RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
7 | COPY . /code/
8 | 
9 | 


--------------------------------------------------------------------------------
/docker-sket_server-config/docker_sket_gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | # set nvidia version
 2 | FROM nvidia/cuda:11.0-devel
 3 | 
 4 | #set up environment
 5 | RUN apt-get update && apt-get install --no-install-recommends --no-install-suggests -y curl
 6 | RUN apt-get install unzip
 7 | RUN apt-get -y install python3.8
 8 | RUN apt-get -y install python3-pip
 9 | 
10 | # set work directory
11 | WORKDIR /code
12 | # copy requirements in work directory
13 | COPY ./docker-sket_server-config/requirements.txt /code/
14 | # install requirements and scispacy model
15 | RUN pip install --no-cache-dir -r requirements.txt
16 | RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
17 | 
18 | # copy code and config files within work directory
19 | COPY . /code/
20 | # run sket
21 | 
22 | 


--------------------------------------------------------------------------------
/docker-sket_server-config/requirements.txt:
--------------------------------------------------------------------------------
 1 | Django>=3.0,<4.0
 2 | Owlready2==0.26
 3 | negspacy==0.1.9
 4 | pandas
 5 | torch==1.7.1
 6 | numpy
 7 | tqdm==4.55.0
 8 | rdflib==5.0.0
 9 | spacy==2.3.5
10 | textdistance==4.2.0
11 | transformers==4.2.2
12 | roman==3.3
13 | fasttext==0.9.2
14 | pytest==6.2.4
15 | scikit_learn==0.24.2
16 | sentencepiece
17 | openpyxl
18 | djangorestframework
19 | pyparsing==2.4.7
20 | 


--------------------------------------------------------------------------------
/evaluate_sket.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import glob
  4 | import os
  5 | import argparse
  6 | 
  7 | from sklearn.metrics import hamming_loss, accuracy_score, classification_report
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('--gt', default='./ground_truth/celiac/aoec/celiac_labels_allDS.json', type=str, help='Ground truth file.')
 12 | parser.add_argument('--outputs', default='./outputs/labels/aoec/celiac/*.json', type=str, help='SKET results file.')
 13 | parser.add_argument('--use_case', default='celiac', choices=['colon', 'cervix', 'lung', 'celiac'], help='Considered use-case.')
 14 | parser.add_argument('--hospital', default='aoec', choices=['aoec', 'radboud'], help='Considered hospital.')
 15 | parser.add_argument('--debug', default=False, action='store_true', help='Whether to use evaluation for debugging purposes.')
 16 | args = parser.parse_args()
 17 | 
 18 | label2class = {
 19 |     'cervix': {
 20 |         'Normal glands': 'glands_norm',
 21 |         'Normal squamous': 'squamous_norm',
 22 |         'Cancer - squamous cell carcinoma in situ': 'cancer_scc_insitu',
 23 |         'Low grade dysplasia': 'lgd',
 24 |         'Cancer - squamous cell carcinoma invasive': 'cancer_scc_inv',
 25 |         'High grade dysplasia': 'hgd',
 26 |         'Koilocytes': 'koilocytes',
 27 |         'Cancer - adenocarcinoma invasive': 'cancer_adeno_inv',
 28 |         'Cancer - adenocarcinoma in situ': 'cancer_adeno_insitu',
 29 |         'HPV infection present': 'hpv'
 30 |     },
 31 |     'colon': {
 32 |         'Hyperplastic polyp': 'hyperplastic',
 33 |         'Cancer': 'cancer',
 34 |         'Adenomatous polyp - high grade dysplasia': 'hgd',
 35 |         'Adenomatous polyp - low grade dysplasia': 'lgd',
 36 |         'Non-informative': 'ni'
 37 |     },
 38 |     'lung': {
 39 |         'No cancer': 'no_cancer',
 40 |         'Cancer - non-small cell cancer, adenocarcinoma': 'cancer_nscc_adeno',
 41 |         'Cancer - non-small cell cancer, squamous cell carcinoma': 'cancer_nscc_squamous',
 42 |         'Cancer - small cell cancer': 'cancer_scc',
 43 |         'Cancer - non-small cell cancer, large cell carcinoma': 'cancer_nscc_large'
 44 |     },
 45 |     'celiac': {
 46 |         'Normal': 'normal',
 47 |         'Celiac disease': 'celiac_disease',
 48 |         'Non-specific duodenitis': 'duodenitis',
 49 |     }
 50 | }
 51 | 
 52 | 
 53 | def main():
 54 |     # create path for debugging
 55 |     debug_path = './logs/debug/' + args.hospital + '/' + args.use_case + '/'
 56 |     os.makedirs(os.path.dirname(debug_path), exist_ok=True)
 57 | 
 58 |     # read ground-truth
 59 |     with open(args.gt, 'r') as gtf:
 60 |         ground_truth = json.load(gtf)
 61 | 
 62 |     gt = {}
 63 |     # prepare ground-truth for evaluation
 64 |     if args.hospital == 'aoec' or args.use_case == 'celiac':
 65 |         ground_truth = ground_truth['groundtruths']
 66 |     else:
 67 |         ground_truth = ground_truth['ground_truth']
 68 |     for data in ground_truth:
 69 |         if args.hospital == 'aoec' or args.use_case == 'celiac':
 70 |             rid = data['id_report']
 71 |         else:
 72 |             rid = data['report_id_not_hashed']
 73 | 
 74 |         if len(rid.split('_')) == 3 and args.hospital == 'aoec':  # contains codeint info not present within new processed reports
 75 |             rid = rid.split('_')
 76 |             rid = rid[0] + '_' + rid[2]
 77 | 
 78 |         gt[rid] = {label2class[args.use_case][label]: 0 for label in label2class[args.use_case].keys()}
 79 |         for datum in data['labels']:
 80 |             label = label2class[args.use_case][datum['label']]
 81 |             if label in gt[rid]:
 82 |                 gt[rid][label] = 1
 83 |     # gt name
 84 |     gt_name = args.gt.split('/')[-1].split('.')[0]
 85 | 
 86 |     # read SKET results
 87 |     if '*.json' == args.outputs.split('/')[-1]:  # read files
 88 |         # read file paths
 89 |         rsfps = glob.glob(args.outputs)
 90 |         # set dict
 91 |         rs = {}
 92 |         for rsfp in rsfps:
 93 |             with open(rsfp, 'r') as rsf:
 94 |                 rs.update(json.load(rsf))
 95 |     else:  # read file
 96 |         with open(args.outputs, 'r') as rsf:
 97 |             rs = json.load(rsf)
 98 | 
 99 |     sket = {}
100 |     # prepare SKET results for evaluation
101 |     for rid, rdata in rs.items():
102 |         if args.use_case == 'colon' and args.hospital == 'aoec' and '2ndDS' in args.gt:
103 |             rid = rid.split('_')[0]
104 |         if args.hospital == 'radboud':
105 |             sket[rid] = rdata['labels']
106 |         else:
107 |             sket[rid] = rdata
108 | 
109 |     # fix class order to avoid inconsistencies
110 |     rids = list(sket.keys())
111 |     classes = list(sket[rids[0]].keys())
112 |     if args.use_case == 'celiac':
113 |         classes.remove('inconclusive')
114 | 
115 |     # obtain ground-truth and SKET scores
116 |     gt_scores = []
117 |     sket_scores = []
118 | 
119 |     if args.debug:  # open output for debugging
120 |         debugf = open(debug_path + gt_name + '.txt', 'w+')
121 | 
122 |     for rid in gt.keys():
123 |         gt_rscores = []
124 |         sket_rscores = []
125 |         if rid not in sket:
126 |             print('skipped gt record: {}'.format(rid))
127 |             continue
128 |         if args.debug:
129 |             first = True
130 |         for c in classes:
131 |             #if c != 'inconclusive':
132 |             gt_rscores.append(gt[rid][c])
133 |             sket_rscores.append(sket[rid][c])
134 |             if args.debug:  # perform debugging
135 |                 if gt[rid][c] != sket[rid][c]:  # store info for debugging purposes
136 |                     if first:  # first occurence
137 |                         debugf.write('\nReport ID: {}\n'.format(rid))
138 |                         first = False
139 |                     debugf.write(c + ': gt = {}, sket = {}\n'.format(gt[rid][c], sket[rid][c]))
140 |         gt_scores.append(gt_rscores)
141 |         sket_scores.append(sket_rscores)
142 | 
143 |     if args.debug:  # close output for debugging
144 |         debugf.close()
145 | 
146 |     # convert to numpy
147 |     gt_scores = np.array(gt_scores)
148 |     sket_scores = np.array(sket_scores)
149 | 
150 |     # compute evaluation measures
151 |     print('Compute evaluation measures')
152 | 
153 |     # exact match accuracy & hamming loss
154 |     print("Accuracy (exact match): {}".format(accuracy_score(gt_scores, sket_scores)))
155 |     print("Hamming loss: {}\n".format(hamming_loss(gt_scores, sket_scores)))
156 | 
157 |     # compute classification report
158 |     print("Classification report:")
159 |     print(classification_report(y_true=gt_scores, y_pred=sket_scores, target_names=classes))
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/examples/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/examples/test.xlsx


--------------------------------------------------------------------------------
/examples/test_multiple_reports.json:
--------------------------------------------------------------------------------
1 | {"reports": [{"diagnosis": "adenocarcinoma con displasia lieve, focalmente severa. Risultati ottenuti con biopsia al colon.", "materials": "biopsia polipo."}, {"diagnosis": "polipo iperplastico con displasia focalmente severa", "materials": "biopsia retto."}]}


--------------------------------------------------------------------------------
/examples/test_single_report.json:
--------------------------------------------------------------------------------
1 | {"diagnosis": "adenocarcinoma con displasia lieve, focalmente severa. Risultati ottenuti con biopsia al colon.", "materials": "biopsia polipo.", "age": 5, "gender": "M", "id": "test"}


--------------------------------------------------------------------------------
/ground_truth/README.md:
--------------------------------------------------------------------------------
1 | # Ground truth
2 | 
3 | Please put here the ground truth used to evaluate SKET.
4 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main():
 8 |     """Run administrative tasks."""
 9 |     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sket_server.sket_rest_config.settings')
10 |     try:
11 |         from django.core.management import execute_from_command_line
12 |     except ImportError as exc:
13 |         raise ImportError(
14 |             "Couldn't import Django. Are you sure it's installed and "
15 |             "available on your PYTHONPATH environment variable? Did you "
16 |             "forget to activate a virtual environment?"
17 |         ) from exc
18 |     execute_from_command_line(sys.argv)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/outputs/README.md:
--------------------------------------------------------------------------------
 1 | # Outputs
 2 | 
 3 | This directory contains the different outputs generated by SKET, namely:
 4 | 1. Concepts
 5 | 2. Labels
 6 | 3. Graphs
 7 | 
 8 | Each directory is generated at run-time during the first execution of SKET. <br /> 
 9 | Each output is contained within the corresponding directory.
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Owlready2==0.26
 2 | negspacy==0.1.9
 3 | pandas==1.0.4
 4 | torch==1.7.1
 5 | numpy==1.19.5
 6 | tqdm==4.55.0
 7 | rdflib==5.0.0
 8 | spacy==2.3.5
 9 | textdistance==4.2.0
10 | transformers==4.2.2
11 | roman==3.3
12 | fasttext==0.9.2
13 | pytest==6.2.4
14 | scikit_learn==0.24.2
15 | sentencepiece==0.1.91
16 | openpyxl==3.0.7
17 | xlrd==2.0.1
18 | pyparsing==2.4.7
19 | 


--------------------------------------------------------------------------------
/run_med_sket.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import warnings
 3 | 
 4 | from sket.sket import SKET
 5 | 
 6 | warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--src_lang', default='en', type=str, help='Considered source language.')
10 | parser.add_argument('--use_case', default='celiac', choices=['colon', 'cervix', 'lung', 'celiac'], help='Considered use-case.')
11 | parser.add_argument('--spacy_model', default='en_core_sci_sm', type=str, help='Considered NLP spacy model.')
12 | parser.add_argument('--w2v_model', default=True, action='store_true', help='Considered word2vec model.')
13 | parser.add_argument('--fasttext_model', default=None, type=str, help='File path for FastText model.')
14 | parser.add_argument('--bert_model', default=None, type=str, help='Considered BERT model.')
15 | parser.add_argument('--string_model', default=True, action='store_true', help='Considered string matching model.')
16 | parser.add_argument('--gpu', default=None, type=int, help='Considered GPU device. If not specified (default to None), use CPU instead.')
17 | parser.add_argument('--thr', default=1.8, type=float, help='Similarity threshold.')
18 | parser.add_argument('--store', default=True, action='store_true', help='Whether to store concepts, labels, and graphs.')
19 | parser.add_argument('--rdf_format', default='all', choices=['n3', 'trig', 'turtle', 'all'], help='Whether to specify the rdf format for graph serialization. If "all" is specified, serialize w/ the three different formats')
20 | parser.add_argument('--raw', default=False, action='store_true', help='Whether to consider full pipeline or not.')
21 | parser.add_argument('--debug', default=False, action='store_true', help='Whether to use flags for debugging.')
22 | parser.add_argument('--preprocess', default=True, action='store_true', help='Whether to preprocess input data or not.')
23 | parser.add_argument('--dataset', default=None, type=str, help='Dataset file path.')
24 | args = parser.parse_args()
25 | 
26 | 
27 | def main():
28 |     # set SKET
29 |     sket = SKET(args.use_case, args.src_lang, args.spacy_model, args.w2v_model, args.fasttext_model, args.bert_model, args.string_model, args.gpu)
30 | 
31 |     if args.dataset:  # use dataset from file path
32 |         dataset = args.dataset
33 |     else:  # use sample "stream" dataset
34 |         dataset = {
35 |             "text": "polyp 40 cm: tubular adenoma with moderate dysplasia.",
36 |             'gender': 'F',
37 |                     'age': 56,
38 |                     'id': 'test_colon'
39 |         }
40 | 
41 |     # use SKET pipeline to extract concepts, labels, and graphs from dataset
42 |     sket.med_pipeline(dataset, args.preprocess, args.src_lang, args.use_case, args.thr, args.store, args.rdf_format, args.raw, args.debug)
43 | 
44 |     if args.raw:
45 |         print('processed data up to concepts.')
46 |     else:
47 |         print('full pipeline.')
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/run_sket.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import warnings
 3 | 
 4 | from sket.sket import SKET
 5 | 
 6 | warnings.filterwarnings("ignore", message=r"\[W008\]", category=UserWarning)
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--dataset', default='./dataset/original/celiac/aoec/ExaMode_0thDS_AOEC_Celiac.xlsx', type=str, help='Dataset file.')
10 | parser.add_argument('--sheet', default='Sheet 1', type=str, help='Considered dataset sheet.')
11 | parser.add_argument('--header', default=0, type=str, help='Header row within dataset.')
12 | parser.add_argument('--ver', default=2, type=str, help='Considered versioning for operations.')
13 | parser.add_argument('--use_case', default='celiac', choices=['colon', 'cervix', 'lung', 'celiac'], help='Considered use-case.')
14 | parser.add_argument('--hospital', default='aoec', choices=['aoec', 'radboud'], help='Considered hospital.')
15 | parser.add_argument('--spacy_model', default='en_core_sci_sm', type=str, help='Considered NLP spacy model.')
16 | parser.add_argument('--w2v_model', default=True, action='store_true', help='Considered word2vec model.')
17 | parser.add_argument('--fasttext_model', default=None, type=str, help='File path for FastText model.')
18 | parser.add_argument('--bert_model', default=None, type=str, help='Considered BERT model.')
19 | parser.add_argument('--string_model', default=True, action='store_true', help='Considered string matching model.')
20 | parser.add_argument('--gpu', default=None, type=int, help='Considered GPU device. If not specified (default to None), use CPU instead.')
21 | parser.add_argument('--thr', default=1.8, type=float, help='Similarity threshold.')
22 | parser.add_argument('--raw', default=False, action='store_true', help='Whether to return concepts within semantic areas (deployment) or mentions+concepts (debugging)')
23 | parser.add_argument('--debug', default=False, action='store_true', help='Whether to use flags for debugging.')
24 | args = parser.parse_args()
25 | 
26 | 
27 | def main():
28 |     # set source language based on hospital
29 |     if args.hospital == 'aoec':
30 |         src_lang = 'it'
31 |     elif args.hospital == 'radboud':
32 |         src_lang = 'nl'
33 |     else:  # raise exception
34 |         print('Input hospital does not belong to available ones.\nPlease consider either "aoec" or "radboud" as hospital.')
35 |         raise Exception
36 |     # set SKET
37 |     sket = SKET(args.use_case, src_lang, args.spacy_model, args.w2v_model, args.fasttext_model, args.bert_model, args.string_model, args.gpu)
38 | 
39 |     # use SKET pipeline to extract concepts, labels, and graphs from args.dataset
40 |     sket.exa_pipeline(args.dataset, args.sheet, args.header, args.ver, args.use_case, args.hospital, args.thr, args.raw, args.debug)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/sket/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/negex/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/negex/negation.py:
--------------------------------------------------------------------------------
  1 | from spacy.tokens import Token, Doc, Span
  2 | from spacy.matcher import PhraseMatcher
  3 | import logging
  4 | 
  5 | from negspacy.termsets import LANGUAGES
  6 | 
  7 | 
  8 | class Negex:
  9 |     """
 10 | 	A spaCy pipeline component which identifies negated tokens in text.
 11 | 
 12 | 	Based on: NegEx - A Simple Algorithm for Identifying Negated Findings and Diseasesin Discharge Summaries
 13 |     Chapman, Bridewell, Hanbury, Cooper, Buchanan
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     nlp: object
 18 |         spaCy language object
 19 |     ent_types: list
 20 |         list of entity types to negate
 21 |     language: str
 22 |         language code, if using default termsets (e.g. "en" for english)
 23 |     extension_name: str
 24 |         defaults to "negex"; whether entity is negated is then available as ent._.negex
 25 |     pseudo_negations: list
 26 |         list of phrases that cancel out a negation, if empty, defaults are used
 27 |     preceding_negations: list
 28 |         negations that appear before an entity, if empty, defaults are used
 29 |     following_negations: list
 30 |         negations that appear after an entity, if empty, defaults are used
 31 |     termination: list
 32 |         phrases that "terminate" a sentence for processing purposes such as "but". If empty, defaults are used
 33 | 
 34 | 	"""
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         nlp,
 39 |         language="en_clinical",
 40 |         ent_types=list(),
 41 |         extension_name="negex",
 42 |         pseudo_negations=list(),
 43 |         preceding_negations=list(),
 44 |         following_negations=list(),
 45 |         termination=list(),
 46 |         chunk_prefix=list(),
 47 |     ):
 48 |         if not language in LANGUAGES:
 49 |             raise KeyError(
 50 |                 f"{language} not found in languages termset. "
 51 |                 "Ensure this is a supported language or specify "
 52 |                 "your own termsets when initializing Negex."
 53 |             )
 54 |         termsets = LANGUAGES[language]
 55 |         if not Span.has_extension(extension_name):
 56 |             Span.set_extension(extension_name, default=False, force=True)
 57 | 
 58 |         if not pseudo_negations:
 59 |             if not "pseudo_negations" in termsets:
 60 |                 raise KeyError("pseudo_negations not specified for this language.")
 61 |             self.pseudo_negations = termsets["pseudo_negations"]
 62 |         else:
 63 |             self.pseudo_negations = pseudo_negations
 64 | 
 65 |         if not preceding_negations:
 66 |             if not "preceding_negations" in termsets:
 67 |                 raise KeyError("preceding_negations not specified for this language.")
 68 |             self.preceding_negations = termsets["preceding_negations"]
 69 |         else:
 70 |             self.preceding_negations = preceding_negations
 71 | 
 72 |         if not following_negations:
 73 |             if not "following_negations" in termsets:
 74 |                 raise KeyError("following_negations not specified for this language.")
 75 |             self.following_negations = termsets["following_negations"]
 76 |         else:
 77 |             self.following_negations = following_negations
 78 | 
 79 |         if not termination:
 80 |             if not "termination" in termsets:
 81 |                 raise KeyError("termination not specified for this language.")
 82 |             self.termination = termsets["termination"]
 83 |         else:
 84 |             self.termination = termination
 85 | 
 86 |         self.nlp = nlp
 87 |         self.ent_types = ent_types
 88 |         self.extension_name = extension_name
 89 |         self.build_patterns()
 90 |         self.chunk_prefix = list(nlp.tokenizer.pipe(chunk_prefix))
 91 | 
 92 |     def build_patterns(self):
 93 |         # efficiently build spaCy matcher patterns
 94 |         self.matcher = PhraseMatcher(self.nlp.vocab, attr="LOWER")
 95 | 
 96 |         self.pseudo_patterns = list(self.nlp.tokenizer.pipe(self.pseudo_negations))
 97 |         self.matcher.add("pseudo", None, *self.pseudo_patterns)
 98 | 
 99 |         self.preceding_patterns = list(
100 |             self.nlp.tokenizer.pipe(self.preceding_negations)
101 |         )
102 |         self.matcher.add("Preceding", None, *self.preceding_patterns)
103 | 
104 |         self.following_patterns = list(
105 |             self.nlp.tokenizer.pipe(self.following_negations)
106 |         )
107 |         self.matcher.add("Following", None, *self.following_patterns)
108 | 
109 |         self.termination_patterns = list(self.nlp.tokenizer.pipe(self.termination))
110 |         self.matcher.add("Termination", None, *self.termination_patterns)
111 | 
112 |     def remove_patterns(
113 |         self,
114 |         pseudo_negations=None,
115 |         preceding_negations=None,
116 |         following_negations=None,
117 |         termination=None,
118 |     ):
119 |         if pseudo_negations:
120 |             if isinstance(pseudo_negations, list):
121 |                 for p in pseudo_negations:
122 |                     self.pseudo_negations.remove(p)
123 |             else:
124 |                 self.pseudo_negations.remove(pseudo_negations)
125 |         if preceding_negations:
126 |             if isinstance(preceding_negations, list):
127 |                 for p in preceding_negations:
128 |                     self.preceding_negations.remove(p)
129 |             else:
130 |                 self.preceding_negations.remove(preceding_negations)
131 |         if following_negations:
132 |             if isinstance(following_negations, list):
133 |                 for p in following_negations:
134 |                     self.following_negations.remove(p)
135 |             else:
136 |                 self.following_negations.extend(following_negations)
137 |         if termination:
138 |             if isinstance(termination, list):
139 |                 for p in termination:
140 |                     self.termination.remove(p)
141 |             else:
142 |                 self.termination.remove(termination)
143 |         self.build_patterns()
144 | 
145 |     def add_patterns(
146 |         self,
147 |         pseudo_negations=None,
148 |         preceding_negations=None,
149 |         following_negations=None,
150 |         termination=None,
151 |     ):
152 |         if pseudo_negations:
153 |             if not isinstance(pseudo_negations, list):
154 |                 raise ValueError("A list of phrases expected when adding patterns")
155 |             self.pseudo_negations.extend(pseudo_negations)
156 |         if preceding_negations:
157 |             if not isinstance(preceding_negations, list):
158 |                 raise ValueError("A list of phrases expected when adding patterns")
159 |             self.preceding_negations.extend(preceding_negations)
160 |         if following_negations:
161 |             if not isinstance(following_negations, list):
162 |                 raise ValueError("A list of phrases expected when adding patterns")
163 |             self.following_negations.extend(following_negations)
164 |         if termination:
165 |             if not isinstance(termination, list):
166 |                 raise ValueError("A list of phrases expected when adding patterns")
167 |             self.termination.extend(termination)
168 |         self.build_patterns()
169 | 
170 |     def get_patterns(self):
171 |         """
172 |         returns phrase patterns used for various negation dictionaries
173 |         
174 |         Returns
175 |         -------
176 |         patterns: dict
177 |             pattern_type: [patterns]
178 | 
179 |         """
180 |         patterns = {
181 |             "pseudo_patterns": self.pseudo_patterns,
182 |             "preceding_patterns": self.preceding_patterns,
183 |             "following_patterns": self.following_patterns,
184 |             "termination_patterns": self.termination_patterns,
185 |         }
186 |         for pattern in patterns:
187 |             logging.info(pattern)
188 |         return patterns
189 | 
190 |     def process_negations(self, doc):
191 |         """
192 |         Find negations in doc and clean candidate negations to remove pseudo negations
193 | 
194 |         Parameters 
195 |         ----------
196 |         doc: object
197 |             spaCy Doc object
198 | 
199 |         Returns
200 |         -------
201 |         preceding: list
202 |             list of tuples for preceding negations
203 |         following: list
204 |             list of tuples for following negations
205 |         terminating: list
206 |             list of tuples of terminating phrases
207 | 
208 |         """
209 |         ###
210 |         # does not work properly in spacy 2.1.8. Will incorporate after 2.2.
211 |         # Relying on user to use NER in meantime
212 |         # see https://github.com/jenojp/negspacy/issues/7
213 |         ###
214 |         # if not doc.is_nered:
215 |         #     raise ValueError(
216 |         #         "Negations are evaluated for Named Entities found in text. "
217 |         #         "Your SpaCy pipeline does not included Named Entity resolution. "
218 |         #         "Please ensure it is enabled or choose a different language model that includes it."
219 |         #     )
220 |         preceding = list()
221 |         following = list()
222 |         terminating = list()
223 | 
224 |         matches = self.matcher(doc)
225 |         pseudo = [
226 |             (match_id, start, end)
227 |             for match_id, start, end in matches
228 |             if self.nlp.vocab.strings[match_id] == "pseudo"
229 |         ]
230 | 
231 |         for match_id, start, end in matches:
232 |             if self.nlp.vocab.strings[match_id] == "pseudo":
233 |                 continue
234 |             pseudo_flag = False
235 |             for p in pseudo:
236 |                 if start >= p[1] and start <= p[2]:
237 |                     pseudo_flag = True
238 |                     continue
239 |             if not pseudo_flag:
240 |                 if self.nlp.vocab.strings[match_id] == "Preceding":
241 |                     preceding.append((match_id, start, end))
242 |                 elif self.nlp.vocab.strings[match_id] == "Following":
243 |                     following.append((match_id, start, end))
244 |                 elif self.nlp.vocab.strings[match_id] == "Termination":
245 |                     terminating.append((match_id, start, end))
246 |                 else:
247 |                     logging.warnings(
248 |                         f"phrase {doc[start:end].text} not in one of the expected matcher types."
249 |                     )
250 |         return preceding, following, terminating
251 | 
252 |     def termination_boundaries(self, doc, terminating):
253 |         """
254 |         Create sub sentences based on terminations found in text.
255 | 
256 |         Parameters
257 |         ----------
258 |         doc: object
259 |             spaCy Doc object
260 |         terminating: list
261 |             list of tuples with (match_id, start, end)
262 | 
263 |         returns
264 |         -------
265 |         boundaries: list
266 |             list of tuples with (start, end) of spans
267 | 
268 |         """
269 |         sent_starts = [sent.start for sent in doc.sents]
270 |         terminating_starts = [t[1] for t in terminating]
271 |         starts = sent_starts + terminating_starts + [len(doc)]
272 |         starts.sort()
273 |         boundaries = list()
274 |         index = 0
275 |         for i, start in enumerate(starts):
276 |             if not i == 0:
277 |                 boundaries.append((index, start))
278 |             index = start
279 |         return boundaries
280 | 
281 |     def negex(self, doc):
282 |         """
283 |         Negates entities of interest
284 | 
285 |         Parameters 
286 |         ----------
287 |         doc: object
288 |             spaCy Doc object
289 | 
290 |         """
291 |         preceding, following, terminating = self.process_negations(doc)
292 |         boundaries = self.termination_boundaries(doc, terminating)
293 |         for b in boundaries:
294 |             sub_preceding = [i for i in preceding if b[0] <= i[1] < b[1]]
295 |             sub_following = [i for i in following if b[0] <= i[1] < b[1]]
296 | 
297 |             for e in doc[b[0] : b[1]].ents:
298 |                 if self.ent_types:
299 |                     if e.label_ not in self.ent_types:
300 |                         continue
301 |                 if any(pre < e.start for pre in [i[1] for i in sub_preceding]):
302 |                     e._.set(self.extension_name, True)
303 |                     continue
304 |                 if any(fol > e.end for fol in [i[2] for i in sub_following]):
305 |                     e._.set(self.extension_name, True)
306 |                     continue
307 |                 if self.chunk_prefix:
308 |                     if any(
309 |                         c.text.lower() == doc[e.start:e.start+len(c)].text.lower()
310 |                         for c in self.chunk_prefix
311 |                     ):
312 |                         e._.set(self.extension_name, True)
313 |         return doc
314 | 
315 |     def __call__(self, doc):
316 |         return self.negex(doc)
317 | 


--------------------------------------------------------------------------------
/sket/negex/termsets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Default termsets for various languages
  3 | """
  4 | 
  5 | LANGUAGES = dict()
  6 | 
  7 | # english termset dictionary
  8 | en = dict()
  9 | pseudo = [
 10 |     "no further",
 11 |     "not able to be",
 12 |     "not certain if",
 13 |     "not certain whether",
 14 |     "not necessarily",
 15 |     "without any further",
 16 |     "without difficulty",
 17 |     "without further",
 18 |     "might not",
 19 |     "not only",
 20 |     "no increase",
 21 |     "no significant change",
 22 |     "no change",
 23 |     "no definite change",
 24 |     "not extend",
 25 |     "not cause",
 26 |     "not certain if",
 27 |     "not certain whether",
 28 | ]
 29 | en["pseudo_negations"] = pseudo
 30 | 
 31 | preceding = [
 32 |     "absence of",
 33 |     "declined",
 34 |     "denied",
 35 |     "denies",
 36 |     "denying",
 37 |     "no sign of",
 38 |     "no signs of",
 39 |     "not",
 40 |     "not demonstrate",
 41 |     "symptoms atypical",
 42 |     "doubt",
 43 |     "negative for",
 44 |     "no",
 45 |     "versus",
 46 |     "without",
 47 |     "doesn't",
 48 |     "doesnt",
 49 |     "don't",
 50 |     "dont",
 51 |     "didn't",
 52 |     "didnt",
 53 |     "wasn't",
 54 |     "wasnt",
 55 |     "weren't",
 56 |     "werent",
 57 |     "isn't",
 58 |     "isnt",
 59 |     "aren't",
 60 |     "arent",
 61 |     "cannot",
 62 |     "can't",
 63 |     "cant",
 64 |     "couldn't",
 65 |     "couldnt",
 66 |     "never",
 67 | ]
 68 | en["preceding_negations"] = preceding
 69 | 
 70 | following = [
 71 |     "declined",
 72 |     "unlikely",
 73 |     "was not",
 74 |     "were not",
 75 |     "wasn't",
 76 |     "wasnt",
 77 |     "weren't",
 78 |     "werent",
 79 | ]
 80 | en["following_negations"] = following
 81 | 
 82 | termination = [
 83 |     "although",
 84 |     "apart from",
 85 |     "as there are",
 86 |     "aside from",
 87 |     "but",
 88 |     "except",
 89 |     "however",
 90 |     "involving",
 91 |     "nevertheless",
 92 |     "still",
 93 |     "though",
 94 |     "which",
 95 |     "yet",
 96 |     "still",
 97 | ]
 98 | en["termination"] = termination
 99 | 
100 | LANGUAGES["en"] = en
101 | 
102 | # en_clinical builds upon en
103 | en_clinical = dict()
104 | pseudo_clinical = pseudo + [
105 |     "gram negative",
106 |     "not rule out",
107 |     "not ruled out",
108 |     "not been ruled out",
109 |     "not drain",
110 |     "no suspicious change",
111 |     "no interval change",
112 |     "no significant interval change",
113 | ]
114 | en_clinical["pseudo_negations"] = pseudo_clinical
115 | 
116 | preceding_clinical = preceding + [
117 |     "patient was not",
118 |     "without indication of",
119 |     "without sign of",
120 |     "without signs of",
121 |     "without any reactions or signs of",
122 |     "no complaints of",
123 |     "no evidence of",
124 |     "no cause of",
125 |     "evaluate for",
126 |     "fails to reveal",
127 |     "free of",
128 |     "never developed",
129 |     "never had",
130 |     "did not exhibit",
131 |     "rules out",
132 |     "rule out",
133 |     "rule him out",
134 |     "rule her out",
135 |     "rule patient out",
136 |     "rule the patient out",
137 |     "ruled out",
138 |     "ruled him out" "ruled her out",
139 |     "ruled patient out",
140 |     "ruled the patient out",
141 |     "r/o",
142 |     "ro",
143 | ]
144 | en_clinical["preceding_negations"] = preceding_clinical
145 | 
146 | following_clinical = following + ["was ruled out", "were ruled out", "free"]
147 | en_clinical["following_negations"] = following_clinical
148 | 
149 | termination_clinical = termination + [
150 |     "cause for",
151 |     "cause of",
152 |     "causes for",
153 |     "causes of",
154 |     "etiology for",
155 |     "etiology of",
156 |     "origin for",
157 |     "origin of",
158 |     "origins for",
159 |     "origins of",
160 |     "other possibilities of",
161 |     "reason for",
162 |     "reason of",
163 |     "reasons for",
164 |     "reasons of",
165 |     "secondary to",
166 |     "source for",
167 |     "source of",
168 |     "sources for",
169 |     "sources of",
170 |     "trigger event for",
171 | ]
172 | en_clinical["termination"] = termination_clinical
173 | LANGUAGES["en_clinical"] = en_clinical
174 | 
175 | en_clinical_sensitive = dict()
176 | 
177 | preceding_clinical_sensitive = preceding_clinical + [
178 |     "concern for",
179 |     "supposed",
180 |     "which causes",
181 |     "leads to",
182 |     "h/o",
183 |     "history of",
184 |     "instead of",
185 |     "if you experience",
186 |     "if you get",
187 |     "teaching the patient",
188 |     "taught the patient",
189 |     "teach the patient",
190 |     "educated the patient",
191 |     "educate the patient",
192 |     "educating the patient",
193 |     "monitored for",
194 |     "monitor for",
195 |     "test for",
196 |     "tested for",
197 | ]
198 | en_clinical_sensitive["pseudo_negations"] = pseudo_clinical
199 | en_clinical_sensitive["preceding_negations"] = preceding_clinical_sensitive
200 | en_clinical_sensitive["following_negations"] = following_clinical
201 | en_clinical_sensitive["termination"] = termination_clinical
202 | 
203 | LANGUAGES["en_clinical_sensitive"] = en_clinical_sensitive
204 | 


--------------------------------------------------------------------------------
/sket/negex/test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import spacy
  3 | from negation import Negex
  4 | from spacy.pipeline import EntityRuler
  5 | 
  6 | 
  7 | def build_docs():
  8 |     docs = list()
  9 |     docs.append(
 10 |         (
 11 |             "Patient denies Apple Computers but has Steve Jobs. He likes USA.",
 12 |             [("Apple Computers", True), ("Steve Jobs", False), ("USA", False)],
 13 |         )
 14 |     )
 15 |     docs.append(
 16 |         (
 17 |             "No history of USA, Germany, Italy, Canada, or Brazil",
 18 |             [
 19 |                 ("USA", True),
 20 |                 ("Germany", True),
 21 |                 ("Italy", True),
 22 |                 ("Canada", True),
 23 |                 ("Brazil", True),
 24 |             ],
 25 |         )
 26 |     )
 27 | 
 28 |     docs.append(("That might not be Barack Obama.", [("Barack Obama", False)]))
 29 | 
 30 |     return docs
 31 | 
 32 | 
 33 | def build_med_docs():
 34 |     docs = list()
 35 |     docs.append(
 36 |         (
 37 |             "Patient denies cardiovascular disease but has headaches. No history of smoking. Alcoholism unlikely. Smoking not ruled out.",
 38 |             [
 39 |                 ("Patient denies", False),
 40 |                 ("cardiovascular disease", True),
 41 |                 ("headaches", False),
 42 |                 ("No history", True),
 43 |                 ("smoking", True),
 44 |                 ("Alcoholism", True),
 45 |                 ("Smoking", False),
 46 |             ],
 47 |         )
 48 |     )
 49 |     docs.append(
 50 |         (
 51 |             "No history of headaches, prbc, smoking, acid reflux, or GERD.",
 52 |             [
 53 |                 ("No history", True),
 54 |                 ("headaches", True),
 55 |                 ("prbc", True),
 56 |                 ("smoking", True),
 57 |                 ("acid reflux", True),
 58 |                 ("GERD", True),
 59 |             ],
 60 |         )
 61 |     )
 62 | 
 63 |     docs.append(
 64 |         (
 65 |             "Alcoholism was not the cause of liver disease.",
 66 |             [("Alcoholism", True), ("cause", False), ("liver disease", False)],
 67 |         )
 68 |     )
 69 | 
 70 |     docs.append(
 71 |         (
 72 |             "There was no headache for this patient.",
 73 |             [("no headache", True), ("patient", True)],
 74 |         )
 75 |     )
 76 |     return docs
 77 | 
 78 | 
 79 | def test():
 80 |     nlp = spacy.load("en_core_web_sm")
 81 |     negex = Negex(nlp)
 82 |     nlp.add_pipe(negex, last=True)
 83 |     docs = build_docs()
 84 |     for d in docs:
 85 |         doc = nlp(d[0])
 86 |         for i, e in enumerate(doc.ents):
 87 |             print(e.text, e._.negex)
 88 |             assert (e.text, e._.negex) == d[1][i]
 89 | 
 90 | 
 91 | def test_en():
 92 |     nlp = spacy.load("en_core_web_sm")
 93 |     negex = Negex(nlp, language="en")
 94 |     nlp.add_pipe(negex, last=True)
 95 |     docs = build_docs()
 96 |     for d in docs:
 97 |         doc = nlp(d[0])
 98 |         for i, e in enumerate(doc.ents):
 99 |             print(e.text, e._.negex)
100 |             assert (e.text, e._.negex) == d[1][i]
101 | 
102 | 
103 | def test_umls():
104 |     nlp = spacy.load("en_core_sci_sm")
105 |     negex = Negex(
106 |         nlp, language="en_clinical", ent_types=["ENTITY"], chunk_prefix=["no"]
107 |     )
108 |     nlp.add_pipe(negex, last=True)
109 |     docs = build_med_docs()
110 |     for d in docs:
111 |         doc = nlp(d[0])
112 |         for i, e in enumerate(doc.ents):
113 |             print(e.text, e._.negex)
114 |             assert (e.text, e._.negex) == d[1][i]
115 | 
116 | 
117 | def test_umls2():
118 |     nlp = spacy.load("en_core_sci_sm")
119 |     negex = Negex(
120 |         nlp, language="en_clinical_sensitive", ent_types=["ENTITY"], chunk_prefix=["no"]
121 |     )
122 |     nlp.add_pipe(negex, last=True)
123 |     docs = build_med_docs()
124 |     for d in docs:
125 |         doc = nlp(d[0])
126 |         for i, e in enumerate(doc.ents):
127 |             print(e.text, e._.negex)
128 |             assert (e.text, e._.negex) == d[1][i]
129 | 
130 | 
131 | # blocked by spacy 2.1.8 issue. Adding back after spacy 2.2.
132 | # def test_no_ner():
133 | #     nlp = spacy.load("en_core_web_sm", disable=["ner"])
134 | #     negex = Negex(nlp)
135 | #     nlp.add_pipe(negex, last=True)
136 | #     with pytest.raises(ValueError):
137 | #         doc = nlp("this doc has not been NERed")
138 | 
139 | 
140 | def test_own_terminology():
141 |     nlp = spacy.load("en_core_web_sm")
142 |     negex = Negex(nlp, termination=["whatever"])
143 |     nlp.add_pipe(negex, last=True)
144 |     doc = nlp("He does not like Steve Jobs whatever he says about Barack Obama.")
145 |     assert doc.ents[1]._.negex == False
146 | 
147 | 
148 | def test_get_patterns():
149 |     nlp = spacy.load("en_core_web_sm")
150 |     negex = Negex(nlp)
151 |     patterns = negex.get_patterns()
152 |     assert type(patterns) == dict
153 |     assert len(patterns) == 4
154 | 
155 | 
156 | def test_issue7():
157 |     nlp = spacy.load("en_core_web_sm")
158 |     negex = Negex(nlp)
159 |     nlp.add_pipe(negex, last=True)
160 |     ruler = EntityRuler(nlp)
161 |     patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
162 |     doc = nlp("fgfgdghgdh")
163 | 
164 | 
165 | def test_add_remove_patterns():
166 |     nlp = spacy.load("en_core_web_sm")
167 |     negex = Negex(nlp)
168 |     patterns = negex.get_patterns()
169 |     negex.add_patterns(
170 |         pseudo_negations=["my favorite pattern"],
171 |         termination=["these are", "great patterns"],
172 |         preceding_negations=["wow a negation"],
173 |         following_negations=["extra negation"],
174 |     )
175 |     patterns_after = negex.get_patterns()
176 |     print(patterns_after)
177 |     print(len(patterns_after["pseudo_patterns"]))
178 |     assert len(patterns_after["pseudo_patterns"]) - 1 == len(
179 |         patterns["pseudo_patterns"]
180 |     )
181 |     assert len(patterns_after["termination_patterns"]) - 2 == len(
182 |         patterns["termination_patterns"]
183 |     )
184 |     assert len(patterns_after["preceding_patterns"]) - 1 == len(
185 |         patterns["preceding_patterns"]
186 |     )
187 |     assert len(patterns_after["following_patterns"]) - 1 == len(
188 |         patterns["following_patterns"]
189 |     )
190 | 
191 |     negex.remove_patterns(
192 |         termination=["these are", "great patterns"],
193 |         pseudo_negations=["my favorite pattern"],
194 |         preceding_negations="denied",
195 |         following_negations=["unlikely"],
196 |     )
197 |     negex.remove_patterns(termination="but")
198 |     negex.remove_patterns(
199 |         preceding_negations="wow a negation", following_negations=["extra negation"]
200 |     )
201 |     patterns_after = negex.get_patterns()
202 |     assert (
203 |         len(patterns_after["termination_patterns"])
204 |         == len(patterns["termination_patterns"]) - 1
205 |     )
206 |     assert (
207 |         len(patterns_after["following_patterns"])
208 |         == len(patterns["following_patterns"]) - 1
209 |     )
210 |     assert (
211 |         len(patterns_after["preceding_patterns"])
212 |         == len(patterns["preceding_patterns"]) - 1
213 |     )
214 |     assert len(patterns_after["pseudo_patterns"]) == len(patterns["pseudo_patterns"])
215 | 
216 | 
217 | if __name__ == "__main__":
218 |     test()
219 |     test_umls()
220 |     test_bad_beharor()
221 |     test_own_terminology()
222 |     test_get_patterns()
223 |     test_issue7()
224 |     test_add_remove_patterns()
225 | 


--------------------------------------------------------------------------------
/sket/nerd/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/nerd/normalizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class StandardizationNormalizer(object):
 5 | 	# apply standard deviation normalization
 6 | 	def __init__(self, scores):
 7 | 		self.mean = np.mean(scores)
 8 | 		self.std = np.std(scores)
 9 | 
10 | 	def __call__(self, scores):
11 | 		if self.std > 0:
12 | 			return (scores - self.mean) / self.std
13 | 		else:
14 | 			return np.zeros(scores.size)
15 | 
16 | 
17 | class MinMaxNormalizer(object):
18 | 	# apply minmax normalization
19 | 	def __init__(self, scores):
20 | 		self.min = np.min(scores)
21 | 		self.max = np.max(scores)
22 | 
23 | 	def __call__(self, scores):
24 | 		if (self.max - self.min) > 0:
25 | 			return (scores - self.min) / (self.max - self.min)
26 | 		else:
27 | 			return np.zeros(scores.size)
28 | 
29 | 
30 | class IdentityNormalizer(object):
31 | 	# apply identify normalization
32 | 	def __init__(self):
33 | 		pass
34 | 
35 | 	def __call__(self, scores):
36 | 		return scores
37 | 


--------------------------------------------------------------------------------
/sket/nerd/rules/cin_mappings.txt:
--------------------------------------------------------------------------------
1 | cin1	low grade cervical squamous intraepithelial neoplasia
2 | cin2	cervical squamous intraepithelial neoplasia 2
3 | cin3	squamous carcinoma in situ
4 | cin23	cervical intraepithelial neoplasia grade 2/3
5 | lsil	low grade cervical squamous intraepithelial neoplasia
6 | hsil	cervical intraepithelial neoplasia grade 2/3


--------------------------------------------------------------------------------
/sket/nerd/rules/dysplasia_mappings.txt:
--------------------------------------------------------------------------------
 1 | mild	mild colon dysplasia	colon
 2 | moderate	moderate colon dysplasia	colon
 3 | severe	severe colon dysplasia	colon
 4 | low-grade	mild colon dysplasia	colon
 5 | low grade	mild colon dysplasia	colon
 6 | low-degree	mild colon dysplasia	colon
 7 | low degree	mild colon dysplasia	colon
 8 | low	mild colon dysplasia	colon
 9 | high-grade	severe colon dysplasia	colon
10 | high grade	severe colon dysplasia	colon
11 | high-degree	severe colon dysplasia	colon
12 | high degree	severe colon dysplasia	colon
13 | high	severe colon dysplasia	colon
14 | strong	severe colon dysplasia	colon
15 | mild-to-moderate	mild colon dysplasia,moderate colon dysplasia	colon
16 | mild to moderate	mild colon dysplasia,moderate colon dysplasia	colon
17 | mild and moderate	mild colon dysplasia,moderate colon dysplasia	colon
18 | moderate-to-severe	moderate colon dysplasia,severe colon dysplasia	colon
19 | moderate to severe	moderate colon dysplasia,severe colon dysplasia	colon
20 | moderate and severe	moderate colon dysplasia,severe colon dysplasia	colon
21 | focally severe	severe colon dysplasia	colon
22 | severe focally	severe colon dysplasia	colon
23 | severe focal	severe colon dysplasia	colon
24 | moderate-severe	moderate colon dysplasia,severe colon dysplasia	colon
25 | mild-moderate	mild colon dysplasia,moderate colon dysplasia	colon
26 | mild-severe	mild colon dysplasia,severe colon dysplasia	colon
27 | mild to severe	mild colon dysplasia,severe colon dysplasia	colon
28 | mild and severe	mild colon dysplasia,severe colon dysplasia	colon
29 | mild-to-severe	mild colon dysplasia,severe colon dysplasia	colon
30 | mild	low grade cervical squamous intraepithelial neoplasia	cervix
31 | moderate	cervical squamous intraepithelial neoplasia 2	cervix
32 | severe	squamous carcinoma in situ	cervix
33 | low-grade	low grade cervical squamous intraepithelial neoplasia	cervix
34 | low grade	low grade cervical squamous intraepithelial neoplasia	cervix
35 | low	low grade cervical squamous intraepithelial neoplasia	cervix
36 | high-grade	cervical intraepithelial neoplasia grade 2/3	cervix
37 | high grade	cervical intraepithelial neoplasia grade 2/3	cervix
38 | high	cervical intraepithelial neoplasia grade 2/3	cervix
39 | strong	squamous carcinoma in situ	cervix
40 | mild-to-moderate	low grade cervical squamous intraepithelial neoplasia,cervical squamous intraepithelial neoplasia 2	cervix
41 | mild to moderate	low grade cervical squamous intraepithelial neoplasia,cervical squamous intraepithelial neoplasia 2	cervix
42 | mild and moderate	low grade cervical squamous intraepithelial neoplasia,cervical squamous intraepithelial neoplasia 2	cervix
43 | moderate-to-severe	cervical squamous intraepithelial neoplasia 2,squamous carcinoma in situ	cervix
44 | moderate to severe	cervical squamous intraepithelial neoplasia 2,squamous carcinoma in situ	cervix
45 | moderate and severe	cervical squamous intraepithelial neoplasia 2,squamous carcinoma in situ	cervix
46 | focally severe	squamous carcinoma in situ	cervix
47 | severe focally	squamous carcinoma in situ	cervix
48 | severe focal	squamous carcinoma in situ	cervix
49 | moderate-severe	cervical squamous intraepithelial neoplasia 2,squamous carcinoma in situ	cervix
50 | mild-moderate	low grade cervical squamous intraepithelial neoplasia,cervical squamous intraepithelial neoplasia 2	cervix
51 | mild-severe	low grade cervical squamous intraepithelial neoplasia,squamous carcinoma in situ	cervix
52 | mild to severe	low grade cervical squamous intraepithelial neoplasia,squamous carcinoma in situ	cervix
53 | mild and severe	low grade cervical squamous intraepithelial neoplasia,squamous carcinoma in situ	cervix
54 | mild-to-severe	low grade cervical squamous intraepithelial neoplasia,squamous carcinoma in situ	cervix


--------------------------------------------------------------------------------
/sket/nerd/rules/rules.txt:
--------------------------------------------------------------------------------
 1 | dysplasia	mild,moderate,severe,low-grade,low-degree,low grade,low degree,low,high-grade,high-degree,high grade,high degree,high,strong,mild-to-moderate,mild to moderate,mild and moderate,moderate-to-severe,moderate to severe,moderate and severe,focally severe,severe focally,severe focal,moderate-severe,mild-moderate,mild-severe,mild to severe,mild and severe,mild-to-severe	BOTH	LOOSE	colon,cervix
 2 | hyperplastic	polyp,polyp-adenomatous,adenomatous polyp,polyp adenomatous type,adenomatous polyp-type,polyp-type,polyp-focal adenomatous,polyp focal adenomatous,polyp-inflammatory	BOTH	EXACT	colon
 3 | hyperplastic polyp	adenomatous,adenomatous type,focal adenomatous,inflammatory	BOTH	EXACT	colon
 4 | transverse	colon	POST	EXACT	colon
 5 | descending	colon	POST	EXACT	colon
 6 | rectal	mucous membrane	POST	EXACT	colon
 7 | ascending	colon	POST	EXACT	colon
 8 | sigmoid	colon	POST	EXACT	colon
 9 | right	colon	POST	EXACT	colon
10 | left	colon	POST	EXACT	colon
11 | rectum	nos	POST	EXACT	colon
12 | colon	nos	POST	EXACT	colon
13 | uterus	nos	POST	EXACT	cervix
14 | carcinoma	in situ	POST	EXACT	cervix
15 | squamous cell carcinoma	in situ	POST	EXACT	cervix
16 | squamous carcinoma	in situ	POST	EXACT	cervix
17 | cervical adenocarcinoma	in situ	POST	EXACT	cervix
18 | uterine cervix carcinoma	in situ	POST	EXACT	cervix
19 | leep	cervical	BOTH	EXACT	cervix
20 | epithelium	exocervical,endocervical	BOTH	EXACT	cervix
21 | squamous intraepithelial lesion	low-grade,low grade,low,high-grade,high grade,high	BOTH	LOOSE	cervix
22 | neuroendocrine	large-cell,large cell,large,non-small cell,non small cell,small-cell,small cell,small	PRE	EXACT	lung
23 | cell	non-small,non small,small,large,clear	PRE	EXACT	lung
24 | duodenal	bulb biopsy,biopsy	POST	EXACT	celiac
25 | biopsy	2nd duodenum,ii duodenal,according to duodenum,according to duodenal, duodenum ii	BOTH	EXACT	celiac
26 | duodenal	mucosa	POST	LOOSE	celiac
27 | intraepithelial	lymphocytes,lymphocytic quota (iel,lymphocytic quota (iel:,lymphocyte infiltrate (iel,lymphocyte infiltrate (iel	POST	EXACT	celiac
28 | hyperplasia	of the brunner glands,brunner gland,of glandular crypts,of the glands of brunner	BOTH	EXACT	celiac
29 | celiac disease	type	POST	LOOSE	celiac
30 | gluten hypersensitivity	type	POST	LOOSE	celiac
31 | phlogosis	chronic,chronic active,chronic acute,active,acute,with marked activity,with activity	BOTH	EXACT	celiac
32 | inflammatory	chronic,acute	BOTH	EXACT	celiac
33 | chronic	gastritis,phlogosis,inflammation	POST	LOOSE	celiac
34 | active	gastritis,phlogosis,inflammation	POST	LOOSE	celiac
35 | acute	gastritis,phlogosis,inflammation	POST	LOOSE	celiac
36 | chronic	duodenitis	POST	EXACT	celiac
37 | brunner	glands,glands of	BOTH	EXACT	celiac
38 | normal	morphology,within the limits of,within the limit of,devoid of,appearance	BOTH	EXACT	celiac
39 | antral type	mucous membranes of	PRE	EXACT	celiac
40 | atrophy	glandular,of the crypts,crypt,villi,of the villi,villial	BOTH	EXACT	celiac
41 | atrophic	villi,crypt	PRE	LOOSE	celiac
42 | flattened	villuses,villi	BOTH	LOOSE	celiac
43 | flattening	of the villi,of the villus	POST	LOOSE	celiac
44 | lymphocyte	(iel(iel:,infiltrate (iel,infiltrate (iel:	POST	EXACT	celiac
45 | infiltration	lymphocytic,(iel,(iel:	BOTH	EXACT	celiac
46 | villi	free of	PRE	EXACT	celiac
47 | villi	atrophy	BOTH	EXACT	celiac
48 | villi	height,length	BOTH	LOOSE	celiac
49 | height	of the villi	POST	EXACT	celiac
50 | height	villi,villus	PRE	LOOSE	celiac
51 | mitosis	number of,proportion of,proportion of cryptic,number of cryptic,share of,share of cryptic	PRE	EXACT	celiac
52 | duodenitis	chronic,moderate,mild,active,erosive,mild-activity,acute,ulcerative,chronic active ulcerative,moderate chronic,chronic mild,chronic active,chronic moderate,mild chronic,chronic active and erosive,chronic severe,chronic erosive,chronic active erosive,erosive chronic,acute erosiva	BOTH	EXACT	celiac
53 | intraepithelial	lymphocyte	POST	EXACT	celiac
54 | celiac	no indications of,no more signs of,no evidence of	PRE	EXACT	celiac
55 | abnormalities	without,no	PRE	EXACT	celiac


--------------------------------------------------------------------------------
/sket/ont_proc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/ont_proc/ontology_processing.py:
--------------------------------------------------------------------------------
  1 | import owlready2
  2 | import itertools
  3 | import pandas as pd
  4 | import rdflib
  5 | 
  6 | from collections import defaultdict
  7 | from copy import deepcopy
  8 | from rdflib import URIRef
  9 | from rdflib.namespace import RDFS
 10 | from owlready2 import IRIS
 11 | 
 12 | from ..utils import utils
 13 | 
 14 | 
 15 | class OntoProc(object):
 16 | 
 17 | 	def __init__(self, ontology_path=None, hierarchies_path=None):
 18 | 		"""
 19 | 		Load ontology and set use-case variable 
 20 | 
 21 | 		Params:
 22 | 			ontology_path (str): ontology.owl file path
 23 | 			hierarchies_path (str): hierarchy relations file path
 24 | 
 25 | 		Returns: None
 26 | 		"""
 27 | 
 28 | 		self.ontology = rdflib.Graph()
 29 | 		if ontology_path:  # custom ontology path
 30 | 			#self.ontology = owlready2.get_ontology(ontology_path).load()
 31 | 			self.ontology.parse(ontology_path)
 32 | 		else:  # default ontology path
 33 | 			self.ontology.parse('./sket/ont_proc/ontology/examode.owl')
 34 | 		if hierarchies_path:  # custom hierarchy relations path
 35 | 			self.hrels = utils.read_hierarchies(hierarchies_path)
 36 | 		else:  # default hierarchy relations path
 37 | 			self.hrels = utils.read_hierarchies('./sket/ont_proc/rules/hierarchy_relations.txt')
 38 | 		self.disease = {'colon': '0002032', 'lung': '0008903', 'cervix': '0002974', 'celiac': '0005130'}
 39 | 
 40 | 	def restrict2use_case(self, use_case, limit=1000):
 41 | 		"""
 42 | 		Restrict ontology to the considered use-case and return DataFrame containing concepts from restricted ontology
 43 | 
 44 | 		Params:
 45 | 			use_case (str): use case considered (colon, lung, cervix, celiac)
 46 | 			limit (int): max number of returned elements
 47 | 
 48 | 		Returns: a pandas DataFrame containing concepts information
 49 | 		"""
 50 | 
 51 | 		disease = self.disease[use_case]
 52 | 		sparql = "PREFIX exa: <https://w3id.org/examode/ontology/> " \
 53 | 			"PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " \
 54 | 			"PREFIX mondo: <http://purl.obolibrary.org/obo/MONDO_> " \
 55 | 			"PREFIX dcterms: <http://purl.org/dc/terms/> "\
 56 | 			"select ?iri ?iri_label ?iri_SNOMED_code ?iri_UMLS_code ?semantic_area ?semantic_area_label where { " \
 57 | 			"?iri rdfs:label ?iri_label ; exa:associatedDisease mondo:" + disease + ". " \
 58 | 			"filter (langMatches( lang(?iri_label), 'en')). " \
 59 | 			"OPTIONAL {?iri exa:hasSNOMEDCode ?iri_SNOMED_code .} " \
 60 | 			"OPTIONAL {?iri dcterms:conformsTo ?iri_UMLS_code .} " \
 61 | 			"OPTIONAL {?iri exa:hasSemanticArea ?semantic_area . " \
 62 | 			"?semantic_area rdfs:label ?semantic_area_label . " \
 63 | 			"filter (langMatches( lang(?semantic_area_label), 'en')).} " \
 64 | 			"} " \
 65 | 			"limit " + str(limit)
 66 | 		# issue sparql query
 67 | 		resultSet = self.ontology.query(query_object=sparql)
 68 | 		# convert query output to DataFrame
 69 | 		ontology_dict = defaultdict(list)
 70 | 		for row in resultSet:
 71 | 			# store entity as IRI
 72 | 			ontology_dict['iri'].append(str(row.iri))
 73 | 			# store additional information associated w/ entity
 74 | 			ontology_dict['label'].append(str(row.iri_label))
 75 | 			ontology_dict['SNOMED'].append(str(row.iri_SNOMED_code) if row.iri_SNOMED_code is not None else None)
 76 | 			ontology_dict['UMLS'].append(str(row.iri_UMLS_code)if row.iri_UMLS_code is not None else None)
 77 | 			ontology_dict['semantic_area'].append(str(row.semantic_area))
 78 | 			ontology_dict['semantic_area_label'].append(str(row.semantic_area_label))
 79 | 		if use_case == 'celiac':
 80 | 			# Add negative result
 81 | 			# store entity as IRI
 82 | 			ontology_dict['iri'].append('https://w3id.org/examode/ontology/NegativeResult')
 83 | 			# store additional information associated w/ entity
 84 | 			ontology_dict['label'].append('Negative Result')
 85 | 			ontology_dict['SNOMED'].append('M-010100')
 86 | 			ontology_dict['UMLS'].append(None)
 87 | 			ontology_dict['semantic_area'].append('http://purl.obolibrary.org/obo/NCIT_C15220')
 88 | 			ontology_dict['semantic_area_label'].append('Diagnosis')
 89 | 			# Add inconclusive result
 90 | 			# store entity as IRI
 91 | 			ontology_dict['iri'].append('https://w3id.org/examode/ontology/InconclusiveOutcome')
 92 | 			# store additional information associated w/ entity
 93 | 			ontology_dict['label'].append('Inconclusive Outcome')
 94 | 			ontology_dict['SNOMED'].append(None)
 95 | 			ontology_dict['UMLS'].append(None)
 96 | 			ontology_dict['semantic_area'].append('http://purl.obolibrary.org/obo/NCIT_C15220')
 97 | 			ontology_dict['semantic_area_label'].append('Diagnosis')
 98 | 		return pd.DataFrame(ontology_dict)	
 99 | 
100 | 	@staticmethod
101 | 	def lookup_semantic_areas(semantic_areas, use_case_ontology):
102 | 		"""
103 | 		Lookup for ontology concepts associated to target semantic areas
104 | 		
105 | 		Params:
106 | 			semantic_areas (list(str)/str): target semantic areas
107 | 			use_case_ontology (pandas DataFrame): reference ontology restricted to the use case considered
108 | 			
109 | 		Returns a list of rows matching semantic areas
110 | 		"""
111 | 		
112 | 		if type(semantic_areas) == list:  # search for list of semantic areas
113 | 			rows = use_case_ontology.loc[use_case_ontology['semantic_area_label'].isin(semantic_areas)][['iri', 'label', 'semantic_area_label']]
114 | 		else:  # search for single semantic area
115 | 			rows = use_case_ontology.loc[use_case_ontology['semantic_area_label'] == semantic_areas][['iri', 'label', 'semantic_area_label']]
116 | 		if rows.empty:  # no match found within ontology
117 | 			return []
118 | 		else:  # match found
119 | 			return rows.values.tolist()
120 | 
121 | 	def get_ancestors(self, concepts, include_self=False):
122 | 		"""
123 | 		Returns the list of ancestor concepts given target concept and hierachical relations
124 | 		
125 | 		Params:
126 | 			concepts (list(str)): list of concepts from ontology
127 | 			include_self (bool): whether to include current concept in the list of ancestors
128 | 		
129 | 		Returns: the list of ancestors for target concept
130 | 		"""
131 | 		
132 | 		assert type(concepts) == list
133 | 
134 | 		# get latest concept within concepts
135 | 		concept = concepts[-1]
136 | 
137 | 		# Query to return ancestors (both for classes or individuals
138 | 		txtQuery = "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " \
139 | 			"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> " \
140 | 			"select ?ancestor where { " \
141 | 			"<" + concept + "> (rdfs:subClassOf|skos:broaderTransitive)+ ?ancestor. " \
142 | 			"}"
143 | 
144 | 		# issue sparql query
145 | 		resultSet = self.ontology.query(query_object=txtQuery)
146 | 		ancestors = []
147 | 		for r in resultSet:
148 | 			ancestors.append(str(r.ancestor))
149 | 		# if include_self include concept
150 | 		if include_self:
151 | 			ancestors.append(concept)
152 | 
153 | 		return ancestors
154 | 
155 | 	def check_individual_type(self, individual, classURI):
156 | 		"""
157 | 		Checks if an individual belongs to a specified class.
158 | 
159 | 		Params:
160 | 			individual (str): URI of the individual.
161 | 			classURI (str): URI of the class.
162 | 
163 | 		Returns: boolean value asserting whether individual belongs to classURI.
164 | 		"""
165 | 
166 | 		# Query to return classes of individual
167 | 		txtQuery = "select ?type { <" + individual + "> a ?type.}"
168 | 
169 | 		# issue sparql query
170 | 		resultSet = self.ontology.query(query_object=txtQuery)
171 | 		classes = []
172 | 		for r in resultSet:
173 | 			classes.append(str(r.type))
174 | 		# if include_self include concept
175 | 		if classURI in classes:
176 | 			return True
177 | 		else:
178 | 			return False
179 | 
180 | 	def get_higher_concept(self, iri1, iris, include_self=False):
181 | 		"""
182 | 		Return the ontology concept that is more general (hierarchically higher)
183 | 		
184 | 		Params:
185 | 			iri1 (str): the first iri considered
186 | 			iris (list(str)): list of the second iris considered
187 | 			include_self (bool): whether to include current concept in the list of ancestors
188 | 			
189 | 		Returns: the hierarchically higher concept's iri
190 | 		"""
191 | 		# get ancestors for both concepts
192 | 		ancestors1 = self.get_ancestors([iri1], include_self)
193 | 		ancestors2 = self.get_ancestors([iris[0]], include_self)
194 | 		if iris[0] in ancestors1:  # concept1 is a descendant of concept2
195 | 			return iris[0]
196 | 		elif iri1 in ancestors2:  # concept1 is an ancestor of concept2
197 | 			return iri1
198 | 		else:  # concept1 and concept2 are not hierarchically related, check if there is another concept
199 | 			if len(iris) == 2:
200 | 				poType = self.check_individual_type(iri1, iris[1])
201 | 				if poType:  # concept1 is an individual of type concept3
202 | 					return iris[1]
203 | 			return None
204 | 
205 | 
206 | 	def merge_nlp_and_struct(self, nlp_concepts, struct_concepts):
207 | 		"""
208 | 		Merge the information extracted from 'nlp' and 'struct' sections
209 | 		
210 | 		Params:
211 | 			nlp_concepts (dict): the dictionary of linked concepts from 'nlp' section
212 | 			struct_concepts (dict): the dictionary of linked concepts from 'struct' section
213 | 		
214 | 		Returns: a dict containing the linked concepts w/o distinction between 'nlp' and 'struct' concepts
215 | 		"""
216 | 		
217 | 		cconcepts = dict()
218 | 		# merge linked concepts from 'nlp' and 'struct' sections
219 | 		for sem_area in nlp_concepts.keys():
220 | 			if nlp_concepts[sem_area] and struct_concepts[sem_area]:  # semantic area is not empty for both 'nlp' and 'struct' sections
221 | 				# get all the possible combinations of 'nlp' and 'struct' concepts
222 | 				combinations = list(itertools.product(nlp_concepts[sem_area], struct_concepts[sem_area]))
223 | 				# return IRIs to be removed (hierarchically higher)
224 | 				IRIs = {self.get_higher_concept(combination[0][0], combination[1][0]) for combination in combinations} - {None}
225 | 				# remove under-specified concepts and store remaining concepts
226 | 				cconcepts[sem_area] = deepcopy(nlp_concepts[sem_area])
227 | 				cconcepts[sem_area].extend([concept for concept in struct_concepts[sem_area] if concept[0] not in [concept[0] for concept in nlp_concepts[sem_area]]])
228 | 				# remove IRIs from cconcepts
229 | 				cconcepts[sem_area] = [concept for concept in cconcepts[sem_area] if concept[0] not in IRIs]
230 | 			elif nlp_concepts[sem_area]:  # semantic area is not empty only for the 'nlp' section
231 | 				cconcepts[sem_area] = deepcopy(nlp_concepts[sem_area])
232 | 			elif struct_concepts[sem_area]:  # semantic area is not empty only for 'struct' section
233 | 				cconcepts[sem_area] = deepcopy(struct_concepts[sem_area])
234 | 			else:  # semantic area is empty for both sections
235 | 				cconcepts[sem_area] = list()
236 | 		# return combined concepts
237 | 		return cconcepts
238 | 


--------------------------------------------------------------------------------
/sket/ont_proc/rules/hierarchy_relations.txt:
--------------------------------------------------------------------------------
1 | hasBroaderTransitive


--------------------------------------------------------------------------------
/sket/rdf_proc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/rep_proc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/rep_proc/report_processing.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import math
  3 | import string
  4 | import re
  5 | import json
  6 | import uuid
  7 | import copy
  8 | import roman
  9 | 
 10 | from tqdm import tqdm
 11 | from copy import deepcopy
 12 | from collections import defaultdict
 13 | from transformers import MarianMTModel, MarianTokenizer
 14 | from datetime import datetime
 15 | 
 16 | from ..utils import utils
 17 | 
 18 | 
 19 | class ReportProc(object):
 20 | 
 21 | 	def __init__(self, src_lang, use_case, fields_path=None):
 22 | 		"""
 23 | 		Set translator and build regular expression to split text based on bullets
 24 | 
 25 | 		Params:
 26 | 			src_lang (str): considered source language
 27 | 			use_case (str): considered use case
 28 | 			fields_path (str): report fields file path
 29 | 
 30 | 		Returns: None
 31 | 		"""
 32 | 
 33 | 		self.use_case = use_case
 34 | 
 35 | 		if fields_path:  # read report fields file
 36 | 			self.fields = utils.read_report_fields(fields_path)
 37 | 		else:  # no report fields file provided
 38 | 			self.fields = utils.read_report_fields('./sket/rep_proc/rules/report_fields.txt')
 39 | 
 40 | 		if src_lang != 'en':  # set NMT model
 41 | 			self.nmt_name = 'Helsinki-NLP/opus-mt-' + src_lang + '-en'
 42 | 			self.tokenizer = MarianTokenizer.from_pretrained(self.nmt_name)
 43 | 			self.nmt = MarianMTModel.from_pretrained(self.nmt_name)
 44 | 		else:  # no NMT model required
 45 | 			self.nmt_name = None
 46 | 			self.tokenizer = None
 47 | 			self.nmt = None
 48 | 
 49 | 		# build regex for bullet patterns
 50 | 		self.en_roman_regex = re.compile('((?<=(^i-ii(\s|:|\.)))|(?<=(^i-iii(\s|:|\.)))|(?<=(^ii-iii(\s|:|\.)))|(?<=(^i-iv(\s|:|\.)))|(?<=(^ii-iv(\s|:|\.)))|(?<=(^iii-iv(\s|:|\.)))|(?<=(^i and ii(\s|:|\.)))|(?<=(^i and iii(\s|:|\.)))|(?<=(^ii and iii(\s|:|\.)))|(?<=(^i and iv(\s|:|\.)))|(?<=(^ii and iv(\s|:|\.)))|(?<=(^iii and iv(\s|:|\.)))|(?<=(^i(\s|:|\.)))|(?<=(^ii(\s|:|\.)))|(?<=(^iii(\s|:|\.)))|(?<=(^iv(\s|:|\.)))|(?<=(\si-ii(\s|:|\.)))|(?<=(\si-iii(\s|:|\.)))|(?<=(\sii-iii(\s|:|\.)))|(?<=(\si-iv(\s|:|\.)))|(?<=(\sii-iv(\s|:|\.)))|(?<=(\siii-iv(\s|:|\.)))|(?<=(\si and ii(\s|:|\.)))|(?<=(\si and iii(\s|:|\.)))|(?<=(\sii and iii(\s|:|\.)))|(?<=(\si and iv(\s|:|\.)))|(?<=(\sii and iv(\s|:|\.)))|(?<=(\siii and iv(\s|:|\.)))|(?<=(\si(\s|:|\.)))|(?<=(\sii(\s|:|\.)))|(?<=(\siii(\s|:|\.)))|(?<=(\siv(\s|:|\.))))(.*?)((?=(\si+(\s|:|\.|-)))|(?=(\siv(\s|:|\.|-)))|(?=($)))')
 51 | 		self.nl_roman_regex = re.compile('((?<=(^i-ii(\s|:|\.)))|(?<=(^i-iii(\s|:|\.)))|(?<=(^ii-iii(\s|:|\.)))|(?<=(^i-iv(\s|:|\.)))|(?<=(^ii-iv(\s|:|\.)))|(?<=(^iii-iv(\s|:|\.)))|(?<=(^i en ii(\s|:|\.)))|(?<=(^i en iii(\s|:|\.)))|(?<=(^ii en iii(\s|:|\.)))|(?<=(^i en iv(\s|:|\.)))|(?<=(^ii en iv(\s|:|\.)))|(?<=(^iii en iv(\s|:|\.)))|(?<=(^i(\s|:|\.)))|(?<=(^ii(\s|:|\.)))|(?<=(^iii(\s|:|\.)))|(?<=(^iv(\s|:|\.)))|(?<=(\si-ii(\s|:|\.)))|(?<=(\si-iii(\s|:|\.)))|(?<=(\sii-iii(\s|:|\.)))|(?<=(\si-iv(\s|:|\.)))|(?<=(\sii-iv(\s|:|\.)))|(?<=(\siii-iv(\s|:|\.)))|(?<=(\si en ii(\s|:|\.)))|(?<=(\si en iii(\s|:|\.)))|(?<=(\sii en iii(\s|:|\.)))|(?<=(\si en iv(\s|:|\.)))|(?<=(\sii en iv(\s|:|\.)))|(?<=(\siii en iv(\s|:|\.)))|(?<=(\si(\s|:|\.)))|(?<=(\sii(\s|:|\.)))|(?<=(\siii(\s|:|\.)))|(?<=(\siv(\s|:|\.))))(.*?)((?=(\si+(\s|:|\.|-)))|(?=(\siv(\s|:|\.|-)))|(?=($)))')
 52 | 		self.bullet_regex = re.compile("^[-(]?\s*[\d,]+\s*[:)-]?")
 53 | 		self.ranges_regex = re.compile("^\(?\s*(\d\s*-\s*\d|\d\s*\.\s*\d)\s*\)?")
 54 | 
 55 | 	# COMMON FUNCTIONS
 56 | 	
 57 | 	def is_empty(self, var):
 58 | 		"""
 59 | 		Check whether a var is empty (i.e., NULL or nan)
 60 | 
 61 | 		Params:
 62 | 			var (any): considered variable
 63 | 
 64 | 		Returns: bool
 65 | 		"""
 66 | 
 67 | 		if type(var) == float:
 68 | 			return math.isnan(var)
 69 | 		else:
 70 | 			return var is None
 71 | 	
 72 | 	def update_usecase(self, use_case):
 73 | 		"""
 74 | 		Update use case
 75 | 
 76 | 		Params:
 77 | 			use_case (str): considered use case
 78 | 
 79 | 		Returns: None
 80 | 		"""
 81 | 
 82 | 		self.use_case = use_case
 83 | 
 84 | 	def update_nmt(self, src_lang):
 85 | 		"""
 86 | 		Update NMT model changing source language
 87 | 
 88 | 		Params:
 89 | 			src_lang (str): considered source language
 90 | 
 91 | 		Returns: None
 92 | 		"""
 93 | 
 94 | 		if src_lang != 'en':  # update NMT model
 95 | 			self.nmt_name = 'Helsinki-NLP/opus-mt-' + src_lang + '-en'
 96 | 			self.tokenizer = MarianTokenizer.from_pretrained(self.nmt_name)
 97 | 			self.nmt = MarianMTModel.from_pretrained(self.nmt_name)
 98 | 		else:  # no NMT model required
 99 | 			self.nmt_name = None
100 | 			self.tokenizer = None
101 | 			self.nmt = None
102 | 
103 | 	def update_report_fields(self, fields_path):
104 | 		"""
105 | 		Update report fields changing current ones
106 | 
107 | 		Params:
108 | 			fields_path (str): report fields file
109 | 
110 | 		Returns: None
111 | 		"""
112 | 
113 | 		self.fields = utils.read_report_fields(fields_path)
114 | 
115 | 	def load_dataset(self, reports_path, sheet, header): 
116 | 		"""
117 | 		Load reports dataset
118 | 
119 | 		Params:
120 | 			reports_path (str): reports.xlsx fpath
121 | 			sheet (str): name of the excel sheet to use 
122 | 			header (int): row index used as header
123 | 		
124 | 		Returns: the loaded dataset
125 | 		"""
126 | 
127 | 		if reports_path.split('.')[-1] == 'xlsx':  # requires openpyxl engine
128 | 			dataset = pd.read_excel(io=reports_path, sheet_name=sheet, header=header, engine='openpyxl')
129 | 		else:
130 | 			dataset = pd.read_excel(io=reports_path, sheet_name=sheet, header=header)
131 | 		# remove rows w/ na
132 | 		dataset.dropna(axis=0, how='all', inplace=True)
133 | 		# dataset.dropna(axis=0, how='all', subset=dataset.columns[1:], inplace=True)
134 | 
135 | 		return dataset
136 | 
137 | 	def translate_text(self, text):
138 | 		"""
139 | 		Translate text from source to destination -- text is lower-cased before and after translation
140 | 
141 | 		Params:
142 | 			text (str): target text
143 | 
144 | 		Returns: translated text
145 | 		"""
146 | 
147 | 		if type(text) == str:
148 | 			trans_text = self.nmt.generate(**self.tokenizer(text.lower(), return_tensors="pt", padding=True))[0]
149 | 			trans_text = self.tokenizer.decode(trans_text, skip_special_tokens=True)
150 | 		else:
151 | 			trans_text = ''
152 | 		return trans_text.lower()
153 | 
154 | 	# AOEC SPECIFIC FUNCTIONS
155 | 
156 | 	def aoec_process_data(self, dataset):
157 | 		"""
158 | 		Read AOEC reports and extract the required fields 
159 | 
160 | 		Params:
161 | 			dataset (pandas DataFrame): target dataset
162 | 
163 | 		Returns: a dict containing the required reports fields
164 | 		"""
165 | 
166 | 		reports = dict()
167 | 		print('acquire data')
168 | 		# acquire data and translate text
169 | 		for report in tqdm(dataset.itertuples()):
170 | 			reports[str(report._1).strip()] = {
171 | 				'diagnosis_nlp': report.Diagnosi,
172 | 				'materials': report.Materiali,
173 | 				'procedure': report.Procedura if type(report.Procedura) == str else '',
174 | 				'topography': report.Topografia if type(report.Topografia) == str else '',
175 | 				'diagnosis_struct': report._5 if type(report._5) == str else '',
176 | 				'age': int(report.Età) if not math.isnan(report.Età) else None,
177 | 				'gender': report.Sesso if type(report.Sesso) == str else ''
178 | 			}
179 | 		return reports
180 | 
181 | 	def aoec_split_diagnoses(self, diagnoses, int_id, debug=False):
182 | 		"""
183 | 		Split the section 'diagnoses' within AOEC reports relying on bullets (i.e. '1', '2', etc.)
184 | 
185 | 		Params:
186 | 			diagnoses (str): the 'diagnoses' section of AOEC reports
187 | 			int_id (int): the internal id specifying the current diagnosis
188 | 			debug (bool): whether to keep flags for debugging
189 | 
190 | 		Returns: the part of the 'diagnoses' section related to the current internalid
191 | 		"""
192 | 
193 | 		current_iids = []
194 | 		dgnss = {}
195 | 		# split diagnosis on new lines
196 | 		dlines = diagnoses.split('\n')
197 | 		# loop over lines
198 | 		for line in dlines:
199 | 			line = line.strip()
200 | 			if line:  # line contains text
201 | 				# look for range first
202 | 				rtext = self.ranges_regex.findall(line)
203 | 				if rtext:  # range found
204 | 					bullets = re.findall('\d+', rtext[0])
205 | 					bullets = list(map(int, bullets))
206 | 					bullets = range(bullets[0], bullets[1]+1)
207 | 					current_iids = deepcopy(bullets)
208 | 				else:  # ranges not found
209 | 					# look for bullets
210 | 					btext = self.bullet_regex.findall(line)
211 | 					if btext:  # bullets found
212 | 						bullets = re.findall('\d+', btext[0])
213 | 						bullets = list(map(int, bullets))
214 | 						current_iids = deepcopy(bullets)
215 | 				# associate current line to the corresponding ids
216 | 				for iid in current_iids:
217 | 					if iid in dgnss:  # iid assigned before
218 | 						dgnss[iid] += ' ' + line
219 | 					else:  # new idd
220 | 						dgnss[iid] = line
221 | 		if int_id in dgnss:  # return the corresponding diagnosis
222 | 			return dgnss[int_id]
223 | 		elif not current_iids:  # no bullet found -- return the whole diagnoses field (w/o \n to avoid problems w/ FastText)
224 | 			return diagnoses.replace('\n', ' ')
225 | 		else:  # return the whole diagnoses field (w/o \n to avoid problems w/ FastText) -- something went wrong
226 | 			if debug:
227 | 				print('\n\nSomething went wrong -- return the whole diagnoses field but print data:')
228 | 				print('Internal ID: {}'.format(int_id))
229 | 				print('Raw Field: {}'.format(diagnoses))
230 | 				print('Processed Field: {}\n\n'.format(dgnss))
231 | 			return diagnoses.replace('\n', ' ')
232 | 
233 | 	def aoec_process_data_v2(self, dataset, debug=False):
234 | 		"""
235 | 		Read AOEC reports and extract the required fields (v2 used for batches from 2nd onwards)
236 | 
237 | 		Params:
238 | 			dataset (pandas DataFrame): target dataset
239 | 			debug (bool): whether to keep flags for debugging
240 | 
241 | 		Returns: a dict containing the required report fields
242 | 		"""
243 | 
244 | 		reports = dict()
245 | 		print('acquire data and split it based on diagnoses')
246 | 		# acquire data and split it based on diagnoses
247 | 		for report in tqdm(dataset.itertuples()):
248 | 			if 'IDINTERNO' in dataset.columns:
249 | 				rid = str(report.FILENAME).strip() + '_' + str(report.IDINTERNO).strip()
250 | 				if type(report.TESTODIAGNOSI) == str:
251 | 					reports[rid] = {
252 | 						'diagnosis_nlp': self.aoec_split_diagnoses(report.TESTODIAGNOSI, report.IDINTERNO, debug=debug)
253 | 						if type(report.IDINTERNO) == str else report.TESTODIAGNOSI,
254 | 						'materials': report.MATERIALE,
255 | 						'procedure': report.SNOMEDPROCEDURA if type(report.SNOMEDPROCEDURA) == str else '',
256 | 						'topography': report.SNOMEDTOPOGRAFIA if type(report.SNOMEDTOPOGRAFIA) == str else '',
257 | 						'diagnosis_struct': report.SNOMEDDIAGNOSI if type(report.SNOMEDDIAGNOSI) == str else '',
258 | 						'birth_date': report.NATOIL if report.NATOIL else '',
259 | 						'visit_date': report.DATAORAFINEVALIDAZIONE if report.DATAORAFINEVALIDAZIONE else '',
260 | 						'gender': report.SESSO if type(report.SESSO) == str else '',
261 | 						'image': report.FILENAME,
262 | 						'internalid': report.IDINTERNO
263 | 					}
264 | 			else:
265 | 				# process_data_v3, no IDINTERNO and MATERIALI
266 | 				rid = str(int(report.FILENAME)).strip()
267 | 				if type(report.TESTODIAGNOSI) == str:
268 | 					reports[rid] = {
269 | 						'diagnosis_nlp': report.TESTODIAGNOSI,
270 | 						'procedure': report.SNOMEDPROCEDURA if type(report.SNOMEDPROCEDURA) == str else '',
271 | 						'topography': report.SNOMEDTOPOGRAFIA if type(report.SNOMEDTOPOGRAFIA) == str else '',
272 | 						'diagnosis_struct': report.SNOMEDDIAGNOSI if type(report.SNOMEDDIAGNOSI) == str else '',
273 | 						'birth_date': report.NATOIL.to_pydatetime().strftime("%Y%m%d")+"000000" if report.NATOIL else '',
274 | 						'visit_date': report.DATAORAFINEVALIDAZIONE.to_pydatetime().strftime("%Y%m%d")+"000000" if report.DATAORAFINEVALIDAZIONE else '',
275 | 						'gender': report.SESSO if type(report.SESSO) == str else '',
276 | 						'image': int(report.FILENAME)
277 | 					}
278 | 
279 | 		return reports
280 | 
281 | 	@staticmethod
282 | 	def date_formatter(raw_date):
283 | 		"""
284 | 		Returns date in the correct format.
285 | 
286 | 		Params:
287 | 			raw_date (timestamp): date to format
288 | 		Return string with correctly formatted date.
289 | 		"""
290 | 		date = datetime.strptime(raw_date, '%d/%m/%Y')
291 | 		return date.strftime("%Y%m%d")+"000000"
292 | 
293 | 	def aoec_translate_reports(self, reports):
294 | 		"""
295 | 		Translate processed reports
296 | 
297 | 		Params:
298 | 			reports (dict): processed reports
299 | 
300 | 		Returns: translated reports
301 | 		"""
302 | 
303 | 		trans_reports = copy.deepcopy(reports)
304 | 		print('translate text')
305 | 		# translate text
306 | 		for rid, report in tqdm(trans_reports.items()):
307 | 			trans_reports[rid]['diagnosis_nlp'] = self.translate_text(report['diagnosis_nlp'])
308 | 			if 'materials' in report:
309 | 				trans_reports[rid]['materials'] = self.translate_text(report['materials']) if report['materials'] != '' else ''
310 | 		return trans_reports
311 | 
312 | 	# RADBOUD SPECIFIC FUNCTIONS
313 | 
314 | 	def radboud_split_conclusions(self, conclusions):
315 | 		"""
316 | 		Split the section 'conclusions' within reports relying on bullets (i.e. 'i', 'ii', etc.)
317 | 
318 | 		Params:
319 | 			conclusions (str): the 'conclusions' section of radboud reports
320 | 
321 | 		Returns: a dict containing the 'conclusions' section divided as a bullet list
322 | 		"""
323 | 
324 | 		sections = defaultdict(str)
325 | 		# use regex to identify bullet-divided sections within 'conclusions'
326 | 		for groups in self.nl_roman_regex.findall(conclusions):
327 | 			# identify the target bullet for the given section
328 | 			bullet = [group for group in groups[:65] if group and any(char.isalpha() or char.isdigit() for char in group)][0].strip()
329 | 			if 'en' in bullet:  # composite bullet
330 | 				bullets = bullet.split(' en ')
331 | 			elif '-' in bullet:  # composite bullet
332 | 				bullets = bullet.split('-')
333 | 			else:  # single bullet
334 | 				bullets = [bullet]
335 | 			# loop over bullets and concatenate corresponding sections
336 | 			for bullet in bullets:
337 | 				if groups[65] != 'en':  # the section is not a conjunction between two bullets (e.g., 'i and ii')
338 | 					sections[bullet.translate(str.maketrans('', '', string.punctuation)).upper()] += ' ' + groups[65]  # store them using uppercased roman numbers as keys - required to make Python 'roman' library working
339 | 		if bool(sections):  # 'sections' contains split sections
340 | 			return sections
341 | 		else:  # 'sections' is empty - assign the whole 'conclusions' to 'sections'
342 | 			sections['whole'] = conclusions
343 | 			return sections
344 | 
345 | 	def radboud_process_data(self, dataset, debug=False):
346 | 		"""
347 | 		Read Radboud reports and extract the required fields
348 | 
349 | 		Params:
350 | 			dataset (pandas DataFrame): target dataset
351 | 			debug (bool): whether to keep flags for debugging
352 | 
353 | 
354 | 		Returns: a dict containing the required report fields
355 | 		"""
356 | 
357 | 		proc_reports = dict()
358 | 		skipped_reports = []
359 | 		unsplitted_reports = 0
360 | 		misplitted_reports = 0
361 | 		report_conc_keys = {report.Studynumber: report.Conclusion for report in dataset.itertuples()}
362 | 		for report in tqdm(dataset.itertuples()):
363 | 			rid = str(report.Studynumber).strip()
364 | 			if type(report.Conclusion) == str:  # split conclusions and associate to each block the corresponding conclusion
365 | 				# deepcopy rdata to avoid removing elements from input reports
366 | 				raw_conclusions = report.Conclusion
367 | 				# split conclusions into sections
368 | 				conclusions = self.radboud_split_conclusions(utils.nl_sanitize_record(raw_conclusions.lower(), self.use_case))
369 | 				pid = '_'.join(rid.split('_')[:-1])  # remove block and slide ids from report id - keep patient id
370 | 				related_ids = [rel_id for rel_id in report_conc_keys.keys() if pid in rel_id]  # get all the ids related to the current patient
371 | 				# get block ids from related_ids
372 | 				block_ids = []
373 | 				for rel_id in related_ids:
374 | 					if 'B' not in rel_id:  # skip report as it does not contain block ID
375 | 						skipped_reports.append(rel_id)
376 | 						continue
377 | 					if 'v' not in rel_id.lower() and '-' not in rel_id:  # report does not contain special characters
378 | 						block_part = rel_id.split('_')[-1]
379 | 						if len(block_part) < 4:  # slide ID not available
380 | 							block_ids.append(rel_id)
381 | 						else:  # slide ID available
382 | 							block_ids.append(rel_id[:-2])
383 | 					elif 'v' in rel_id.lower():  # report contains slide ID first variant (i.e., _V0*)
384 | 						block_part = rel_id.split('_')[-2]
385 | 						if len(block_part) < 4:  # slide ID not available
386 | 							block_ids.append('_'.join(rel_id.split('_')[:-1]))
387 | 						else:  # slide ID available
388 | 							block_ids.append('_'.join(rel_id.split('_')[:-1])[:-2])
389 | 					elif '-' in rel_id:  # report contains slide ID second variant (i.e., -*)
390 | 						block_part = rel_id.split('_')[-1].split('-')[0]
391 | 						if len(block_part) < 4:  # slide ID not available
392 | 							block_ids.append(rel_id.split('-')[0])
393 | 						else:  # slide ID available
394 | 							block_ids.append(rel_id.split('-')[0][:-2])
395 | 					else:
396 | 						print('something went wrong w/ current report')
397 | 						print(rel_id)
398 | 
399 | 				if not block_ids:  # Block IDs not found -- skip it
400 | 					continue
401 | 
402 | 				if 'whole' in conclusions:  # unable to split conclusions - either single conclusion or not appropriately specified
403 | 					if len(block_ids) > 1:  # conclusions splits not appropriately specified or wrong
404 | 						unsplitted_reports += 1
405 | 					for bid in block_ids:
406 | 						# create dict to store block diagnosis and slide ids
407 | 						proc_reports[bid] = dict()
408 | 						# store conclusion - i.e., the final diagnosis
409 | 						proc_reports[bid]['diagnosis'] = conclusions['whole']
410 | 						# store slide ids associated to the current block diagnosis
411 | 						slide_ids = []
412 | 						for sid in report_conc_keys.keys():
413 | 							if bid in sid:  # Block ID found within report ID
414 | 								if 'v' not in sid.lower() and '-' not in sid:  # report does not contain special characters
415 | 									block_part = sid.split('_')[-1]
416 | 									if len(block_part) < 4:  # slide ID not available
417 | 										continue
418 | 									else:  # slide ID available
419 | 										slide_ids.append(sid[-2:])
420 | 								elif 'v' in sid.lower():  # report contains slide ID first variant (i.e., _V0*)
421 | 									block_part = sid.split('_')[-2]
422 | 									if len(block_part) < 4:  # slide ID not available
423 | 										slide_ids.append(sid.split('_')[-1])
424 | 									else:  # slide ID available
425 | 										slide_ids.append(sid.split('_')[-2][-2:] + '_' + sid.split('_')[-1])
426 | 								elif '-' in sid:  # report contains slide ID second variant (i.e., -*)
427 | 									block_part = sid.split('_')[-1].split('-')[0]
428 | 									if len(block_part) < 4:  # slide ID not available
429 | 										slide_ids.append(sid.split('-')[1])
430 | 									else:  # slide ID available
431 | 										slide_ids.append(sid.split('-')[0][-2:] + '-' + sid.split('-')[1])
432 | 						proc_reports[bid]['slide_ids'] = slide_ids
433 | 				else:
434 | 					block_ix2id = {int(block_id[-1]): block_id for block_id in block_ids}
435 | 					if len(conclusions) < len(block_ids):  # fewer conclusions have been identified than the actual number of blocks - store and fix later
436 | 						misplitted_reports += 1
437 | 						# get conclusions IDs
438 | 						cix2id = {roman.fromRoman(cid): cid for cid in conclusions.keys()}
439 | 						# loop over Block IDs and associate the given conclusions to the corresponding blocks when available
440 | 						for bix, bid in block_ix2id.items():
441 | 							# create dict to store block diagnosis and slide ids
442 | 							proc_reports[bid] = dict()
443 | 							if bix in cix2id:  # conclusion associated with the corresponding block
444 | 								# store conclusion - i.e., the final diagnosis
445 | 								proc_reports[bid]['diagnosis'] = conclusions[cix2id[bix]]
446 | 								# store slide ids associated to the current block diagnosis
447 | 								slide_ids = []
448 | 								for sid in report_conc_keys.keys():
449 | 									if bid in sid:  # Block ID found within report ID
450 | 										if 'v' not in sid.lower() and '-' not in sid:  # report does not contain special characters
451 | 											block_part = sid.split('_')[-1]
452 | 											if len(block_part) < 4:  # slide ID not available
453 | 												continue
454 | 											else:  # slide ID available
455 | 												slide_ids.append(sid[-2:])
456 | 										elif 'v' in sid.lower():  # report contains slide ID first variant (i.e., _V0*)
457 | 											block_part = sid.split('_')[-2]
458 | 											if len(block_part) < 4:  # slide ID not available
459 | 												slide_ids.append(sid.split('_')[-1])
460 | 											else:  # slide ID available
461 | 												slide_ids.append(sid.split('_')[-2][-2:] + '_' + sid.split('_')[-1])
462 | 										elif '-' in sid:  # report contains slide ID second variant (i.e., -*)
463 | 											block_part = sid.split('_')[-1].split('-')[0]
464 | 											if len(block_part) < 4:  # slide ID not available
465 | 												slide_ids.append(sid.split('-')[1])
466 | 											else:  # slide ID available
467 | 												slide_ids.append(sid.split('-')[0][-2:] + '-' + sid.split('-')[1])
468 | 								proc_reports[bid]['slide_ids'] = slide_ids
469 | 							else:  # unable to associate diagnosis with the corresponding block -- associate the entire conclusion
470 | 								# store slide ids associated to the current block diagnosis
471 | 								slide_ids = []
472 | 								# get patient ID to store conclusions field
473 | 								pid = '_'.join(bid.split('_')[:3])
474 | 								wconc = [report_conc_keys[sid] for sid in report_conc_keys.keys() if pid in sid and type(report_conc_keys[sid]) == str]
475 | 								# store the whole 'conclusions' field
476 | 								proc_reports[bid]['diagnosis'] = wconc[0]
477 | 								for sid in report_conc_keys.keys():
478 | 									if bid in sid:  # Block ID found within report ID
479 | 										if 'v' not in sid.lower() and '-' not in sid:  # report does not contain special characters
480 | 											block_part = sid.split('_')[-1]
481 | 											if len(block_part) < 4:  # slide ID not available
482 | 												continue
483 | 											else:  # slide ID available
484 | 												slide_ids.append(sid[-2:])
485 | 										elif 'v' in sid.lower():  # report contains slide ID first variant (i.e., _V0*)
486 | 											block_part = sid.split('_')[-2]
487 | 											if len(block_part) < 4:  # slide ID not available
488 | 												slide_ids.append(sid.split('_')[-1])
489 | 											else:  # slide ID available
490 | 												slide_ids.append(sid.split('_')[-2][-2:] + '_' + sid.split('_')[-1])
491 | 										elif '-' in sid:  # report contains slide ID second variant (i.e., -*)
492 | 											block_part = sid.split('_')[-1].split('-')[0]
493 | 											if len(block_part) < 4:  # slide ID not available
494 | 												slide_ids.append(sid.split('-')[1])
495 | 											else:  # slide ID available
496 | 												slide_ids.append(sid.split('-')[0][-2:] + '-' + sid.split('-')[1])
497 | 								proc_reports[bid]['slide_ids'] = slide_ids
498 | 					else:  # associate the given conclusions to the corresponding blocks
499 | 						# loop over conclusions and fill proc_reports
500 | 						for cid, cdata in conclusions.items():
501 | 							block_ix = roman.fromRoman(cid)  # convert conclusion id (roman number) into corresponding arabic number (i.e., block index)
502 | 							if block_ix in block_ix2id:  # block with bloc_ix present within dataset
503 | 								# create dict to store block diagnosis and slide ids
504 | 								proc_reports[block_ix2id[block_ix]] = dict()
505 | 								# store conclusion - i.e., the final diagnosis
506 | 								proc_reports[block_ix2id[block_ix]]['diagnosis'] = cdata
507 | 								# store slide ids associated to the current block diagnosis
508 | 								slide_ids = []
509 | 								for sid in report_conc_keys.keys():
510 | 									if block_ix2id[block_ix] in sid:  # Block ID found within report ID
511 | 										if 'v' not in sid.lower() and '-' not in sid:  # report does not contain special characters
512 | 											block_part = sid.split('_')[-1]
513 | 											if len(block_part) < 4:  # slide ID not available
514 | 												continue
515 | 											else:  # slide ID available
516 | 												slide_ids.append(sid[-2:])
517 | 										elif 'v' in sid.lower():  # report contains slide ID first variant (i.e., _V0*)
518 | 											block_part = sid.split('_')[-2]
519 | 											if len(block_part) < 4:  # slide ID not available
520 | 												slide_ids.append(sid.split('_')[-1])
521 | 											else:  # slide ID available
522 | 												slide_ids.append(sid.split('_')[-2][-2:] + '_' + sid.split('_')[-1])
523 | 										elif '-' in sid:  # report contains slide ID second variant (i.e., -*)
524 | 											block_part = sid.split('_')[-1].split('-')[0]
525 | 											if len(block_part) < 4:  # slide ID not available
526 | 												slide_ids.append(sid.split('-')[1])
527 | 											else:  # slide ID available
528 | 												slide_ids.append(sid.split('-')[0][-2:] + '-' + sid.split('-')[1])
529 | 								proc_reports[block_ix2id[block_ix]]['slide_ids'] = slide_ids
530 | 		if debug:
531 | 			print('number of missplitted reports: {}'.format(misplitted_reports))
532 | 			print('number of unsplitted reports: {}'.format(unsplitted_reports))
533 | 			print('skipped reports:')
534 | 			print(skipped_reports)
535 | 		return proc_reports
536 | 
537 | 	def radboud_process_data_v2(self, dataset):
538 | 		"""
539 | 		Read Radboud reports and extract the required fields (v2 used for anonymized datasets)
540 | 
541 | 		Params:
542 | 			dataset (pandas DataFrame): target dataset
543 | 
544 | 		Returns: a dict containing the required report fields
545 | 		"""
546 | 
547 | 		proc_reports = dict()
548 | 		for report in tqdm(dataset.itertuples()):
549 | 			if 'Microscopy' in report._fields:  # first batch of Radboud reports
550 | 				rid = str(report.Studynumber).strip()
551 | 			else:  # subsequent anonymized batches of Radboud reports
552 | 				rid = str(report._3).strip() + '_A'  # '_A' stands for anonymized report
553 | 			if report.Conclusion:  # split conclusions and associate to each block the corresponding conclusion
554 | 				# split conclusions into sections
555 | 				conclusions = self.radboud_split_conclusions(utils.nl_sanitize_record(report.Conclusion.lower(), self.use_case))
556 | 
557 | 				if 'whole' in conclusions:  # unable to split conclusions - either single conclusion or not appropriately specified
558 | 					# create block id
559 | 					bid = rid + '_1'
560 | 					# create dict to store block diagnosis
561 | 					proc_reports[bid] = dict()
562 | 					# store conclusion - i.e., the final diagnosis
563 | 					proc_reports[bid]['diagnosis'] = conclusions['whole']
564 | 
565 | 				else:
566 | 					# get conclusions IDs
567 | 					cid2ix = {cid: roman.fromRoman(cid) for cid in conclusions.keys()}
568 | 					for cid, cix in cid2ix.items():
569 | 						# create block id
570 | 						bid = rid + '_' + str(cix)
571 | 						# create dict to store block diagnosis
572 | 						proc_reports[bid] = dict()
573 | 						# store conclusion - i.e., the final diagnosis
574 | 						proc_reports[bid]['diagnosis'] = conclusions[cid]
575 | 		return proc_reports
576 | 
577 | 	def radboud_process_celiac_data(self, dataset):
578 | 		"""
579 | 		Read Radboud reports and extract the required fields (used for celiac datasets)
580 | 
581 | 		Params:
582 | 			dataset (pandas DataFrame): target dataset
583 | 
584 | 		Returns: a dict containing the required report fields
585 | 		"""
586 | 
587 | 		proc_reports = dict()
588 | 		for report in tqdm(dataset.itertuples()):
589 | 			rid = str(report.Studynumber).strip()
590 | 			if type(report._7) == str:
591 | 				if any(report._6.strip() == k for k in ['alle', 'aIle', 'aI', 'al', 'a', '?']):  # single conclusion
592 | 
593 | 					# create dict to store block diagnosis
594 | 					proc_reports[rid] = dict()
595 | 					# store conclusion - i.e., the final diagnosis
596 | 					proc_reports[rid]['diagnosis'] = utils.nl_sanitize_record(report._7.lower(), self.use_case)
597 | 					# Add other fields
598 | 					proc_reports[rid]['tissue'] = report._8 if type(report._8) == str else ''
599 | 					proc_reports[rid]['procedure'] = report._9 if type(report._9) == str else ''
600 | 					proc_reports[rid]['short'] = []
601 | 					for short in [report.short1, report.short2, report.short3]:
602 | 						if type(short) == str:
603 | 							proc_reports[rid]['short'].append(short)
604 | 					proc_reports[rid]['slide_ids'] = [rid.lstrip(report.block).split('_')[1].lstrip(report.block.split('_')[3])]
605 | 				else:  # split conclusions and associate to each block the corresponding conclusion
606 | 					# split conclusions into sections
607 | 					conclusions = self.radboud_split_conclusions(utils.nl_sanitize_record(report._7.lower(), self.use_case))
608 | 
609 | 					if 'whole' in conclusions:  # unable to split conclusions - either single conclusion or not appropriately specified
610 | 						# create dict to store block diagnosis
611 | 						proc_reports[rid] = dict()
612 | 						# store conclusion - i.e., the final diagnosis
613 | 						proc_reports[rid]['diagnosis'] = conclusions['whole']
614 | 						# Add other fields
615 | 						proc_reports[rid]['tissue'] = report._8 if type(report._8) == str else ''
616 | 						proc_reports[rid]['procedure'] = report._9 if type(report._9) == str else ''
617 | 						proc_reports[rid]['short'] = []
618 | 						for short in [report.short1, report.short2, report.short3]:
619 | 							if type(short) == str:
620 | 								proc_reports[rid]['short'].append(short)
621 | 						proc_reports[rid]['slide_ids'] = [rid.lstrip(report.block).split('_')[1].lstrip(
622 | 							report.block.split('_')[3])]
623 | 
624 | 					else:
625 | 						# get conclusions IDs
626 | 						cid2ix = {cid: roman.fromRoman(cid) for cid in conclusions.keys()}
627 | 						for cid, cix in cid2ix.items():
628 | 							numbers = [n.strip() for n in report._6.split('&')]
629 | 							if cid in numbers:
630 | 								# create block id
631 | 								bid = rid + '_' + str(cix)
632 | 								# create dict to store block diagnosis
633 | 								proc_reports[bid] = dict()
634 | 								# store conclusion - i.e., the final diagnosis
635 | 								proc_reports[bid]['diagnosis'] = conclusions[cid]
636 | 								# Add other fields
637 | 								proc_reports[bid]['tissue'] = report._8 if type(report._8) == str else ''
638 | 								proc_reports[bid]['procedure'] = report._9 if type(report._9) == str else ''
639 | 								proc_reports[bid]['short'] = []
640 | 								for short in [report.short1, report.short2, report.short3]:
641 | 									if type(short) == str:
642 | 										proc_reports[bid]['short'].append(short)
643 | 								proc_reports[bid]['slide_ids'] = [rid.lstrip(report.block).split('_')[1].lstrip(
644 | 									report.block.split('_')[3])]
645 | 
646 | 		return proc_reports
647 | 
648 | 	def radboud_translate_celiac_reports(self, reports):
649 | 		"""
650 | 		Translate processed reports for celiac use-case
651 | 
652 | 		Params:
653 | 			reports (dict): processed reports
654 | 
655 | 		Returns: translated reports
656 | 		"""
657 | 
658 | 		trans_reports = copy.deepcopy(reports)
659 | 		print('translate text')
660 | 		# translate text
661 | 		for rid, report in tqdm(trans_reports.items()):
662 | 			trans_reports[rid]['diagnosis'] = self.translate_text(report['diagnosis'])
663 | 			if report['tissue'] != '':
664 | 				trans_reports[rid]['tissue'] = self.translate_text(report['tissue'])
665 | 			if report['procedure'] != '':
666 | 				trans_reports[rid]['procedure'] = self.translate_text(report['procedure'])
667 | 			# List of translated shorts
668 | 			tmp = []
669 | 			for short in report['short']:
670 | 				tmp.append(self.translate_text(short))
671 | 			trans_reports[rid]['short'] = tmp
672 | 		return trans_reports
673 | 
674 | 	def radboud_translate_reports(self, reports):
675 | 		"""
676 | 		Translate processed reports
677 | 
678 | 		Params:
679 | 			reports (dict): processed reports
680 | 
681 | 		Returns: translated reports
682 | 		"""
683 | 
684 | 		trans_reports = copy.deepcopy(reports)
685 | 		print('translate text')
686 | 		# translate text
687 | 		for rid, report in tqdm(trans_reports.items()):
688 | 			trans_reports[rid]['diagnosis'] = self.translate_text(report['diagnosis'])
689 | 		return trans_reports
690 | 
691 | 	# GENERAL-PURPOSE FUNCTIONS
692 | 
693 | 	def read_xls_reports(self, dataset):
694 | 		"""
695 | 		Read reports from xls file
696 | 
697 | 		Params:
698 | 			dataset (str): target dataset
699 | 
700 | 		Returns: a list containing dataset report(s)
701 | 		"""
702 | 
703 | 		if dataset.split('.')[-1] == 'xlsx':  # read input file as xlsx object
704 | 			ds = pd.read_excel(io=dataset, header=0, engine='openpyxl')
705 | 		else:  # read input file as xls object
706 | 			ds = pd.read_excel(io=dataset, header=0)
707 | 
708 | 		reports = []
709 | 		for report in tqdm(ds.itertuples(index=False)):  # convert raw dataset into list containing report(s)
710 | 			reports.append({field: report[ix] for ix, field in enumerate(report._fields)})
711 | 		# return report(s)
712 | 		return reports
713 | 
714 | 	def read_csv_reports(self, dataset):
715 | 		"""
716 | 		Read reports from csv file
717 | 
718 | 		Params:
719 | 			dataset (str): target dataset
720 | 
721 | 		Returns: a list containing dataset report(s)
722 | 		"""
723 | 		# read input file as csv object
724 | 		ds = pd.read_csv(filepath_or_buffer=dataset, sep='	', header=0)
725 | 
726 | 		reports = []
727 | 		for report in tqdm(ds.itertuples(index=False)):  # convert raw dataset into list containing report(s)
728 | 			reports.append({field: report[ix] for ix, field in enumerate(report._fields)})
729 | 		# return report(s)
730 | 		return reports
731 | 
732 | 	def read_json_reports(self, dataset):
733 | 		"""
734 | 		Read reports from JSON file
735 | 
736 | 		Params:
737 | 			dataset (str): target dataset
738 | 
739 | 		Returns: a list containing dataset report(s)
740 | 		"""
741 | 
742 | 		with open(dataset, 'r') as dsf:
743 | 			ds = json.load(dsf)
744 | 
745 | 		if 'reports' in ds:  # dataset consists of several reports
746 | 			reports = ds['reports']
747 | 		else:  # dataset consists of single report
748 | 			reports = [ds]
749 | 		# return report(s)
750 | 		return reports
751 | 
752 | 	def read_stream_reports(self, dataset):
753 | 		"""
754 | 		Read reports from stream input
755 | 
756 | 		Params:
757 | 			dataset (dict): target dataset
758 | 
759 | 		Returns: a list containing dataset report(s)
760 | 		"""
761 | 
762 | 		if 'reports' in dataset:  # dataset consists of several reports
763 | 			reports = dataset['reports']
764 | 		else:  # dataset consists of single report
765 | 			reports = [dataset]
766 | 		# return report(s)
767 | 		return reports
768 | 
769 | 	def process_data(self, dataset, debug=False):
770 | 		"""
771 | 		Read reports and extract the required fields
772 | 
773 | 		Params:
774 | 			dataset (dict): target dataset
775 | 			debug (bool): whether to keep flags for debugging
776 | 
777 | 		Returns: a dict containing the required report fields
778 | 		"""
779 | 
780 | 		if type(dataset) == str:  # dataset passed as input file
781 | 			if dataset.split('.')[-1] == 'json':  # read input file as JSON object
782 | 				reports = self.read_json_reports(dataset)
783 | 			elif dataset.split('.')[-1] == 'xlsx' or dataset.split('.')[-1] == 'xls':  # read input file as xlsx or xls object
784 | 				reports = self.read_xls_reports(dataset)
785 | 			elif dataset.split('.')[-1] == 'csv':  # read input file as csv or csv object
786 | 				reports = self.read_csv_reports(dataset)
787 | 			else:  # raise exception
788 | 				print('Format required for input: JSON, xls, xlsx or csv.')
789 | 				raise Exception
790 | 		else:  # dataset passed as stream dict
791 | 			reports = self.read_stream_reports(dataset)
792 | 
793 | 		proc_reports = {}
794 | 		# process reports and concat fields
795 | 		for report in reports:
796 | 			if 'id' in report:
797 | 				rid = report.pop('id')  # use provided id
798 | 			else:
799 | 				rid = str(uuid.uuid4())  # generate uuid
800 | 
801 | 			if 'age' in report:  # get age from report
802 | 				if self.is_empty(report['age']):
803 | 					age = None
804 | 				else:
805 | 					age = report.pop('age')
806 | 			else:  # set age to None
807 | 				age = None
808 | 
809 | 			if 'gender' in report:  # get gender from report
810 | 				if self.is_empty(report['gender']):
811 | 					gender = None
812 | 				else:
813 | 					gender = report.pop('gender')
814 | 			else:  # set gender to None
815 | 				gender = None
816 | 
817 | 			if self.fields:  # report fields specified -- restrict to self.fields
818 | 				fields = [field for field in report.keys() if field in self.fields]
819 | 			else:  # report fields not specified -- keep report fields
820 | 				fields = [field for field in report.keys()]
821 | 			report_fields = [report[field] if report[field].endswith('.') else report[field] + '.' for field in fields]
822 | 			text = ' '.join(report_fields)
823 | 
824 | 			# prepare processed report
825 | 			proc_reports[rid] = {'text': text, 'age': age, 'gender': gender}
826 | 		return proc_reports
827 | 
828 | 	def translate_reports(self, reports):
829 | 		"""
830 | 		Translate reports
831 | 
832 | 		Params:
833 | 			reports (dict): reports
834 | 
835 | 		Returns: translated reports
836 | 		"""
837 | 
838 | 		trans_reports = copy.deepcopy(reports)
839 | 		print('translate text')
840 | 		# translate text
841 | 		for rid, report in tqdm(trans_reports.items()):
842 | 			trans_reports[rid]['text'] = self.translate_text(report['text'])
843 | 		return trans_reports
844 | 


--------------------------------------------------------------------------------
/sket/rep_proc/rules/report_fields.txt:
--------------------------------------------------------------------------------
1 | text


--------------------------------------------------------------------------------
/sket/sket.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | import json
  4 | 
  5 | from .rep_proc.report_processing import ReportProc
  6 | from .ont_proc.ontology_processing import OntoProc
  7 | from .nerd.nerd import NERD
  8 | from .rdf_proc.rdf_processing import RDFProc
  9 | 
 10 | from .utils import utils
 11 | 
 12 | 
 13 | class SKET(object):
 14 | 
 15 |     def __init__(
 16 |             self,
 17 |             use_case, src_lang,
 18 |             biospacy="en_core_sci_sm", biow2v=True, biofast=None, biobert=None, str_match=False, gpu=None, rules=None, dysplasia_mappings=None, cin_mappings=None,
 19 |             ontology_path=None, hierarchies_path=None,
 20 |             fields_path=None
 21 |     ):
 22 |         """
 23 |         Load SKET components
 24 | 
 25 |         Params:
 26 |             SKET:
 27 |                 use_case (str): considered use case
 28 |                 src_lang (str): considered language
 29 |             NERD:
 30 |                 biospacy (str): full spaCy pipeline for biomedical data
 31 |                 biow2v (bool): whether to use biospacy to perform semantic matching or not
 32 |                 biofast (str): biomedical fasttext model
 33 |                 biobert (str): biomedical bert model
 34 |                 str_match (bool): string matching
 35 |                 gpu (int): use gpu when using BERT
 36 |                 rules (str): hand-crafted rules file path
 37 |                 dysplasia_mappings (str): dysplasia mappings file path
 38 |                 cin_mappings (str): cin mappings file path
 39 |             OntoProc:
 40 |                 ontology_path (str): ontology.owl file path
 41 |                 hierarchies_path (str): hierarchy relations file path
 42 |             ReportProc:
 43 |                 fields_path (str): report fields file path
 44 | 
 45 |         Returns: None
 46 |         """
 47 | 
 48 |         # load Named Entity Recognition and Disambiguation (NERD)
 49 |         self.nerd = NERD(biospacy, biow2v, str_match, biofast, biobert, rules, dysplasia_mappings, cin_mappings, gpu)
 50 |         # load Ontology Processing (OntoProc)
 51 |         self.onto_proc = OntoProc(ontology_path, hierarchies_path)
 52 |         # load Report Processing (ReportProc)
 53 |         self.rep_proc = ReportProc(src_lang, use_case, fields_path)
 54 |         # load RDF Processing (RDFProc)
 55 |         self.rdf_proc = RDFProc()
 56 | 
 57 |         # define set of ad hoc labeling operations @smarchesin TODO: add 'custom' to lung too if required
 58 |         self.ad_hoc_exa_labeling = {
 59 |             'aoec': {
 60 |                 'colon': {
 61 |                     'original': utils.aoec_colon_concepts2labels,
 62 |                     'custom': utils.aoec_colon_labels2binary},
 63 |                 'cervix': {
 64 |                     'original': utils.aoec_cervix_concepts2labels,
 65 |                     'custom': utils.aoec_cervix_labels2aggregates},
 66 |                 'lung': {
 67 |                     'original': utils.aoec_lung_concepts2labels},
 68 |                 'celiac': {
 69 |                     'original': utils.aoec_celiac_concepts2labels}
 70 |             },
 71 |             'radboud': {
 72 |                 'colon': {
 73 |                     'original': utils.radboud_colon_concepts2labels,
 74 |                     'custom': utils.radboud_colon_labels2binary},
 75 |                 'cervix': {
 76 |                     'original': utils.radboud_cervix_concepts2labels,
 77 |                     'custom': utils.radboud_cervix_labels2aggregates},
 78 |                 'celiac': {
 79 |                     'original': utils.radboud_celiac_concepts2labels}
 80 |             }
 81 |         }
 82 | 
 83 |         self.ad_hoc_med_labeling = {
 84 |             'colon': {
 85 |                 'original': utils.colon_concepts2labels,
 86 |                 'custom': utils.colon_labels2binary
 87 |             },
 88 |             'cervix': {
 89 |                 'original': utils.cervix_concepts2labels,
 90 |                 'custom': utils.cervix_labels2aggregates
 91 |             },
 92 |             'lung': {
 93 |                 'original': utils.lung_concepts2labels
 94 |             },
 95 |             'celiac': {
 96 |                 'original': utils.celiac_concepts2labels
 97 |             }
 98 |         }
 99 | 
100 |         # set use case
101 |         self.use_case = use_case
102 |         # restrict hand-crafted rules and mappings based on use case
103 |         self.nerd.restrict2use_case(use_case)
104 |         # restrict onto concepts to the given use case
105 |         self.onto = self.onto_proc.restrict2use_case(use_case)
106 |         # restrict concept preferred terms (i.e., labels) given the use case
107 |         self.onto_terms = self.nerd.process_ontology_concepts([term.lower() for term in self.onto['label'].tolist()])
108 | 
109 |     def update_nerd(
110 |             self,
111 |             biospacy="en_core_sci_lg", biofast=None, biobert=None, str_match=False, rules=None, dysplasia_mappings=None, cin_mappings=None, gpu=None):
112 |         """
113 |         Update NERD model w/ input parameters
114 | 
115 |         Params:
116 |             biospacy (str): full spaCy pipeline for biomedical data
117 |             biofast (str): biomedical fasttext model
118 |             biobert (str): biomedical bert model
119 |             str_match (bool): string matching
120 |             rules (str): hand-crafted rules file path
121 |             dysplasia_mappings (str): dysplasia mappings file path
122 |             cin_mappings (str): cin mappings file path
123 |             gpu (int): use gpu when using BERT
124 | 
125 |         Returns: None
126 |         """
127 | 
128 |         # update nerd model
129 |         self.nerd = NERD(biospacy, biofast, biobert, str_match, rules, dysplasia_mappings, cin_mappings, gpu)
130 |         # restrict hand-crafted rules and mappings based on current use case
131 |         self.nerd.restrict2use_case(self.use_case)
132 | 
133 |     def update_usecase(self, use_case):
134 |         """
135 |         Update use case and dependent functions
136 | 
137 |         Params:
138 |             use_case (str): considered use case
139 | 
140 |         Returns: None
141 |         """
142 | 
143 |         if use_case not in ['colon', 'cervix', 'lung', 'celiac']:  # raise exception
144 |             print('current supported use cases are: "colon", "cervix", "lung" and "celiac"')
145 |             raise Exception
146 |         # set use case
147 |         self.use_case = use_case
148 |         # update report processing
149 |         self.rep_proc.update_usecase(self.use_case)
150 |         # restrict hand-crafted rules and mappings based on use case
151 |         self.nerd.restrict2use_case(use_case)
152 |         # restrict onto concepts to the given use case
153 |         self.onto = self.onto_proc.restrict2use_case(use_case)
154 |         # restrict concept preferred terms (i.e., labels) given the use case
155 |         self.onto_terms = self.nerd.process_ontology_concepts([term.lower() for term in self.onto['label'].tolist()])
156 | 
157 |     def update_nmt(self, src_lang):
158 |         """
159 |         Update NMT model changing source language
160 | 
161 |         Params:
162 |             src_lang (str): considered source language
163 | 
164 |         Returns: None
165 |         """
166 | 
167 |         # update NMT model
168 |         self.rep_proc.update_nmt(src_lang)
169 | 
170 |     def update_report_fields(self, fields):
171 |         """
172 |         Update report fields changing current ones
173 | 
174 |         Params:
175 |             fields (list): report fields
176 | 
177 |         Returns: None
178 |         """
179 | 
180 |         # update report fields
181 |         self.rep_proc.fields = fields
182 | 
183 |     @staticmethod
184 |     def store_reports(reports, r_path):
185 |         """
186 |         Store reports
187 | 
188 |         Params:
189 |             reports (dict): reports
190 |             r_path (str): reports file path
191 | 
192 |         Returns: None
193 |         """
194 | 
195 |         with open(r_path, 'w') as out:
196 |             json.dump(reports, out, indent=4)
197 | 
198 |     @staticmethod
199 |     def load_reports(r_fpath):
200 |         """
201 |         Load reports
202 | 
203 |         Params:
204 |             r_fpath (str): reports file path
205 | 
206 |         Returns: reports
207 |         """
208 | 
209 |         with open(r_fpath, 'r') as rfp:
210 |             reports = json.load(rfp)
211 |         return reports
212 | 
213 |     @staticmethod
214 |     def store_concepts(concepts, c_fpath):
215 |         """
216 |         Store extracted concepts as JSON dict
217 | 
218 |         Params:
219 |             concepts (dict): dict containing concepts extracted from reports
220 |             c_fpath (str): concepts file path
221 | 
222 |         Returns: None
223 |         """
224 | 
225 |         utils.store_concepts(concepts, c_fpath)
226 | 
227 |     @staticmethod
228 |     def store_labels(labels, l_fpath):
229 |         """
230 |         Store mapped labels as JSON dict
231 | 
232 |         Params:
233 |             labels (dict): dict containing labels mapped from extracted concepts
234 |             l_fpath (str): labels file path
235 | 
236 |         Returns: None
237 |         """
238 | 
239 |         utils.store_labels(labels, l_fpath)
240 | 
241 |     def store_rdf_graphs(self, graphs, g_fpath, rdf_format='turtle'):
242 |         """
243 |         Store RDF graphs w/ RDF serialization format
244 | 
245 |         Params:
246 |             graphs (list): list containing (s,p,o) triples representing ExaMode report(s)
247 |             g_fpath (str): graphs file path
248 |             rdf_format (str): RDF format used to serialize graphs
249 | 
250 |         Returns: serialized report graph when g_fpath == 'stream' or boolean when g_fpath != 'stream'
251 |         """
252 | 
253 |         if rdf_format not in ['turtle', 'n3', 'trig']:  # raise exception
254 |             print('provide correct format: "turtle", "n3", or "trig".')
255 |             raise Exception
256 | 
257 |         if g_fpath != 'stream':  # check that file type and rdf format coincide
258 |             ftype = g_fpath.split('.')[-1]
259 |             ftype = 'turtle' if ftype == 'ttl' else ftype
260 |             assert ftype == rdf_format
261 | 
262 |         return self.rdf_proc.serialize_report_graphs(graphs, output=g_fpath, rdf_format=rdf_format)
263 | 
264 |     @staticmethod
265 |     def store_json_graphs(graphs, g_fpath):
266 |         """
267 |         Store RDF graphs w/ JSON serialization format
268 | 
269 |         Params:
270 |             graphs (dict): dict containing (s,p,o) triples representing ExaMode report(s)
271 |             g_fpath (str): graphs file path
272 | 
273 |         Returns: None
274 |         """
275 | 
276 |         os.makedirs(os.path.dirname(g_fpath), exist_ok=True)
277 | 
278 |         with open(g_fpath, 'w') as out:
279 |             json.dump(graphs, out, indent=4)
280 | 
281 |     # EXAMODE RELATED FUNCTIONS
282 | 
283 |     def prepare_exa_dataset(self, ds_fpath, sheet, header, hospital, ver, ds_name=None, debug=False):
284 |         """
285 |         Prepare ExaMode batch data to perform NERD
286 | 
287 |         Params:
288 |             ds_fpath (str): examode dataset file path
289 |             sheet (str): name of the excel sheet to use
290 |             header (int): row index used as header
291 |             hospital (str): considered hospital
292 |             ver (int): data format version
293 |             use_case (str): considered use case
294 |             ds_name (str): dataset name
295 |             debug (bool): whether to keep flags for debugging
296 | 
297 |         Returns: translated, split, and prepared dataset
298 |         """
299 | 
300 |         # get dataset name from file path if not provided
301 |         if not ds_name:
302 |             ds_name = ds_fpath.split('/')[-1].split('.')[0]  # ./dataset/raw/aoec/####.csv
303 |         # set output directories
304 |         proc_out = './dataset/processed/' + hospital + '/' + self.use_case + '/'
305 |         trans_out = './dataset/translated/' + hospital + '/' + self.use_case + '/'
306 | 
307 |         if os.path.isfile(trans_out + ds_name + '.json'):  # translated reports file already exists
308 |             print('translated reports file already exist -- remove it before running "exa_pipeline" to reprocess it')
309 |             trans_reports = self.load_reports(trans_out + ds_name + '.json')
310 |             return trans_reports
311 |         elif os.path.isfile(proc_out + ds_name + '.json'):  # processed reports file already exists
312 |             print('processed reports file already exist -- remove it before running "exa_pipeline" to reprocess it')
313 |             proc_reports = self.load_reports(proc_out + ds_name + '.json')
314 |             if hospital == 'aoec':
315 |                 # translate reports
316 |                 trans_reports = self.rep_proc.aoec_translate_reports(proc_reports)
317 |             elif hospital == 'radboud':
318 |                 if self.use_case == 'celiac':
319 |                     # translate celiac reports
320 |                     trans_reports = self.rep_proc.radboud_translate_celiac_reports(proc_reports)
321 |                 else:
322 |                     # translate reports
323 |                     trans_reports = self.rep_proc.radboud_translate_reports(proc_reports)
324 |             else:  # raise exception
325 |                 print('provide correct hospital info: "aoec" or "radboud"')
326 |                 raise Exception
327 | 
328 |             if not os.path.exists(trans_out):  # dir not exists -- make it
329 |                 os.makedirs(trans_out)
330 |             # store translated reports
331 |             self.store_reports(trans_reports, trans_out + ds_name + '.json')
332 | 
333 |             return trans_reports
334 |         else:  # neither processed nor translated reports files exist
335 |             # load dataset
336 |             dataset = self.rep_proc.load_dataset(ds_fpath, sheet, header)
337 | 
338 |             if hospital == 'aoec':
339 |                 if ver == 1:  # process data using method v1
340 |                     proc_reports = self.rep_proc.aoec_process_data(dataset)
341 |                 else:  # process data using method v2
342 |                     proc_reports = self.rep_proc.aoec_process_data_v2(dataset, debug=debug)
343 | 
344 |                 # translate reports
345 |                 trans_reports = self.rep_proc.aoec_translate_reports(proc_reports)
346 |             elif hospital == 'radboud':
347 |                 if self.use_case == 'celiac':
348 |                     proc_reports = self.rep_proc.radboud_process_celiac_data(dataset)
349 |                 elif ver == 1:  # process data using method v1
350 |                     proc_reports = self.rep_proc.radboud_process_data(dataset, debug=debug)
351 |                 else:  # process data using method v2
352 |                     proc_reports = self.rep_proc.radboud_process_data_v2(dataset)
353 |                 if self.use_case == 'celiac':
354 |                     # translate reports
355 |                     trans_reports = self.rep_proc.radboud_translate_celiac_reports(proc_reports)
356 |                 else:
357 |                     # translate reports
358 |                     trans_reports = self.rep_proc.radboud_translate_reports(proc_reports)
359 |             else:  # raise exception
360 |                 print('provide correct hospital info: "aoec" or "radboud"')
361 |                 raise Exception
362 | 
363 |             if not os.path.exists(proc_out):  # dir not exists -- make it
364 |                 os.makedirs(proc_out)
365 |             # store processed reports
366 |             self.store_reports(proc_reports, proc_out + ds_name + '.json')
367 |             if not os.path.exists(trans_out):  # dir not exists -- make it
368 |                 os.makedirs(trans_out)
369 |             # store translated reports
370 |             self.store_reports(trans_reports, trans_out + ds_name + '.json')
371 | 
372 |             return trans_reports
373 | 
374 |     def exa_entity_linking(self, reports, hospital, sim_thr=0.7, raw=False, debug=False):
375 |         """
376 |         Perform entity linking based on ExaMode reports structure and data
377 | 
378 |         Params:
379 |             reports (dict): dict containing reports -- can be either one or many
380 |             hospital (str): considered hospital
381 |             sim_thr (float): keep candidates with sim score greater than or equal to sim_thr
382 |             raw (bool): whether to return concepts within semantic areas or mentions+concepts
383 |             debug (bool): whether to keep flags for debugging
384 | 
385 |         Returns: a dict containing concepts from input reports
386 |         """
387 | 
388 |         # perform entity linking
389 |         if hospital == 'aoec':  # AOEC data
390 |             concepts = self.nerd.aoec_entity_linking(reports, self.onto_proc, self.onto, self.onto_terms, self.use_case, sim_thr, raw, debug=debug)
391 |         elif hospital == 'radboud':  # Radboud data
392 |             concepts = self.nerd.radboud_entity_linking(reports, self.onto, self.onto_terms, self.use_case, sim_thr, raw, debug=debug)
393 |         else:  # raise exception
394 |             print('provide correct hospital info: "aoec" or "radboud"')
395 |             raise Exception
396 |         return concepts
397 | 
398 |     def exa_labeling(self, concepts, hospital):
399 |         """
400 |         Map extracted concepts to pre-defined labels
401 | 
402 |         Params:
403 |             concepts (dict): dict containing concepts extracted from report(s)
404 |             hospital (str): considered hospital
405 | 
406 |         Returns: a dict containing labels from input report(s)
407 |         """
408 | 
409 |         if hospital not in ['aoec', 'radboud']:
410 |             print('provide correct hospital info: "aoec" or "radboud"')
411 |             raise Exception
412 |         labels = self.ad_hoc_exa_labeling[hospital][self.use_case]['original'](concepts)
413 |         return labels
414 | 
415 |     def create_exa_graphs(self, reports, concepts, hospital, struct=False, debug=False):
416 |         """
417 |         Create report graphs in RDF format
418 | 
419 |         Params:
420 |             reports (dict): dict containing reports -- can be either one or many
421 |             concepts (dict): dict containing concepts extracted from report(s)
422 |             hospital (str): considered hospital
423 |             struct (bool): whether to return graphs structured as dict
424 |             debug (bool): whether to keep flags for debugging
425 | 
426 |         Returns: list of (s,p,o) triples representing report graphs and dict structuring report graphs (if struct==True)
427 |         """
428 | 
429 |         if hospital == 'aoec':  # AOEC data
430 |             create_graph = self.rdf_proc.aoec_create_graph
431 |         elif hospital == 'radboud':  # Radboud data
432 |             create_graph = self.rdf_proc.radboud_create_graph
433 |         else:  # raise exception
434 |             print('provide correct hospital info: "aoec" or "radboud"')
435 |             raise Exception
436 | 
437 |         rdf_graphs = []
438 |         struct_graphs = []
439 |         # convert report data into (s,p,o) triples
440 |         for rid in reports.keys():
441 |             rdf_graph, struct_graph = create_graph(rid, reports[rid], concepts[rid], self.onto_proc, self.use_case, debug=debug)
442 |             rdf_graphs.append(rdf_graph)
443 |             struct_graphs.append(struct_graph)
444 |         if struct:  # return both rdf and dict graphs
445 |             return rdf_graphs, struct_graphs
446 |         else:
447 |             return rdf_graphs
448 | 
449 |     def exa_pipeline(self, ds_fpath, sheet, header, ver, use_case=None, hosp=None, sim_thr=0.7, raw=False, debug=False):
450 |         """
451 |         Perform the complete SKET pipeline over ExaMode data:
452 |             - (i) Load dataset
453 |             - (ii) Process dataset
454 |             - (iii) Translate dataset
455 |             - (iv) Perform entity linking and store concepts
456 |             - (v) Perform labeling and store labels
457 |             - (vi) Create RDF graphs and store graphs
458 | 
459 |         Params:
460 |             ds_fpath (str): dataset file path
461 |             sheet (str): name of the excel sheet to use
462 |             header (int): row index used as header
463 |             ver (int): data format version
464 |             use_case (str): considered use case
465 |             hosp (str): considered hospital
466 |             sim_thr (float): keep candidates with sim score greater than or equal to sim_thr
467 |             raw (bool): whether to return concepts within semantic areas or mentions+concepts
468 |             debug (bool): whether to keep flags for debugging.
469 | 
470 |         Returns: None
471 |         """
472 | 
473 |         if use_case:  # update to input use case
474 |             self.update_usecase(use_case)
475 | 
476 |         # get dataset name
477 |         ds_name = ds_fpath.split('/')[-1].split('.')[0]  # ./dataset/raw/aoec/####.csv
478 | 
479 |         if hosp:  # update to input hospital
480 |             if hosp not in ['aoec', 'radboud']:
481 |                 print('provide correct hospital info: "aoec" or "radboud"')
482 |                 raise Exception
483 |             else:
484 |                 hospital = hosp
485 |         else:
486 |             # get hospital name
487 |             hospital = ds_fpath.split('/')[-2]  # ./dataset/raw/ --> aoec <-- /####.csv
488 | 
489 |         # set output directories
490 |         if raw:  # return mentions+concepts (used for EXATAG)
491 |             concepts_out = './outputs/concepts/raw/' + hospital + '/' + self.use_case + '/'
492 |         else:  # perform complete pipeline (used for SKET/CERT/EXANET)
493 |             concepts_out = './outputs/concepts/refined/' + hospital + '/' + self.use_case + '/'
494 |             labels_out = './outputs/labels/' + hospital + '/' + self.use_case + '/'
495 |             rdf_graphs_out = './outputs/graphs/rdf/' + hospital + '/' + self.use_case + '/'
496 |             struct_graphs_out = './outputs/graphs/json/' + hospital + '/' + self.use_case + '/'
497 | 
498 |         # prepare dataset
499 |         reports = self.prepare_exa_dataset(ds_fpath, sheet, header, hospital, ver, ds_name, debug=debug)
500 | 
501 |         # perform entity linking
502 |         concepts = self.exa_entity_linking(reports, hospital, sim_thr, raw, debug=debug)
503 |         # store concepts
504 |         self.store_concepts(concepts, concepts_out + 'concepts_' + ds_name + '.json')
505 |         if raw:  # return mentions+concepts
506 |             return concepts
507 | 
508 |         # perform labeling
509 |         labels = self.exa_labeling(concepts, hospital)
510 |         # store labels
511 |         self.store_labels(labels, labels_out + 'labels_' + ds_name + '.json')
512 |         # create RDF graphs
513 |         rdf_graphs, struct_graphs = self.create_exa_graphs(reports, concepts, hospital, struct=True, debug=debug)
514 |         # store RDF graphs
515 |         self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.n3', 'n3')
516 |         self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.trig', 'trig')
517 |         self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.ttl', 'turtle')
518 |         # store JSON graphs
519 |         self.store_json_graphs(struct_graphs, struct_graphs_out + 'graphs_' + ds_name + '.json')
520 | 
521 |     # GENERAL-PURPOSE FUNCTIONS
522 | 
523 |     def prepare_med_dataset(self, ds, ds_name, src_lang=None, store=False, debug=False):
524 |         """
525 |         Prepare dataset to perform NERD
526 | 
527 |         Params:
528 |             ds (dict): dataset
529 |             ds_name (str): dataset name
530 |             src_lang (str): considered language
531 |             store (bool): whether to store concepts, labels, and RDF graphs
532 |             debug (bool): whether to keep flags for debugging
533 | 
534 |         Returns: translated, split, and prepared dataset
535 |         """
536 | 
537 |         # set output directories
538 |         proc_out = './dataset/processed/' + self.use_case + '/'
539 |         trans_out = './dataset/translated/' + self.use_case + '/'
540 | 
541 |         # process reports
542 |         proc_reports = self.rep_proc.process_data(ds, debug=debug)
543 |         if store:  # store processed reports
544 |             os.makedirs(proc_out, exist_ok=True)
545 |             self.store_reports(proc_reports, proc_out + ds_name + '.json')
546 | 
547 |         if src_lang != 'en':  # translate reports
548 |             trans_reports = self.rep_proc.translate_reports(proc_reports)
549 |         else:  # keep processed reports
550 |             trans_reports = proc_reports
551 |         if store:  # store translated reports
552 |             os.makedirs(trans_out, exist_ok=True)
553 |             self.store_reports(trans_reports, trans_out + ds_name + '.json')
554 | 
555 |         return trans_reports
556 | 
557 |     def med_entity_linking(self, reports, sim_thr=0.7, raw=False, debug=False):
558 |         """
559 |         Perform entity linking on input reports
560 | 
561 |         Params:
562 |             reports (dict): dict containing reports -- can be either one or many
563 |             sim_thr (float): keep candidates with sim score greater than or equal to sim_thr
564 |             raw (bool): whether to return concepts within semantic areas or mentions+concepts
565 |             debug (bool): whether to keep flags for debugging
566 | 
567 |         Returns: a dict containing concepts from input reports
568 |         """
569 | 
570 |         # perform entity linking
571 |         concepts = self.nerd.entity_linking(reports, self.onto, self.onto_terms, self.use_case, sim_thr, raw, debug=debug)
572 | 
573 |         return concepts
574 | 
575 |     def med_labeling(self, concepts):
576 |         """
577 |         Map extracted concepts to pre-defined labels
578 | 
579 |         Params:
580 |             concepts (dict): dict containing concepts extracted from report(s)
581 | 
582 |         Returns: a dict containing labels from input report(s)
583 |         """
584 | 
585 |         labels = self.ad_hoc_med_labeling[self.use_case]['original'](concepts)
586 |         return labels
587 | 
588 |     def create_med_graphs(self, reports, concepts, struct=False, debug=False):
589 |         """
590 |         Create report graphs in RDF format
591 | 
592 |         Params:
593 |             reports (dict): dict containing reports -- can be either one or many
594 |             concepts (dict): dict containing concepts extracted from report(s)
595 |             struct (bool): whether to return graphs structured as dict
596 |             debug (bool): whether to keep flags for debugging
597 | 
598 |         Returns: list of (s,p,o) triples representing report graphs and dict structuring report graphs (if struct==True)
599 |         """
600 | 
601 |         rdf_graphs = []
602 |         struct_graphs = []
603 |         # convert report data into (s,p,o) triples
604 |         for rid in reports.keys():
605 |             rdf_graph, struct_graph = self.rdf_proc.create_graph(rid, reports[rid], concepts[rid], self.onto_proc, self.use_case, debug=debug)
606 |             rdf_graphs.append(rdf_graph)
607 |             struct_graphs.append(struct_graph)
608 |         if struct:  # return both rdf and dict graphs
609 |             return rdf_graphs, struct_graphs
610 |         else:
611 |             return rdf_graphs
612 | 
613 |     def med_pipeline(self, ds, preprocess, src_lang=None, use_case=None, sim_thr=0.7, store=False, rdf_format='all', raw=False, debug=False):
614 |         """
615 |         Perform the complete SKET pipeline over generic data:
616 |             - (i) Process dataset
617 |             - (ii) Translate dataset
618 |             - (iii) Perform entity linking (and store concepts)
619 |             - (iv) Perform labeling (and store labels)
620 |             - (v) Create RDF graphs (and store graphs)
621 |             - (vi) Return concepts, labels, and RDF graphs
622 | 
623 |         When raw == True: perform steps i-iii and return mentions+concepts
624 |         When store == True: store concepts, labels, and RDF graphs
625 | 
626 |         Params:
627 |             ds (dict): dataset
628 |             preprocess (boolean): whether to preprocess data or not.
629 |             src_lang (str): considered language
630 |             use_case (str): considered use case
631 |             hosp (str): considered hospital
632 |             sim_thr (float): keep candidates with sim score greater than or equal to sim_thr
633 |             store (bool): whether to store concepts, labels, and RDF graphs
634 |             rdf_format (str): RDF format used to serialize graphs
635 |             raw (bool): whether to return concepts within semantic areas or mentions+concepts
636 |             debug (bool): whether to keep flags for debugging
637 | 
638 |         Returns: None
639 |         """
640 | 
641 |         if use_case:  # update to input use case
642 |             self.update_usecase(use_case)
643 | 
644 |         if src_lang:  # update to input source language
645 |             self.update_nmt(src_lang)
646 | 
647 |         # set output directories
648 |         if raw:  # return mentions+concepts (used for EXATAG)
649 |             concepts_out = './outputs/concepts/raw/' + self.use_case + '/'
650 |         else:  # perform complete pipeline (used for SKET/CERT/EXANET)
651 |             concepts_out = './outputs/concepts/refined/' + self.use_case + '/'
652 |             labels_out = './outputs/labels/' + self.use_case + '/'
653 |             rdf_graphs_out = './outputs/graphs/rdf/' + self.use_case + '/'
654 |             struct_graphs_out = './outputs/graphs/json/' + self.use_case + '/'
655 | 
656 |         if preprocess:
657 |             if type(ds) == str:
658 |                 # set dataset name as input file name
659 |                 ds_name = ds.split('/')[-1].split('.')[0]
660 |             else:
661 |                 # set dataset name with random identifier
662 |                 ds_name = str(uuid.uuid4())
663 |             # prepare dataset
664 |             reports = self.prepare_med_dataset(ds, ds_name, src_lang, store, debug=debug)
665 |         else:
666 |             if type(ds) == str and ds.split('.')[-1] == 'json':
667 |                 # set dataset name as input file name
668 |                 ds_name = ds.split('/')[-1].split('.')[0]
669 |                 reports = self.load_reports(ds)
670 |             else:  # raise exception
671 |                 print('Format required for input without preprocess step: JSON.')
672 |                 raise Exception
673 | 
674 |         # perform entity linking
675 |         concepts = self.med_entity_linking(reports, sim_thr, raw, debug=debug)
676 |         if store:  # store concepts
677 |             self.store_concepts(concepts, concepts_out + 'concepts_' + ds_name + '.json')
678 |         if raw:  # return mentions+concepts
679 |             return concepts
680 | 
681 |         # perform labeling
682 |         labels = self.med_labeling(concepts)
683 |         if store:  # store labels
684 |             self.store_labels(labels, labels_out + 'labels_' + ds_name + '.json')
685 |         # create RDF graphs
686 |         rdf_graphs, struct_graphs = self.create_med_graphs(reports, concepts, struct=True, debug=debug)
687 |         if store:  # store graphs
688 |             # RDF graphs
689 |             if rdf_format in ['all', 'n3']:
690 |                 self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.n3', 'n3')
691 |             if rdf_format in ['all', 'trig']:
692 |                 self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.trig', 'trig')
693 |             if rdf_format in ['all', 'turtle']:
694 |                 self.store_rdf_graphs(rdf_graphs, rdf_graphs_out + 'graphs_' + ds_name + '.ttl', 'turtle')
695 |             # JSON graphs
696 |             self.store_json_graphs(struct_graphs, struct_graphs_out + 'graphs_' + ds_name + '.json')
697 |         else:  # return serialized graphs as stream
698 |             if rdf_format == 'all':
699 |                 print('"all" is not supported for standard (stream) output.\nSupported RDF serialization formats for stream output are: "n3", "trig", and "turtle".')
700 |                 raise Exception
701 |             else:
702 |                 rdf_graphs = self.store_rdf_graphs(rdf_graphs, 'stream', rdf_format)
703 |         return concepts, labels, rdf_graphs
704 | 


--------------------------------------------------------------------------------
/sket/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sket/utils/utils.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import json
   3 | 
   4 | 
   5 | def assign_gpu(tknz_out, gpu):
   6 | 	"""
   7 | 	Assign tokenizer tensors to GPU(s)
   8 | 
   9 | 	Params:
  10 | 		tknz_out (dict): dict containing tokenizer tensors within CPU
  11 | 		gpu (int): gpu device
  12 | 
  13 | 	Returns: the dict containing tokenizer tensors within GPU(s)
  14 | 	"""
  15 | 
  16 | 	if type(gpu) == int:
  17 | 		device = 'cuda:' + str(gpu)
  18 | 	else:
  19 | 		device = 'cpu'
  20 | 	tokens_tensor = tknz_out['input_ids'].to(device)
  21 | 	token_type_ids = tknz_out['token_type_ids'].to(device)
  22 | 	attention_mask = tknz_out['attention_mask'].to(device)
  23 | 	# assign GPU(s) tokenizer tensors to output dict
  24 | 	output = {
  25 | 		'input_ids': tokens_tensor,
  26 | 		'token_type_ids': token_type_ids,
  27 | 		'attention_mask': attention_mask
  28 | 	}
  29 | 	return output
  30 | 
  31 | 
  32 | def en_sanitize_record(record, use_case):  # @smarchesin TODO: define sanitize use-case functions that read replacements from file
  33 | 	"""
  34 | 	Sanitize record to avoid translation errors
  35 | 
  36 | 	Params:
  37 | 		record (str): target record
  38 | 
  39 | 	Returns: the sanitized record
  40 | 	"""
  41 | 
  42 | 	if record:
  43 | 		if use_case == 'colon':
  44 | 			record = record.replace('octopus', 'polyp')
  45 | 			record = record.replace('hairy', 'villous')
  46 | 			record = record.replace('villous adenoma-tubule', 'tubulo-villous adenoma')
  47 | 			record = record.replace('villous adenomas-tubule', 'tubulo-villous adenoma')
  48 | 			record = record.replace('villous adenomas tubule', 'tubulo-villous adenoma')
  49 | 			record = record.replace('tubule adenoma-villous', 'tubulo-villous adenoma')
  50 | 			record = record.replace('tubular adenoma-villous', 'tubulo-villous adenoma')
  51 | 			record = record.replace('villous adenoma tubule-', 'tubulo-villous adenoma ')
  52 | 			record = record.replace('villous adenoma tubule', 'tubulo-villous adenoma')
  53 | 			record = record.replace('tubulovilloso adenoma', 'tubulo-villous adenoma')
  54 | 			record = record.replace('blind', 'caecum')
  55 | 			record = record.replace('cecal', 'caecum')
  56 | 			record = record.replace('rectal', 'rectum')
  57 | 			record = record.replace('sigma', 'sigmoid')
  58 | 			record = record.replace('hyperplasia', 'hyperplastic')  # MarianMT translates 'iperplastico' as 'hyperplasia' instead of 'hyperplastic'
  59 | 			record = record.replace('proximal colon', 'right colon')
  60 | 		if use_case == 'cervix':
  61 | 			record = record.replace('octopus', 'polyp')
  62 | 			record = record.replace('his cassock', 'lamina propria')
  63 | 			record = record.replace('tunica propria', 'lamina propria')
  64 | 			record = record.replace('l-sil', 'lsil')
  65 | 			record = record.replace('h-sil', 'hsil')
  66 | 			record = record.replace('cin ii / iii', 'cin23')
  67 | 			record = record.replace('cin iii', 'cin3')
  68 | 			record = record.replace('cin ii', 'cin2')
  69 | 			record = record.replace('cin i', 'cin1')
  70 | 			record = record.replace('cin-iii', 'cin3')
  71 | 			record = record.replace('cin-ii', 'cin2')
  72 | 			record = record.replace('cin-i', 'cin1')
  73 | 			record = record.replace('cin1-2', 'cin1 cin2')
  74 | 			record = record.replace('cin2-3', 'cin2 cin3')
  75 | 			record = record.replace('cin-1', 'cin1')
  76 | 			record = record.replace('cin-2', 'cin2')
  77 | 			record = record.replace('cin-3', 'cin3')
  78 | 			record = record.replace('cin 2 / 3', 'cin23')
  79 | 			record = record.replace('cin 2/3', 'cin23')
  80 | 			record = record.replace('cin 1-2', 'cin1 cin2')
  81 | 			record = record.replace('cin 2-3', 'cin2 cin3')
  82 | 			record = record.replace('cin 1', 'cin1')
  83 | 			record = record.replace('cin 2', 'cin2')
  84 | 			record = record.replace('cin 3', 'cin3')
  85 | 			record = record.replace('ii-iii cin', 'cin2 cin3')
  86 | 			record = record.replace('i-ii cin', 'cin1 cin2')
  87 | 			record = record.replace('iii cin', 'cin3')
  88 | 			record = record.replace('ii cin', 'cin2')
  89 | 			record = record.replace('i cin', 'cin1')
  90 | 			record = record.replace('port biopsy', 'portio biopsy')
  91 | 		if use_case == 'celiac':
  92 | 			record = record.replace('villas', 'villi')
  93 | 			record = record.replace('duodonitis', 'duodenitis')
  94 | 			record = record.replace('duodoneitis', 'duodenitis')
  95 | 			record = record.replace('duodonia', 'duodenitis')
  96 | 			record = record.replace('duedenitis', 'duodenitis')
  97 | 			record = record.replace('mucosae', 'mucosa')
  98 | 			record = record.replace('mucous', 'mucosa')
  99 | 			record = record.replace('oedema', 'edema')
 100 | 			record = record.replace('leucocyte', 'leukocyte')
 101 | 			record = record.replace('granulocytes', 'granulocyte')
 102 | 			record = record.replace('eosinophiles', 'eosinophil')
 103 | 			record = record.replace('neutrophiles', 'neutrophil')
 104 | 			record = record.replace('leukocytes', 'leukocyte')
 105 | 			record = record.replace('lymphocytes', 'lymphocyte')
 106 | 			record = record.replace('lymphocytosis', 'lymphocyte')
 107 | 			record = record.replace('enterocytes', 'enterocyte')
 108 | 			record = record.replace('vills', 'villi')
 109 | 			record = record.replace('villous', 'villi')
 110 | 			record = record.replace('villuse', 'villi')
 111 | 			record = record.replace('villus', 'villi')
 112 | 			record = record.replace('cryptes', 'crypts')
 113 | 			record = record.replace('hyperaemia', 'hyperemia')
 114 | 			record = record.replace('antro', 'antrum')
 115 | 			record = record.replace('biopt', 'biopsy')
 116 | 			record = record.replace('biopsys', 'biopsy')
 117 | 			record = record.replace('geen afwijking', 'no abnormalities')
 118 | 			record = record.replace('no deviation', 'no abnormalities')
 119 | 			record = record.replace('no abnormality', 'no abnormalities')
 120 | 			record = record.replace('bioptes', 'biopsy')
 121 | 			record = record.replace('biopsie', 'biopsy')
 122 | 			record = record.replace('duedenum', 'duodenum')
 123 | 			record = record.replace('duodenium', 'duodenum')
 124 | 			record = record.replace('biopsies', 'biopsy')
 125 | 			record = record.replace('coeliac', 'celiac')
 126 | 			record = record.replace('coeliakie', 'celiac disease')
 127 | 			record = record.replace('ontsteking', 'inflammation')
 128 | 			record = record.replace('anthrum', 'antrum')
 129 | 			record = record.replace('corpusbiopts', 'corpus biopsy')
 130 | 			record = record.replace('flokatrophy', 'villi atrophy')
 131 | 			record = record.replace('flocatrophy', 'villi atrophy')
 132 | 			record = record.replace('flake', 'villi')
 133 | 			record = record.replace('bulbus duodeni', 'duodenal bulb')
 134 | 			record = record.replace('eosinophilia', 'eosinophil')
 135 | 			record = record.replace('theduodenum', 'the duodenum')
 136 | 
 137 | 	return record
 138 | 
 139 | 
 140 | def nl_sanitize_record(record, use_case):
 141 | 	"""
 142 | 	Sanitize record to avoid translation errors
 143 | 	Params:
 144 | 		record (str): target record
 145 | 	Returns: the sanitized record
 146 | 	"""
 147 | 
 148 | 	if record:
 149 | 		if use_case == 'cervix':
 150 | 			record = record.replace('cin ii - iii', 'cin2 cin3')
 151 | 			record = record.replace('cin ii-iii', 'cin2 cin3')
 152 | 			record = record.replace('cin ii en  iii', 'cin2 cin3')
 153 | 			record = record.replace('cin i - iii', 'cin1 cin3')
 154 | 			record = record.replace('cin i-iii', 'cin1 cin3')
 155 | 			record = record.replace('cin i en iii', 'cin1 cin3')
 156 | 			record = record.replace('cin i - ii', 'cin1 cin2')
 157 | 			record = record.replace('cin i-ii', 'cin1 cin2')
 158 | 			record = record.replace('cin i en ii', 'cin1 cin2')
 159 | 			record = record.replace('cin ii / iii', 'cin23')
 160 | 			record = record.replace('cin iii', 'cin3')
 161 | 			record = record.replace('cin ii', 'cin2')
 162 | 			record = record.replace('cin i', 'cin1')
 163 | 			record = record.replace('cin-iii', 'cin3')
 164 | 			record = record.replace('cin-ii', 'cin2')
 165 | 			record = record.replace('cin-i', 'cin1')
 166 | 			record = record.replace('ii-iii cin', 'cin2 cin3')
 167 | 			record = record.replace('i-ii cin', 'cin1 cin2')
 168 | 			record = record.replace('iii cin', 'cin3')
 169 | 			record = record.replace('ii cin', 'cin2')
 170 | 			record = record.replace('i cin', 'cin1')
 171 | 			record = record.replace('kin ii - iii', 'kin2 kin3')
 172 | 			record = record.replace('kin ii-iii', 'kin2 kin3')
 173 | 			record = record.replace('kin ii en  iii', 'kin2 kin3')
 174 | 			record = record.replace('kin i - iii', 'kin1 kin3')
 175 | 			record = record.replace('kin i-iii', 'kin1 kin3')
 176 | 			record = record.replace('kin i en iii', 'kin1 kin3')
 177 | 			record = record.replace('kin i - ii', 'kin1 kin2')
 178 | 			record = record.replace('kin i-ii', 'kin1 kin2')
 179 | 			record = record.replace('kin i en ii', 'kin1 kin2')
 180 | 			record = record.replace('kin ii / iii', 'kin2 kin3')
 181 | 			record = record.replace('kin iii', 'kin3')
 182 | 			record = record.replace('kin ii', 'kin2')
 183 | 			record = record.replace('kin i', 'kin1')
 184 | 			record = record.replace('kin-iii', 'kin3')
 185 | 			record = record.replace('kin-ii', 'kin2')
 186 | 			record = record.replace('kin-i', 'kin1')
 187 | 			record = record.replace('ii-iii kin', 'kin2 kin3')
 188 | 			record = record.replace('i-ii kin', 'kin1 kin2')
 189 | 			record = record.replace('iii kin', 'kin3')
 190 | 			record = record.replace('ii kin', 'kin2')
 191 | 			record = record.replace('i kin', 'kin1')
 192 | 	return record
 193 | 
 194 | 
 195 | def sanitize_code(code):
 196 | 	"""
 197 | 	Sanitize code removing unnecessary characters
 198 | 
 199 | 	Params:
 200 | 		code (str): target code
 201 | 
 202 | 	Returns: the sanitized code
 203 | 	"""
 204 | 
 205 | 	if code:
 206 | 		code = code.replace('-', '')
 207 | 		code = code.ljust(7, '0')
 208 | 	return code
 209 | 
 210 | 
 211 | def sanitize_codes(codes):
 212 | 	"""
 213 | 	Sanitize codes by splitting and removing unnecessarsy characters
 214 | 
 215 | 	Params:
 216 | 		codes (list): target codes
 217 | 
 218 | 	Returns: the sanitized codes
 219 | 	"""
 220 | 
 221 | 	codes = codes.split(';')
 222 | 	codes = [sanitize_code(code) for code in codes]
 223 | 	return codes
 224 | 
 225 | 
 226 | def read_rules(rules):
 227 | 	"""
 228 | 	Read rules stored within file
 229 | 
 230 | 	Params: 
 231 | 		rules (str): path to rules file
 232 | 
 233 | 	Returns: a dict of trigger: [candidates] representing rules for each use-case
 234 | 	"""
 235 | 
 236 | 	with open(rules, 'r') as file:
 237 | 		lines = file.readlines()
 238 | 
 239 | 	rules = {'colon': {}, 'cervix': {}, 'celiac': {}, 'lung': {}}
 240 | 	for line in lines:
 241 | 		trigger, candidates, position, mode, use_cases = line.strip().split('\t')
 242 | 		use_cases = use_cases.split(',')
 243 | 		for use_case in use_cases:
 244 | 			rules[use_case][trigger] = (candidates.split(','), position, mode)
 245 | 	return rules
 246 | 
 247 | 
 248 | def read_dysplasia_mappings(mappings):
 249 | 	"""
 250 | 	Read dysplasia mappings stored within file
 251 | 
 252 | 	Params:
 253 | 		mappings (str): path to dysplasia mappings file
 254 | 
 255 | 	Returns: a dict of {trigger: grade} representing mappings for each use-case
 256 | 	"""
 257 | 
 258 | 	with open(mappings, 'r') as file:
 259 | 		lines = file.readlines()
 260 | 
 261 | 	mappings = {'colon': {}, 'cervix': {}, 'celiac': {}, 'lung': {}}
 262 | 	for line in lines:
 263 | 		trigger, grade, use_cases = line.strip().split('\t')
 264 | 		use_cases = use_cases.split(',')
 265 | 		for use_case in use_cases:
 266 | 			mappings[use_case][trigger] = grade.split(',')
 267 | 	return mappings
 268 | 
 269 | 
 270 | def read_cin_mappings(mappings):
 271 | 	"""
 272 | 	Read cin mappings stored within file
 273 | 
 274 | 	Params:
 275 | 		mappings (str): path to cin mappings file
 276 | 
 277 | 	Returns: a dict of {trigger: grade} representing mappings for cervical intraephitelial neoplasia
 278 | 	"""
 279 | 
 280 | 	with open(mappings, 'r') as file:
 281 | 		lines = file.readlines()
 282 | 
 283 | 	mappings = {}
 284 | 	for line in lines:
 285 | 		trigger, grade = line.strip().split('\t')
 286 | 		mappings[trigger] = grade
 287 | 	return mappings
 288 | 
 289 | 
 290 | def read_hierarchies(hrels):
 291 | 	"""
 292 | 	Read hierarchy relations stored within file
 293 | 	
 294 | 	Params:
 295 | 		hrels (str): hierarchy relations file path
 296 | 		
 297 | 	Returns: the list of hierarchical relations
 298 | 	"""
 299 | 	
 300 | 	with open(hrels, 'r') as f:
 301 | 		rels = f.readlines()
 302 | 	return [rel.strip() for rel in rels]
 303 | 
 304 | 
 305 | def read_report_fields(rfields):
 306 | 	"""
 307 | 	Read considered report fields stored within file
 308 | 
 309 | 	Params:
 310 | 		rfields (str): report fields file path
 311 | 
 312 | 	Returns: the list of report fields
 313 | 	"""
 314 | 
 315 | 	with open(rfields, 'r') as f:
 316 | 		fields = f.read().splitlines()
 317 | 	return [field.strip() for field in fields if field]
 318 | 
 319 | 
 320 | def store_concepts(concepts, out_path, indent=4, sort_keys=False):
 321 | 	"""
 322 | 	Store report concepts 
 323 | 
 324 | 	Params:
 325 | 		concepts (dict): report concepts
 326 | 		out_path (str): output file-path w/o extension
 327 | 		indent (int): indentation level
 328 | 		sort_keys (bool): sort keys
 329 | 
 330 | 	Returns: True
 331 | 	"""
 332 | 
 333 | 	os.makedirs(os.path.dirname(out_path), exist_ok=True)
 334 | 
 335 | 	with open(out_path, 'w') as out:
 336 | 		json.dump(concepts, out, indent=indent, sort_keys=sort_keys)
 337 | 	return True
 338 | 
 339 | 
 340 | def load_concepts(concept_fpath):
 341 | 	"""
 342 | 	Load stored concepts
 343 | 
 344 | 	Params:
 345 | 		concept_fpath (str): file-path to stored concepts
 346 | 
 347 | 	Returns: the dict containing the report (stored) concepts
 348 | 	"""
 349 | 
 350 | 	with open(concept_fpath, 'r') as f:
 351 | 		concepts = json.load(f)
 352 | 	return concepts
 353 | 
 354 | 
 355 | def store_labels(labels, out_path, indent=4, sort_keys=False):
 356 | 	"""
 357 | 	Store report labels 
 358 | 
 359 | 	Params:
 360 | 		labels (dict): report labels
 361 | 		out_path (str): output file-path w/o extension
 362 | 		indent (int): indentation level
 363 | 		sort_keys (bool): sort keys
 364 | 
 365 | 	Returns: True
 366 | 	"""
 367 | 
 368 | 	os.makedirs(os.path.dirname(out_path), exist_ok=True)
 369 | 
 370 | 	with open(out_path, 'w') as out:
 371 | 		json.dump(labels, out, indent=indent, sort_keys=sort_keys)
 372 | 	return True
 373 | 
 374 | 
 375 | def load_labels(label_fpath):
 376 | 	"""
 377 | 	Load stored labels
 378 | 
 379 | 	Params:
 380 | 		label_fpath (str): file-path to stored labels
 381 | 
 382 | 	Returns: the dict containing the report (stored) labels
 383 | 	"""
 384 | 
 385 | 	with open(label_fpath, 'r') as f:
 386 | 		labels = json.load(f)
 387 | 	return labels
 388 | 
 389 | 
 390 | # AOEC RELATED FUNCTIONS
 391 | 
 392 | def aoec_colon_concepts2labels(report_concepts):
 393 | 	"""
 394 | 	Convert the concepts extracted from colon reports to the set of pre-defined labels used for classification
 395 | 	
 396 | 	Params:
 397 | 		report_concepts (dict(list)): the dict containing for each colon report the extracted concepts
 398 | 		
 399 | 	Returns: a dict containing for each colon report the set of pre-defined labels where 0 = absence and 1 = presence
 400 | 	"""
 401 | 	
 402 | 	report_labels = dict()
 403 | 	# loop over reports
 404 | 	for rid, rconcepts in report_concepts.items():
 405 | 		# assign pre-defined set of labels to current report
 406 | 		report_labels[rid] = {'cancer': 0, 'hgd': 0, 'lgd': 0, 'hyperplastic': 0, 'ni': 0}
 407 | 		# textify diagnosis section
 408 | 		diagnosis = ' '.join([concept[1].lower() for concept in rconcepts['Diagnosis']])
 409 | 		# update pre-defined labels w/ 1 in case of label presence
 410 | 		if 'colon adenocarcinoma' in diagnosis:  # update cancer
 411 | 			report_labels[rid]['cancer'] = 1
 412 | 		if 'dysplasia' in diagnosis:  # diagnosis contains dysplasia
 413 | 			if 'mild' in diagnosis:  # update lgd
 414 | 				report_labels[rid]['lgd'] = 1
 415 | 			if 'moderate' in diagnosis:  # update lgd
 416 | 				report_labels[rid]['lgd'] = 1
 417 | 			if 'severe' in diagnosis:  # update hgd
 418 | 				report_labels[rid]['hgd'] = 1
 419 | 		if 'hyperplastic polyp' in diagnosis:  # update hyperplastic
 420 | 			report_labels[rid]['hyperplastic'] = 1
 421 | 		if sum(report_labels[rid].values()) == 0:  # update ni
 422 | 			report_labels[rid]['ni'] = 1   
 423 | 	return report_labels
 424 | 
 425 | 
 426 | def aoec_colon_labels2binary(report_labels):
 427 | 	"""
 428 | 	Convert the pre-defined labels extracted from colon reports to binary labels used for classification
 429 | 	
 430 | 	Params:
 431 | 		report_labels (dict(list)): the dict containing for each colon report the pre-defined labels
 432 | 		
 433 | 	Returns: a dict containing for each colon report the set of binary labels where 0 = absence and 1 = presence
 434 | 	"""
 435 | 	
 436 | 	binary_labels = dict()
 437 | 	# loop over reports
 438 | 	for rid, rlabels in report_labels.items():
 439 | 		# assign binary labels to current report
 440 | 		binary_labels[rid] = {'cancer_or_dysplasia': 0, 'other': 0}
 441 | 		# update binary labels w/ 1 in case of label presence
 442 | 		if rlabels['cancer'] == 1 or rlabels['lgd'] == 1 or rlabels['hgd'] == 1:  # update cancer_or_dysplasia label
 443 | 			binary_labels[rid]['cancer_or_dysplasia'] = 1
 444 | 		else:  # update other label
 445 | 			binary_labels[rid]['other'] = 1  
 446 | 	return binary_labels
 447 | 
 448 | 
 449 | def aoec_cervix_concepts2labels(report_concepts):
 450 | 	"""
 451 | 	Convert the concepts extracted from cervix reports to the set of pre-defined labels used for classification
 452 | 	
 453 | 	Params:
 454 | 		report_concepts (dict(list)): the dict containing for each cervix report the extracted concepts
 455 | 		
 456 | 	Returns: a dict containing for each cervix report the set of pre-defined labels where 0 = absence and 1 = presence
 457 | 	"""
 458 | 	
 459 | 	report_labels = dict()
 460 | 	# loop over reports
 461 | 	for rid, rconcepts in report_concepts.items():
 462 | 		# assign pre-defined set of labels to current report
 463 | 		report_labels[rid] = {
 464 | 			'cancer_scc_inv': 0, 'cancer_scc_insitu': 0, 'cancer_adeno_inv': 0, 'cancer_adeno_insitu': 0,
 465 | 			'lgd': 0, 'hgd': 0,
 466 | 			'hpv': 0, 'koilocytes': 0,
 467 | 			'glands_norm': 0, 'squamous_norm': 0
 468 | 		}
 469 | 		# make diagnosis section a set
 470 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 471 | 		# update pre-defined labels w/ 1 in case of label presence
 472 | 		for d in diagnosis:
 473 | 			if 'cervical squamous cell carcinoma' == d:
 474 | 				report_labels[rid]['cancer_scc_inv'] = 1
 475 | 			if 'squamous carcinoma in situ' == d or 'squamous intraepithelial neoplasia' == d:
 476 | 				report_labels[rid]['cancer_scc_insitu'] = 1
 477 | 			if 'cervical adenocarcinoma' in d:
 478 | 				if 'cervical adenocarcinoma in situ' == d:
 479 | 					report_labels[rid]['cancer_adeno_insitu'] = 1
 480 | 				else:
 481 | 					report_labels[rid]['cancer_adeno_inv'] = 1
 482 | 			if 'low grade cervical squamous intraepithelial neoplasia' == d:
 483 | 				report_labels[rid]['lgd'] = 1
 484 | 			if 'squamous carcinoma in situ' == d or \
 485 | 				'squamous intraepithelial neoplasia' == d or \
 486 | 				'cervical squamous intraepithelial neoplasia 2' == d or \
 487 | 				'cervical intraepithelial neoplasia grade 2/3' == d:
 488 | 				report_labels[rid]['hgd'] = 1
 489 | 			if 'human papilloma virus infection' == d:
 490 | 				report_labels[rid]['hpv'] = 1
 491 | 			if 'koilocytotic squamous cell' == d:
 492 | 				report_labels[rid]['koilocytes'] = 1
 493 | 		# update when no label has been set to 1
 494 | 		if sum(report_labels[rid].values()) == 0:
 495 | 			report_labels[rid]['glands_norm'] = 1
 496 | 			report_labels[rid]['squamous_norm'] = 1
 497 | 	return report_labels
 498 | 
 499 | 
 500 | def aoec_cervix_labels2aggregates(report_labels):
 501 | 	"""
 502 | 	Convert the pre-defined labels extracted from cervix reports to coarse- and fine-grained aggregated labels
 503 | 		Params:
 504 | 			report_labels (dict(list)): the dict containing for each cervix report the pre-defined labels
 505 | 		Returns: two dicts containing for each cervix report the set of aggregated labels where 0 = absence and 1 = presence
 506 | 	"""
 507 | 
 508 | 	coarse_labels = dict()
 509 | 	fine_labels = dict()
 510 | 	# loop over reports
 511 | 	for rid, rlabels in report_labels.items():
 512 | 		# assign aggregated labels to current report
 513 | 		coarse_labels[rid] = {'cancer': 0, 'dysplasia': 0, 'normal': 0}
 514 | 		fine_labels[rid] = {'cancer_adeno': 0, 'cancer_scc': 0, 'dysplasia': 0, 'glands_norm': 0, 'squamous_norm': 0}
 515 | 		# update aggregated labels w/ 1 in case of label presence
 516 | 		if rlabels['cancer_adeno_inv'] == 1 or rlabels['cancer_adeno_insitu'] == 1:
 517 | 			coarse_labels[rid]['cancer'] = 1
 518 | 			fine_labels[rid]['cancer_adeno'] = 1
 519 | 		if rlabels['cancer_scc_inv'] == 1 or rlabels['cancer_scc_insitu'] == 1:
 520 | 			coarse_labels[rid]['cancer'] = 1
 521 | 			fine_labels[rid]['cancer_scc'] = 1
 522 | 		if rlabels['lgd'] == 1 or rlabels['hgd'] == 1:
 523 | 			coarse_labels[rid]['dysplasia'] = 1
 524 | 			fine_labels[rid]['dysplasia'] = 1
 525 | 		if rlabels['glands_norm'] == 1:
 526 | 			coarse_labels[rid]['normal'] = 1
 527 | 			fine_labels[rid]['glands_norm'] = 1
 528 | 		if rlabels['squamous_norm'] == 1:
 529 | 			coarse_labels[rid]['normal'] = 1
 530 | 			fine_labels[rid]['squamous_norm'] = 1
 531 | 	return coarse_labels, fine_labels
 532 | 
 533 | 
 534 | def aoec_lung_concepts2labels(report_concepts):
 535 | 	"""
 536 | 	Convert the concepts extracted from lung reports to the set of pre-defined labels used for classification
 537 | 
 538 | 	Params:
 539 | 		report_concepts (dict(list)): the dict containing for each lung report the extracted concepts
 540 | 
 541 | 	Returns: a dict containing for each lung report the set of pre-defined labels where 0 = absence and 1 = presence
 542 | 	"""
 543 | 
 544 | 	report_labels = dict()
 545 | 	# loop over reports
 546 | 	for rid, rconcepts in report_concepts.items():
 547 | 		# assign pre-defined set of labels to current report
 548 | 		report_labels[rid] = {
 549 | 			'cancer_scc': 0, 'cancer_nscc_adeno': 0, 'cancer_nscc_squamous': 0, 'cancer_nscc_large': 0, 'no_cancer': 0}
 550 | 		# make diagnosis section a set
 551 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 552 | 		# update pre-defined labels w/ 1 in case of label presence
 553 | 		for d in diagnosis:
 554 | 			if 'small cell lung carcinoma' == d:
 555 | 				report_labels[rid]['cancer_scc'] = 1
 556 | 			if 'lung adenocarcinoma' == d or 'clear cell adenocarcinoma' == d or 'metastatic neoplasm' == d:
 557 | 				report_labels[rid]['cancer_nscc_adeno'] = 1
 558 | 			if 'non-small cell squamous lung carcinoma' == d:
 559 | 				report_labels[rid]['cancer_nscc_squamous'] = 1
 560 | 			if 'lung large cell carcinoma' == d:
 561 | 				report_labels[rid]['cancer_nscc_large'] = 1
 562 | 		# update when no label has been set to 1
 563 | 		if sum(report_labels[rid].values()) == 0:
 564 | 			report_labels[rid]['no_cancer'] = 1
 565 | 	return report_labels
 566 | 
 567 | 
 568 | def aoec_celiac_concepts2labels(report_concepts):
 569 | 	"""
 570 | 		Convert the concepts extracted from celiac reports to the set of pre-defined labels used for classification
 571 | 
 572 | 		Params:
 573 | 			report_concepts (dict(list)): the dict containing for each celiac report the extracted concepts
 574 | 
 575 | 		Returns: a dict containing for each celiac report the set of pre-defined labels where 0 = absence and 1 = presence
 576 | 	"""
 577 | 
 578 | 	report_labels = dict()
 579 | 	# loop over reports
 580 | 	for rid, rconcepts in report_concepts.items():
 581 | 		# assign pre-defined set of labels to current report
 582 | 		report_labels[rid] = {
 583 | 			'celiac_disease': 0, 'duodenitis': 0, 'inconclusive': 0, 'normal': 0}
 584 | 		# make diagnosis section a set
 585 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 586 | 		# update pre-defined labels w/ 1 in case of label presence
 587 | 		for d in diagnosis:
 588 | 			if 'positive to celiac disease' == d:
 589 | 				report_labels[rid]['celiac_disease'] = 1
 590 | 			if 'duodenitis' == d:
 591 | 				report_labels[rid]['duodenitis'] = 1
 592 | 			if 'inconclusive outcome' == d:
 593 | 				report_labels[rid]['inconclusive'] = 1
 594 | 			if 'negative result' == d:
 595 | 				report_labels[rid]['normal'] = 1
 596 | 		# update when no label has been set to 1
 597 | 		if sum(report_labels[rid].values()) == 0:
 598 | 			report_labels[rid]['normal'] = 1
 599 | 	return report_labels
 600 | 
 601 | 
 602 | # RADBOUD RELATED FUNCTIONS
 603 | 
 604 | def radboud_colon_concepts2labels(report_concepts):
 605 | 	"""
 606 | 	Convert the concepts extracted from reports to the set of pre-defined labels used for classification
 607 | 	
 608 | 	Params:
 609 | 		report_concepts (dict(list)): the dict containing for each report the extracted concepts
 610 | 
 611 | 	Returns: a dict containing for each report the set of pre-defined labels where 0 = abscence and 1 = presence
 612 | 	"""
 613 | 	
 614 | 	report_labels = dict()
 615 | 	# loop over reports
 616 | 	for rid, rconcepts in report_concepts.items():
 617 | 		report_labels[rid] = dict()
 618 | 		# assign pre-defined set of labels to current report
 619 | 		report_labels[rid]['labels'] = {'cancer': 0, 'hgd': 0, 'lgd': 0, 'hyperplastic': 0, 'ni': 0}
 620 | 		# textify diagnosis section
 621 | 		diagnosis = ' '.join([concept[1].lower() for concept in rconcepts['Diagnosis']])
 622 | 		# update pre-defined labels w/ 1 in case of label presence
 623 | 		if 'colon adenocarcinoma' in diagnosis:  # update cancer
 624 | 			report_labels[rid]['labels']['cancer'] = 1
 625 | 		if 'dysplasia' in diagnosis:  # diagnosis contains dysplasia
 626 | 			if 'mild' in diagnosis:  # update lgd
 627 | 				report_labels[rid]['labels']['lgd'] = 1
 628 | 			if 'moderate' in diagnosis:  # update lgd
 629 | 				report_labels[rid]['labels']['lgd'] = 1
 630 | 			if 'severe' in diagnosis:  # update hgd
 631 | 				report_labels[rid]['labels']['hgd'] = 1
 632 | 		if 'hyperplastic polyp' in diagnosis:  # update hyperplastic
 633 | 			report_labels[rid]['labels']['hyperplastic'] = 1
 634 | 		if sum(report_labels[rid]['labels'].values()) == 0:  # update ni
 635 | 			report_labels[rid]['labels']['ni'] = 1   
 636 | 		if 'slide_ids' in rconcepts:
 637 | 			report_labels[rid]['slide_ids'] = rconcepts['slide_ids']
 638 | 	return report_labels
 639 | 
 640 | 
 641 | def radboud_colon_labels2binary(report_labels):
 642 | 	"""
 643 | 	Convert the pre-defined labels extracted from reports to binary labels used for classification
 644 | 	
 645 | 	Params:
 646 | 		report_labels (dict(list)): the dict containing for each report the pre-defined labels
 647 | 
 648 | 	Returns: a dict containing for each report the set of binary labels where 0 = abscence and 1 = presence
 649 | 	"""
 650 | 	
 651 | 	binary_labels = dict()
 652 | 	# loop over reports
 653 | 	for rid, rlabels in report_labels.items():
 654 | 		binary_labels[rid] = dict()
 655 | 		# assign binary labels to current report
 656 | 		binary_labels[rid]['labels'] = {'cancer_or_dysplasia': 0, 'other': 0}
 657 | 		# update binary labels w/ 1 in case of label presence
 658 | 		if rlabels['labels']['cancer'] == 1 or rlabels['labels']['lgd'] == 1 or rlabels['labels']['hgd'] == 1:  # update cancer_or_adenoma label
 659 | 			binary_labels[rid]['labels']['cancer_or_dysplasia'] = 1
 660 | 		else:  # update other label
 661 | 			binary_labels[rid]['labels']['other'] = 1
 662 | 		if 'slide_ids' in rlabels:
 663 | 			binary_labels[rid]['slide_ids'] = rlabels['slide_ids']
 664 | 	return binary_labels
 665 | 
 666 | 
 667 | def radboud_cervix_concepts2labels(report_concepts):
 668 | 	"""
 669 | 	Convert the concepts extracted from cervix reports to the set of pre-defined labels used for classification
 670 | 
 671 | 	Params:
 672 | 		report_concepts (dict(list)): the dict containing for each cervix report the extracted concepts
 673 | 
 674 | 	Returns: a dict containing for each cervix report the set of pre-defined labels where 0 = absence and 1 = presence
 675 | 	"""
 676 | 
 677 | 	report_labels = dict()
 678 | 	# loop over reports
 679 | 	for rid, rconcepts in report_concepts.items():
 680 | 		report_labels[rid] = dict()
 681 | 		# assign pre-defined set of labels to current report
 682 | 		report_labels[rid]['labels'] = {
 683 | 			'cancer_scc_inv': 0, 'cancer_scc_insitu': 0, 'cancer_adeno_inv': 0, 'cancer_adeno_insitu': 0,
 684 | 			'lgd': 0, 'hgd': 0,
 685 | 			'hpv': 0, 'koilocytes': 0,
 686 | 			'glands_norm': 0, 'squamous_norm': 0
 687 | 		}
 688 | 		# make diagnosis section a set
 689 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 690 | 		# update pre-defined labels w/ 1 in case of label presence
 691 | 		for d in diagnosis:
 692 | 			if 'cervical squamous cell carcinoma' == d:
 693 | 				report_labels[rid]['labels']['cancer_scc_inv'] = 1
 694 | 			if 'squamous carcinoma in situ' == d or 'squamous intraepithelial neoplasia' == d:
 695 | 				report_labels[rid]['labels']['cancer_scc_insitu'] = 1
 696 | 			if 'cervical adenocarcinoma' in d:
 697 | 				if 'cervical adenocarcinoma in situ' == d:
 698 | 					report_labels[rid]['labels']['cancer_adeno_insitu'] = 1
 699 | 				else:
 700 | 					report_labels[rid]['labels']['cancer_adeno_inv'] = 1
 701 | 			if 'low grade cervical squamous intraepithelial neoplasia' == d:
 702 | 				report_labels[rid]['labels']['lgd'] = 1
 703 | 			if 'squamous carcinoma in situ' == d or \
 704 | 				'squamous intraepithelial neoplasia' == d or \
 705 | 				'cervical squamous intraepithelial neoplasia 2' == d or \
 706 | 				'cervical intraepithelial neoplasia grade 2/3' == d:
 707 | 				report_labels[rid]['labels']['hgd'] = 1
 708 | 			if 'human papilloma virus infection' == d:
 709 | 				report_labels[rid]['labels']['hpv'] = 1
 710 | 			if 'koilocytotic squamous cell' == d:
 711 | 				report_labels[rid]['labels']['koilocytes'] = 1
 712 | 		# update when no label has been set to 1
 713 | 		if sum(report_labels[rid]['labels'].values()) == 0:
 714 | 			report_labels[rid]['labels']['glands_norm'] = 1
 715 | 			report_labels[rid]['labels']['squamous_norm'] = 1
 716 | 
 717 | 		if 'slide_ids' in rconcepts:
 718 | 			report_labels[rid]['slide_ids'] = rconcepts['slide_ids']
 719 | 	return report_labels
 720 | 
 721 | 
 722 | def radboud_cervix_labels2aggregates(report_labels):
 723 | 	"""
 724 | 	Convert the pre-defined labels extracted from cervix reports to coarse- and fine-grained aggregated labels
 725 | 
 726 | 		Params:
 727 | 			report_labels (dict(list)): the dict containing for each cervix report the pre-defined labels
 728 | 		Returns: two dicts containing for each cervix report the set of aggregated labels where 0 = absence and 1 = presence
 729 | 	"""
 730 | 
 731 | 	coarse_labels = dict()
 732 | 	fine_labels = dict()
 733 | 	# loop over reports
 734 | 	for rid, rlabels in report_labels.items():
 735 | 		coarse_labels[rid] = dict()
 736 | 		fine_labels[rid] = dict()
 737 | 		# assign aggregated labels to current report
 738 | 		coarse_labels[rid]['labels'] = {'cancer': 0, 'dysplasia': 0, 'normal': 0}
 739 | 		fine_labels[rid]['labels'] = {'cancer_adeno': 0, 'cancer_scc': 0, 'dysplasia': 0, 'glands_norm': 0, 'squamous_norm': 0}
 740 | 		# update aggregated labels w/ 1 in case of label presence
 741 | 		if rlabels['cancer_adeno_inv'] == 1 or rlabels['cancer_adeno_insitu'] == 1:
 742 | 			coarse_labels[rid]['labels']['cancer'] = 1
 743 | 			fine_labels[rid]['labels']['cancer_adeno'] = 1
 744 | 		if rlabels['cancer_scc_inv'] == 1 or rlabels['cancer_scc_insitu'] == 1:
 745 | 			coarse_labels[rid]['labels']['cancer'] = 1
 746 | 			fine_labels[rid]['labels']['cancer_scc'] = 1
 747 | 		if rlabels['lgd'] == 1 or rlabels['hgd'] == 1:
 748 | 			coarse_labels[rid]['labels']['dysplasia'] = 1
 749 | 			fine_labels[rid]['labels']['dysplasia'] = 1
 750 | 		if rlabels['glands_norm'] == 1:
 751 | 			coarse_labels[rid]['labels']['normal'] = 1
 752 | 			fine_labels[rid]['labels']['glands_norm'] = 1
 753 | 		if rlabels['squamous_norm'] == 1:
 754 | 			coarse_labels[rid]['labels']['normal'] = 1
 755 | 			fine_labels[rid]['labels']['squamous_norm'] = 1
 756 | 		if 'slide_ids' in rlabels:
 757 | 			coarse_labels[rid]['slide_ids'] = rlabels['slide_ids']
 758 | 			fine_labels[rid]['slide_ids'] = rlabels['slide_ids']
 759 | 	return coarse_labels, fine_labels
 760 | 
 761 | def radboud_celiac_concepts2labels(report_concepts):
 762 | 	"""
 763 | 		Convert the concepts extracted from celiac reports to the set of pre-defined labels used for classification
 764 | 
 765 | 		Params:
 766 | 			report_concepts (dict(list)): the dict containing for each celiac report the extracted concepts
 767 | 
 768 | 		Returns: a dict containing for each celiac report the set of pre-defined labels where 0 = absence and 1 = presence
 769 | 	"""
 770 | 
 771 | 	report_labels = dict()
 772 | 	# loop over reports
 773 | 	for rid, rconcepts in report_concepts.items():
 774 | 		report_labels[rid] = dict()
 775 | 		# assign pre-defined set of labels to current report
 776 | 		report_labels[rid]['labels'] = {
 777 | 			'celiac_disease': 0, 'duodenitis': 0, 'inconclusive': 0, 'normal': 0}
 778 | 		# make diagnosis section a set
 779 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 780 | 		# update pre-defined labels w/ 1 in case of label presence
 781 | 		for d in diagnosis:
 782 | 			if 'positive to celiac disease' == d:
 783 | 				report_labels[rid]['labels']['celiac_disease'] = 1
 784 | 			if 'duodenitis' == d:
 785 | 				report_labels[rid]['labels']['duodenitis'] = 1
 786 | 			if 'inconclusive outcome' == d:
 787 | 				report_labels[rid]['inconclusive'] = 1
 788 | 			if 'negative result' == d:
 789 | 				report_labels[rid]['labels']['normal'] = 1
 790 | 		# update when no label has been set to 1
 791 | 		if sum(report_labels[rid]['labels'].values()) == 0:
 792 | 			report_labels[rid]['labels']['normal'] = 1
 793 | 		if 'slide_ids' in rconcepts:
 794 | 			report_labels[rid]['slide_ids'] = rconcepts['slide_ids']
 795 | 	return report_labels
 796 | 
 797 | 
 798 | # GENERAL-PURPOSE FUNCTIONS
 799 | 
 800 | def colon_concepts2labels(report_concepts):
 801 | 	"""
 802 | 	Convert the concepts extracted from colon reports to the set of pre-defined labels used for classification
 803 | 
 804 | 	Params:
 805 | 		report_concepts (dict(list)): the dict containing for each colon report the extracted concepts
 806 | 
 807 | 	Returns: a dict containing for each colon report the set of pre-defined labels where 0 = absence and 1 = presence
 808 | 	"""
 809 | 
 810 | 	report_labels = dict()
 811 | 	# loop over reports
 812 | 	for rid, rconcepts in report_concepts.items():
 813 | 		report_labels[rid] = dict()
 814 | 		# assign pre-defined set of labels to current report
 815 | 		report_labels[rid] = {'cancer': 0, 'hgd': 0, 'lgd': 0, 'hyperplastic': 0, 'ni': 0}
 816 | 		# textify diagnosis section
 817 | 		diagnosis = ' '.join([concept[1].lower() for concept in rconcepts['Diagnosis']])
 818 | 		# update pre-defined labels w/ 1 in case of label presence
 819 | 		if 'colon adenocarcinoma' in diagnosis:  # update cancer
 820 | 			report_labels[rid]['cancer'] = 1
 821 | 		if 'dysplasia' in diagnosis:  # diagnosis contains dysplasia
 822 | 			if 'mild' in diagnosis:  # update lgd
 823 | 				report_labels[rid]['lgd'] = 1
 824 | 			if 'moderate' in diagnosis:  # update lgd
 825 | 				report_labels[rid]['lgd'] = 1
 826 | 			if 'severe' in diagnosis:  # update hgd
 827 | 				report_labels[rid]['hgd'] = 1
 828 | 		if 'hyperplastic polyp' in diagnosis:  # update hyperplastic
 829 | 			report_labels[rid]['hyperplastic'] = 1
 830 | 		if sum(report_labels[rid].values()) == 0:  # update ni
 831 | 			report_labels[rid]['ni'] = 1
 832 | 	return report_labels
 833 | 
 834 | 
 835 | def colon_labels2binary(report_labels):
 836 | 	"""
 837 | 	Convert the pre-defined labels extracted from colon reports to binary labels used for classification
 838 | 
 839 | 	Params:
 840 | 		report_labels (dict(list)): the dict containing for each colon report the pre-defined labels
 841 | 
 842 | 	Returns: a dict containing for each colon report the set of binary labels where 0 = absence and 1 = presence
 843 | 	"""
 844 | 
 845 | 	binary_labels = dict()
 846 | 	# loop over reports
 847 | 	for rid, rlabels in report_labels.items():
 848 | 		# assign binary labels to current report
 849 | 		binary_labels[rid] = {'cancer_or_dysplasia': 0, 'other': 0}
 850 | 		# update binary labels w/ 1 in case of label presence
 851 | 		if rlabels['cancer'] == 1 or rlabels['lgd'] == 1 or rlabels['hgd'] == 1:  # update cancer_or_dysplasia label
 852 | 			binary_labels[rid]['cancer_or_dysplasia'] = 1
 853 | 		else:  # update other label
 854 | 			binary_labels[rid]['other'] = 1
 855 | 	return binary_labels
 856 | 
 857 | 
 858 | def cervix_concepts2labels(report_concepts):
 859 | 	"""
 860 | 	Convert the concepts extracted from cervix reports to the set of pre-defined labels used for classification
 861 | 
 862 | 	Params:
 863 | 		report_concepts (dict(list)): the dict containing for each cervix report the extracted concepts
 864 | 
 865 | 	Returns: a dict containing for each cervix report the set of pre-defined labels where 0 = absence and 1 = presence
 866 | 	"""
 867 | 
 868 | 	report_labels = dict()
 869 | 	# loop over reports
 870 | 	for rid, rconcepts in report_concepts.items():
 871 | 		# assign pre-defined set of labels to current report
 872 | 		report_labels[rid] = {
 873 | 			'cancer_scc_inv': 0, 'cancer_scc_insitu': 0, 'cancer_adeno_inv': 0, 'cancer_adeno_insitu': 0,
 874 | 			'lgd': 0, 'hgd': 0,
 875 | 			'hpv': 0, 'koilocytes': 0,
 876 | 			'glands_norm': 0, 'squamous_norm': 0
 877 | 		}
 878 | 		# make diagnosis section a set
 879 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 880 | 		# update pre-defined labels w/ 1 in case of label presence
 881 | 		for d in diagnosis:
 882 | 			if 'cervical squamous cell carcinoma' == d:
 883 | 				report_labels[rid]['cancer_scc_inv'] = 1
 884 | 			if 'squamous carcinoma in situ' == d or 'squamous intraepithelial neoplasia' == d:
 885 | 				report_labels[rid]['cancer_scc_insitu'] = 1
 886 | 			if 'cervical adenocarcinoma' in d:
 887 | 				if 'cervical adenocarcinoma in situ' == d:
 888 | 					report_labels[rid]['cancer_adeno_insitu'] = 1
 889 | 				else:
 890 | 					report_labels[rid]['cancer_adeno_inv'] = 1
 891 | 			if 'low grade cervical squamous intraepithelial neoplasia' == d:
 892 | 				report_labels[rid]['lgd'] = 1
 893 | 			if 'squamous carcinoma in situ' == d or \
 894 | 					'squamous intraepithelial neoplasia' == d or \
 895 | 					'cervical squamous intraepithelial neoplasia 2' == d or \
 896 | 					'cervical intraepithelial neoplasia grade 2/3' == d:
 897 | 				report_labels[rid]['hgd'] = 1
 898 | 			if 'human papilloma virus infection' == d:
 899 | 				report_labels[rid]['hpv'] = 1
 900 | 			if 'koilocytotic squamous cell' == d:
 901 | 				report_labels[rid]['koilocytes'] = 1
 902 | 		# update when no label has been set to 1
 903 | 		if sum(report_labels[rid].values()) == 0:
 904 | 			report_labels[rid]['glands_norm'] = 1
 905 | 			report_labels[rid]['squamous_norm'] = 1
 906 | 	return report_labels
 907 | 
 908 | 
 909 | def cervix_labels2aggregates(report_labels):
 910 | 	"""
 911 | 	Convert the pre-defined labels extracted from cervix reports to coarse- and fine-grained aggregated labels
 912 | 		Params:
 913 | 			report_labels (dict(list)): the dict containing for each cervix report the pre-defined labels
 914 | 		Returns: two dicts containing for each cervix report the set of aggregated labels where 0 = absence and 1 = presence
 915 | 	"""
 916 | 
 917 | 	coarse_labels = dict()
 918 | 	fine_labels = dict()
 919 | 	# loop over reports
 920 | 	for rid, rlabels in report_labels.items():
 921 | 		# assign aggregated labels to current report
 922 | 		coarse_labels[rid] = {'cancer': 0, 'dysplasia': 0, 'normal': 0}
 923 | 		fine_labels[rid] = {'cancer_adeno': 0, 'cancer_scc': 0, 'dysplasia': 0, 'glands_norm': 0, 'squamous_norm': 0}
 924 | 		# update aggregated labels w/ 1 in case of label presence
 925 | 		if rlabels['cancer_adeno_inv'] == 1 or rlabels['cancer_adeno_insitu'] == 1:
 926 | 			coarse_labels[rid]['cancer'] = 1
 927 | 			fine_labels[rid]['cancer_adeno'] = 1
 928 | 		if rlabels['cancer_scc_inv'] == 1 or rlabels['cancer_scc_insitu'] == 1:
 929 | 			coarse_labels[rid]['cancer'] = 1
 930 | 			fine_labels[rid]['cancer_scc'] = 1
 931 | 		if rlabels['lgd'] == 1 or rlabels['hgd'] == 1:
 932 | 			coarse_labels[rid]['dysplasia'] = 1
 933 | 			fine_labels[rid]['dysplasia'] = 1
 934 | 		if rlabels['glands_norm'] == 1:
 935 | 			coarse_labels[rid]['normal'] = 1
 936 | 			fine_labels[rid]['glands_norm'] = 1
 937 | 		if rlabels['squamous_norm'] == 1:
 938 | 			coarse_labels[rid]['normal'] = 1
 939 | 			fine_labels[rid]['squamous_norm'] = 1
 940 | 	return coarse_labels, fine_labels
 941 | 
 942 | 
 943 | def lung_concepts2labels(report_concepts):
 944 | 	"""
 945 | 	Convert the concepts extracted from lung reports to the set of pre-defined labels used for classification
 946 | 
 947 | 	Params:
 948 | 		report_concepts (dict(list)): the dict containing for each lung report the extracted concepts
 949 | 
 950 | 	Returns: a dict containing for each lung report the set of pre-defined labels where 0 = absence and 1 = presence
 951 | 	"""
 952 | 
 953 | 	report_labels = dict()
 954 | 	# loop over reports
 955 | 	for rid, rconcepts in report_concepts.items():
 956 | 		# assign pre-defined set of labels to current report
 957 | 		report_labels[rid] = {
 958 | 			'cancer_scc': 0, 'cancer_nscc_adeno': 0, 'cancer_nscc_squamous': 0, 'cancer_nscc_large': 0, 'no_cancer': 0}
 959 | 		# make diagnosis section a set
 960 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 961 | 		# update pre-defined labels w/ 1 in case of label presence
 962 | 		for d in diagnosis:
 963 | 			if 'small cell lung carcinoma' == d:
 964 | 				report_labels[rid]['cancer_scc'] = 1
 965 | 			if 'lung adenocarcinoma' == d or 'clear cell adenocarcinoma' == d or 'metastatic neoplasm' == d:
 966 | 				report_labels[rid]['cancer_nscc_adeno'] = 1
 967 | 			if 'non-small cell squamous lung carcinoma' == d:
 968 | 				report_labels[rid]['cancer_nscc_squamous'] = 1
 969 | 			if 'lung large cell carcinoma' == d:
 970 | 				report_labels[rid]['cancer_nscc_large'] = 1
 971 | 		# update when no label has been set to 1
 972 | 		if sum(report_labels[rid].values()) == 0:
 973 | 			report_labels[rid]['no_cancer'] = 1
 974 | 	return report_labels
 975 | 
 976 | 
 977 | def celiac_concepts2labels(report_concepts):
 978 | 	"""
 979 | 		Convert the concepts extracted from celiac reports to the set of pre-defined labels used for classification
 980 | 
 981 | 		Params:
 982 | 			report_concepts (dict(list)): the dict containing for each celiac report the extracted concepts
 983 | 
 984 | 		Returns: a dict containing for each celiac report the set of pre-defined labels where 0 = absence and 1 = presence
 985 | 	"""
 986 | 
 987 | 	report_labels = dict()
 988 | 	# loop over reports
 989 | 	for rid, rconcepts in report_concepts.items():
 990 | 		# assign pre-defined set of labels to current report
 991 | 		report_labels[rid] = {
 992 | 			'celiac_disease': 0, 'duodenitis': 0, 'inconclusive': 0, 'normal': 0}
 993 | 		# make diagnosis section a set
 994 | 		diagnosis = set([concept[1].lower() for concept in rconcepts['Diagnosis']])
 995 | 		# update pre-defined labels w/ 1 in case of label presence
 996 | 		for d in diagnosis:
 997 | 			if 'positive to celiac disease' == d:
 998 | 				report_labels[rid]['celiac_disease'] = 1
 999 | 			if 'duodenitis' == d:
1000 | 				report_labels[rid]['duodenitis'] = 1
1001 | 			if 'inconclusive outcome' == d:
1002 | 				report_labels[rid]['inconclusive'] = 1
1003 | 			if 'negative result' == d:
1004 | 				report_labels[rid]['normal'] = 1
1005 | 		# update when no label has been set to 1
1006 | 		if sum(report_labels[rid].values()) == 0:
1007 | 			report_labels[rid]['normal'] = 1
1008 | 	return report_labels
1009 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__init__.py


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/admin.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/admin.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/admin.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/admin.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/apps.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/apps.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/apps.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/apps.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/models.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/models.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/models.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/urls.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/urls.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/urls.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/urls.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/views.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/views.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/__pycache__/views.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/__pycache__/views.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class SketRestAppConfig(AppConfig):
5 |     default_auto_field = 'django.db.models.BigAutoField'
6 |     name = 'sket_server.sket_rest_app'
7 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/migrations/__init__.py


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/migrations/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/migrations/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/migrations/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_app/migrations/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 | 
3 | # Create your models here.
4 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/urls.py:
--------------------------------------------------------------------------------
 1 | from django.urls import path
 2 | from . import views
 3 | from django.views.decorators.csrf import csrf_exempt
 4 | 
 5 | from django.contrib.auth import views as auth_views
 6 | 
 7 | app_name='sket_server.sket_rest_app'
 8 | urlpatterns = [
 9 | 
10 |     # path('annotate/<use_case>/<language>/<object>', views.annotate, name='annotate'),
11 |     path('', views.annotate, name='annotate'),
12 |     # path('annotate/<reports>', views.annotate, name='annotate'),
13 |     path('annotate/<use_case>/<language>/<obj>/<rdf_format>', views.annotate, name='annotate'),    
14 |     path('annotate/<use_case>/<language>/<obj>', views.annotate, name='annotate'), 
15 |     path('annotate/<use_case>/<language>', views.annotate, name='annotate'),
16 | 
17 | ]
18 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_app/views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from rest_framework import status
  3 | from rest_framework.decorators import api_view
  4 | from rest_framework.response import Response
  5 | from sket_server.sket_rest_config import sket_pipe
  6 | from django.core.files.storage import FileSystemStorage
  7 | import shutil
  8 | import os
  9 | 
 10 | 
 11 | @api_view(['GET', 'POST'])
 12 | def annotate(request, use_case=None, language=None, obj=None, rdf_format=None):
 13 |     json_resp_single = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}
 14 | 
 15 |     if request.method == 'GET':
 16 |         return Response(json_resp_single)
 17 | 
 18 |     elif request.method == 'POST':
 19 |         workpath = os.path.dirname(os.path.abspath(__file__))
 20 |         output_concepts_dir = os.path.join(workpath, '../sket_rest_config/config.json')
 21 |         f = open(output_concepts_dir, 'r')
 22 |         data = json.load(f)
 23 |         thr = data['thr']
 24 |         files = []
 25 |         labels = {}
 26 |         concepts = {}
 27 |         rdf_graphs = {}
 28 | 
 29 |         if use_case is None or language is None:
 30 |             response = {'ERROR': 'Your request is invalid!'}
 31 |             return Response(response, status=status.HTTP_400_BAD_REQUEST)
 32 |         if obj not in ['concepts','graphs','labels','n3','turtle','all','trig',None]:
 33 |             response = {'ERROR': 'Your request is invalid!'}
 34 |             return Response(response, status=status.HTTP_400_BAD_REQUEST)
 35 |         if obj == 'graphs' and (
 36 |                 rdf_format is None or rdf_format == 'all' or rdf_format not in ['n3', 'turtle', 'trig']):
 37 |             response = {'ERROR': 'Your request is invalid: the allowed rdf_formats are: turtle, n3, trig'}
 38 |             return Response(response, status=status.HTTP_400_BAD_REQUEST)
 39 | 
 40 |         store = True
 41 | 
 42 |         if (use_case is not None and language is not None and obj in ['n3', 'turtle', 'trig', 'all']) or (
 43 |                 use_case is not None and language is not None):
 44 |             store = True
 45 |         if obj in ['concepts', 'graphs', 'labels']:
 46 |             store = False
 47 |             if obj in ['concepts','labels']:
 48 |             	rdf_format = 'turtle'
 49 |         if store == True and obj is None:
 50 |             rdf_format = 'all'
 51 |         if store == True and obj in ['n3', 'turtle', 'trig', 'all']:
 52 |             rdf_format = obj
 53 | 
 54 | 
 55 |         if len(request.FILES) > 0:
 56 |             for file in request.FILES.items():
 57 |                 files.append(file[1])
 58 | 
 59 |         if len(files) == 0:
 60 |             print('json')
 61 |             if type(request.data) == dict:
 62 |                 request_body = request.data
 63 |                 concepts, labels, rdf_graphs = sket_pipe.med_pipeline(request_body, language, use_case, thr, store,
 64 |                                                                       rdf_format,
 65 |                                                                       False, False)
 66 |         elif len(files) > 0:
 67 |             for file in files:
 68 |                 workpath = os.path.dirname(os.path.abspath(__file__))
 69 |                 fs = FileSystemStorage(os.path.join(workpath, './tmp'))
 70 |                 print(file.name)
 71 |                 file_up = fs.save(file.name, file)
 72 |                 uploaded_file_path = os.path.join(workpath, './tmp/' + file_up)
 73 |                 print(rdf_format)
 74 |                 try:
 75 |                     concepts, labels, rdf_graphs = sket_pipe.med_pipeline(uploaded_file_path, language, use_case, thr,
 76 |                                                                           store, rdf_format,
 77 |                                                                           False, False)
 78 |                 except Exception as e:
 79 |                     print(e)
 80 |                     js_resp = {'error': 'an error occurred: ' + str(e) + '.'}
 81 |                     return Response(js_resp)
 82 |                 finally:
 83 |                     for root, dirs, files in os.walk(os.path.join(workpath, './tmp')):
 84 |                         for f in files:
 85 |                             os.unlink(os.path.join(root, f))
 86 |                         for d in dirs:
 87 |                             shutil.rmtree(os.path.join(root, d))
 88 |         if not store:
 89 |             if obj == 'graphs':
 90 |                 return Response(rdf_graphs, status=status.HTTP_201_CREATED)
 91 |             elif obj == 'labels':
 92 |                 return Response(labels, status=status.HTTP_201_CREATED)
 93 |             elif obj == 'concepts':
 94 |                 return Response(concepts, status=status.HTTP_201_CREATED)
 95 |         elif store:
 96 |             json_resp = {"response": "request handled with success."}
 97 |             return Response(json_resp, status=status.HTTP_201_CREATED)
 98 | 
 99 |     else:
100 |         return Response(json_resp_single, status=status.HTTP_201_CREATED)
101 | 
102 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__init__.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | import os
 4 | workpath = os.path.dirname(os.path.abspath(__file__))  # Returns the Path your .py file is in
 5 | pp = os.path.join(workpath,'../../')
 6 | import sys
 7 | sys.path.insert(0,pp)
 8 | from sket.sket import SKET
 9 | print('start sket initialization')
10 | 
11 | output_concepts_dir = os.path.join(workpath, './config.json')
12 | f = open(output_concepts_dir,'r')
13 | data = json.load(f)
14 | st = time.time()
15 | # sket_pipe = SKET('colon', 'en', 'en_core_sci_sm', True, None, None, False, 0)
16 | sket_pipe = SKET('colon', 'en', 'en_core_sci_sm', data['w2v_model'], data['fasttext_model'], data['bert_model'], data['string_model'],data['gpu'])
17 | end = time.time()
18 | print('sket initialization completed in: ',str(end-st), ' seconds')
19 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/settings.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/settings.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/urls.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/urls.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/urls.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/urls.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/wsgi.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/wsgi.cpython-38.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/__pycache__/wsgi.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExaNLP/sket/d9a3fcc42d5f3671dcb2ac6597ea663b9b259433/sket_server/sket_rest_config/__pycache__/wsgi.cpython-39.pyc


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for sket_rest project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sket_server.sket_rest_config.settings')
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "w2v_model":true,
3 |   "fasttext_model": null,
4 |   "bert_model":null,
5 |   "string_model": false,
6 |   "gpu":null,
7 |   "thr":0.9
8 | }
9 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for sket_rest project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 3.2.7.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/3.2/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/3.2/ref/settings/
 11 | """
 12 | 
 13 | from pathlib import Path
 14 | import os
 15 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
 16 | BASE_DIR = Path(__file__).resolve().parent.parent
 17 | 
 18 | # Quick-start development settings - unsuitable for production
 19 | # See https://docs.djangoproject.com/en/3.2/howto/deployment/checklist/
 20 | 
 21 | # SECURITY WARNING: keep the secret key used in production secret!
 22 | SECRET_KEY = 'django-insecure-co+vta_0^gtvr&-m7@254lf)3zw!i!yel^=bd9b7yjf_@&h6-t'
 23 | 
 24 | # SECURITY WARNING: don't run with debug turned on in production!
 25 | DEBUG = True
 26 | 
 27 | ALLOWED_HOSTS = ['*']
 28 | 
 29 | 
 30 | # Application definition
 31 | 
 32 | INSTALLED_APPS = [
 33 |     'django.contrib.admin',
 34 |     'django.contrib.auth',
 35 |     'django.contrib.contenttypes',
 36 |     'django.contrib.sessions',
 37 |     'django.contrib.messages',
 38 |     'django.contrib.staticfiles',
 39 |     'sket_server.sket_rest_app.apps.SketRestAppConfig',
 40 |     'rest_framework',
 41 | ]
 42 | 
 43 | MIDDLEWARE = [
 44 |     'django.middleware.security.SecurityMiddleware',
 45 |     'django.contrib.sessions.middleware.SessionMiddleware',
 46 |     'django.middleware.common.CommonMiddleware',
 47 |     'django.middleware.csrf.CsrfViewMiddleware',
 48 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 | ]
 52 | 
 53 | ROOT_URLCONF = 'sket_server.sket_rest_config.urls'
 54 | 
 55 | TEMPLATES = [
 56 |     {
 57 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 58 |         'DIRS': [BASE_DIR / 'templates']
 59 |         ,
 60 |         'APP_DIRS': True,
 61 |         'OPTIONS': {
 62 |             'context_processors': [
 63 |                 'django.template.context_processors.debug',
 64 |                 'django.template.context_processors.request',
 65 |                 'django.contrib.auth.context_processors.auth',
 66 |                 'django.contrib.messages.context_processors.messages',
 67 |             ],
 68 |         },
 69 |     },
 70 | ]
 71 | 
 72 | WSGI_APPLICATION = 'sket_server.sket_rest_config.wsgi.application'
 73 | REST_FRAMEWORK = {
 74 |     'DEFAULT_AUTHENTICATION_CLASSES': [],
 75 |     'DEFAULT_PERMISSION_CLASSES': [],
 76 | }
 77 | 
 78 | # Database
 79 | # https://docs.djangoproject.com/en/3.2/ref/settings/#databases
 80 | 
 81 | 
 82 | 
 83 | # Password validation
 84 | # https://docs.djangoproject.com/en/3.2/ref/settings/#auth-password-validators
 85 | 
 86 | AUTH_PASSWORD_VALIDATORS = [
 87 |     {
 88 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 89 |     },
 90 |     {
 91 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
 92 |     },
 93 |     {
 94 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
 95 |     },
 96 |     {
 97 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
 98 |     },
 99 | ]
100 | 
101 | 
102 | # Internationalization
103 | # https://docs.djangoproject.com/en/3.2/topics/i18n/
104 | 
105 | LANGUAGE_CODE = 'en-us'
106 | 
107 | TIME_ZONE = 'UTC'
108 | 
109 | USE_I18N = True
110 | 
111 | USE_L10N = True
112 | 
113 | USE_TZ = True
114 | 
115 | 
116 | # Static files (CSS, JavaScript, Images)
117 | # https://docs.djangoproject.com/en/3.2/howto/static-files/
118 | 
119 | STATIC_URL = '/static/'
120 | 
121 | # Default primary key field type
122 | # https://docs.djangoproject.com/en/3.2/ref/settings/#default-auto-field
123 | 
124 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
125 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/urls.py:
--------------------------------------------------------------------------------
 1 | """sket_rest URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/3.2/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  path('', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.urls import include, path
14 |     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
15 | """
16 | from django.contrib import admin
17 | from django.urls import path
18 | from django.urls import include
19 | 
20 | urlpatterns = [
21 |     path('', include('sket_server.sket_rest_app.urls')),
22 |     path('api-auth/', include('rest_framework.urls', namespace='rest_framework'))
23 | ]
24 | 


--------------------------------------------------------------------------------
/sket_server/sket_rest_config/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for sket_rest project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.2/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'sket_server.sket_rest_config.settings')
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------