├── NLPreprocessing
    ├── __init__.py
    ├── text_process
    │   ├── __init__.py
    │   ├── text_case_formatter.py
    │   └── text_special_cases.py
    ├── .gitignore
    ├── README.md
    ├── file_utils
    │   ├── nlp_io.py
    │   ├── create_sent_map_files.py
    │   └── create_train_dev_test_set.py
    ├── LICENSE
    └── annotation2BIO.py
├── ClinicalTransformerRelationExtraction
    ├── src
    │   ├── __init__.py
    │   ├── data_processing
    │   │   ├── __init__.py
    │   │   ├── data_format_conf.py
    │   │   ├── io_utils.py
    │   │   └── post_processing.py
    │   ├── config.py
    │   ├── relation_extraction_json.py
    │   ├── utils.py
    │   ├── model_utils.py
    │   ├── run_app.py
    │   ├── relation_extraction.py
    │   ├── models.py
    │   └── data_utils.py
    ├── requirements.txt
    ├── .gitignore
    ├── run_json.sh
    ├── config_experiment_sample.json
    ├── LICENSE
    ├── run.sh
    └── readme.md
├── .gitignore
├── ClinicalTransformerNER
    ├── src
    │   ├── __init__.py
    │   ├── common_utils
    │   │   ├── __init__.py
    │   │   ├── common_config.py
    │   │   ├── common_log.py
    │   │   ├── common_io.py
    │   │   ├── output_format_converter.py
    │   │   └── bio_prf_eval.py
    │   ├── eval_scripts
    │   │   ├── __init__.py
    │   │   └── old_bio_eval.py
    │   ├── transformer_ner
    │   │   ├── __init__.py
    │   │   ├── transfomer_log.py
    │   │   ├── test_transfomer.py
    │   │   └── model_utils.py
    │   ├── run_format_bio_output.py
    │   ├── run_transformer_batch_prediction.py
    │   └── run_transformer_ner.py
    ├── .gitignore
    ├── requirements.txt
    ├── LICENSE
    ├── run_transformer_batch_prediction.sh
    ├── run_transformer_ner.sh
    └── README.md
├── requirements.txt
├── LICENSE
├── scipts
    ├── compare_ner.py
    ├── run_ner.py
    ├── get_ann.py
    ├── run_pred.sh
    ├── training_process.sh
    ├── get_statistics.py
    ├── training_ner.py
    └── make_relation.py
└── README.md


/NLPreprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NLPreprocessing/text_process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
3 | data/
4 | models/
5 | 


--------------------------------------------------------------------------------
/NLPreprocessing/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | __pycache__


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | transformers
3 | tqdm
4 | numpy
5 | scikit-learn
6 | packaging


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/eval_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/common_config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/transformer_ner/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | .idea/
4 | __pycahce__
5 | new_ner_model
6 | /.python-version


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.2.0
2 | transformers>=3.1.0
3 | tqdm>=4.36.1
4 | numpy
5 | scikit-learn
6 | packaging


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | __pycache__
4 | /data/
5 | .ipynb_checkpoints/
6 | /notebook/
7 | /test/
8 | 


--------------------------------------------------------------------------------
/NLPreprocessing/README.md:
--------------------------------------------------------------------------------
1 | # NLPpreprocessing
2 | A comprehensive NLP preprocessing package for clinical notes sentence boundary detection, tokenization
3 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/requirements.txt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | 
4 | torch>=1.6.0
5 | transformers==3.1.0
6 | tqdm>=4.36.1
7 | numpy==1.16.0
8 | packaging
9 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/run_json.sh:
--------------------------------------------------------------------------------
1 | # example of training using json config to initialize all experiment parameters
2 | export CUDA_VISIBLE_DEVICES=1
3 | 
4 | python ./src/relation_extraction_json.py \
5 | 		--config_json "./config_experiment_sample.json"


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/data_processing/data_format_conf.py:
--------------------------------------------------------------------------------
1 | NON_RELATION_TAG = "NonRel"
2 | BRAT_REL_TEMPLATE = "R{}\t{} Arg1:{} Arg2:{}"
3 | EN1_START = "[s1]"
4 | EN1_END = "[e1]"
5 | EN2_START = "[s2]"
6 | EN2_END = "[e2]"
7 | SPEC_TAGS = [EN1_START, EN1_END, EN2_START, EN2_END]


--------------------------------------------------------------------------------
/NLPreprocessing/file_utils/nlp_io.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle as pkl
 3 | 
 4 | 
 5 | def make_dir(mdir):
 6 |     if not os.path.isdir(mdir):
 7 |         os.mkdir(mdir)
 8 | 
 9 | 
10 | def pkl_dump(data, file):
11 |     with open(file, "wb") as f:
12 |         pkl.dump(data, f)
13 | 
14 | 
15 | def pkl_load(file):
16 |     with open(file, "rb") as f:
17 |         data = pkl.load(f)
18 |     return data
19 | 
20 | 
21 | def read_file(file, encoding="utf-8"):
22 |     with open(file, "r", encoding=encoding) as f:
23 |         text = f.read().strip()
24 |     return text
25 | 
26 | 
27 | def write_file(text, file, encoding="utf-8"):
28 |     with open(file, "w", encoding=encoding) as f:
29 |         f.write(text)
30 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/transformer_ner/transfomer_log.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | #  -*- coding: utf-8 -*-
 3 | 
 4 | from common_utils.common_log import create_logger
 5 | import logging
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | class TransformerNERLogger:
10 |     def __init__(self, logger_file=None, logger_level=logging.DEBUG):
11 |         self.lf = logger_file
12 |         self.lvl = logger_level
13 | 
14 |     def set_log_info(self, logger_file, logger_level):
15 |         self.lf = logger_file
16 |         self.lvl = logger_level
17 | 
18 |     def get_logger(self):
19 |         Path(self.lf).parent.mkdir(parents=True, exist_ok=True)
20 |         return create_logger("Transformer_NER", log_level=self.lvl, set_file=self.lf)
21 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/data_processing/io_utils.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import json
 3 | 
 4 | 
 5 | def load_text(ifn):
 6 |     with open(ifn, "r") as f:
 7 |         txt = f.read()
 8 |     return txt
 9 | 
10 | 
11 | def save_text(text, ofn):
12 |     with open(ofn, "w") as f:
13 |         f.write(text)
14 | 
15 | 
16 | def pkl_save(data, file):
17 |     with open(file, "wb") as f:
18 |         pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)
19 | 
20 | 
21 | def pkl_load(file):
22 |     with open(file, "rb") as f:
23 |         data = pkl.load(f)
24 |     return data
25 | 
26 | 
27 | def load_json(file):
28 |     with open(file, "r") as f:
29 |         data = json.load(f)
30 |     return data
31 | 
32 | 
33 | def save_json(data, file):
34 |     with open(file, "w") as f:
35 |         json.dump(data, f)
36 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/common_log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | import logging
 4 | 
 5 | LOG_LVLs = {
 6 |     'i': logging.INFO,
 7 |     'd': logging.DEBUG,
 8 |     'e': logging.ERROR,
 9 |     'w': logging.WARN
10 | }
11 | 
12 | 
13 | def create_logger(logger_name="", log_level="d", set_file=None):
14 |     logger = logging.getLogger(logger_name)
15 |     formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
16 |     logger.setLevel(LOG_LVLs[log_level])
17 |     if set_file:
18 |         fh = logging.FileHandler(set_file)
19 |         fh.setFormatter(formatter)
20 |         fh.setLevel(LOG_LVLs[log_level])
21 |         logger.addHandler(fh)
22 |     else:
23 |         ch = logging.StreamHandler()
24 |         ch.setFormatter(formatter)
25 |         ch.setLevel(LOG_LVLs[log_level])
26 |         logger.addHandler(ch)
27 | 
28 |     return logger
29 | 


--------------------------------------------------------------------------------
/NLPreprocessing/text_process/text_case_formatter.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def all2lower(ifn):
 5 |     idx = ifn.rfind(".")
 6 |     ofn = ifn[:idx] + ".lower.txt"
 7 | 
 8 |     with open(ifn, "r") as fr, open(ofn, "w") as fw:
 9 |         for i, line in enumerate(fr):
10 |             nline = " ".join([w.lower() for w in line.split(" ")])
11 |             fw.write(nline)
12 | 
13 | 
14 | def all2upper(ifn):
15 |     idx = ifn.rfind(".")
16 |     ofn = ifn[:idx] + ".upper.txt"
17 | 
18 |     with open(ifn, "r") as fr, open(ofn, "w") as fw:
19 |         for i, line in enumerate(fr):
20 |             nline = " ".join([w.upper() for w in line.split(" ")])
21 |             fw.write(nline)
22 | 
23 | 
24 | def all2capitalized(ifn):
25 |     idx = ifn.rfind(".")
26 |     ofn = ifn[:idx] + ".capitalized.txt"
27 | 
28 |     with open(ifn, "r") as fr, open(ofn, "w") as fw:
29 |         for i, line in enumerate(fr):
30 |             nline = " ".join([w.capitalize() for w in line.split(" ")])
31 |             fw.write(nline)


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/config_experiment_sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_type": "deberta",
 3 |   "data_format_mode": 0,
 4 |   "classification_scheme": 2,
 5 |   "pretrained_model": "microsoft/deberta-base",
 6 |   "data_dir": "../sample_data",
 7 |   "new_model_dir": "../deberta_re_model",
 8 |   "predict_output_file": "../deberta_re_predict.txt",
 9 |   "overwrite_model_dir": true,
10 |   "seed": 1234,
11 |   "max_seq_length": 128,
12 |   "cache_data": false,
13 |   "data_file_header": true,
14 |   "do_train": true,
15 |   "do_eval": false,
16 |   "do_predict": true,
17 |   "do_lower_case": true,
18 |   "train_batch_size": 2,
19 |   "eval_batch_size": 32,
20 |   "learning_rate": 1e-05,
21 |   "num_train_epochs": 5,
22 |   "gradient_accumulation_steps": 1,
23 |   "do_warmup": true,
24 |   "warmup_ratio": 0.1,
25 |   "weight_decay": 0.0,
26 |   "adam_epsilon": 1e-08,
27 |   "max_grad_norm": 1.0,
28 |   "max_num_checkpoints": 0,
29 |   "log_file": null,
30 |   "log_lvl": "i",
31 |   "log_step": 2,
32 |   "num_core": 4,
33 |   "non_relation_label": "nonRel",
34 |   "progress_bar": false,
35 |   "fp16": false,
36 |   "fp16_opt_level": "O1"
37 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 uf-hobi-informatics-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NLPreprocessing/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 uf-hobi-informatics-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 uf-hobi-informatics-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 uf-hobi-informatics-lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scipts/compare_ner.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | import shutil
 4 | 
 5 | gs_root=Path(str(sys.argv[1]))
 6 | bert_root=Path(str(sys.argv[2]))
 7 | save_root=Path(str(sys.argv[3]))
 8 | save_root.mkdir(parents=True, exist_ok=True)
 9 | for k in gs_root.glob('*.ann'):
10 |     fid=k.stem
11 |     txt_fn = gs_root / (fid + ".txt")
12 |     ann_fn = gs_root / (fid + ".ann")
13 |     txt_fn1 = save_root / (fid + ".txt")
14 |     ann_fn1 = save_root / (fid + ".ann")
15 |     shutil.copyfile(txt_fn, txt_fn1)
16 |     shutil.copyfile(ann_fn, ann_fn1)
17 | 
18 | for k in save_root.glob('*.ann'):
19 |     #print(k.stem)
20 |     with open(bert_root/(k.stem+'.ann')) as f:
21 |         lines=f.readlines()
22 |         lines_used=[]
23 |         i=300
24 |         for line in lines:
25 |             if line[0]=='T':
26 |                 entity_name=line.split('\t',2)[1].split(' ',1)[0]
27 |                 entity_num=line.split('\t',2)[1].split(' ',1)[1]
28 |                 #print(entity_name)
29 |                 lines_used = lines_used+['T'+str(i)+'\t'+entity_name+'_predicted '+entity_num+'\t'+line.split('\t',2)[2]]
30 |                 i+=1
31 |         with open(k, "a") as f1:
32 |             f1.writelines(lines_used)


--------------------------------------------------------------------------------
/ClinicalTransformerNER/run_transformer_batch_prediction.sh:
--------------------------------------------------------------------------------
 1 | : '
 2 | The script is used to run multi-file batch prediction using transformer ner
 3 | We only use bert as example, the roberta, XLNet should be the same
 4 | The input files must have offset information
 5 | If no offset information, just combine all the files into one test.txt and use the do_pred from run_transformer_ner.sh for prediction
 6 | This script is design for mainly production using to generate brat/BioC formatted outputs with offset information.
 7 | '
 8 | 
 9 | ################# BERT example #####################
10 | export CUDA_VISIBLE_DEVICES=0
11 | 
12 | # config and tokenizer information can be found in the pretrained model dir
13 | # use format 1 for BRAT, 2 for BioC, 0 as default for BIO
14 | python ./src/run_transformer_batch_prediction.py \
15 |       --model_type bert \
16 |       --pretrained_model <your pretrained model path> \
17 |       --raw_text_dir <path to the original text files> \
18 |       --preprocessed_text_dir <path to the bio formatted files> \
19 |       --output_dir <path to save predicted results> \
20 |       --max_seq_length 128 \
21 |       --do_lower_case \
22 |       --eval_batch_size 8 \
23 |       --log_file ./log.txt\
24 |       --do_format 1 \
25 |       --do_copy \
26 |       --data_has_offset_information


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/config.py:
--------------------------------------------------------------------------------
 1 | from transformers import (BertConfig, RobertaConfig, XLNetConfig, AlbertConfig, LongformerConfig,
 2 |                           BertTokenizer, RobertaTokenizer, XLNetTokenizer, AlbertTokenizer, LongformerTokenizer,
 3 |                           DebertaConfig, DebertaTokenizer)
 4 | from models import (BertForRelationIdentification, RoBERTaForRelationIdentification,
 5 |                     XLNetForRelationIdentification, AlbertForRelationIdentification,
 6 |                     LongFormerForRelationIdentification, DebertaForRelationIdentification)
 7 | 
 8 | 
 9 | EN1_START = "[s1]"
10 | EN1_END = "[e1]"
11 | EN2_START = "[s2]"
12 | EN2_END = "[e2]"
13 | # keep the seq order
14 | SPEC_TAGS = [EN1_START, EN1_END, EN2_START, EN2_END]
15 | 
16 | MODEL_REQUIRE_SEGMENT_ID = {'bert', 'xlnet', 'albert', 'deberta'}
17 | 
18 | MODEL_DICT = {
19 |     "bert": (BertForRelationIdentification, BertConfig, BertTokenizer),
20 |     "roberta": (RoBERTaForRelationIdentification, RobertaConfig, RobertaTokenizer),
21 |     "xlnet": (XLNetForRelationIdentification, XLNetConfig, XLNetTokenizer),
22 |     "albert": (AlbertForRelationIdentification, AlbertConfig, AlbertTokenizer),
23 |     "longformer": (LongFormerForRelationIdentification, LongformerConfig, LongformerTokenizer),
24 |     "deberta": (DebertaForRelationIdentification, DebertaConfig, DebertaTokenizer)
25 | }
26 | 
27 | TOKENIZER_USE_FOUR_SPECIAL_TOKs = {'roberta', 'longformer'}


--------------------------------------------------------------------------------
/scipts/run_ner.py:
--------------------------------------------------------------------------------
 1 | #run NER
 2 | import sys
 3 | sys.path.append("../ClinicalTransformerNER/")
 4 | sys.path.append("../NLPreprocessing/")
 5 | import os
 6 | from pathlib import Path
 7 | from collections import defaultdict, Counter
 8 | import numpy as np
 9 | from sklearn.model_selection import train_test_split
10 | import shutil 
11 | import fileinput
12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file
13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]"
14 | 
15 | 
16 | data_dir=sys.argv[1]
17 | output_name=sys.argv[2]
18 | 
19 | #data stat
20 | file_ids = set()
21 | enss = []
22 | 
23 | for fn in Path(data_dir).glob("*.ann"):
24 |     file_ids.add(fn.stem)
25 |     _, ens, _ = read_annotation_brat(fn)
26 |     #print( _)
27 |     enss.extend(ens)
28 | 
29 | print("number of test files: ", len(file_ids))
30 | print("total number of test eneitites: ", len(enss))
31 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))
32 | 
33 | # generate bio
34 | test_root = Path(data_dir)
35 | test_bio = "../temp/"+output_name
36 | output_root = Path(test_bio)
37 | output_root.mkdir(parents=True, exist_ok=True)
38 | 
39 | for fn in test_root.glob("*.txt"):
40 |     txt_fn = fn
41 |     bio_fn = output_root / (fn.stem + ".bio.txt")
42 |     
43 |     txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
44 |     nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False)
45 |     
46 |     BIOdata_to_file(bio_fn, nsents)
47 | 
48 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/run_format_bio_output.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | The script is used to format the BIO output to target output format like BRAT
 6 | The script is used to help all the prediction without format
 7 | """
 8 | 
 9 | import argparse
10 | import traceback
11 | from pathlib import Path
12 | from common_utils.output_format_converter import main as format_converter
13 | 
14 | 
15 | def main(args):
16 |     base_path = Path(args.bio_dir)
17 |     output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
18 |     output_formatted_dir.mkdir(parents=True, exist_ok=True)
19 |     format_converter(text_dir=args.raw_text_dir,
20 |                      input_bio_dir=args.bio_dir,
21 |                      output_dir=output_formatted_dir,
22 |                      formatter=args.do_format,
23 |                      do_copy_text=args.do_copy)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--raw_text_dir", type=str, required=True,
29 |                         help="The input data directory.")
30 |     parser.add_argument("--bio_dir", type=str, required=True,
31 |                         help="The output data directory.")
32 |     parser.add_argument("--do_format", default=0, type=int,
33 |                         help="0=bio (not format change will be applied); 1=brat; 2=bioc")
34 |     parser.add_argument("--do_copy", action='store_true',
35 |                         help="if copy the original plain text to output folder")
36 |     global_args = parser.parse_args()
37 | 
38 |     try:
39 |         main(global_args)
40 |     except Exception as ex:
41 |         traceback.print_exc()


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/common_io.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pickle as pkl
 4 | import json
 5 | 
 6 | 
 7 | def read_from_file(ifn):
 8 |     with open(ifn, "r") as f:
 9 |         text = f.read()
10 |     return text
11 | 
12 | 
13 | def write_to_file(text, ofn):
14 |     with open(ofn, "w") as f:
15 |         f.write(text)
16 |     return True
17 | 
18 | 
19 | def pkl_load(ifn):
20 |     with open(ifn, "rb") as f:
21 |         pdata = pkl.load(f)
22 |     return pdata
23 | 
24 | 
25 | def pkl_dump(pdata, ofn):
26 |     with open(ofn, "wb") as f:
27 |         pkl.dump(pdata, f)
28 |     return True
29 | 
30 | 
31 | def json_load(ifn):
32 |     with open(ifn, "r") as f:
33 |         jdata = json.load(f)
34 |     return jdata
35 | 
36 | 
37 | def json_dump(jdata, ofn):
38 |     with open(ofn, "w") as f:
39 |         json.dump(jdata, f)
40 |     return True
41 | 
42 | 
43 | def load_bio_file_into_sents(bio_file, word_sep=" ", do_lower=False):
44 |     bio_text = read_from_file(bio_file)
45 |     bio_text = bio_text.strip()
46 |     if do_lower:
47 |         bio_text = bio_text.lower()
48 | 
49 |     new_sents = []
50 |     sents = bio_text.split("\n\n")
51 | 
52 |     for sent in sents:
53 |         new_sent = []
54 |         words = sent.split("\n")
55 |         for word in words:
56 |             new_word = word.split(word_sep)
57 |             new_sent.append(new_word)
58 |         new_sents.append(new_sent)
59 | 
60 |     return new_sents
61 | 
62 | 
63 | def output_bio(bio_data, output_file, sep=" "):
64 |     with open(output_file, "w") as f:
65 |         for sent in bio_data:
66 |             for word in sent:
67 |                 line = sep.join(word)
68 |                 f.write(line)
69 |                 f.write("\n")
70 |             f.write("\n")
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Requirement
 2 | - python env: 3.8+
 3 | - use ```pip install -r requirements.txt``` to install dependencies
 4 | 
 5 | 
 6 | # Models
 7 | - we have two models trained for NER and Relation
 8 | - Both models base on BERT architecture with different classifiers
 9 | - we provide models on Huggingface:
10 | -   NER:https://huggingface.co/nvbic/SODA_BERT_NER
11 | -   RE:https://huggingface.co/nvbic/SODA_BERT_RE
12 | - contact: zehao.yu@ufl.edu; alexgre@ufl.edu; yonghui.wu@ufl.edu
13 | 
14 | 
15 | # SDoH_NLPend2end System
16 | - The system aims for extract SDoH information from clinical notes
17 | - We support text format for production and brat format for evaluation
18 | - The system is a two stage pipeline
19 |   - The first stage is to extract SDoH concepts
20 |   - The second stage is to identify relations between extracted concepts
21 |   
22 | 
23 | # Usage
24 | - download the models and unzip into this project root directory, you should have:
25 |     - ./models/ner_bert
26 |     - ./models/re_bert
27 | - then, cd to the ```./scripts``` directory
28 | - execute pipeline as 
29 | ```shell
30 | bash run_pred.sh -i <input data directory> -c gpu_id
31 | ```
32 | - "input data directory" is the location of the data you annotated (*.txt and *.ann) e.g., ./test_data
33 | - gpu_id is the id where you want to run the program. e.g, 0 - use the GPU with id as 0
34 | - if GPU is not available, try -1 to use CPU which is slow but should work.
35 | 
36 | 
37 | # Results
38 | - in the main directory (./SDoH_NLPend2end), we will create three directories for outputs
39 | - the first is ./logs which saves all the running logs
40 | - the second is ./temp which saves all the intermediate generated files
41 | - the third is ./results where the eval_results.txt stores the final performance measurement and the rest directories are the e2e outputs
42 | 


--------------------------------------------------------------------------------
/NLPreprocessing/file_utils/create_sent_map_files.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from annotation2BIO import pre_processing
 4 | import logging
 5 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG)
 6 | logger = logging.getLogger('pre_processing clinical notes')
 7 | 
 8 | 
 9 | def token2file(fw, sents):
10 |     for sent in sents:
11 |         for word in sent:
12 |             new_line = "\t".join(map(lambda x: str(x),
13 |                                      [word[0], word[1][0], word[1][1], word[2][0], word[2][1], "\n"]))
14 |             fw.write(new_line)
15 | 
16 | 
17 | def output_mapping_sent_files(raw_data_dir, output_dir, deid_pattern=None):
18 |     raw_data_dir = raw_data_dir
19 |     output_dir = output_dir
20 | 
21 |     if not os.path.isdir(raw_data_dir):
22 |         raise RuntimeError("Input data source directory is not exist.")
23 | 
24 |     if not os.path.isdir(output_dir):
25 |         os.mkdir(output_dir)
26 | 
27 |     for input_file in os.listdir(raw_data_dir):
28 |         logger.info(f'Current processing {input_file}')
29 | 
30 |         output_sent_file = "".join([output_dir, "/", input_file.split(".")[0], ".sent.txt"])
31 |         output_map_file = "".join([output_dir, "/", input_file.split(".")[0], ".map.txt"])
32 |         input_file = "".join([raw_data_dir, "/", input_file])
33 | 
34 |         with open(output_map_file, "w") as fw_map, open(output_sent_file, "w") as fw_sent:
35 |             sents, tokens = pre_processing(input_file, deid_pattern=deid_pattern)
36 |             fw_sent.write(sents)
37 |             token2file(fw_map, tokens)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     # output_mapping_sent_files("data_sample/test", "data_sample/test_output", deid_pattern="\[\*\*|\*\*\]")
42 |     assert len(sys.argv) == 4, "must provide input, output file directories and de-identifier pattern using # if None"
43 |     if sys.argv[3] == '#':
44 |         dp = None
45 |     else:
46 |         dp = sys.argv[3]
47 |     output_mapping_sent_files(sys.argv[1], sys.argv[2], deid_pattern=dp)
48 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/run.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | bz=4
 4 | epn=3
 5 | sc=2
 6 | dfmm=0
 7 | model_type=bert
 8 | pm=bert-base-uncased
 9 | data_dir=/home/zehao.yu/workspace/py3/data/dr_relation_aio_th1
10 | nmd=./new_model
11 | pof=./predictions.txt
12 | log=./logs/log_1.txt
13 | 
14 | python3 ./src/relation_extraction.py \
15 | 		--model_type $model_type \
16 | 		--data_format_mode $dfmm \
17 | 		--classification_scheme $sc \
18 | 		--pretrained_model $pm \
19 | 		--data_dir $data_dir \
20 | 		--new_model_dir $nmd \
21 | 		--predict_output_file $pof \
22 | 		--overwrite_model_dir \
23 | 		--seed 13 \
24 | 		--max_seq_length 512 \
25 | 		--cache_data \
26 | 		--do_train \
27 | 		--do_predict \
28 | 		--do_lower_case \
29 | 		--train_batch_size $bz \
30 | 		--eval_batch_size $bz \
31 | 		--learning_rate 1e-5 \
32 | 		--num_train_epochs $epn \
33 | 		--gradient_accumulation_steps 1 \
34 | 		--do_warmup \
35 | 		--warmup_ratio 0.1 \
36 | 		--weight_decay 0 \
37 | 		--max_num_checkpoints 1 \
38 | 		--log_file $log \
39 | 
40 | 
41 | # example of testing and convert predictions to brat
42 | export CUDA_VISIBLE_DEVICES=1
43 | python3 ./src/relation_extraction.py \
44 | 		--model_type $model_type \
45 | 		--data_format_mode $dfmm \
46 | 		--classification_scheme $sc \
47 | 		--pretrained_model $pm \
48 | 		--data_dir $data_dir \
49 | 		--new_model_dir $nmd \
50 | 		--predict_output_file $pof \
51 | 		--overwrite_model_dir \
52 | 		--seed 13 \
53 | 		--max_seq_length 512 \
54 | 		--cache_data \
55 | 		--do_predict \
56 | 		--do_lower_case \
57 | 		--train_batch_size $bz \
58 | 		--eval_batch_size $bz \
59 | 		--learning_rate 1e-5 \
60 | 		--num_train_epochs $epn \
61 | 		--gradient_accumulation_steps 1 \
62 | 		--do_warmup \
63 | 		--warmup_ratio 0.1 \
64 | 		--weight_decay 0 \
65 | 		--max_num_checkpoints 1 \
66 | 		--log_file $log \
67 | 
68 | edr="./data_annotation_entity_only"
69 | pod="./predicted_results"
70 | python3 src/data_processing/post_processing.py \
71 | 		--mode mul \
72 | 		--predict_result_file $pof \
73 | 		--entity_data_dir $edr \
74 | 		--test_data_file ${data_dir}/test.tsv \
75 | 		--brat_result_output_dir $pod
76 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/run_transformer_ner.sh:
--------------------------------------------------------------------------------
 1 | : '
 2 | The script contains the example shell commands you can use to run the transformer_ner tasks
 3 | We include two groups of commands:
 4 | 1. train and predict: The commands in train and predict section demonstrated how to run training and prediction in sequence
 5 | 2. only predict: If you have a trained model, you can run prediction only following the commands in the only predict section
 6 | 3. the prediction here is only for one file (test.txt) prediction. If you need batch prediction on group of files, use run_transformer_batch_prediction.sh instead.
 7 | 
 8 | Each section contains an example for BERT
 9 | We do support ALBERT, DISTILBERT, XLNet, RoBERTa as well. You can find more model information at https://huggingface.co/transformers/pretrained_models.html.
10 | We did not include examples using fp16 training mode but you can train model with fp16 (read run_transformer_ner.py source code)
11 | We currently do not support distraibuted multi-GPU training since fine-tuning task is not heavy on most clinical NER datasets.
12 | '
13 | 
14 | ########################### train and predict ###########################
15 | # tell system which GPU to use
16 | export CUDA_VISIBLE_DEVICES=0
17 | 
18 | ########################### train and predict ###########################
19 | #bert
20 | python src/run_transformer_ner.py \
21 |       --model_type bert \
22 |       --pretrained_model bert-base-uncased \
23 |       --data_dir ./test_data/conll-2003 \
24 |       --new_model_dir ./new_bert_ner_model \
25 |       --overwrite_model_dir \
26 |       --predict_output_file ./bert_pred.txt \
27 |       --max_seq_length 256 \
28 |       --save_model_core \
29 |       --do_train \
30 |       --do_predict \
31 |       --model_selection_scoring strict-f_score-1 \
32 |       --do_lower_case \
33 |       --train_batch_size 8 \
34 |       --eval_batch_size 8 \
35 |       --train_steps 500 \
36 |       --learning_rate 1e-5 \
37 |       --num_train_epochs 1 \
38 |       --gradient_accumulation_steps 1 \
39 |       --do_warmup \
40 |       --seed 13 \
41 |       --warmup_ratio 0.1 \
42 |       --max_num_checkpoints 3 \
43 |       --log_file ./log.txt \
44 |       --progress_bar \
45 |       --early_stop 3


--------------------------------------------------------------------------------
/NLPreprocessing/text_process/text_special_cases.py:
--------------------------------------------------------------------------------
 1 | SYMBOLS = {',', '?', '!', ':', '\'', '"', '(', ')', ';', '@', '^', '^', '&', '&', '$', '$', '£',
 2 |            '[', ']', '{', '}', '<', '>', '+', '-', "*", "#", "%", "=", "~", '/', "_"}
 3 | 
 4 | PREP = {'about', 'above', 'across', 'after', 'against', 'aka', 'along', 'and', 'anti', 'apart', 'around', 'as',
 5 |          'astride', 'at', 'away', 'because', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond',
 6 |          'but', 'by', 'contra', 'down', 'due to', 'during', 'ex', 'except', 'excluding', 'following', 'for', 'from',
 7 |          'given', 'in', 'including', 'inside', 'into', 'like', 'near', 'nearby', 'neath', 'of', 'off', 'on', 'onto',
 8 |          'or', 'out', 'over', 'past', 'per', 'plus', 'since', 'so', 'than', 'though', 'through', 'til', 'to',
 9 |          'toward', 'towards', 'under', 'underneath', 'versus', 'via', 'where', 'while', 'with', 'within', 'without',
10 |          'also'}
11 | 
12 | DET = {'a', 'an', 'the'}
13 | 
14 | NON_STOP_PUNCT = {',', ';'}
15 | 
16 | STOP_PUNCT = {'.', '?', '!'}
17 | 
18 | SENT_WORD = {'we', 'us', 'patient', 'denies', 'reveals', 'no', 'none', 'he', 'she', 'his', 'her', 'they', 'them', 'is',
19 |              'was', 'who', 'when', 'where', 'which', 'are', 'be', 'have', 'had', 'has', 'this', 'will', 'that', 'the',
20 |              'to', 'in', 'with', 'for', 'an', 'and', 'but', 'or', 'as', 'at', 'of', 'have', 'it', 'that', 'by', 'from',
21 |              'on', 'include', 'other', 'another'}
22 | 
23 | UNIT = {'mg', 'lb', 'kg', 'mm', 'cm', 'm', 'doz', 'am', 'pm', 'mph', 'oz', 'ml', 'l', 'mb', 'mmHg', 'min', 'cm2', 'm2', 'M2',
24 |         'mm2', 'mL', 'F', 'ppd', 'L', 'g', 'cc', "MG", "Munits", "pack", "mcg", "K", "hrs", "N", "inch", "d",
25 |         "AM", "PM", "HS", "QAM", "QPM", "BID", "mEq", "hr", "cGy", "mGy", "mLs", "mOsm"}
26 | 
27 | MIMICIII_DEID_PATTERN = "\[\*\*|\*\*\]"
28 | 
29 | NAME_PREFIX_SUFFIX = {
30 |     'Dr', 'Mr', 'Mrs', 'Jr', 'Ms', 'Prof'
31 | }
32 | 
33 | PROFESSIONAL_TITLE = {
34 |     'M.D.', 'Ph.D.', 'Pharm.D.'
35 | }
36 | 
37 | SPECIAL_ABBV = {
38 |     'e.c.', 'p.o.', 'b.i.d.', 'p.r.n.', 'i.v.', 'i.m.', 'b.i.d', 'p.r.n', 'i.m', 'i.v', 'p.o', 'd.o.b', 'vo.', 'm.o',
39 |     'r.i.', 'y.o.'
40 | }
41 | 
42 | ROMAN_NUM = {
43 |     'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'
44 | }
45 | 
46 | WHITE_LIST = {
47 |     'NaCl', 'KCl', 'HandiHaler', 'MetroCream', 'ChloraPrep', 'NovoLog', 'FlexPen', 'EpiPen', 'CellCept', 'iPad', 'eConsult', 'PreserVision'
48 | }


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/transformer_ner/test_transfomer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | import sys
 4 | from pathlib import Path
 5 | from transformer_ner.task import run_task
 6 | from transformer_ner.transfomer_log import TransformerNERLogger
 7 | 
 8 | 
 9 | class Args:
10 |     def __init__(self, model_type, pretrained_model):
11 |         self.model_type = model_type
12 |         self.pretrained_model = pretrained_model
13 |         self.config_name = self.pretrained_model
14 |         self.tokenizer_name = self.pretrained_model
15 |         self.do_lower_case = True
16 |         self.overwrite_model_dir = True
17 |         self.data_dir = Path(__file__).resolve().parent.parent.parent/'test_data/conll-2003'
18 |         self.data_has_offset_information = False
19 |         self.new_model_dir = Path(__file__).resolve().parent.parent.parent/f'new_ner_model/{model_type}_new_ner_model'
20 |         self.predict_output_file = Path(__file__).resolve().parent.parent.parent/f"new_ner_model/{model_type}_new_ner_model/pred.txt"
21 |         self.overwrite_output_dir = True
22 |         self.max_seq_length = 16
23 |         self.do_train = True
24 |         self.do_predict = True
25 |         self.model_selection_scoring = "strict-f_score-1"
26 |         self.train_batch_size = 4
27 |         self.eval_batch_size = 4
28 |         self.learning_rate = 0.00001
29 |         self.seed = 13
30 |         self.logger = TransformerNERLogger(
31 |             logger_level="i",
32 |             logger_file=Path(__file__).resolve().parent.parent.parent/"new_ner_model/log.txt").get_logger()
33 |         self.num_train_epochs = 2
34 |         self.gradient_accumulation_steps = 1
35 |         self.do_warmup = True
36 |         self.label2idx = None
37 |         self.idx2label = None
38 |         self.max_num_checkpoints = 1
39 |         self.warmup_ratio = 0.1
40 |         self.weight_decay = 0.0
41 |         self.adam_epsilon = 0.00000001
42 |         self.max_grad_norm = 1.0
43 |         self.log_file = None
44 |         self.log_lvl = None
45 |         self.fp16 = False
46 |         self.local_rank = -1
47 |         self.device = "cpu"
48 |         self.train_steps = 100
49 |         self.early_stop = -1
50 |         self.progress_bar = True
51 |         self.save_model_core = True
52 |         self.use_crf = False
53 | 
54 | 
55 | def test():
56 |     for each in [('deberta', "microsoft/deberta-base"),
57 |                  ('bert', 'bert-base-uncased'),
58 |                  ('roberta', 'roberta-base'),
59 |                  ('xlnet', 'xlnet-base-cased')]:
60 |         args = Args(each[0], each[1])
61 |         run_task(args)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     test()
66 | 


--------------------------------------------------------------------------------
/scipts/get_ann.py:
--------------------------------------------------------------------------------
 1 | run NER
 2 | import sys
 3 | sys.path.append("../ClinicalTransformerNER/")
 4 | sys.path.append("../NLPreprocessing/")
 5 | import os
 6 | from pathlib import Path
 7 | from collections import defaultdict, Counter
 8 | import numpy as np
 9 | from sklearn.model_selection import train_test_split
10 | import shutil 
11 | import fileinput
12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file
13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]"
14 | 
15 | #check number of ann in 150/50 split
16 | train_dev_root1 = Path('../data/training_set_150')
17 | test_root1 = Path('../data/test_set_150')
18 | #data stat
19 | file_ids = set()
20 | enss = []
21 | 
22 | for fn in test_root1.glob("*.ann"):
23 |     file_ids.add(fn.stem)
24 |     _, ens, _ = read_annotation_brat(fn)
25 |     #print( _)
26 |     enss.extend(ens)
27 | print("150 files as training, test files: ", len(file_ids), list(file_ids)[:5])
28 | print("150 files as training, total test eneitites: ", len(enss))
29 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))
30 | 
31 | 
32 | file_ids = set()
33 | enss = []
34 | 
35 | for fn in train_dev_root1.glob("*.ann"):
36 |     file_ids.add(fn.stem)
37 |     _, ens, _ = read_annotation_brat(fn)
38 |     #print( _)
39 |     enss.extend(ens)
40 | print("150 files as training, training files: ", len(file_ids), list(file_ids)[:5])
41 | print("150 files as training, total training eneitites: ", len(enss))
42 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))
43 | 
44 | #check ann in 100/100 split
45 | 
46 | train_dev_root2 = Path('../data/training_set_100')
47 | test_root1 = Path('../data/test_set_100')
48 | #data stat
49 | file_ids = set()
50 | enss = []
51 | 
52 | for fn in test_root2.glob("*.ann"):
53 |     file_ids.add(fn.stem)
54 |     _, ens, _ = read_annotation_brat(fn)
55 |     #print( _)
56 |     enss.extend(ens)
57 | print("100 files as training, test files: ", len(file_ids), list(file_ids)[:5])
58 | print("100 files as training, total test eneitites: ", len(enss))
59 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))
60 | 
61 | 
62 | file_ids = set()
63 | enss = []
64 | 
65 | for fn in train_dev_root2.glob("*.ann"):
66 |     file_ids.add(fn.stem)
67 |     _, ens, _ = read_annotation_brat(fn)
68 |     #print( _)
69 |     enss.extend(ens)
70 | print("100 files as training, training files: ", len(file_ids), list(file_ids)[:5])
71 | print("100 files as training, total training eneitites: ", len(enss))
72 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/relation_extraction_json.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import torch
 4 | from utils import TransformerLogger
 5 | from relation_extraction import app as main_app
 6 | 
 7 | 
 8 | class Args:
 9 |     """
10 |         used to hold all parameters
11 |         actual parameters for experiments will be loaded from the user defined json config file
12 |     """
13 |     def __init__(self, **kwargs):
14 |         self.model_type = "bert"
15 |         self.data_format_mode = 0
16 |         self.classification_scheme = 2
17 |         self.pretrained_model = "bert-base-uncased" # microsoft/deberta-large; microsoft/deberta-xlarge
18 |         self.data_dir = "../sample_data"
19 |         self.new_model_dir = "./bert_re_model"
20 |         self.predict_output_file = "./bert_re_predict.txt"
21 |         self.overwrite_model_dir = True
22 |         self.seed = 1234
23 |         self.max_seq_length = 128
24 |         self.cache_data = False
25 |         self.data_file_header = True
26 |         self.do_train = True
27 |         self.do_eval = False
28 |         self.do_predict = True
29 |         self.do_lower_case = True
30 |         self.train_batch_size = 8
31 |         self.eval_batch_size = 32
32 |         self.learning_rate = 1e-5
33 |         self.num_train_epochs = 4
34 |         self.gradient_accumulation_steps = 1
35 |         self.do_warmup = True
36 |         self.warmup_ratio = 0.1
37 |         self.weight_decay = 0.0
38 |         self.adam_epsilon = 1e-8
39 |         self.max_grad_norm = 1.0
40 |         self.max_num_checkpoints = 0
41 |         self.log_file = "./bert_re_log_txt"
42 |         self.log_lvl = "i"
43 |         self.log_step = 100
44 |         self.num_core = 4
45 |         self.non_relation_label = "nonRel"
46 |         self.progress_bar = True
47 |         self.fp16 = False
48 |         self.fp16_opt_level = "O1"
49 | 
50 |         self.__update_args(**kwargs)
51 | 
52 |     def __update_args(self, **kwargs):
53 |         for k, v in kwargs.items():
54 |             setattr(self, k, v)
55 | 
56 |     def __repr__(self):
57 |         return repr(self.__dict__)
58 | 
59 | 
60 | def json2args(jsondata):
61 |     return Args(**jsondata)
62 | 
63 | 
64 | def app(gargs):
65 |     main_app(gargs)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     parser = argparse.ArgumentParser()
70 |     # parse arguments
71 |     parser.add_argument("--config_json", default="./config.json", type=str, required=True,
72 |                         help="json file for experiment configurations")
73 |     args = parser.parse_args()
74 | 
75 |     with open(args.config_json, "r") as f:
76 |         configs = json.load(f, object_hook=json2args)
77 | 
78 |     # other setup
79 |     configs.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
80 |     configs.logger = TransformerLogger(
81 |         logger_file=configs.log_file, logger_level=configs.log_lvl).get_logger()
82 | 
83 |     app(configs)
84 | 


--------------------------------------------------------------------------------
/NLPreprocessing/file_utils/create_train_dev_test_set.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from numpy import random
 4 | from shutil import copyfile
 5 | import logging
 6 | 
 7 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG)
 8 | logger = logging.getLogger('train_test_split')
 9 | 
10 | 
11 | def __create_files_list(src_dir):
12 |     assert os.path.isdir(src_dir), f"{src_dir} is not exist."
13 |     return list(map(lambda x: "/".join([src_dir, x]), os.listdir(src_dir))), os.listdir(src_dir)
14 | 
15 | 
16 | def train_test_ids_to_file(fids, dir, cate='train'):
17 |     file_name = f"{cate}_set_all_ids.txt"
18 | 
19 |     fids = list(map(lambda x: x.split('/')[-1], fids))
20 | 
21 |     with open(f"{dir}/{file_name}", "w") as f:
22 |         f.write("\n".join(fids))
23 | 
24 | 
25 | def __write2file(file_list, output_dir, output_file_name):
26 |     with open("/".join([output_dir, output_file_name]), "w") as f_tr:
27 |         for file in file_list:
28 |             file_id = file.split("/")[-1]
29 |             # f_tr.write("\t".join([f"__doc {file_id}__", "-1", "-1", "-1", "-1", "O", "\n\n"]))
30 |             f_tr.write(f"-DOCSTART- __doc {file_id}__\n\n")
31 |             with open(file, "r") as fr:
32 |                 txt = fr.read().strip()
33 |             f_tr.write(txt)
34 |             f_tr.write("\n\n")
35 | 
36 | 
37 | def create_train_test_sets(src_dir, test_proportion=0.2, merge=True, shuffle_num=3):
38 |     file_list, file_id_list = __create_files_list(src_dir)
39 | 
40 |     for _ in range(shuffle_num):
41 |         random.shuffle(file_list)
42 | 
43 |     slice_index = int(len(file_list) * test_proportion)
44 | 
45 |     test_set = file_list[:slice_index]
46 |     train_set = file_list[slice_index:]
47 |     logger.info(f"train set size: {len(train_set)}; test set size: {len(test_set)}")
48 | 
49 |     output_dir = "_".join([src_dir, "train_test_split"])
50 | 
51 |     if not os.path.isdir(output_dir):
52 |         os.mkdir(output_dir)
53 | 
54 |     # write train and test ids to files
55 |     train_test_ids_to_file(train_set, output_dir, "train")
56 |     train_test_ids_to_file(test_set, output_dir, "test")
57 | 
58 |     if not merge:
59 |         train_dir = "/".join([output_dir, "training_set"])
60 |         test_dir = "/".join([output_dir, "test_set"])
61 |         if not os.path.isdir(train_dir):
62 |             os.mkdir(train_dir)
63 |         if not os.path.isdir(test_dir):
64 |             os.mkdir(test_dir)
65 |         for file in train_set:
66 |             new_file = "/".join([train_dir, file.split("/")[-1]])
67 |             copyfile(file, new_file)
68 |         for file in test_set:
69 |             new_file = "/".join([test_dir, file.split("/")[-1]])
70 |             copyfile(file, new_file)
71 | 
72 |     __write2file(train_set, output_dir, "training_set.txt")
73 |     __write2file(test_set, output_dir, "testing_set.txt")
74 | 
75 | 
76 | def test():
77 |     # create_train_test_sets("data_sample/bio", test_proportion=0.5, merge=False)
78 |     # print(os.getcwd())
79 |     create_train_test_sets(src_dir="/Users/alexgre/workspace/py3/2019AMIA_DEID/2019amia_train_bio",
80 |                            test_proportion=0.25)
81 |     pass
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     test()
86 | 


--------------------------------------------------------------------------------
/scipts/run_pred.sh:
--------------------------------------------------------------------------------
 1 | #run prediction from notes
 2 | #git clone https://github.com/uf-hobi-informatics-lab/ClinicalTransformerRelationExtraction.git
 3 | #git clone https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER.git
 4 | #git clone https://github.com/uf-hobi-informatics-lab/NLPreprocessing.git
 5 | 
 6 | while getopts :i:d:n:c: flag
 7 | do
 8 |     case "${flag}" in
 9 |         i) input_dir=${OPTARG};;
10 |         c) cuda=${OPTARG};;
11 |     esac
12 | done
13 | echo "Input dir: $input_dir";
14 | echo "CUDA used: $cuda";
15 | 
16 | output_dir=../results
17 | output_name=bio
18 | 
19 | 
20 | mkdir $output_dir
21 | export CUDA_VISIBLE_DEVICES=$cuda
22 | 
23 | python3 ./run_ner.py $input_dir $output_name
24 | 
25 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \
26 |       --model_type bert \
27 |       --pretrained_model ../models/ner_bert \
28 |       --raw_text_dir $input_dir \
29 |       --preprocessed_text_dir ../temp/${output_name} \
30 |       --output_dir ../temp/${output_name} \
31 |       --max_seq_length 128 \
32 |       --do_lower_case \
33 |       --eval_batch_size 8 \
34 |       --log_file ../logs/log_ner.txt\
35 |       --do_format 1 \
36 |       --do_copy \
37 |       --data_has_offset_information
38 | 
39 | 
40 | python3 ./make_relation.py $input_dir $output_name
41 | 
42 | 
43 | bz=4
44 | epn=3
45 | sc=2
46 | dfmm=0
47 | model_type=bert
48 | pm=bert-large
49 | data_dir_re=../temp/${output_name}_aio_th1
50 | nmd=../models/re_bert
51 | pof=../temp/predictions_${output_name}.txt
52 | log=../logs/log_re_${output_name}.txt
53 | 
54 | python3 ../ClinicalTransformerRelationExtraction/src/relation_extraction.py \
55 |                 --model_type $model_type \
56 |                 --data_format_mode $dfmm \
57 |                 --classification_scheme $sc \
58 |                 --pretrained_model $pm \
59 |                 --data_dir $data_dir_re \
60 |                 --new_model_dir $nmd \
61 |                 --predict_output_file $pof \
62 |                 --overwrite_model_dir \
63 |                 --seed 13 \
64 |                 --max_seq_length 512 \
65 |                 --num_core 10 \
66 |                 --cache_data \
67 |                 --do_predict \
68 |                 --do_lower_case \
69 |                 --train_batch_size $bz \
70 |                 --eval_batch_size $bz \
71 |                 --learning_rate 1e-5 \
72 |                 --num_train_epochs $epn \
73 |                 --gradient_accumulation_steps 1 \
74 |                 --do_warmup \
75 |                 --warmup_ratio 0.1 \
76 |                 --weight_decay 0 \
77 |                 --max_num_checkpoints 0 \
78 |                 --log_file $log
79 | 
80 | mkdir ${output_dir}/result
81 | mkdir ${output_dir}/result/eval
82 | mkdir ${output_dir}/result/RE
83 | 
84 | edr=../temp/${output_name}_formatted_output
85 | pod=${output_dir}/result/RE/${output_name}_relation_predicted_results
86 | python3 ../ClinicalTransformerRelationExtraction/src/data_processing/post_processing.py \
87 |                 --mode mul \
88 |                 --predict_result_file $pof \
89 |                 --entity_data_dir $edr \
90 |                 --test_data_file ${data_dir_re}/test.tsv \
91 |                 --brat_result_output_dir $pod\
92 |                 --log_file $log
93 | 
94 | python brat_eval.py --f1 $input_dir --f2 $pod >> ${output_dir}/eval_result.txt
95 | 
96 | 


--------------------------------------------------------------------------------
/scipts/training_process.sh:
--------------------------------------------------------------------------------
  1 | #training from pre-trained model on 1FL dataset
  2 | while getopts :i:d:n:c: flag
  3 | do
  4 |     case "${flag}" in
  5 |         i) input_dir=${OPTARG};;
  6 |         c) cuda=${OPTARG};;
  7 |     esac
  8 | done
  9 | echo "Input dir: $input_dir";
 10 | echo "CUDA used: $cuda";
 11 | export CUDA_VISIBLE_DEVICES=$cuda
 12 | output_dir=../results
 13 | output_name=bio_training
 14 | mkdir ../models/SDOH_bert_updated_150
 15 | mkdir ../models/SDOH_bert_updated_100
 16 | mkdir ${output_dir}
 17 | python3 ./training_ner.py $input_dir 
 18 | python3 ../ClinicalTransformerNER/src/run_transformer_ner.py \
 19 |       --model_type bert \
 20 |       --pretrained_model ../models/ner_bert \
 21 |       --data_dir ../bio/bio_training_150 \
 22 |       --new_model_dir ../models/SDOH_bert_updated_150 \
 23 |       --overwrite_model_dir \
 24 |       --max_seq_length 128 \
 25 |       --data_has_offset_information \
 26 |       --save_model_core \
 27 |       --do_train \
 28 |       --model_selection_scoring strict-f_score-1 \
 29 |       --do_lower_case \
 30 |       --train_batch_size 8 \
 31 |       --train_steps 1000 \
 32 |       --learning_rate 1e-5 \
 33 |       --num_train_epochs 30 \
 34 |       --gradient_accumulation_steps 1 \
 35 |       --do_warmup \
 36 |       --seed 13 \
 37 |       --warmup_ratio 0.1 \
 38 |       --max_num_checkpoints 3 \
 39 |       --log_file ../logs/log_ner_training.txt \
 40 |       --progress_bar \
 41 |       --early_stop 3 
 42 | 
 43 | 
 44 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \
 45 |       --model_type bert \
 46 |       --pretrained_model ../models/SDOH_bert_updated_150 \
 47 |       --raw_text_dir ../data/test_set_150 \
 48 |       --preprocessed_text_dir ../bio/bio_test_150 \
 49 |       --output_dir ../result/training_result_150 \
 50 |       --max_seq_length 128 \
 51 |       --do_lower_case \
 52 |       --eval_batch_size 8 \
 53 |       --log_file ../logs/log_ner_training.txt\
 54 |       --do_format 1 \
 55 |       --do_copy \
 56 |       --data_has_offset_information
 57 | 
 58 | python ./brat_eval.py --f1 ../data/test_set_150 --f2 ../result/training_result_150_formatted_output >> ${output_dir}/eval_result_training_150.txt
 59 | 
 60 | 
 61 | # training process on 1:1 split 
 62 | 
 63 | python3 ../ClinicalTransformerNER/src/run_transformer_ner.py \
 64 |       --model_type bert \
 65 |       --pretrained_model ../models/ner_bert \
 66 |       --data_dir ../bio/bio_training_100 \
 67 |       --new_model_dir ../models/SDOH_bert_updated_100 \
 68 |       --overwrite_model_dir \
 69 |       --max_seq_length 128 \
 70 |       --data_has_offset_information \
 71 |       --save_model_core \
 72 |       --do_train \
 73 |       --model_selection_scoring strict-f_score-1 \
 74 |       --do_lower_case \
 75 |       --train_batch_size 8 \
 76 |       --train_steps 1000 \
 77 |       --learning_rate 1e-5 \
 78 |       --num_train_epochs 30 \
 79 |       --gradient_accumulation_steps 1 \
 80 |       --do_warmup \
 81 |       --seed 13 \
 82 |       --warmup_ratio 0.1 \
 83 |       --max_num_checkpoints 3 \
 84 |       --log_file ../logs/log_ner_training.txt \
 85 |       --progress_bar \
 86 |       --early_stop 3 
 87 | 
 88 | 
 89 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \
 90 |       --model_type bert \
 91 |       --pretrained_model ../models/SDOH_bert_updated_100 \
 92 |       --raw_text_dir ../data/test_set_100 \
 93 |       --preprocessed_text_dir ../bio/bio_test_100 \
 94 |       --output_dir ../result/training_result_100 \
 95 |       --max_seq_length 128 \
 96 |       --do_lower_case \
 97 |       --eval_batch_size 8 \
 98 |       --log_file ../logs/log_ner_training.txt\
 99 |       --do_format 1 \
100 |       --do_copy \
101 |       --data_has_offset_information
102 | 
103 | python ./brat_eval.py --f1 ../data/test_set_100 --f2 ../result/training_result_100_formatted_output >> ${output_dir}/eval_result_training_100.txt
104 | 
105 | 


--------------------------------------------------------------------------------
/scipts/get_statistics.py:
--------------------------------------------------------------------------------
  1 | #get stat result
  2 | from pathlib import Path
  3 | import numpy as np
  4 | import os
  5 | import pickle as pkl
  6 | import sys
  7 | import pandas as pd
  8 | def pkl_save(data, file):
  9 |     with open(file, "wb") as f:
 10 |         pkl.dump(data, f)
 11 | 
 12 | def pkl_load(file):
 13 |     with open(file, "rb") as f:
 14 |         data = pkl.load(f)
 15 |     return data
 16 | 
 17 | def ann_stat(data_root1):
 18 |     dict1=dict()
 19 |     for fn in Path(data_root1).glob("*.ann"):
 20 | #        i+=1
 21 |        # print(fn.stem.split('_')[-1])
 22 |  #       file_ids.add(fn)
 23 |         fid=fn.stem
 24 |         if fid not in dict1.keys():
 25 |             dict1.update({fid:{}})
 26 |         with open(fn,'r') as f:
 27 |             lines=f.readlines()
 28 |        # if not lines:
 29 |        #     continue
 30 |         #else:
 31 |             for line in lines:
 32 |                 line=line.strip()
 33 |                 try:
 34 |                     ann_cate=line.split('\t')[1].split(' ')[0]
 35 |                     ann_res=line.split('\t')[2].split('\n')[0]
 36 |                # print(ann_cate)
 37 |                # print(ann_res)
 38 |                     if ann_cate not in dict1[fid].keys():
 39 |                         dict1[fid].update({ann_cate:[ann_res]})
 40 |                     else:
 41 |                         dict1[fid][ann_cate].append(ann_res)
 42 |                 except:
 43 |                # print('except')
 44 |                # print(line)
 45 |                     continue
 46 |     return dict1
 47 | 
 48 | data_dir1=sys.argv[1]
 49 | data_dir2=sys.argv[2]
 50 | 
 51 | 
 52 | dict1=ann_stat(data_dir1)
 53 | dict2=ann_stat(data_dir2)
 54 | 
 55 | def find_agg_data(dict1):
 56 |     null_notes=set()
 57 |     notes=set()
 58 |     dict_agg=dict()
 59 |     for k,v in dict1.items():
 60 |         if len(v)==0:
 61 |             null_notes.add(k)
 62 |         else:
 63 |             notes.add(k)
 64 |             for k1,v1 in v.items():
 65 |                 if k1 not in dict_agg.keys():
 66 |                     dict_agg.update({k1:set()})
 67 |                     dict_agg[k1].add(k)
 68 |                 else:
 69 |                     dict_agg[k1].add(k)
 70 |     return null_notes,notes,dict_agg
 71 | 
 72 | 
 73 | pd_null,pd_pts,pd_dict_agg=find_agg_data(dict1)
 74 | gs_null,gs_pts,gs_dict_agg=find_agg_data(dict2)
 75 | sdoh_cate=sorted(list(pd_dict_agg.keys())+list(gs_dict_agg.keys()))
 76 | def find_agg_data_3(dict1):
 77 |     dict_agg=dict()
 78 |     for k in sdoh_cate:
 79 |         dict_agg.update({k:[]})
 80 |     for k,v in dict1.items():
 81 | 
 82 |         for sdoh_label in sdoh_cate:
 83 |             if sdoh_label not in v.keys():
 84 |                 dict_agg[sdoh_label].append(0)
 85 |             else:
 86 |                 dict_agg[sdoh_label].append(len(v[sdoh_label]))
 87 | 
 88 |     return dict_agg
 89 | pd_dict_2=find_agg_data_3(dict1)
 90 | gs_dict_2=find_agg_data_3(dict2)
 91 | 
 92 | data={'SDoH_cate':sorted(list(pd_dict_agg.keys())+list(gs_dict_agg.keys())+['null_note'])}
 93 | df=pd.DataFrame(data)
 94 | def count_pts(x,dict_agg):
 95 |     if x in dict_agg.keys():
 96 |         return len(dict_agg[x])
 97 |     else:
 98 |         return 0
 99 | def sum_pts_cate(x,dict_agg):
100 |     if x in dict_agg.keys():
101 |         return sum(dict_agg[x])
102 |     else:
103 |         return 0
104 | 
105 | df['concept_sum_pred']=df.apply(lambda x: sum_pts_cate(x['SDoH_cate'],pd_dict_2),axis=1)
106 | df['concept_sum_ann']=df.apply(lambda x: sum_pts_cate(x['SDoH_cate'],gs_dict_2),axis=1)
107 | df['notes_count_pred']=df.apply(lambda x: count_pts(x['SDoH_cate'],pd_dict_agg),axis=1)
108 | df['notes_count_ann']=df.apply(lambda x: count_pts(x['SDoH_cate'],gs_dict_agg),axis=1)
109 | df.loc[(df.SDoH_cate == 'null_note'),'notes_count_pred']=len(pd_null)
110 | df.loc[(df.SDoH_cate == 'null_note'),'notes_count_pred']=len(gs_null)
111 | Path('../results').mkdir(parents=True, exist_ok=True)
112 | df.to_csv('../results/count_concepts.csv')
113 | #print(df)


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from sklearn.metrics import accuracy_score
  3 | import traceback
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | def try_catch_annotator(func):
  8 |     def try_catch(*args, **kwargs):
  9 |         try:
 10 |             return func(*args, **kwargs)
 11 |         except Exception as ex:
 12 |             traceback.print_exc()
 13 |             return None
 14 |     return try_catch
 15 | 
 16 | 
 17 | class TransformerLogger:
 18 |     LOG_LVLs = {
 19 |         'i': logging.INFO,
 20 |         'd': logging.DEBUG,
 21 |         'e': logging.ERROR,
 22 |         'w': logging.WARN
 23 |     }
 24 | 
 25 |     def __init__(self, logger_file=None, logger_level='d'):
 26 |         self.lf = logger_file
 27 |         self.lvl = logger_level
 28 | 
 29 |     def set_log_info(self, logger_file, logger_level):
 30 |         self.lf = logger_file
 31 |         self.lvl = logger_level
 32 | 
 33 |     def _create_logger(self, logger_name=""):
 34 |         logger = logging.getLogger(logger_name)
 35 |         formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 36 |                                       datefmt="%Y-%m-%d %H:%M:%S")
 37 |         logger.setLevel(self.LOG_LVLs[self.lvl])
 38 |         if self.lf:
 39 |             fh = logging.FileHandler(self.lf)
 40 |             fh.setFormatter(formatter)
 41 |             fh.setLevel(self.LOG_LVLs[self.lvl])
 42 |             logger.addHandler(fh)
 43 |         else:
 44 |             ch = logging.StreamHandler()
 45 |             ch.setFormatter(formatter)
 46 |             ch.setLevel(self.LOG_LVLs[self.lvl])
 47 |             logger.addHandler(ch)
 48 | 
 49 |         return logger
 50 | 
 51 |     def get_logger(self):
 52 |         return self._create_logger("Transformer_Relation_Extraction")
 53 | 
 54 | 
 55 | class PRF:
 56 |     def __init__(self):
 57 |         self.tp = 0
 58 |         self.fp = 0
 59 | 
 60 |     def __repr__(self):
 61 |         return f'tp: {self.tp}; fp: {self.fp}'
 62 | 
 63 | def calc(tp, tp_fp, tp_tn):
 64 |     if tp_fp != 0:
 65 |         pre = tp / tp_fp
 66 |     else:
 67 |         pre = 0
 68 | 
 69 |     if tp_tn == 0:
 70 |         rec = 0
 71 |     else:
 72 |         rec = tp / tp_tn
 73 | 
 74 |     if pre == 0 and rec == 0:
 75 |         f1 = 0
 76 |     else:
 77 |         f1 = 2 * pre * rec / (pre + rec)
 78 | 
 79 |     return round(pre, 4), round(rec, 4), round(f1, 4)
 80 | 
 81 | 
 82 | def measure_prf(preds, gs_labels, non_rel_label):
 83 |     res = dict()
 84 |     temp = defaultdict(PRF)
 85 |     total_tp, total_tp_fp, total_tp_tn = 0, 0, 0
 86 |     tn_dict = defaultdict(lambda: 0)
 87 | 
 88 |     assert preds == gs_labels, "prediction and gold standard is not equal"
 89 | 
 90 |     labels = set(gs_labels)
 91 |     for l in labels:
 92 |         for p, g in zip(preds, gs_labels):
 93 |             if g == l:
 94 |                 tn_dict[l] += 1
 95 |             if g == p == l:
 96 |                 temp[l].tp += 1
 97 |             elif g != l and p == l:
 98 |                 temp[l].fp += 1
 99 | 
100 |     for l in labels:
101 |         if l == non_rel_label:
102 |             continue
103 |         tp, fp = temp[l].tp, temp[l].fp
104 |         tp_fp = tp + fp
105 |         tp_tn = tn_dict[l]
106 |         res[l] = calc(tp, tp_fp, tp_tn)
107 | 
108 |         total_tp += tp
109 |         total_tp_fp += tp_fp
110 |         total_tp_tn += tp_tn
111 | 
112 |     res['micro_average_pre_rec_f1'] = calc(total_tp, total_tp_fp, total_tp_tn)
113 |     f1 = res['micro_average_pre_rec_f1'][-1]
114 | 
115 |     return res, f1
116 | 
117 | 
118 | def acc_and_f1(labels, preds, label2idx, non_rel_label):
119 |     acc = accuracy_score(labels, preds)
120 | 
121 |     idx2label = {v: k for k, v in label2idx.items()}
122 |     new_labels = [idx2label[e] for e in labels]
123 |     new_preds = [idx2label[e] for e in preds]
124 |     prf_list, f1 = measure_prf(new_preds, new_labels, non_rel_label)
125 |     prf_list = sorted(prf_list.items(), key=lambda x: len(x[0]))
126 |     res = []
127 |     for k, v in prf_list:
128 |         res.append(f"{k} - pre: {v[0]}, rec: {v[1]}, f1: {v[2]}")
129 | 
130 |     return acc, "\n".join(res), f1
131 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/transformer_ner/model_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import math
  4 | 
  5 | 
  6 | def gelu(x):
  7 |     """
  8 |     Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
  9 |     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
 10 |     """
 11 |     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 12 | 
 13 | 
 14 | def get_mask(input, local_context):
 15 |     if not isinstance(local_context, DropoutContext):
 16 |         dropout = local_context
 17 |         mask = None
 18 |     else:
 19 |         dropout = local_context.dropout
 20 |         dropout *= local_context.scale
 21 |         mask = local_context.mask if local_context.reuse_mask else None
 22 | 
 23 |     if dropout > 0 and mask is None:
 24 |         mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
 25 | 
 26 |     if isinstance(local_context, DropoutContext):
 27 |         if local_context.mask is None:
 28 |             local_context.mask = mask
 29 | 
 30 |     return mask, dropout
 31 | 
 32 | 
 33 | class DropoutContext(object):
 34 |     def __init__(self):
 35 |         self.dropout = 0
 36 |         self.mask = None
 37 |         self.scale = 1
 38 |         self.reuse_mask = True
 39 | 
 40 | 
 41 | class XDropout(torch.autograd.Function):
 42 |     """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
 43 | 
 44 |     @staticmethod
 45 |     def forward(ctx, input, local_ctx):
 46 |         mask, dropout = get_mask(input, local_ctx)
 47 |         ctx.scale = 1.0 / (1 - dropout)
 48 |         if dropout > 0:
 49 |             ctx.save_for_backward(mask)
 50 |             return input.masked_fill(mask, 0) * ctx.scale
 51 |         else:
 52 |             return input
 53 | 
 54 |     @staticmethod
 55 |     def backward(ctx, grad_output):
 56 |         if ctx.scale > 1:
 57 |             (mask,) = ctx.saved_tensors
 58 |             return grad_output.masked_fill(mask, 0) * ctx.scale, None
 59 |         else:
 60 |             return grad_output, None
 61 | 
 62 | 
 63 | class StableDropout(torch.nn.Module):
 64 |     """
 65 |     Optimized dropout module for stabilizing the training
 66 |     Args:
 67 |         drop_prob (float): the dropout probabilities
 68 |     """
 69 | 
 70 |     def __init__(self, drop_prob):
 71 |         super().__init__()
 72 |         self.drop_prob = drop_prob
 73 |         self.count = 0
 74 |         self.context_stack = None
 75 | 
 76 |     def forward(self, x):
 77 |         """
 78 |         Call the module
 79 |         Args:
 80 |             x (:obj:`torch.tensor`): The input tensor to apply dropout
 81 |         """
 82 |         if self.training and self.drop_prob > 0:
 83 |             return XDropout.apply(x, self.get_context())
 84 |         return x
 85 | 
 86 |     def clear_context(self):
 87 |         self.count = 0
 88 |         self.context_stack = None
 89 | 
 90 |     def init_context(self, reuse_mask=True, scale=1):
 91 |         if self.context_stack is None:
 92 |             self.context_stack = []
 93 |         self.count = 0
 94 |         for c in self.context_stack:
 95 |             c.reuse_mask = reuse_mask
 96 |             c.scale = scale
 97 | 
 98 |     def get_context(self):
 99 |         if self.context_stack is not None:
100 |             if self.count >= len(self.context_stack):
101 |                 self.context_stack.append(DropoutContext())
102 |             ctx = self.context_stack[self.count]
103 |             ctx.dropout = self.drop_prob
104 |             self.count += 1
105 |             return ctx
106 |         else:
107 |             return self.drop_prob
108 | 
109 | 
110 | class ContextPooler(nn.Module):
111 |     def __init__(self, config):
112 |         super().__init__()
113 |         self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
114 |         self.dropout = StableDropout(config.pooler_dropout)
115 |         self.config = config
116 | 
117 |     def forward(self, hidden_states):
118 |         # We "pool" the model by simply taking the hidden state corresponding
119 |         # to the first token.
120 | 
121 |         context_token = hidden_states[:, 0]
122 |         context_token = self.dropout(context_token)
123 |         pooled_output = self.dense(context_token)
124 |         pooled_output = gelu(pooled_output)
125 |         return pooled_output
126 | 
127 |     @property
128 |     def output_dim(self):
129 |         return self.config.hidden_size


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/model_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import math
  4 | 
  5 | 
  6 | def gelu(x):
  7 |     """
  8 |     Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
  9 |     the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
 10 |     """
 11 |     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 12 | 
 13 | 
 14 | def get_mask(input, local_context):
 15 |     if not isinstance(local_context, DropoutContext):
 16 |         dropout = local_context
 17 |         mask = None
 18 |     else:
 19 |         dropout = local_context.dropout
 20 |         dropout *= local_context.scale
 21 |         mask = local_context.mask if local_context.reuse_mask else None
 22 | 
 23 |     if dropout > 0 and mask is None:
 24 |         mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
 25 | 
 26 |     if isinstance(local_context, DropoutContext):
 27 |         if local_context.mask is None:
 28 |             local_context.mask = mask
 29 | 
 30 |     return mask, dropout
 31 | 
 32 | 
 33 | class DropoutContext(object):
 34 |     def __init__(self):
 35 |         self.dropout = 0
 36 |         self.mask = None
 37 |         self.scale = 1
 38 |         self.reuse_mask = True
 39 | 
 40 | 
 41 | class XDropout(torch.autograd.Function):
 42 |     """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
 43 | 
 44 |     @staticmethod
 45 |     def forward(ctx, input, local_ctx):
 46 |         mask, dropout = get_mask(input, local_ctx)
 47 |         ctx.scale = 1.0 / (1 - dropout)
 48 |         if dropout > 0:
 49 |             ctx.save_for_backward(mask)
 50 |             return input.masked_fill(mask, 0) * ctx.scale
 51 |         else:
 52 |             return input
 53 | 
 54 |     @staticmethod
 55 |     def backward(ctx, grad_output):
 56 |         if ctx.scale > 1:
 57 |             (mask,) = ctx.saved_tensors
 58 |             return grad_output.masked_fill(mask, 0) * ctx.scale, None
 59 |         else:
 60 |             return grad_output, None
 61 | 
 62 | 
 63 | class StableDropout(torch.nn.Module):
 64 |     """
 65 |     Optimized dropout module for stabilizing the training
 66 | 
 67 |     Args:
 68 |         drop_prob (float): the dropout probabilities
 69 |     """
 70 | 
 71 |     def __init__(self, drop_prob):
 72 |         super().__init__()
 73 |         self.drop_prob = drop_prob
 74 |         self.count = 0
 75 |         self.context_stack = None
 76 | 
 77 |     def forward(self, x):
 78 |         """
 79 |         Call the module
 80 | 
 81 |         Args:
 82 |             x (:obj:`torch.tensor`): The input tensor to apply dropout
 83 |         """
 84 |         if self.training and self.drop_prob > 0:
 85 |             return XDropout.apply(x, self.get_context())
 86 |         return x
 87 | 
 88 |     def clear_context(self):
 89 |         self.count = 0
 90 |         self.context_stack = None
 91 | 
 92 |     def init_context(self, reuse_mask=True, scale=1):
 93 |         if self.context_stack is None:
 94 |             self.context_stack = []
 95 |         self.count = 0
 96 |         for c in self.context_stack:
 97 |             c.reuse_mask = reuse_mask
 98 |             c.scale = scale
 99 | 
100 |     def get_context(self):
101 |         if self.context_stack is not None:
102 |             if self.count >= len(self.context_stack):
103 |                 self.context_stack.append(DropoutContext())
104 |             ctx = self.context_stack[self.count]
105 |             ctx.dropout = self.drop_prob
106 |             self.count += 1
107 |             return ctx
108 |         else:
109 |             return self.drop_prob
110 | 
111 | 
112 | class ContextPooler(nn.Module):
113 |     def __init__(self, config):
114 |         super().__init__()
115 |         self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
116 |         self.dropout = StableDropout(config.pooler_dropout)
117 |         self.config = config
118 | 
119 |     def forward(self, hidden_states):
120 |         # We "pool" the model by simply taking the hidden state corresponding
121 |         # to the first token.
122 | 
123 |         context_token = hidden_states[:, 0]
124 |         context_token = self.dropout(context_token)
125 |         pooled_output = self.dense(context_token)
126 |         pooled_output = gelu(pooled_output)
127 |         return pooled_output
128 | 
129 |     @property
130 |     def output_dim(self):
131 |         return self.config.hidden_size
132 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/run_app.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The py file is an example for
  3 | 
  4 | 1. how to run RE as an python app not through command line
  5 | 2. how to manually add new models other than those available in the models.py
  6 | 
  7 | We used deberta as an example
  8 | """
  9 | from models import BaseModel
 10 | from data_utils import RelationDataFormatSepProcessor
 11 | from transformers import DebertaForSequenceClassification, DebertaModel, DebertaConfig, DebertaTokenizer
 12 | from task import TaskRunner
 13 | from utils import TransformerLogger
 14 | 
 15 | import numpy as np
 16 | import torch
 17 | import traceback
 18 | 
 19 | 
 20 | class DeBERTaRelationExtraction(DebertaForSequenceClassification, BaseModel):
 21 |     def __init__(self, config):
 22 |         super().__init__(config)
 23 | 
 24 |     def forward(
 25 |             self,
 26 |             input_ids=None,
 27 |             attention_mask=None,
 28 |             token_type_ids=None,
 29 |             position_ids=None,
 30 |             inputs_embeds=None,
 31 |             labels=None,
 32 |             output_attentions=None,
 33 |             output_hidden_states=None,
 34 |             return_dict=None,
 35 |     ):
 36 |         outputs = self.deberta(
 37 |             input_ids,
 38 |             token_type_ids=token_type_ids,
 39 |             attention_mask=attention_mask,
 40 |             position_ids=position_ids,
 41 |             inputs_embeds=inputs_embeds,
 42 |             output_attentions=output_attentions,
 43 |             output_hidden_states=output_hidden_states,
 44 |             return_dict=return_dict,
 45 |         )
 46 |         seq_output = outputs[0]
 47 |         pooled_output = self.pooler(seq_output)
 48 | 
 49 |         pooled_output = self.dropout(pooled_output)
 50 |         seq_output = self.dropout(seq_output)
 51 | 
 52 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
 53 |         outputs = (logits,) + outputs[2:]
 54 |         loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 55 |         outputs = (loss,) + outputs
 56 | 
 57 |         return outputs
 58 | 
 59 | 
 60 | class DeBERTaDataProcessor(RelationDataFormatSepProcessor):
 61 |     def __init__(self, data_dir=None, max_seq_len=128, num_core=-1, header=True, tokenizer_type='deberta'):
 62 |         super().__init__(
 63 |             data_dir=data_dir, max_seq_len=max_seq_len, num_core=num_core, header=True, tokenizer_type='deberta')
 64 |         self.total_special_token_num = 4
 65 | 
 66 | 
 67 | class Args:
 68 |     """
 69 |         used to init all parameters
 70 |         deberta use roberta vocab
 71 |         deberta-v2 need new tokenizer as XLNet based on SPM
 72 |     """
 73 |     def __init__(self, **kwargs):
 74 |         self.model_type = "deberta"
 75 |         self.data_format_mode = 0
 76 |         self.classification_scheme = 2
 77 |         self.pretrained_model = "microsoft/deberta-base" # microsoft/deberta-large; microsoft/deberta-xlarge
 78 |         self.data_dir = "../sample_data"
 79 |         self.new_model_dir = "../deberta_re_model"
 80 |         self.predict_output_file = "../deberta_re_predict.txt"
 81 |         self.overwrite_model_dir = True
 82 |         self.seed = 1234
 83 |         self.max_seq_length = 128
 84 |         self.cache_data = False
 85 |         self.data_file_header = True
 86 |         self.do_train = True
 87 |         self.do_eval = False
 88 |         self.do_predict = True
 89 |         self.do_lower_case = True
 90 |         self.train_batch_size = 2
 91 |         self.eval_batch_size = 32
 92 |         self.learning_rate = 1e-5
 93 |         self.num_train_epochs = 5
 94 |         self.gradient_accumulation_steps = 1
 95 |         self.do_warmup = True
 96 |         self.warmup_ratio = 0.1
 97 |         self.weight_decay = 0.0
 98 |         self.adam_epsilon = 1e-8
 99 |         self.max_grad_norm = 1.0
100 |         self.max_num_checkpoints = 0
101 |         self.log_file = None
102 |         self.log_lvl = "i"
103 |         self.log_step = 2
104 |         self.num_core = 4
105 |         self.non_relation_label = "nonRel"
106 |         self.progress_bar = False
107 |         self.fp16 = False
108 |         self.fp16_opt_level = "O1"
109 | 
110 |         self.__update_args(**kwargs)
111 | 
112 |     def __update_args(self, **kwargs):
113 |         for k, v in kwargs.items():
114 |             setattr(self, k, v)
115 | 
116 | 
117 | def app():
118 |     args = Args()
119 |     args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120 |     args.logger = TransformerLogger(logger_file=args.log_file, logger_level='i').get_logger()
121 | 
122 |     np.random.seed(args.seed)
123 |     torch.manual_seed(args.seed)
124 | 
125 |     task_runner = TaskRunner(args)
126 | 
127 |     # add deberta to model dict
128 |     task_runner.model_dict['deberta'] = (DeBERTaRelationExtraction, DebertaConfig, DebertaTokenizer)
129 |     # set deberta data processor for data processing
130 |     task_runner.data_processor = DeBERTaDataProcessor(
131 |         max_seq_len=args.max_seq_length, num_core=args.num_core)
132 | 
133 |     task_runner.task_runner_default_init()
134 | 
135 |     if args.do_train:
136 |         try:
137 |             task_runner.train()
138 |         except Exception as ex:
139 |             raise RuntimeError(traceback.print_exc())
140 | 
141 |     if args.do_predict:
142 |         try:
143 |             task_runner.predict()
144 |         except Exception as ex:
145 |             raise RuntimeError(traceback.print_exc())
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     app()
150 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/readme.md:
--------------------------------------------------------------------------------
  1 | # Clinical Relation Extration with Transformers
  2 | 
  3 | ## Aim
  4 | This package is developed for researchers easily to use state-of-the-art transformers models for extracting relations from clinical notes. 
  5 | No prior knowledge of transformers is required. We handle the whole process from data preprocessing to training to prediction.
  6 | 
  7 | ## Dependency
  8 | The package is built on top of the Transformers developed by the HuggingFace. 
  9 | We have the requirement.txt to specify the packages required to run the project.
 10 | 
 11 | ## Background
 12 | Our training strategy is inspired by the paper: https://arxiv.org/abs/1906.03158
 13 | We only support train-dev mode, but you can do 5-fold CV.
 14 | 
 15 | ## Available models
 16 | - BERT
 17 | - XLNet
 18 | - RoBERTa
 19 | - ALBERT
 20 | - DeBERTa
 21 | - Longformer
 22 | > We will keep adding new models.
 23 | 
 24 | ## usage and example
 25 | - data format
 26 | > see sample_data dir (train.tsv and test.tsv) for the train and test data format
 27 | 
 28 | > The sample data is a small subset of the data prepared from the 2018 umass made1.0 challenge corpus
 29 | 
 30 | ```
 31 | # data format: tsv file with 8 columns:
 32 | 1. relation_type: adverse
 33 | 2. sentence_1: ALLERGIES : [s1] Penicillin [e1] .
 34 | 3. sentence_2: [s2] ALLERGIES [e2] : Penicillin .
 35 | 4. entity_type_1: Drug
 36 | 5. entity_type_2: ADE
 37 | 6. entity_id_1: T1
 38 | 7. entity_id2: T2
 39 | 8. file_id: 13_10
 40 | 
 41 | note: 
 42 | 1) the entity between [s1][e1] is the first entity in a relation; the second entity in the relation is inbetween [s2][e2]
 43 | 2) even the two entities in the same sentenc, we still require to put them separately
 44 | 3) in the test.tsv, you can set all labels to neg or no_relation or whatever, because we will not use the label anyway
 45 | 4) We recommend to evaluate the test performance in a separate process based on prediction. (see **post-processing**)
 46 | 5) We recommend using official evaluation scripts to do evaluation to make sure the results reported are reliable.
 47 | ```
 48 | 
 49 | - preprocess data (see the preprocess.ipynb script for more details on usage)
 50 | > we did not provide a script for training and test data generation
 51 | 
 52 | > we have a jupyter notebook with preprocessing 2018 n2c2 data as an example
 53 | 
 54 | > you can follow our example to generate your own dataset
 55 | 
 56 | - special tags
 57 | > we use 4 special tags to identify two entities in a relation
 58 | ```
 59 | # the defaults tags we defined in the repo are
 60 | 
 61 | EN1_START = "[s1]"
 62 | EN1_END = "[e1]"
 63 | EN2_START = "[s2]"
 64 | EN2_END = "[e2]"
 65 | 
 66 | If you need to customize these tags, you can change them in
 67 | config.py
 68 | ```
 69 | 
 70 | - training
 71 | > please refer to the wiki page for all details of the parameters
 72 | > [flag details](https://github.com/uf-hobi-informatics-lab/ClinicalTransformerRelationExtraction/wiki/All-flags-explained-for-training-and-test)
 73 | 
 74 | ```shell script
 75 | export CUDA_VISIBLE_DEVICES=1
 76 | data_dir=./sample_data
 77 | nmd=./new_model
 78 | pof=./predictions.txt
 79 | log=./log.txt
 80 | 
 81 | python ./src/relation_extraction.py \
 82 | 		--model_type bert \
 83 | 		--data_format_mode 0 \
 84 | 		--classification_scheme 1 \
 85 | 		--pretrained_model bert-base-uncased \
 86 | 		--data_dir $data_dir \
 87 | 		--new_model_dir $nmd \
 88 | 		--predict_output_file $pof \
 89 | 		--overwrite_model_dir \
 90 | 		--seed 13 \
 91 | 		--max_seq_length 256 \
 92 | 		--cache_data \
 93 | 		--do_train \
 94 | 		--do_lower_case \
 95 | 		--train_batch_size 4 \
 96 | 		--eval_batch_size 4 \
 97 | 		--learning_rate 1e-5 \
 98 | 		--num_train_epochs 3 \
 99 | 		--gradient_accumulation_steps 1 \
100 | 		--do_warmup \
101 | 		--warmup_ratio 0.1 \
102 | 		--weight_decay 0 \
103 | 		--max_num_checkpoints 1 \
104 | 		--log_file $log \
105 | ```
106 | 
107 | - prediction
108 | ```shell script
109 | export CUDA_VISIBLE_DEVICES=1
110 | data_dir=./sample_data
111 | nmd=./new_model
112 | pof=./predictions.txt
113 | log=./log.txt
114 | 
115 | # we have to set data_dir, new_model_dir, model_type, log_file, and eval_batch_size, data_format_mode
116 | python ./src/relation_extraction.py \
117 | 		--model_type bert \
118 | 		--data_format_mode 0 \
119 | 		--classification_scheme 1 \
120 | 		--pretrained_model bert-base-uncased \
121 | 		--data_dir $data_dir \
122 | 		--new_model_dir $nmd \
123 | 		--predict_output_file $pof \
124 | 		--overwrite_model_dir \
125 | 		--seed 13 \
126 | 		--max_seq_length 256 \
127 | 		--cache_data \
128 | 		--do_predict \
129 | 		--do_lower_case \
130 | 		--eval_batch_size 4 \
131 | 		--log_file $log \
132 | ```
133 | 
134 | - post-processing (we only support transformation to brat format)
135 | ```shell script
136 | # see --help for more information
137 | data_dir=./sample_data
138 | pof=./predictions.txt
139 | 
140 | python src/data_processing/post_processing.py \
141 | 		--mode mul \
142 | 		--predict_result_file $pof \
143 | 		--entity_data_dir ./test_data_entity_only \
144 | 		--test_data_file ${data_dir}/test.tsv \
145 | 		--brat_result_output_dir ./brat_output
146 | ```
147 | 
148 | 
149 | ## Using json file for experiment config instead of commend line
150 | 
151 | - to simplify using the package, we support using json file for configuration
152 | - using json, you can define all parameters in a separate json file instead of input via commend line
153 | - config_experiment_sample.json is a sample json file you can follow to develop yours
154 | - to run experiment with json config, you need to follow run_json.sh
155 | ```shell script
156 | export CUDA_VISIBLE_DEVICES=1
157 | 
158 | python ./src/relation_extraction_json.py \
159 | 		--config_json "./config_experiment_sample.json"
160 | ```
161 | 
162 | ## Issues
163 | raise an issue if you have problems. 
164 | 
165 | ## Citation
166 | please cite our paper:
167 | ```
168 | 
169 | ```
170 | 
171 | ## Clinical Pre-trained Transformer Models
172 | We have a series transformer models pre-trained on MIMIC-III.
173 | You can find them here:
174 | - https://transformer-models.s3.amazonaws.com/mimiciii_albert_10e_128b.zip
175 | - https://transformer-models.s3.amazonaws.com/mimiciii_bert_10e_128b.zip
176 | - https://transformer-models.s3.amazonaws.com/mimiciii_electra_5e_128b.zip
177 | - https://transformer-models.s3.amazonaws.com/mimiciii_roberta_10e_128b.zip
178 | - https://transformer-models.s3.amazonaws.com/mimiciii_xlnet_5e_128b.zip
179 | - https://transformer-models.s3.amazonaws.com/mimiciii_deberta_10e_128b.tar.gz
180 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/output_format_converter.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | The script provide a tool to convert BIO formatted file to either Brat or BioC format
  6 | The script also provide a tool to merge several brat or BioC formatted files into one file by concatenating all the unique entities.
  7 | 
  8 | The pre-request is BIO data must have offset information
  9 | """
 10 | 
 11 | from pathlib import Path
 12 | from common_utils.common_io import read_from_file, load_bio_file_into_sents
 13 | import shutil
 14 | import traceback
 15 | 
 16 | 
 17 | BRAT_TEMPLATE = "{}\t{} {} {}\t{}"
 18 | BIOC_TEMPLATE = """
 19 |       <annotation id="{a}">
 20 |         <infon key="type">{e}</infon>
 21 |         <location length="{d}" offset="{c}"/>
 22 |         <text>{b}</text>
 23 |       </annotation>\n
 24 | """
 25 | BIOC_HEADER = """
 26 | <?xml version='1.0' encoding='utf-8' standalone='yes'?>
 27 | <collection>
 28 |   <source></source>
 29 |   <date></date>
 30 |   <key></key>
 31 |   <document>
 32 |     <id>{}</id>
 33 |     <passage>
 34 |       <offset>0</offset>
 35 | """
 36 | BIOC_END = """
 37 |     </passage>
 38 |   </document>
 39 | </collection>
 40 | """
 41 | 
 42 | 
 43 | def __prepare_path(text_dir, input_dir, output_dir):
 44 |     t_input = Path(text_dir)
 45 |     p_input = Path(input_dir)
 46 |     p_output = Path(output_dir)
 47 |     p_output.mkdir(parents=True, exist_ok=True)
 48 | 
 49 |     return t_input, p_input, p_output
 50 | 
 51 | 
 52 | def tag2entity(sents):
 53 |     entities = []
 54 |     for i, sent in enumerate(sents):
 55 |         term, start, end, sem_tag, prev_tag = [], None, None, None, "O"
 56 |         for j, word in enumerate(sent):
 57 |             text, w_s, w_e, w_a_s, w_a_e, predict_tag = word  # must have offset information
 58 |             if predict_tag == "O":
 59 |                 if prev_tag != "O":
 60 |                     entities.append((" ".join(term), start, end, sem_tag))
 61 |                     term, start, end, sem_tag = [], None, None, None
 62 |             else:
 63 |                 boundary, ttag = predict_tag.split("-")
 64 |                 if boundary == "B":
 65 |                     if prev_tag != "O":
 66 |                         entities.append((" ".join(term), start, end, sem_tag))
 67 |                         term, start, end, sem_tag = [], None, None, None
 68 |                     term.append(text)
 69 |                     start, end, sem_tag = w_s, w_e, ttag
 70 |                 elif boundary == "I":
 71 |                     if sem_tag == ttag:
 72 |                         term.append(text)
 73 |                         end = w_e
 74 |                     else:
 75 |                         if prev_tag != "O":
 76 |                             entities.append((" ".join(term), start, end, sem_tag))
 77 |                             term, start, end, sem_tag = [], None, None, None
 78 |                         term.append(text)
 79 |                         start, end, sem_tag = w_s, w_e, ttag
 80 |                 else:
 81 |                     raise ValueError('The BIO scheme only support B, I but get {}-{} in {}'.format(boundary, ttag, sent))
 82 |             prev_tag = predict_tag
 83 | 
 84 |         if term:
 85 |             entities.append((" ".join(term), start, end, sem_tag))
 86 | 
 87 |     return entities
 88 | 
 89 | 
 90 | def bio2output(text_dir, input_dir, output_dir, output_template, do_copy_text, file_suffix='ann'):
 91 |     """
 92 |     we expect the input as a directory of all bio files end with .txt suffix
 93 |     we expect the each bio file contain the offset info (start; end position of each words) and tag info;
 94 |     original words are not required
 95 |     convert the bio formatted files to brat formatted .ann file
 96 |     the output directory will not contain the .txt file
 97 |     """
 98 |     t_input, p_input, p_output = __prepare_path(text_dir, input_dir, output_dir)
 99 |     for ifn in p_input.glob("*.txt"):
100 |         try:
101 |             ifn_stem = ifn.stem.split(".")[0]
102 |             doc_text_file = t_input / "{}.txt".format(ifn_stem)
103 |             ofn = p_output / "{}.{}".format(ifn_stem, file_suffix)
104 |             sents = load_bio_file_into_sents(ifn, do_lower=False)
105 |             doc_text = read_from_file(doc_text_file)
106 |             entities = tag2entity(sents)
107 |             output_entities = []
108 |             for idx, entity in enumerate(entities):
109 |                 ann_text, offset_s, offset_e, sem_tag = entity
110 |                 offset_s, offset_e = int(offset_s), int(offset_e)
111 |                 # we need to use original text not the ann text here
112 |                 # you can use ann_text for debugging
113 |                 raw_entity_text = doc_text[offset_s:offset_e]
114 | 
115 |                 if "\n" in raw_entity_text:
116 |                     idx = raw_entity_text.index("\n")
117 |                     offset_s = "{} {};{}".format(offset_s, offset_s+idx, offset_s+idx+1)
118 |                     raw_entity_text = raw_entity_text.replace("\n", " ")
119 | 
120 |                 if file_suffix == "ann":
121 |                     formatted_output = output_template.format("T{}".format(idx+1), sem_tag, offset_s, offset_e, raw_entity_text)
122 |                 elif file_suffix == "xml":
123 |                     formatted_output = output_template.format(a=idx+1, b=raw_entity_text, c=offset_s, d=offset_e-offset_s, e=sem_tag)
124 |                 else:
125 |                     formatted_output = None
126 |                     print('formatted output is None due to unknown formatter code')
127 | 
128 |                 output_entities.append(formatted_output)
129 | 
130 |             if do_copy_text:
131 |                 new_text_file = p_output / "{}.txt".format(ifn_stem)
132 |                 shutil.copy2(doc_text_file.as_posix(), new_text_file.as_posix())
133 | 
134 |             with open(ofn, "w") as f:
135 |                 formatted_output = "\n".join(output_entities)
136 |                 if file_suffix == "xml":
137 |                     formatted_output = BIOC_HEADER.format(ifn.stem) + formatted_output + BIOC_END
138 |                 f.write(formatted_output)
139 |                 f.write("\n")
140 |         except Exception as ex:
141 |             traceback.print_exc()
142 | 
143 | 
144 | def main(text_dir=None, input_bio_dir=None, output_dir=None, formatter=1, do_copy_text=True):
145 |     if formatter == 1:
146 |         bio2output(text_dir, input_bio_dir, output_dir, BRAT_TEMPLATE, do_copy_text, file_suffix="ann")
147 |     elif formatter == 2:
148 |         bio2output(text_dir, input_bio_dir, output_dir, BIOC_TEMPLATE, do_copy_text, file_suffix='xml')
149 |     else:
150 |         raise RuntimeError("Only support formatter as 1 and 2 but get {}; see help for more information.".format(formatter))
151 | 


--------------------------------------------------------------------------------
/scipts/training_ner.py:
--------------------------------------------------------------------------------
  1 | #create training and test bio for NER
  2 | import sys
  3 | sys.path.append("../ClinicalTransformerNER/")
  4 | sys.path.append("../NLPreprocessing/")
  5 | import os
  6 | from pathlib import Path
  7 | from collections import defaultdict, Counter
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | import shutil 
 11 | import fileinput
 12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file
 13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]"
 14 | 
 15 | data_dir=sys.argv[1]
 16 | #output_name='test'
 17 | 
 18 | #data stat
 19 | file_ids = set()
 20 | enss = []
 21 | 
 22 | for fn in Path(data_dir).glob("*.ann"):
 23 |     file_ids.add(fn.stem)
 24 |     _, ens, _ = read_annotation_brat(fn)
 25 |     #print( _)
 26 |     enss.extend(ens)
 27 | print("test files: ", len(file_ids), list(file_ids)[:5])
 28 | print("total test eneitites: ", len(enss))
 29 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()]))
 30 | # generate bio
 31 | file_ids = list(file_ids)
 32 | train_dev_ids, test_ids = train_test_split(file_ids, train_size=0.75, random_state=13, shuffle=True)#use  150 for training
 33 | print('length of training and test')
 34 | len(train_dev_ids), len(test_ids)
 35 | train_dev_root = Path('../data/training_set_150')
 36 | test_root = Path('../data/test_set_150')
 37 | #create notes file
 38 | Path(train_dev_root).mkdir(parents=True, exist_ok=True)
 39 | Path(test_root).mkdir(parents=True, exist_ok=True)
 40 | train_root=Path(data_dir)
 41 | #copy file to train and test
 42 | for fid in train_dev_ids:
 43 |     txt_fn = train_root / (fid + ".txt")
 44 |     ann_fn = train_root / (fid + ".ann")
 45 |     txt_fn1 = train_dev_root / (fid + ".txt")
 46 |     ann_fn1 = train_dev_root / (fid + ".ann")
 47 |     shutil.copyfile(txt_fn, txt_fn1)
 48 |     shutil.copyfile(ann_fn, ann_fn1)
 49 | for fid in test_ids:
 50 |     txt_fn = train_root / (fid + ".txt")
 51 |     ann_fn = train_root / (fid + ".ann")
 52 |     txt_fn1 = test_root / (fid + ".txt")
 53 |     ann_fn1 = test_root / (fid + ".ann")
 54 |     shutil.copyfile(txt_fn, txt_fn1)
 55 |     shutil.copyfile(ann_fn, ann_fn1)
 56 | 
 57 | train_dev_ids = list(train_dev_ids)
 58 | train_ids, dev_ids = train_test_split(train_dev_ids, train_size=0.9, random_state=13, shuffle=True)
 59 | test_bio = "../bio/"+'bio_test_150'
 60 | training_bio = "../bio/"+'bio_training_150'
 61 | output_root1 = Path(test_bio)
 62 | output_root2 = Path(training_bio)
 63 | output_root1.mkdir(parents=True, exist_ok=True)
 64 | output_root2.mkdir(parents=True, exist_ok=True)
 65 | 
 66 | for fid in train_dev_ids:
 67 |     txt_fn = train_dev_root / (fid + ".txt")
 68 |     ann_fn = train_dev_root / (fid + ".ann")
 69 |     bio_fn = output_root2 / (fid + ".bio.txt")
 70 |     
 71 |     txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
 72 |     e2idx, entities, rels = read_annotation_brat(ann_fn)
 73 |     nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False)
 74 |     #print(nsents)
 75 |     #print(bio_fn)
 76 |     #break
 77 |     BIOdata_to_file(bio_fn, nsents)
 78 | # train
 79 | with open(training_bio+"/train.txt", "w") as f:
 80 |     for fid in train_ids:
 81 |         f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt")))
 82 |     fileinput.close()
 83 |         
 84 | # dev
 85 | with open(training_bio+"/dev.txt", "w") as f:
 86 |     for fid in dev_ids:
 87 |         f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt")))
 88 |     fileinput.close()
 89 | 
 90 | #test
 91 | for fn in test_root.glob("*.txt"):
 92 |     txt_fn = fn
 93 |     bio_fn = output_root1 / (fn.stem + ".bio.txt")
 94 |     
 95 |     txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
 96 |     nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False)
 97 |     
 98 |     BIOdata_to_file(bio_fn, nsents)
 99 | 
100 | #same process but have train test split as 1:1
101 | train_dev_ids, test_ids = train_test_split(file_ids, train_size=0.5, random_state=13, shuffle=True)#use  8:2 split
102 | print('length of training and test')
103 | len(train_dev_ids), len(test_ids)
104 | train_dev_root = Path('../data/training_set_100')
105 | test_root = Path('../data/test_set_100')
106 | #create notes file
107 | Path(train_dev_root).mkdir(parents=True, exist_ok=True)
108 | Path(test_root).mkdir(parents=True, exist_ok=True)
109 | train_root=Path(data_dir)
110 | #copy file to train and test
111 | for fid in train_dev_ids:
112 |     txt_fn = train_root / (fid + ".txt")
113 |     ann_fn = train_root / (fid + ".ann")
114 |     txt_fn1 = train_dev_root / (fid + ".txt")
115 |     ann_fn1 = train_dev_root / (fid + ".ann")
116 |     shutil.copyfile(txt_fn, txt_fn1)
117 |     shutil.copyfile(ann_fn, ann_fn1)
118 | for fid in test_ids:
119 |     txt_fn = train_root / (fid + ".txt")
120 |     ann_fn = train_root / (fid + ".ann")
121 |     txt_fn1 = test_root / (fid + ".txt")
122 |     ann_fn1 = test_root / (fid + ".ann")
123 |     shutil.copyfile(txt_fn, txt_fn1)
124 |     shutil.copyfile(ann_fn, ann_fn1)
125 | 
126 | train_dev_ids = list(train_dev_ids)
127 | train_ids, dev_ids = train_test_split(train_dev_ids, train_size=0.9, random_state=13, shuffle=True)
128 | test_bio = "../bio/"+'bio_test_100'
129 | training_bio = "../bio/"+'bio_training_100'
130 | output_root1 = Path(test_bio)
131 | output_root2 = Path(training_bio)
132 | output_root1.mkdir(parents=True, exist_ok=True)
133 | output_root2.mkdir(parents=True, exist_ok=True)
134 | 
135 | for fid in train_dev_ids:
136 |     txt_fn = train_dev_root / (fid + ".txt")
137 |     ann_fn = train_dev_root / (fid + ".ann")
138 |     bio_fn = output_root2 / (fid + ".bio.txt")
139 |     
140 |     txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
141 |     e2idx, entities, rels = read_annotation_brat(ann_fn)
142 |     nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False)
143 |     #print(nsents)
144 |     #print(bio_fn)
145 |     #break
146 |     BIOdata_to_file(bio_fn, nsents)
147 | # train
148 | with open(training_bio+"/train.txt", "w") as f:
149 |     for fid in train_ids:
150 |         f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt")))
151 |     fileinput.close()
152 |         
153 | # dev
154 | with open(training_bio+"/dev.txt", "w") as f:
155 |     for fid in dev_ids:
156 |         f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt")))
157 |     fileinput.close()
158 | 
159 | #test
160 | for fn in test_root.glob("*.txt"):
161 |     txt_fn = fn
162 |     bio_fn = output_root1 / (fn.stem + ".bio.txt")
163 |     
164 |     txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
165 |     nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False)
166 |     
167 |     BIOdata_to_file(bio_fn, nsents)


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/run_transformer_batch_prediction.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | The input files must have offset information. In input file, for each word in line, it must have at least text, start, end, tag information
  5 | output file suffix will be set to .bio.txt
  6 | """
  7 | 
  8 | import torch
  9 | import argparse
 10 | import os
 11 | import traceback
 12 | from pathlib import Path
 13 | 
 14 | from common_utils.common_io import json_load, output_bio
 15 | from transformer_ner.data_utils import TransformerNerDataProcessor, transformer_convert_data_to_features
 16 | from transformer_ner.task import load_model, predict, MODEL_CLASSES, _output_bio
 17 | from transformer_ner.transfomer_log import TransformerNERLogger
 18 | from common_utils.common_log import LOG_LVLs
 19 | from common_utils.output_format_converter import main as format_converter
 20 | 
 21 | import transformers
 22 | from packaging import version
 23 | 
 24 | 
 25 | pytorch_version = version.parse(transformers.__version__)
 26 | assert pytorch_version >= version.parse('3.0.0'), \
 27 |     'we now only support transformers version >=3.0.0, but your version is {}'.format(pytorch_version)
 28 | 
 29 | 
 30 | def main(args):
 31 |     label2idx = json_load(os.path.join(args.pretrained_model, "label2idx.json"))
 32 |     num_labels = len(label2idx)
 33 |     idx2label = {v: k for k, v in label2idx.items()}
 34 |     args.label2idx = label2idx
 35 |     args.idx2label = idx2label
 36 |     # get config, model and tokenizer
 37 |     model_config, _, model_tokenizer = MODEL_CLASSES[args.model_type]
 38 |     tokenizer = model_tokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case)
 39 |     args.tokenizer = tokenizer
 40 |     config = model_config.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case)
 41 |     args.config = config
 42 |     args.use_crf = config.use_crf
 43 |     model = load_model(args, args.pretrained_model)
 44 |     model.to(args.device)
 45 | 
 46 |     ner_data_processor = TransformerNerDataProcessor()
 47 |     ner_data_processor.set_logger(args.logger)
 48 |     ner_data_processor.set_data_dir(args.preprocessed_text_dir)
 49 |     if args.data_has_offset_information:
 50 |         ner_data_processor.offset_info_available()
 51 | 
 52 |     # fids = [each.stem.split(".")[0] for each in Path(args.preprocessed_text_dir).glob("*.txt")]
 53 |     for each_file in Path(args.preprocessed_text_dir).glob("*.txt"):
 54 |         try:
 55 |             test_example = ner_data_processor.get_test_examples(file_name=each_file.name)
 56 |             test_features = transformer_convert_data_to_features(args=args,
 57 |                                                                  input_examples=test_example,
 58 |                                                                  label2idx=label2idx,
 59 |                                                                  tokenizer=tokenizer,
 60 |                                                                  max_seq_len=args.max_seq_length)
 61 |             predictions = predict(args, model, test_features)
 62 |             Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 63 |             ofn = each_file.stem.split(".")[0] + ".bio.txt"
 64 |             args.predict_output_file = os.path.join(args.output_dir, ofn)
 65 |             _output_bio(args, test_example, predictions)
 66 |         except Exception as ex:
 67 |             args.logger.error(f"Encountered an error when processing predictions for file: {each_file.name}")
 68 |             args.logger.error(traceback.format_exc())
 69 | 
 70 |     if args.do_format:
 71 |         base_path = Path(args.output_dir)
 72 |         output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
 73 |         output_formatted_dir.mkdir(parents=True, exist_ok=True)
 74 |         format_converter(text_dir=args.raw_text_dir,
 75 |                          input_bio_dir=args.output_dir,
 76 |                          output_dir=output_formatted_dir,
 77 |                          formatter=args.do_format,
 78 |                          do_copy_text=args.do_copy)
 79 | 
 80 | 
 81 | if __name__ == '__main__':
 82 |     parser = argparse.ArgumentParser()
 83 | 
 84 |     parser.add_argument("--model_type", default='bert', type=str, required=True,
 85 |                         help="valid values: bert, roberta or xlnet, albert, distilbert")
 86 |     parser.add_argument("--pretrained_model", type=str, required=True,
 87 |                         help="The pretrained model file or directory for fine tuning.")
 88 |     parser.add_argument("--preprocessed_text_dir", type=str, required=True,
 89 |                         help="The input data directory.")
 90 |     parser.add_argument("--raw_text_dir", type=str, required=True,
 91 |                         help="The input data directory.")
 92 |     parser.add_argument("--data_has_offset_information", action='store_true',
 93 |                         help="The input data directory.")
 94 |     parser.add_argument("--output_dir", type=str, required=True,
 95 |                         help="The output data directory.")
 96 |     parser.add_argument("--do_lower_case", action='store_true',
 97 |                         help="Set this flag if you are using an uncased model.")
 98 |     parser.add_argument("--eval_batch_size", default=8, type=int,
 99 |                         help="Total batch size for eval.")
100 |     parser.add_argument("--max_seq_length", default=128, type=int,
101 |                         help="maximum number of tokens allowed in each sentence")
102 |     parser.add_argument("--log_file", default=None,
103 |                         help="where to save the log information")
104 |     parser.add_argument("--log_lvl", default="i", type=str,
105 |                         help="d=DEBUG; i=INFO; w=WARNING; e=ERROR")
106 |     parser.add_argument("--do_format", default=0, type=int,
107 |                         help="0=bio (not format change will be applied); 1=brat; 2=bioc")
108 |     parser.add_argument("--do_copy", action='store_true',
109 |                         help="if copy the original plain text to output folder")
110 |     parser.add_argument("--progress_bar", action='store_true',
111 |                         help="show progress during the training in tqdm")
112 |     parser.add_argument("--use_crf", action='store_true',
113 |                         help="Whether to use crf layer as classifier.")
114 | 
115 |     global_args = parser.parse_args()
116 |     # create logger
117 |     logger = TransformerNERLogger(global_args.log_file, global_args.log_lvl).get_logger()
118 |     global_args.logger = logger
119 |     # device
120 |     global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
121 |     logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.')
122 | 
123 |     main(global_args)
124 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/run_transformer_ner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import torch
  6 | from transformer_ner.task import run_task
  7 | from transformer_ner.transfomer_log import TransformerNERLogger
  8 | from traceback import format_exc
  9 | 
 10 | from packaging import version
 11 | import transformers
 12 | 
 13 | 
 14 | pytorch_version = version.parse(transformers.__version__)
 15 | assert pytorch_version >= version.parse('3.0.0'), \
 16 |     'we now only support transformers version >=3.0.0, but your version is {}'.format(pytorch_version)
 17 | 
 18 | 
 19 | def main():
 20 |     parser = argparse.ArgumentParser()
 21 | 
 22 |     # add arguments
 23 |     parser.add_argument("--model_type", default='bert', type=str, required=True,
 24 |                         help="valid values: bert, roberta or xlnet")
 25 |     parser.add_argument("--pretrained_model", type=str, required=True,
 26 |                         help="The pretrained model file or directory for fine tuning.")
 27 |     parser.add_argument("--config_name", default=None, type=str,
 28 |                         help="Pretrained config name or path if not the same as pretrained_model")
 29 |     parser.add_argument("--tokenizer_name", default=None, type=str,
 30 |                         help="Pretrained tokenizer name or path if not the same as pretrained_model")
 31 |     parser.add_argument("--data_dir", type=str, required=True,
 32 |                         help="The input data directory.")
 33 |     parser.add_argument("--data_has_offset_information", action='store_true',
 34 |                         help="The input data directory.")
 35 |     parser.add_argument("--new_model_dir", type=str, required=True,
 36 |                         help="directory for saving new model checkpoints (keep latest n only)")
 37 |     parser.add_argument("--save_model_core", action='store_true',
 38 |                         help="""save the transformer core of the model 
 39 |                         which allows model to be used as base model for further pretraining""")
 40 |     parser.add_argument("--predict_output_file", type=str, default=None,
 41 |                         help="predicted results output file.")
 42 |     parser.add_argument('--overwrite_model_dir', action='store_true',
 43 |                         help="Overwrite the content of the new model directory")
 44 |     parser.add_argument("--seed", default=3, type=int,
 45 |                         help='random seed')
 46 |     parser.add_argument("--max_seq_length", default=128, type=int,
 47 |                         help="maximum number of tokens allowed in each sentence")
 48 |     parser.add_argument("--do_train", action='store_true',
 49 |                         help="Whether to run training.")
 50 |     parser.add_argument("--model_selection_scoring", default='strict-f_score-1', type=str,
 51 |                         help="""The scoring methos used to select model on dev dataset 
 52 |                         only support strict-f_score-n, relax-f_score-n (n is 0.5, 1, or 2)""")
 53 |     parser.add_argument("--do_predict", action='store_true',
 54 |                         help="Whether to run prediction on the test set.")
 55 |     parser.add_argument("--use_crf", action='store_true',
 56 |                         help="Whether to use crf layer as classifier.")
 57 |     parser.add_argument("--do_lower_case", action='store_true',
 58 |                         help="Set this flag if you are using an uncased model.")
 59 |     parser.add_argument("--train_batch_size", default=8, type=int,
 60 |                         help="The batch size for training.")
 61 |     parser.add_argument("--eval_batch_size", default=8, type=int,
 62 |                         help="The batch size for eval.")
 63 |     parser.add_argument('--train_steps', type=int, default=-1,
 64 |                         help="Number of trianing steps between two evaluations on the dev set; if <0 then evaluate after each epoch")
 65 |     parser.add_argument("--learning_rate", default=1e-5, type=float,
 66 |                         help="The initial learning rate for Adam.")
 67 |     parser.add_argument("--num_train_epochs", default=10, type=float,
 68 |                         help="Total number of training epochs to perform.")
 69 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
 70 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
 71 |     parser.add_argument("--do_warmup", action='store_true',
 72 |                         help='Whether to apply warmup strategy in optimizer.')
 73 |     parser.add_argument("--warmup_ratio", default=0.1, type=float,
 74 |                         help="Linear warmup over warmup_ratio.")
 75 |     parser.add_argument("--weight_decay", default=0.0, type=float,
 76 |                         help="Weight deay if we apply some.")
 77 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
 78 |                         help="Epsilon for Adam optimizer.")
 79 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
 80 |                         help="Max gradient norm.")
 81 |     parser.add_argument("--max_num_checkpoints", default=3, type=int,
 82 |                         help="max number of checkpoints saved during training, old checkpoints will be removed.")
 83 |     parser.add_argument("--log_file", default=None,
 84 |                         help="where to save the log information")
 85 |     parser.add_argument("--log_lvl", default="i", type=str,
 86 |                         help="d=DEBUG; i=INFO; w=WARNING; e=ERROR")
 87 |     parser.add_argument("--progress_bar", action='store_true',
 88 |                         help="show progress during the training in tqdm")
 89 |     parser.add_argument("--early_stop", default=-1, type=int,
 90 |                         help="""The training will stop after num of epoch without performance improvement. If set to 0 or -1, then not use early stop.""")
 91 | 
 92 |     # fp16 and distributed training
 93 |     parser.add_argument('--fp16', action='store_true',
 94 |                         help="Whether to use 16-bit float precision instead of 32-bit")
 95 |     # parser.add_argument("--fp16_opt_level", type=str, default="O1",
 96 |     #                     help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
 97 |     #                          "See details at https://nvidia.github.io/apex/amp.html")
 98 |     # parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
 99 |     # parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
100 |     # parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
101 | 
102 |     global_args = parser.parse_args()
103 | 
104 |     # create logger
105 |     logger = TransformerNERLogger(global_args.log_file, global_args.log_lvl).get_logger()
106 |     global_args.logger = logger
107 | 
108 |     # set and check cuda (we recommend to set up CUDA device in shell)
109 |     # os.environ['CUDA_VISIBLE_DEVICES'] = global_args.cuda_ids
110 |     global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111 |     logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.')
112 | 
113 |     # if args.tokenizer_name and args.config_name are not specially set, set them as pretrained_model
114 |     if not global_args.tokenizer_name:
115 |         global_args.tokenizer_name = global_args.pretrained_model
116 |         logger.warning("set tokenizer as {}".format(global_args.tokenizer_name))
117 | 
118 |     if not global_args.config_name:
119 |         global_args.config_name = global_args.pretrained_model
120 |         logger.warning("set config as {}".format(global_args.config_name))
121 | 
122 |     if global_args.do_predict and not global_args.predict_output_file:
123 |         raise RuntimeError("Running prediction but predict output file is not set.")
124 | 
125 |     try:
126 |         run_task(global_args)
127 |     except Exception as ex:
128 |         logger.error(format_exc())
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     main()
133 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/relation_extraction.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import numpy as np
  4 | import random
  5 | from utils import TransformerLogger
  6 | from task import TaskRunner
  7 | from pathlib import Path
  8 | from data_processing.io_utils import save_text
  9 | import traceback
 10 | 
 11 | 
 12 | def set_seed(gargs):
 13 |     random.seed(gargs.seed)
 14 |     np.random.seed(gargs.seed)
 15 |     torch.manual_seed(gargs.seed)
 16 | 
 17 | 
 18 | def app(gargs):
 19 |     set_seed(gargs)
 20 | 
 21 |     # do_eval is used with do_train in most cases for 5-CV
 22 |     if gargs.do_eval and not gargs.do_train:
 23 |         raise RuntimeError("Evaluation mode (do_eval) is only available when do_train is used.\n"
 24 |                            "You may want to use do_predict instead.")
 25 | 
 26 |     # make it case in-sensitive
 27 |     gargs.model_type = gargs.model_type.lower()
 28 |     task_runner = TaskRunner(gargs)
 29 |     task_runner.task_runner_default_init()
 30 | 
 31 |     if gargs.do_train:
 32 |         if Path(gargs.new_model_dir).exists() and not gargs.overwrite_model_dir:
 33 |             raise RuntimeError("{} is exist and overwrite this dir is not permitted.".format(gargs.new_model_dir))
 34 | 
 35 |         # training
 36 |         try:
 37 |             task_runner.train()
 38 |         except Exception as ex:
 39 |             gargs.logger.error("Training error:\n{}".format(traceback.format_exc()))
 40 |             traceback.print_exc()
 41 |             raise RuntimeError()
 42 | 
 43 |     if gargs.do_predict:
 44 |         # run prediction
 45 |         try:
 46 |             preds = task_runner.predict()
 47 |         except Exception as ex:
 48 |             gargs.logger.error("Prediction error:\n{}".format(traceback.format_exc()))
 49 |             raise RuntimeError(traceback.format_exc())
 50 | 
 51 |         pred_res = "\n".join([str(pred) for pred in preds])
 52 | 
 53 |         # predict_output_file must be a file, we will create parent dir automatically
 54 |         Path(gargs.predict_output_file).parent.mkdir(parents=True, exist_ok=True)
 55 |         save_text(pred_res, gargs.predict_output_file)
 56 | 
 57 | 
 58 | if __name__ == '__main__':
 59 |     parser = argparse.ArgumentParser()
 60 |     # parse arguments
 61 |     parser.add_argument("--model_type", default='bert', type=str, required=True,
 62 |                         help="valid values: bert, roberta, albert or xlnet")
 63 |     parser.add_argument("--data_format_mode", default=0, type=int,
 64 |                         help="valid values: 0: sep mode - [CLS]S1[SEP]S2[SEP]; 1: uni mode - [CLS]S1S2[SEP]")
 65 |     parser.add_argument("--classification_scheme", default=2, type=int,
 66 |                         help="special tokens used for classification. "
 67 |                              "Valid values: "
 68 |                              "0: [CLS]; 1: [CLS], [S1], [S2]; 2: [CLS], [S1], [S2], [E1], [E2]; 3: [S1], [S2]")
 69 |     parser.add_argument("--pretrained_model", type=str,
 70 |                         help="The pretrained model file or directory for fine tuning.")
 71 |     parser.add_argument("--data_dir", type=str, required=True,
 72 |                         help="The input data directory. Should have at least a file named train.tsv")
 73 |     parser.add_argument("--new_model_dir", type=str, required=True,
 74 |                         help="directory for saving new model checkpoints (keep latest n only)")
 75 |     parser.add_argument("--predict_output_file", type=str, default=None,
 76 |                         help="predicted results output file.")
 77 |     parser.add_argument('--overwrite_model_dir', action='store_true',
 78 |                         help="Overwrite the content of the new model directory")
 79 |     parser.add_argument("--seed", default=1234, type=int,
 80 |                         help='random seed')
 81 |     parser.add_argument("--max_seq_length", default=512, type=int,
 82 |                         help="maximum number of tokens allowed in each sentence")
 83 |     parser.add_argument("--cache_data", action='store_true',
 84 |                         help="Whether to cache the features after tokenization (save training initialization time)")
 85 |     parser.add_argument("--data_file_header", default=True, type=bool,
 86 |                         help="flag used to define whether the data tsv file has header or not. "
 87 |                              "If has header, we will skip the first line")
 88 |     parser.add_argument("--do_train", action='store_true',
 89 |                         help="Whether to run training.")
 90 |     parser.add_argument("--do_eval", action='store_true',
 91 |                         help="Whether to run evaluation on dev. (require dev.tsv)")
 92 |     parser.add_argument("--do_predict", action='store_true',
 93 |                         help="Whether to run prediction on the test set. (require test.tsv)")
 94 |     parser.add_argument("--do_lower_case", action='store_true',
 95 |                         help="Set this flag if you are using an uncased model.")
 96 |     parser.add_argument("--train_batch_size", default=8, type=int,
 97 |                         help="The batch size for training.")
 98 |     parser.add_argument("--eval_batch_size", default=8, type=int,
 99 |                         help="The batch size for eval.")
100 |     parser.add_argument("--learning_rate", default=1e-5, type=float,
101 |                         help="The initial learning rate for Adam.")
102 |     parser.add_argument("--num_train_epochs", default=10, type=int,
103 |                         help="Total number of training epochs to perform.")
104 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
105 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
106 |     parser.add_argument("--do_warmup", action='store_true',
107 |                         help='Whether to apply warmup strategy in optimizer.')
108 |     parser.add_argument("--warmup_ratio", default=0.1, type=float,
109 |                         help="Linear warmup over warmup_ratio.")
110 |     parser.add_argument("--weight_decay", default=0.0, type=float,
111 |                         help="Weight deay if we apply some.")
112 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
113 |                         help="Epsilon for Adam optimizer.")
114 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
115 |                         help="Max gradient norm.")
116 |     parser.add_argument("--max_num_checkpoints", default=0, type=int,
117 |                         help="max number of checkpoints saved during training, old checkpoints will be removed."
118 |                              "if 0, then only save the last one at the end of training")
119 |     parser.add_argument("--log_file", default=None,
120 |                         help="where to save the log information")
121 |     parser.add_argument("--log_lvl", default="i", type=str,
122 |                         help="d=DEBUG; i=INFO; w=WARNING; e=ERROR")
123 |     parser.add_argument("--log_step", default=1000, type=int,
124 |                         help="logging after how many steps of training. If < 0, no log during training")
125 |     parser.add_argument("--num_core", default=1, type=int,
126 |                         help="how many cores used for multiple process for data generation")
127 |     parser.add_argument("--non_relation_label", default="NonRel", type=str,
128 |                         help="The label used for representing "
129 |                              "candidate entity pairs that is not a true relation (negative sample)")
130 |     parser.add_argument("--progress_bar", action='store_true',
131 |                         help="show progress during the training in tqdm")
132 |     parser.add_argument('--fp16', action='store_true',
133 |                         help="Whether to use 16-bit float precision instead of 32-bit")
134 |     parser.add_argument("--fp16_opt_level", type=str, default="O1",
135 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
136 |                              "See details at https://nvidia.github.io/apex/amp.html")
137 | 
138 |     args = parser.parse_args()
139 | 
140 |     # other setup
141 |     args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
142 |     args.logger = TransformerLogger(logger_file=args.log_file, logger_level=args.log_lvl).get_logger()
143 |     app(args)
144 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/data_processing/post_processing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Post processing
  3 | 
  4 | Using this script to merge the system prediction with the entities
  5 | The output format will be in BRAT
  6 | 
  7 | We will automatically align the predictions to entity pairs and file ids
  8 | The results will be write out with the entity information into a new file
  9 | We will not copy the original text to the results output dir
 10 | """
 11 | import argparse
 12 | from pathlib import Path
 13 | import numpy as np
 14 | from io_utils import load_text, save_text, pkl_load
 15 | from collections import defaultdict
 16 | from data_format_conf import NON_RELATION_TAG, BRAT_REL_TEMPLATE
 17 | import traceback
 18 | 
 19 | # import logger from upper level dir
 20 | import os
 21 | import sys
 22 | sys.path.append(Path(os.path.abspath(__file__)).parent.parent.as_posix())
 23 | from utils import TransformerLogger
 24 | 
 25 | 
 26 | def load_mappings(map_file):
 27 |     maps = []
 28 |     text = load_text(map_file)
 29 |     for idx, line in enumerate(text.strip().split("\n")):
 30 |         if idx == 0:
 31 |             continue
 32 |         info = line.split("\t")
 33 |         maps.append(info[-3:])
 34 | 
 35 |     return maps
 36 | 
 37 | 
 38 | def load_predictions(result_file):
 39 |     results = []
 40 |     text = load_text(result_file)
 41 |     for each in text.strip().split("\n"):
 42 |         results.append(each.strip())
 43 | 
 44 |     return results
 45 | 
 46 | 
 47 | def map_results(res):
 48 |     mapped_preds = defaultdict(list)
 49 |     prev_fid = "no previous file id"
 50 |     rel_idx = 1
 51 | 
 52 |     for each in res:
 53 |         fid, rt, arg1, arg2 = each
 54 |         if prev_fid != fid:
 55 |             prev_fid = fid
 56 |             rel_idx = 1
 57 |         brat_res = BRAT_REL_TEMPLATE.format(rel_idx, rt, arg1, arg2)
 58 |         mapped_preds[fid].append(brat_res)
 59 |         rel_idx += 1
 60 | 
 61 |     return mapped_preds
 62 | 
 63 | 
 64 | def output_results(mapped_predictions, entity_data_dir, output_dir):
 65 |     entity_data_dir = Path(entity_data_dir)
 66 | 
 67 |     output_dir = Path(output_dir)
 68 |     output_dir.mkdir(parents=True, exist_ok=True)
 69 | 
 70 |     for fid in entity_data_dir.glob("*.ann"):
 71 |         fid_key = fid.stem
 72 |         ofn = output_dir / "{}.ann".format(fid_key)
 73 |         entities = load_text(fid).strip()
 74 |         if fid_key in mapped_predictions:
 75 |             rels = mapped_predictions[fid_key]
 76 |             rels = "\n".join(rels)
 77 |             outputs = "\n".join([entities, rels])
 78 |             save_text(outputs, ofn)
 79 |         else:
 80 |             save_text(entities, ofn)
 81 | 
 82 | 
 83 | def combine_maps_predictions_mul(args):
 84 |     comb_map_pred = []
 85 | 
 86 |     for mf, pf in zip(args.test_data_file, args.predict_result_file):
 87 |         maps = load_mappings(mf)
 88 |         preds = load_predictions(pf)
 89 |         llp = len(preds)
 90 |         llm = len(maps)
 91 |         assert llp == llm, \
 92 |             f"prediction results and mappings should have same amount data, but got preds: {llp} and maps: {llm}"
 93 |         for m, rel_type in zip(maps, preds):
 94 |             if rel_type == NON_RELATION_TAG:
 95 |                 continue
 96 |             arg1, arg2, fid = m
 97 |             comb_map_pred.append((fid, rel_type, arg1, arg2))
 98 | 
 99 |     comb_map_pred.sort(key=lambda x: x[0])
100 |     return comb_map_pred
101 | 
102 | 
103 | def load_mappings_bin(map_file):
104 |     maps = []
105 |     text = load_text(map_file)
106 |     for idx, line in enumerate(text.strip().split("\n")):
107 |         if idx == 0:
108 |             continue
109 |         info = line.split("\t")
110 |         maps.append(info[-5:])
111 | 
112 |     return maps
113 | 
114 | 
115 | def combine_maps_predictions_bin(args):
116 |     if not args.type_map:
117 |         raise RuntimeError("no type maps (entity-relation) provided. See help.")
118 |     type_maps = pkl_load(args.type_map)
119 | 
120 |     comb_map_pred = []
121 | 
122 |     for mf, pf in zip(args.test_data_file, args.predict_result_file):
123 |         maps = load_mappings_bin(mf)
124 |         preds = load_predictions(pf)
125 |         llp = len(preds)
126 |         llm = len(maps)
127 |         assert llp == llm, \
128 |             f"prediction results and mappings should have same amount data, but got preds: {llp} and maps: {llm}"
129 |         for m, rel_type in zip(maps, preds):
130 |             if rel_type == NON_RELATION_TAG:
131 |                 continue
132 |             en_type_1, en_type_2, arg1, arg2, fid = m
133 |             real_rel_type = type_maps[(en_type_1, en_type_2)]
134 |             comb_map_pred.append((fid, real_rel_type, arg1, arg2))
135 | 
136 |     comb_map_pred.sort(key=lambda x: x[0])
137 |     return comb_map_pred
138 | 
139 | 
140 | def app(args):
141 |     lltf = len(args.test_data_file)
142 |     llpf = len(args.predict_result_file)
143 | 
144 |     args.logger.info("mode: {}; predict file: {}; output: {}".format(
145 |         args.mode,
146 |         args.predict_result_file,
147 |         args.brat_result_output_dir
148 |     ))
149 | 
150 |     try:
151 |         assert lltf == llpf
152 |     except AssertionError as ex:
153 |         args.logger.error(
154 |             f"test and prediction file number should be same but get test: {lltf} and preduction {llpf}.")
155 |         raise RuntimeError(
156 |             f"test and prediction file number should be same but get test: {lltf} and preduction {llpf}.")
157 | 
158 |     if args.mode == "mul":
159 |         combined_results = combine_maps_predictions_mul(args)
160 |     elif args.mode == "bin":
161 |         combined_results = combine_maps_predictions_bin(args)
162 |     else:
163 |         args.logger.error("expect mode to be mul or bin but get {}".format(args.mode))
164 |         raise RuntimeError("expect mode to be mul or bin but get {}".format(args.mode))
165 | 
166 |     try:
167 |         combined_results = map_results(combined_results)
168 |         output_results(combined_results, args.entity_data_dir, args.brat_result_output_dir)
169 |     except Exception as ex:
170 |         args.logger.error(traceback.print_exc())
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser()
175 |     # parse arguments
176 |     """
177 |         To input multiple test data and prediction files, using following syntax in terminal;
178 |         You need to make sure the files order between test and prediction is correct
179 |         
180 |         bash:
181 |             python post_processing.py --test_data_file tf1.txt --test_data_file tf2.txt --predict_result_file res1.txt
182 |                 --predict_result_file res2.txx
183 |         
184 |         in the program:
185 |             args.test_data_file = ['tf1.txt', 'tf2.txt']
186 |             args.predict_result_file = ['res1.txt', 'res2.txt']
187 |             
188 |         if use bin model, you need a map file to map positive relation to its relation type.
189 |         We use entity type pair as key to conduct this mapping
190 |         example:
191 |             (ADE, Drug): Drug-ADE
192 |     """
193 |     parser.add_argument("--mode", type=str, default='mul', required=True,
194 |                         help="we have two mode for binary (bin) and multiple (mul) classes classification")
195 |     parser.add_argument("--type_map", type=str, default=None,
196 |                         help="a map of entity pair types to relation types (only use when mode is bin)")
197 |     parser.add_argument("--test_data_file", type=str, nargs='+', required=True,
198 |                         help="The test data file in which we need to read the maps; available to accept multiple files")
199 |     parser.add_argument("--entity_data_dir", type=str, required=True,
200 |                         help="The annotation files with all the entities")
201 |     parser.add_argument("--predict_result_file", nargs='+', type=str, required=True,
202 |                         help="prediction results; available to accept multiple files")
203 |     parser.add_argument("--brat_result_output_dir", type=str, required=True,
204 |                         help="prediction results")
205 |     parser.add_argument("--log_file", default="./log.txt",
206 |                         help="where to save the log information")
207 |     pargs = parser.parse_args()
208 | 
209 |     pargs.logger = TransformerLogger(logger_file=pargs.log_file,
210 |                                      logger_level='i').get_logger()
211 | 
212 |     app(pargs)
213 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/README.md:
--------------------------------------------------------------------------------
  1 | # Clinical Transformer NER
  2 | 
  3 | ## Aim
  4 | The package is the implementation of a transformer based NER system for clinical information extraction task. We aim to provide a simple and quick tool for researchers to conduct clinical NER without comprehensive knowledge of transformers. We also implemented a strategy to handle the sequence with length longer than the general transformer limits (512 tokens) without truncating any tokens.
  5 | 
  6 | ## Current available models
  7 | - BERT (base, large, mimiciii-pretrained)
  8 | - RoBERTa (base, large, mimiciii-pretrained)
  9 | - ALBERT (base, large, xlarge, xxlarge, mimiciii-pretrained)
 10 | - ELECTRA (base, large, mimiciii-pretrained)
 11 | - DistilBERT (base)
 12 | - XLNet (base, large, mimiciii-pretrained)
 13 | - Longformer (allenai/longformer-base-4096, allenai/longformer-large-4096)
 14 | - DeBERTa (microsoft/deberta-base, microsoft/deberta-large, microsoft/deberta-xlarge)
 15 | > note: 1. all mimic-pretrained models are based on base transformer architecture (Download is available in the section MIMIC-III pre-trained models); 2. DeBERTa is not support xlarge-v2 due to tokenizer change in original implementation
 16 | 
 17 | ## Usage and example
 18 | - Training and test with BIO 
 19 | 
 20 | ```shell script
 21 | # set GPU
 22 | export CUDA_VISIBLE_DEVICES=0
 23 | 
 24 | # use bert
 25 | python src/run_transformer_ner.py \
 26 |       --model_type bert \
 27 |       --pretrained_model bert-base-uncased \
 28 |       --data_dir ./test_data/conll-2003 \
 29 |       --new_model_dir ./new_bert_ner_model \
 30 |       --overwrite_model_dir \
 31 |       --predict_output_file ./bert_pred.txt \
 32 |       --max_seq_length 256 \
 33 |       --save_model_core \
 34 |       --do_train \
 35 |       --do_predict \
 36 |       --model_selection_scoring strict-f_score-1 \
 37 |       --do_lower_case \
 38 |       --train_batch_size 8 \
 39 |       --eval_batch_size 8 \
 40 |       --train_steps 500 \
 41 |       --learning_rate 1e-5 \
 42 |       --num_train_epochs 1 \
 43 |       --gradient_accumulation_steps 1 \
 44 |       --do_warmup \
 45 |       --seed 13 \
 46 |       --warmup_ratio 0.1 \
 47 |       --max_num_checkpoints 3 \
 48 |       --log_file ./log.txt \
 49 |       --progress_bar \
 50 |       --early_stop 3
 51 | ```
 52 | 
 53 | - Test on multiple files and convert bio to brat format
 54 | 
 55 | ```shell script
 56 | ##### note ######
 57 | # In the script below, you are asked to provide a preprocessed_text_dir which contains all the preprocessed file.
 58 | # 
 59 | # If you only use the BIO format for output (you have to remove --data_has_offset_information flag
 60 | # and set --do_format flag to 0), and the data format will be the format exactly as the conll-2003 dataset.
 61 | # 
 62 | # If you need BRAT or BioC format as output (as the example script), then you have to add offset information 
 63 | # to the BIO data to indicate where each word is located in the raw text. 
 64 | # We suggest you to follow the format below:
 65 | # 
 66 | # The original sentences: "Name: John Doe\nAge: 18"
 67 | # The two sentences after preprocesing "Name : John Doe\nAge : 18"
 68 | # 
 69 | # then, you can convert the data into BIO format similar as the Conll-2003 as
 70 | # """
 71 | # Name 0 4 0 4 O
 72 | # : 4 5 5 6 O
 73 | # John 6 10 7 11 B-name
 74 | # Doe 11 14 12 15 I-name
 75 | # 
 76 | # Age 15 18 16 19 O
 77 | # : 18 19 19 20 O
 78 | # 18 20 22 22 24 B-age
 79 | # 
 80 | # For test purposes, you do not need to assign a real BIO label for each word, 
 81 | # you can just simple assign "O" to all of them. 
 82 | # It will not influence the prediction results since the predictions will be converted to brat/BioC, 
 83 | # and you need to use those for evaluation.
 84 | # """
 85 | # 
 86 | # The first two numbers are the offsets of a word in the original text and the following 
 87 | # two numbers are the offsets of a word in the preprocessed text. 
 88 | # If you do not need to perform any preprocessing, then you have to set the second set of offsets as the first one.
 89 | #################
 90 | 
 91 | export CUDA_VISIBLE_DEVICES=0
 92 | 
 93 | # config and tokenizer information can be found in the pretrained model dir
 94 | # use format 1 for BRAT, 2 for BioC, 0 as default for BIO
 95 | python ./src/run_transformer_batch_prediction.py \
 96 |       --model_type bert \
 97 |       --pretrained_model <your pretrained model path> \
 98 |       --raw_text_dir <path to the original text files> \
 99 |       --preprocessed_text_dir <path to the bio formatted files> \
100 |       --output_dir <path to save predicted results> \
101 |       --max_seq_length 128 \
102 |       --do_lower_case \
103 |       --eval_batch_size 8 \
104 |       --log_file ./log.txt\
105 |       --do_format 1 \
106 |       --do_copy \
107 |       --data_has_offset_information
108 | 
109 | ####
110 | # note: If you use do_format, then we have two outputs: 
111 | # 1) all bio outputs in output_dir; 
112 | # 2) 2) we create a formatted output dir (this dir's name is output_dir's name with a suffix of '_formatted_output') for the formatted # outputs (brat format if you set do_format=1). If you set --do_copy, we will copy the .txt files to the formatted output dir, otherwise we only put .ann files in the formatted output dir.
113 | ####
114 | ```
115 | 
116 | ## Wiki for all parameters
117 | [wiki](https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER/wiki/Parameters)
118 | 
119 | ## Organization
120 | - Department of Health Outcomes and Biomedical Informatics, College of Medicine, University of Florida
121 | 
122 | ## Authors
123 | - Xi Yang (alexgre@ufl.edu)
124 | - Jiang Bian (bianjiang@ufl.edu)
125 | - Yonghui Wu (yonghui.wu@ufl.edu)
126 | 
127 | ## Contact
128 | - If you have any questions, please raise an issue in the GitHub
129 | 
130 | ## Reference
131 | please cite our paper:
132 | > Xi Yang, Jiang Bian, William R Hogan, Yonghui Wu, Clinical concept extraction using transformers, Journal of the American Medical Informatics Association, ocaa189, https://doi.org/10.1093/jamia/ocaa189
133 | 
134 | ```
135 | @article{10.1093/jamia/ocaa189,
136 |     author = {Yang, Xi and Bian, Jiang and Hogan, William R and Wu, Yonghui},
137 |     title = "{Clinical concept extraction using transformers}",
138 |     journal = {Journal of the American Medical Informatics Association},
139 |     year = {2020},
140 |     month = {10},
141 |     abstract = "{The goal of this study is to explore transformer-based models (eg, Bidirectional Encoder Representations from Transformers [BERT]) for clinical concept extraction and develop an open-source package with pretrained clinical models to facilitate concept extraction and other downstream natural language processing (NLP) tasks in the medical domain.We systematically explored 4 widely used transformer-based architectures, including BERT, RoBERTa, ALBERT, and ELECTRA, for extracting various types of clinical concepts using 3 public datasets from the 2010 and 2012 i2b2 challenges and the 2018 n2c2 challenge. We examined general transformer models pretrained using general English corpora as well as clinical transformer models pretrained using a clinical corpus and compared them with a long short-term memory conditional random fields (LSTM-CRFs) mode as a baseline. Furthermore, we integrated the 4 clinical transformer-based models into an open-source package.The RoBERTa-MIMIC model achieved state-of-the-art performance on 3 public clinical concept extraction datasets with F1-scores of 0.8994, 0.8053, and 0.8907, respectively. Compared to the baseline LSTM-CRFs model, RoBERTa-MIMIC remarkably improved the F1-score by approximately 4\\% and 6\\% on the 2010 and 2012 i2b2 datasets. This study demonstrated the efficiency of transformer-based models for clinical concept extraction. Our methods and systems can be applied to other clinical tasks. The clinical transformer package with 4 pretrained clinical models is publicly available at https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER. We believe this package will improve current practice on clinical concept extraction and other tasks in the medical domain.}",
142 |     issn = {1527-974X},
143 |     doi = {10.1093/jamia/ocaa189},
144 |     url = {https://doi.org/10.1093/jamia/ocaa189},
145 |     note = {ocaa189},
146 |     eprint = {https://academic.oup.com/jamia/advance-article-pdf/doi/10.1093/jamia/ocaa189/34055422/ocaa189.pdf},
147 | }
148 | ```
149 | 
150 | ## MIMIC-III pre-trained models
151 | - https://transformer-models.s3.amazonaws.com/mimiciii_albert_10e_128b.zip
152 | - https://transformer-models.s3.amazonaws.com/mimiciii_bert_10e_128b.zip
153 | - https://transformer-models.s3.amazonaws.com/mimiciii_electra_5e_128b.zip
154 | - https://transformer-models.s3.amazonaws.com/mimiciii_roberta_10e_128b.zip
155 | - https://transformer-models.s3.amazonaws.com/mimiciii_xlnet_5e_128b.zip
156 | - https://transformer-models.s3.amazonaws.com/mimiciii_deberta_10e_128b.tar.gz
157 | - https://transformer-models.s3.amazonaws.com/mimiciii_longformer_5e_128b.zip
158 | > note: all model pretraining tasks were done with the scripts at https://github.com/huggingface/transformers/tree/master/examples/language-modeling with a few customization.
159 | 


--------------------------------------------------------------------------------
/NLPreprocessing/annotation2BIO.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script aims to convert BRAT format data into BIO format data for NER
  3 | Entities will be mapped from their original offsets to the new offsets after sentence tokenization
  4 | Two sentences are separated by a empty line
  5 | entities and relations information are also provided in json format
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import logging
 11 | from text_process.sentence_tokenization import SentenceBoundaryDetection
 12 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG)
 13 | logger = logging.getLogger(__file__)
 14 | # logger.disabled = True
 15 | MIMICIII_PATTERN = "\[\*\*|\*\*\]"
 16 | 
 17 | 
 18 | def __ann_info(ann):
 19 |     en_info = ann.split(" ")
 20 |     return en_info[0], int(en_info[1]), int(en_info[-1])
 21 | 
 22 | 
 23 | def __rel_info(rel_id, rel, rep):
 24 |     info = rel.split(" ")
 25 |     assert len(info) == 3, f"{rel_id}\t{rel} is not a valid relation"
 26 | 
 27 |     arg1 = info[1].split(":")[1]
 28 |     arg2 = info[2].split(":")[1]
 29 |     rel_type = info[0]
 30 | 
 31 |     if rep:
 32 |         rel_type = rel_type.replace("-", "_") # format rel_type replace - with _
 33 | 
 34 |     return rel_type, arg1, arg2
 35 | 
 36 | 
 37 | def read_annotation_brat(ann_file, rep=False):
 38 |     """
 39 |     load annotation data
 40 |     entity_id2index_map -> {'T1': 0}
 41 |     entites -> ('T1', 'anticoagulant medications', 'Drug', (1000, 1025))
 42 |     relations -> ('Route-Drug', 'T3', 'T2')
 43 |     """
 44 |     # map the entity id (e.g., T1) to its index in entities list
 45 |     entity_id2index_map = dict()
 46 |     entites = []
 47 |     relations = []
 48 |     with open(ann_file, "r") as f:
 49 |         for line in f:
 50 |             line = line.strip()
 51 |             if not line:
 52 |                 continue
 53 |             anns = line.split("\t")
 54 |             ann_id = anns[0]
 55 |             if ann_id.startswith("T"):
 56 |                 t_type = anns[-1]
 57 |                 # for each in __ann_info(anns[1]):
 58 |                 #     entites.append((t_type, each[0], each[1]))
 59 |                 entity_words, offset_s, offset_e = __ann_info(anns[1])
 60 |                 entites.append((t_type,  entity_words, (offset_s, offset_e)))
 61 |                 entity_id2index_map[ann_id] = len(entites) - 1
 62 |             elif ann_id.startswith("R"):
 63 |                 relations.append(__rel_info(ann_id, anns[1], rep))
 64 | 
 65 |     # sort entities list
 66 |     # entites = sorted(entites, key=lambda x: x[2][1])
 67 | 
 68 |     return entity_id2index_map, entites, relations
 69 | 
 70 | 
 71 | def pre_processing(abs_file_path, deid_pattern=None, word_level=True, replace_number=False):
 72 |     sent_tokenizer = SentenceBoundaryDetection()
 73 | 
 74 |     if replace_number and not word_level:
 75 |         logger.info("sentence level tokenization")
 76 |         return sent_tokenizer.sent_tokenizer(replace_number)
 77 | 
 78 |     if deid_pattern:
 79 |         sent_tokenizer.set_deid_pattern(deid_pattern)
 80 | 
 81 |     sent_tokenizer.set_input_file(abs_file_path)
 82 | 
 83 |     logger.info(f"word level tokenization with replace_number set to {replace_number}")
 84 | 
 85 |     return sent_tokenizer.sent_word_tokenization_and_mapping(replace_number)
 86 | 
 87 | 
 88 | def __remove_overlap_entity(sorted_entities):
 89 |     valid_en = []
 90 |     for idx, en in enumerate(sorted_entities):
 91 |         if idx == 0:
 92 |             valid_en.append(en)
 93 |             continue
 94 |         pre_en = sorted_entities[idx-1]
 95 |         c_s = en[2][0]
 96 |         c_e = en[2][1]
 97 |         p_s = pre_en[2][0]
 98 |         p_e = pre_en[2][1]
 99 |         if c_s > p_e:
100 |             valid_en.append(en)
101 |     return valid_en
102 | 
103 | 
104 | def generate_BIO(sents, entities, file_id="", no_overlap=False, record_pos=False, tag_types=None,
105 |                  exclude_tag_types=None):
106 |     """
107 |     assign annotation information to each token
108 |     if two token have overlapped offsets, the second one will be discarded
109 |     if define tag_types (iterable type), only the types in the tag_types list will be labeled to the corpus
110 |     if define exclude_tag_types (iterable type), the tags will not be annotated
111 |     """
112 |     nsents = []
113 |     if file_id:
114 |         logger.info(f"process {file_id} file")
115 | 
116 |     entities = sorted(entities, key=lambda x: x[2][0])
117 |      
118 |     if tag_types:
119 |         entities = list(filter(lambda x: x[1] in tag_types, entities))
120 | 
121 |     if exclude_tag_types:
122 |         entities = list(filter(lambda x: x[1] not in exclude_tag_types, entities))
123 | 
124 |     if no_overlap:
125 |         entities = __remove_overlap_entity(entities)
126 | 
127 |     entities_iter = iter(entities)
128 |     entity = next(entities_iter, None)
129 |     for i, sent in enumerate(sents):
130 |         nsent = []
131 |         for j, token in enumerate(sent):
132 |             if record_pos:
133 |                 token.append((i, j))
134 |             if not entity:
135 |                 token.append('O')
136 |             else:
137 |                 # token: ('Admission', (0, 9), (0, 9))
138 |                 offset_start = token[1][0]
139 |                 offset_end = token[1][1]
140 |                 en_s = entity[2][0]
141 |                 en_e = entity[2][1]
142 |                 en_type = entity[1]
143 |                 if offset_start < en_s and offset_end < en_e:
144 |                     token.append('O')
145 |                 elif offset_start == en_s:
146 |                     token.append("-".join(['B', en_type]))
147 |                     if offset_end >= en_e:
148 |                         entity = next(entities_iter, None)
149 |                 elif offset_start > en_s and offset_end < en_e:
150 |                     token.append("-".join(['I', en_type]))
151 |                 elif offset_start > en_s and offset_end == en_e:
152 |                     token.append("-".join(['I', en_type]))
153 |                     entity = next(entities_iter, None)
154 |                 else:
155 |                     # check entity position and token position
156 |                     logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap")
157 |                     entity = next(entities_iter, None)
158 |                     if not entity:
159 |                         token.append('O')
160 |                         continue
161 |                     if offset_start > en_e:
162 |                         # logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap")
163 |                         # entity = next(entities_iter, None)
164 |                         en_s = entity[2][0]
165 |                         en_e = entity[2][1]
166 |                         en_type = entity[1]
167 |                         if offset_end <= en_s:
168 |                             token.append('O')
169 |                         else:
170 |                             if offset_start == en_s:
171 |                                 token.append("-".join(['B', en_type]))
172 |                                 if offset_end >= en_e:
173 |                                     entity = next(entities_iter, None)
174 |                             else:
175 |                                 logger.error(f"{token}\t{entity} not matched by their offsets.")
176 |                                 token.append('O')
177 |                                 entity = next(entities_iter, None)
178 |                     else:
179 |                         # logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap")
180 |                         # entity = next(entities_iter, None)
181 |                         en_s = entity[2][0]
182 |                         en_e = entity[2][1]
183 |                         en_type = entity[1]
184 |                         if offset_start == en_s:
185 |                             token.append("-".join(['B', en_type]))
186 |                             if offset_end >= en_e:
187 |                                 entity = next(entities_iter, None)
188 |                         elif offset_end < en_s:
189 |                             token.append('O')
190 |                         else:
191 |                             logger.error(f"{token}\t{entity} not matched by their offsets.")
192 |                             # token.append("-".join(['B', en_type]))
193 |                             token.append('O')
194 |                             entity = next(entities_iter, None)
195 |             nsent.append(token)
196 |         nsents.append(nsent)
197 | 
198 |     sent_bound_range = dict()  # key: sent id; value: boundary range
199 |     for i, each in enumerate(nsents):
200 |         try:
201 |             sent_start_index = each[0][1][0]
202 |             sent_end_index = each[-1][1][1]
203 |             sent_bound_range[i] = (sent_start_index, sent_end_index)
204 |         except Exception as ex:
205 |             if i != len(nsents) - 1:
206 |                 raise RuntimeError(f'The {i}th sentence is an empty sentence')
207 | 
208 |     # if record_pos:
209 |     #     nsents = [w for e in nsents for w in e]
210 | 
211 |     return nsents, sent_bound_range
212 | 
213 | 
214 | def __flat(data, to_str=False):
215 |     flatted = []
216 | 
217 |     for each in data:
218 |         if isinstance(each, list) or isinstance(each, tuple):
219 |             for e in each:
220 |                 flatted.append(e)
221 |         else:
222 |             flatted.append(each)
223 | 
224 |     if to_str:
225 |         flatted = list(map(lambda x: str(x), flatted))
226 | 
227 |     return flatted
228 | 
229 | 
230 | def BIOdata_to_file(file_name, sents, sep=" "):
231 |     # the data must be list of list
232 |     assert isinstance(sents, list), "the data object must be list and generated from generate_BIO()."
233 |     with open(file_name, "w") as fw:
234 |         # 'anticoagulant', (1000, 1013), (976, 989), 'B-Drug'
235 |         for sent in sents:
236 |             for word in sent:
237 |                 word = __flat(word, to_str=True)
238 |                 # word.append("\n")
239 |                 fw.write(sep.join(word)+"\n")
240 |             fw.write("\n")
241 | 
242 | 
243 | def load_mapping_file(mapping_file, sep=" "):
244 |     with open(mapping_file, "r") as f:
245 |         txt = f.read().strip()
246 |         sents = txt.split("\n\n")
247 |         nsents = []
248 |         for sent in sents:
249 |             words = sent.split("\n")
250 |             for word in words:
251 |                 info = word.strip().split(sep)
252 |                 ninfo = list(map(lambda x: int(x) if x.isdigit() else x, info))
253 |                 nsents.append(ninfo)
254 | 
255 |         mapping_dict = {(each[-2], each[-1]): each for each in nsents}
256 | 
257 |     return nsents, mapping_dict
258 | 
259 | 
260 | def __find_B_tag(word_seq, c_index):
261 |     for k in range(c_index, -1, -1):
262 |         c_tag = word_seq[k][-1].split("-")[0]
263 |         if c_tag == 'B':
264 |             return k
265 |         elif c_tag == 'O':
266 |             raise RuntimeError(f'check {word_seq[k]} since the label should be either I or B not O')
267 |     raise RuntimeError("No B-tag has been labeled in the data.")
268 | 
269 | 
270 | def window_sliding_sample_creation(bio_data, window_size):
271 |     pass
272 | 
273 | 
274 | def test():
275 |     pass
276 | 
277 | if __name__ == '__main__':
278 |     test()


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import CrossEntropyLoss
  4 | from utils import TransformerLogger
  5 | from transformers.modeling_utils import SequenceSummary
  6 | from transformers import (BertForSequenceClassification, BertModel,
  7 |                           XLNetForSequenceClassification, XLNetModel,
  8 |                           RobertaForSequenceClassification, RobertaModel,
  9 |                           AlbertForSequenceClassification, AlbertModel,
 10 |                           LongformerForSequenceClassification, LongformerModel,
 11 |                           DebertaForSequenceClassification, DebertaModel,
 12 |                           PreTrainedModel)
 13 | from model_utils import StableDropout
 14 | 
 15 | 
 16 | logger = TransformerLogger(logger_level='i').get_logger()
 17 | 
 18 | 
 19 | class BaseModel(PreTrainedModel):
 20 | 
 21 |     def __init__(self, config):
 22 |         super().__init__(config)
 23 | 
 24 |         self.spec_tag1, self.spec_tag2, self.spec_tag3, self.spec_tag4 = config.tags
 25 |         self.scheme = config.scheme
 26 |         self.num_labels = config.num_labels
 27 |         self.loss_fct = CrossEntropyLoss()
 28 | 
 29 |         self.drop_out = StableDropout(config.hidden_dropout_prob)
 30 | 
 31 |         if self.scheme == 1:
 32 |             self.classifier_dim = config.hidden_size * 3
 33 |         elif self.scheme == 2:
 34 |             self.classifier_dim = config.hidden_size * 5
 35 |         elif self.scheme == 3:
 36 |             self.classifier_dim = config.hidden_size * 2
 37 |         else:
 38 |             self.classifier_dim = config.hidden_size
 39 | 
 40 |         self.base_classifier = nn.Linear(self.classifier_dim, self.num_labels)
 41 | 
 42 |     @staticmethod
 43 |     def special_tag_representation(seq_output, input_ids, special_tag):
 44 |         spec_idx = (input_ids == special_tag).nonzero(as_tuple=False)
 45 | 
 46 |         temp = []
 47 |         for idx in spec_idx:
 48 |             temp.append(seq_output[idx[0], idx[1], :])
 49 |         tags_rep = torch.stack(temp, dim=0)
 50 | 
 51 |         return tags_rep
 52 | 
 53 |     def output2logits(self, pooled_output, seq_output, input_ids):
 54 |         if self.scheme == 1:
 55 |             seq_tags = []
 56 |             for each_tag in [self.spec_tag1, self.spec_tag3]:
 57 |                 seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag))
 58 |             new_pooled_output = torch.cat((pooled_output, *seq_tags), dim=1)
 59 |         elif self.scheme == 2:
 60 |             seq_tags = []
 61 |             for each_tag in [self.spec_tag1, self.spec_tag2, self.spec_tag3, self.spec_tag4]:
 62 |                 seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag))
 63 |             new_pooled_output = torch.cat((pooled_output, *seq_tags), dim=1)
 64 |         elif self.scheme == 3:
 65 |             seq_tags = []
 66 |             for each_tag in [self.spec_tag1, self.spec_tag3]:
 67 |                 seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag))
 68 |             new_pooled_output = torch.cat(seq_tags, dim=1)
 69 |         else:
 70 |             new_pooled_output = pooled_output
 71 | 
 72 |         logits = self.base_classifier(self.drop_out(new_pooled_output))
 73 | 
 74 |         return logits
 75 | 
 76 |     def calc_loss(self, logits, outputs, labels):
 77 |         new_outputs = (logits,) + outputs[2:]
 78 |         loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 79 |         new_outputs = (loss,) + new_outputs
 80 | 
 81 |         return new_outputs
 82 | 
 83 | 
 84 | class BertForRelationIdentification(BertForSequenceClassification, BaseModel):
 85 |     def __init__(self, config):
 86 |         super().__init__(config)
 87 |         self.bert = BertModel(config)
 88 |         self.init_weights()
 89 | 
 90 |     def forward(self,
 91 |                 input_ids=None,
 92 |                 attention_mask=None,
 93 |                 token_type_ids=None,
 94 |                 position_ids=None,
 95 |                 head_mask=None,
 96 |                 inputs_embeds=None,
 97 |                 labels=None,
 98 |                 output_attentions=None,
 99 |                 **kwargs):
100 | 
101 |         outputs = self.bert(
102 |             input_ids,
103 |             attention_mask=attention_mask,
104 |             token_type_ids=token_type_ids,
105 |             position_ids=position_ids,
106 |             head_mask=head_mask
107 |         )
108 | 
109 |         pooled_output = outputs[1]
110 |         seq_output = outputs[0]
111 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
112 | 
113 |         return self.calc_loss(logits, outputs, labels)
114 | 
115 | 
116 | class RoBERTaForRelationIdentification(RobertaForSequenceClassification, BaseModel):
117 |     def __init__(self, config):
118 |         super().__init__(config)
119 |         self.roberta = RobertaModel(config)
120 |         self.init_weights()
121 | 
122 |     def forward(self,
123 |                 input_ids=None,
124 |                 attention_mask=None,
125 |                 token_type_ids=None,
126 |                 position_ids=None,
127 |                 head_mask=None,
128 |                 inputs_embeds=None,
129 |                 labels=None,
130 |                 output_attentions=None,
131 |                 output_hidden_states=None,
132 |                 **kwargs):
133 | 
134 |         outputs = self.roberta(
135 |             input_ids,
136 |             attention_mask=attention_mask,
137 |             token_type_ids=token_type_ids,
138 |             position_ids=position_ids,
139 |             head_mask=head_mask,
140 |             output_attentions=output_attentions,
141 |             output_hidden_states=output_hidden_states
142 |         )
143 | 
144 |         pooled_output = outputs[1]
145 |         seq_output = outputs[0]
146 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
147 | 
148 |         return self.calc_loss(logits, outputs, labels)
149 | 
150 | 
151 | class AlbertForRelationIdentification(AlbertForSequenceClassification, BaseModel):
152 |     def __init__(self, config):
153 |         super().__init__(config)
154 |         self.albert = AlbertModel(config)
155 |         self.init_weights()
156 | 
157 |     def forward(self,
158 |                 input_ids=None,
159 |                 attention_mask=None,
160 |                 token_type_ids=None,
161 |                 position_ids=None,
162 |                 head_mask=None,
163 |                 inputs_embeds=None,
164 |                 labels=None,
165 |                 output_attentions=None,
166 |                 output_hidden_states=None,
167 |                 **kwargs):
168 | 
169 |         outputs = self.albert(
170 |             input_ids=input_ids,
171 |             attention_mask=attention_mask,
172 |             token_type_ids=token_type_ids,
173 |             position_ids=position_ids,
174 |             head_mask=head_mask,
175 |             inputs_embeds=inputs_embeds,
176 |             output_attentions=output_attentions,
177 |             output_hidden_states=output_hidden_states
178 |         )
179 | 
180 |         pooled_output = outputs[1]
181 |         seq_output = outputs[0]
182 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
183 | 
184 |         return self.calc_loss(logits, outputs, labels)
185 | 
186 | 
187 | class XLNetForRelationIdentification(XLNetForSequenceClassification, BaseModel):
188 |     def __init__(self, config):
189 |         super().__init__(config)
190 |         self.transformer = XLNetModel(config)
191 |         self.sequence_summary = SequenceSummary(config)
192 |         self.init_weights()
193 | 
194 |     def forward(self,
195 |                 input_ids=None,
196 |                 attention_mask=None,
197 |                 mems=None,
198 |                 perm_mask=None,
199 |                 target_mapping=None,
200 |                 token_type_ids=None,
201 |                 input_mask=None,
202 |                 head_mask=None,
203 |                 inputs_embeds=None,
204 |                 use_cache=True,
205 |                 labels=None,
206 |                 output_attentions=None,
207 |                 output_hidden_states=None,
208 |                 **kwargs):
209 | 
210 |         outputs = self.transformer(
211 |                 input_ids,
212 |                 attention_mask=attention_mask,
213 |                 mems=mems,
214 |                 perm_mask=perm_mask,
215 |                 target_mapping=target_mapping,
216 |                 token_type_ids=token_type_ids,
217 |                 input_mask=input_mask,
218 |                 head_mask=head_mask,
219 |                 inputs_embeds=inputs_embeds,
220 |                 use_cache=use_cache,
221 |                 output_attentions=output_attentions,
222 |                 output_hidden_states=output_hidden_states,
223 |                 **kwargs)
224 | 
225 |         seq_output = outputs[0]
226 |         pooled_output = self.sequence_summary(seq_output)
227 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
228 | 
229 |         return self.calc_loss(logits, outputs, labels)
230 | 
231 | 
232 | class LongFormerForRelationIdentification(LongformerForSequenceClassification, BaseModel):
233 |     def __init__(self, config):
234 |         super().__init__(config)
235 |         self.longformer = LongformerModel(config)
236 |         self.init_weights()
237 | 
238 |     def forward(self,
239 |                 input_ids=None,
240 |                 attention_mask=None,
241 |                 global_attention_mask=None,
242 |                 token_type_ids=None,
243 |                 position_ids=None,
244 |                 inputs_embeds=None,
245 |                 labels=None,
246 |                 output_attentions=None,
247 |                 output_hidden_states=None,
248 |                 **kwargs):
249 | 
250 |         outputs = self.longformer(
251 |             input_ids,
252 |             attention_mask=attention_mask,
253 |             global_attention_mask=global_attention_mask,
254 |             token_type_ids=token_type_ids,
255 |             position_ids=position_ids,
256 |             inputs_embeds=inputs_embeds,
257 |             output_attentions=output_attentions,
258 |             output_hidden_states=output_hidden_states
259 |         )
260 | 
261 |         pooled_output = outputs[1]
262 |         seq_output = outputs[0]
263 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
264 | 
265 |         return self.calc_loss(logits, outputs, labels)
266 | 
267 | 
268 | class DebertaForRelationIdentification(DebertaForSequenceClassification, BaseModel):
269 |     def __init__(self, config):
270 |         from model_utils import ContextPooler
271 |         super().__init__(config)
272 |         self.deberta = DebertaModel(config)
273 |         self.pooler = ContextPooler(config)
274 | 
275 |         self.init_weights()
276 | 
277 |     def forward(
278 |             self,
279 |             input_ids=None,
280 |             attention_mask=None,
281 |             token_type_ids=None,
282 |             position_ids=None,
283 |             inputs_embeds=None,
284 |             labels=None,
285 |             output_attentions=None,
286 |             output_hidden_states=None,
287 |             return_dict=None,
288 |     ):
289 |         outputs = self.deberta(
290 |             input_ids,
291 |             token_type_ids=token_type_ids,
292 |             attention_mask=attention_mask,
293 |             position_ids=position_ids,
294 |             inputs_embeds=inputs_embeds,
295 |             output_attentions=output_attentions,
296 |             output_hidden_states=output_hidden_states,
297 |             return_dict=return_dict,
298 |         )
299 | 
300 |         seq_output = outputs[0]
301 |         pooled_output = self.pooler(seq_output)
302 |         logits = self.output2logits(pooled_output, seq_output, input_ids)
303 | 
304 |         return self.calc_loss(logits, outputs, labels)
305 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/eval_scripts/old_bio_eval.py:
--------------------------------------------------------------------------------
  1 | ###
  2 |  # <p>Title:  </p>
  3 |  # <p>Create Date: 21:23:36 01/28/18</p>
  4 |  # <p>Copyright: College of Medicine </p>
  5 |  # <p>Organization: University of Florida</p>
  6 |  # @author Yonghui Wu
  7 |  # @version 1.0
  8 |  # <p>Description: </p>
  9 |  ##
 10 | # from create_log import create_logger
 11 | 
 12 | from __future__ import print_function
 13 | 
 14 | 
 15 | def read_from_file(ifn):
 16 |     with open(ifn, "r") as f:
 17 |         text = f.read()
 18 |     return text
 19 | 
 20 | 
 21 | class PRF:
 22 |     def __init__(self):
 23 |         self.true=0
 24 |         self.false=0
 25 |         
 26 | 
 27 | class BioEval:
 28 |     def __init__(self, ifn, log_name=None):
 29 |         self.ifn=ifn
 30 |         self.acc=PRF()
 31 |         self.all_strict=PRF()
 32 |         self.all_relax=PRF()
 33 |         self.cate_strict={}
 34 |         self.cate_relax={}
 35 | 
 36 |         self.gold_all=0
 37 |         self.gold_cate={}
 38 |         # self.entities=[]
 39 |         self.log_name = log_name
 40 | 
 41 |     def eval_fn(self):
 42 |         text=read_from_file(self.ifn).strip().lower()
 43 |         secs=text.split('\n\n')
 44 |         for sec in secs:
 45 |             sec=sec.strip()
 46 |             lines=sec.split('\n')
 47 |             bio=[]
 48 |             for line in lines:
 49 |                 words=line.split(None)
 50 |                 #words.append(words[-1])
 51 |                 bio.append(words)
 52 |             self.handle(bio)
 53 |         self.prf()
 54 | 
 55 |     def feed_bio(self,bio):
 56 |         self.handle(bio)
 57 | 
 58 |     def train_msg(self):
 59 |         stt="Entities: "
 60 |         for k, v in self.gold_cate.items():
 61 |             stt=stt+k+":"+str(v)+"  "
 62 |         if (self.acc.true+self.acc.false) > 0:
 63 |             acc=float(self.acc.true)/(self.acc.true+self.acc.false)
 64 |         else:
 65 |             acc=0.0
 66 |         if (self.all_strict.true+self.all_strict.false) > 0 and self.gold_all>0:
 67 |             pre = float(self.all_strict.true)/(self.all_strict.true+self.all_strict.false)
 68 |             rec = float(self.all_strict.true)/self.gold_all
 69 |             if pre+rec>0.0:
 70 |                 f1=2*pre*rec/(pre+rec)
 71 |             else:
 72 |                 f1=0.0
 73 |         else:
 74 |             pre=0.0
 75 |             rec=0.0
 76 |             f1=0.0
 77 | 
 78 |         #all_relex
 79 |         if (self.all_relax.true+self.all_relax.false) > 0 and self.gold_all>0:
 80 |             rpre = float(self.all_relax.true)/(self.all_relax.true+self.all_relax.false)
 81 |             rrec = float(self.all_relax.true)/self.gold_all
 82 |             if (rpre+rrec) > 0.0:
 83 |                 rf1=2*rpre*rrec/(rpre+rrec)
 84 |             else:
 85 |                 rf1=0.0
 86 |         else:
 87 |             rpre=0.0
 88 |             rrec=0.0
 89 |             rf1=0.0
 90 | 
 91 |         return([stt,f1,pre,rec,rf1,rpre,rrec,acc])
 92 | 
 93 |     def prf(self):
 94 |         # print "Total %s entities " % self.gold_all
 95 |         log_info = "Total %s entities " % self.gold_all + "\n"
 96 |         for k,v in self.gold_cate.items():
 97 |             # print "    %s : %s" % (k,v)
 98 |             log_info += "    %s : %s\n" % (k,v)
 99 | 
100 |         acc=float(self.acc.true)/(self.acc.true+self.acc.false)
101 |         # print "\nAccuracy : %s" % acc
102 |         log_info += "\nAccuracy : %s\n" % acc
103 | 
104 |         pre = float(self.all_strict.true)/(self.all_strict.true+self.all_strict.false)
105 |         rec = float(self.all_strict.true)/self.gold_all
106 |         try:
107 |             f1=2*pre*rec/(pre+rec)
108 |         except ZeroDivisionError:
109 |             f1 = 0.0
110 | 
111 |         # print "\n\nStrict score ----- "
112 |         log_info += "\n\nStrict score ----- \n"
113 |         # print 'precision : %s , recall : %s , f1 : %s' % (pre,rec,f1)
114 |         log_info += 'precision : %s , recall : %s , f1 : %s\n' % (pre,rec,f1)
115 |         # print 'find : %s , true : %s , false : %s' % (self.all_strict.true+self.all_strict.false,self.all_strict.true,self.all_strict.false)
116 |         log_info += 'find : %s , true : %s , false : %s \n' % (self.all_strict.true+self.all_strict.false,
117 |                                                              self.all_strict.true,self.all_strict.false)
118 |         #all_relex
119 |         pre = float(self.all_relax.true)/(self.all_relax.true+self.all_relax.false)
120 |         rec = float(self.all_relax.true)/self.gold_all
121 |         try:
122 |             f1=2*pre*rec/(pre+rec)
123 |         except ZeroDivisionError:
124 |             f1 = 0.0
125 | 
126 |         # print "\nRelax score -----"
127 |         log_info += "\nRelax score -----\n"
128 |         # print 'precision : %s , recall : %s , f1 : %s' % (pre,rec,f1)
129 |         log_info += 'precision : %s , recall : %s , f1 : %s\n' % (pre,rec,f1)
130 |         # print 'find : %s , true : %s , false : %s' % (self.all_relax.true+self.all_relax.false,self.all_relax.true,self.all_relax.false)
131 |         log_info += 'find : %s , true : %s , false : %s \n' % (self.all_relax.true+self.all_relax.false,
132 |                                                              self.all_relax.true,self.all_relax.false)
133 |         ##category score
134 |         # print "\nstrict score by cate -----"
135 |         log_info += "\nstrict score by cate -----\n"
136 |         for k,v in self.cate_strict.items():
137 |             pre = float(v.true)/(v.true+v.false)
138 |             if k not in self.gold_cate:
139 |                 rec=0.0
140 |                 f1=0.0
141 |             else:
142 |                 rec = float(v.true)/self.gold_cate[k]
143 |                 try:
144 |                     f1 = 2 * pre * rec / (pre + rec)
145 |                 except ZeroDivisionError:
146 |                     f1 = 0.0
147 | 
148 |             # print "Cate : %s, precision : %s , recall : %s , f1 : %s" % (k,pre,rec,f1)
149 |             log_info += "Cate : %s, precision : %s , recall : %s , f1 : %s\n" % (k,pre,rec,f1)
150 |             # print 'find : %s , true : %s , false : %s' % (v.true+v.false,v.true,v.false)
151 |             log_info += 'find : %s , true : %s , false : %s\n' % (v.true+v.false,v.true,v.false)
152 | 
153 |         # print "\nrelax score by cate -----"
154 |         log_info += "\nrelax score by cate -----\n"
155 |         for k,v in self.cate_relax.items():
156 |             pre = float(v.true)/(v.true+v.false)
157 |             if k not in self.gold_cate:
158 |                 rec = 0.0
159 |                 f1 = 0.0
160 |             else:
161 |                 rec = float(v.true)/self.gold_cate[k]
162 |                 try:
163 |                     f1 = 2 * pre * rec / (pre + rec)
164 |                 except ZeroDivisionError:
165 |                     f1 = 0.0
166 | 
167 |             # print "Cate : %s, precision : %s , recall : %s , f1 : %s" % (k,pre,rec,f1)
168 |             log_info += "Cate : %s, precision : %s , recall : %s , f1 : %s\n" % (k,pre,rec,f1)
169 |             # print 'find : %s , true : %s , false : %s' % (v.true+v.false,v.true,v.false)
170 |             log_info += 'find : %s , true : %s , false : %s\n' % (v.true+v.false,v.true,v.false)
171 | 
172 |         print(log_info)
173 |         # if self.log_name:
174 |         #     logger = create_logger(self.log_name, "--evaluation--")
175 |         #     logger.info(log_info)
176 | 
177 |     def same(self,bio,starti,endi):
178 |         '''
179 |         whether the ner (starti : endi) is exactly match
180 |         '''
181 |         flag=True
182 |         pcate=bio[starti][-1][2:]
183 |         if bio[starti][-2].startswith("i-"):
184 |             cate=bio[starti][-2][2:]
185 |             if cate != pcate:
186 |                 flag=False
187 |             else:
188 |                 #check starti-1
189 |                 if starti -1 >= 0 and bio[starti-1][-2] == "i-"+cate or bio[starti-1][-2] == "b-"+cate:
190 |                     flag=False
191 |             if flag:
192 |                 for i in range(starti+1,endi):
193 |                     if bio[i][-2] != "i-"+cate:
194 |                         flag=False
195 |             if flag:# check endi
196 |                 if endi < len(bio) and bio[endi][-2] == "i-"+cate:
197 |                     flag=False
198 |         elif bio[starti][-2].startswith("b-"):
199 |             cate=bio[starti][-2][2:]
200 |             if cate != pcate:
201 |                 flag=False
202 |             # do not need check starti -1
203 |             if flag:
204 |                 for i in range(starti+1,endi):
205 |                     if bio[i][-2] != "i-"+cate:
206 |                         flag=False
207 |             if flag:# check endi
208 |                 if endi < len(bio) and bio[endi][-2] == "i-"+cate:
209 |                     flag=False
210 |         else:
211 |             flag=False
212 |             
213 |         return flag
214 | 
215 |     def overlap(self,bio,starti,endi):
216 |         flag=False
217 |         for i in range(starti,endi):
218 |             if len(bio[i][-2])> 2 and bio[i][-1][2:] == bio[i][-2][2:]:
219 |                 flag=True
220 |                 break
221 |         return flag
222 | 
223 |     def add_tp_strict(self,cate):
224 |         self.all_strict.true=self.all_strict.true+1
225 |         self.all_relax.true=self.all_relax.true+1
226 |         if cate not in self.cate_strict:
227 |             self.cate_strict[cate]=PRF()
228 |         self.cate_strict[cate].true=self.cate_strict[cate].true+1
229 |         if cate not in self.cate_relax:
230 |             self.cate_relax[cate]=PRF()
231 |         self.cate_relax[cate].true=self.cate_relax[cate].true+1
232 | 
233 |     def add_tp_overlap(self,cate):
234 |         self.all_relax.true=self.all_relax.true+1
235 |         if cate not in self.cate_relax:
236 |             self.cate_relax[cate]=PRF()
237 |         self.cate_relax[cate].true=self.cate_relax[cate].true+1
238 |         # treat as false by strict
239 |         self.all_strict.false=self.all_strict.false+1
240 |         if cate not in self.cate_strict:
241 |             self.cate_strict[cate]=PRF()
242 |         self.cate_strict[cate].false=self.cate_strict[cate].false+1
243 | 
244 |     def add_nolap(self,cate):
245 |         self.all_strict.false=self.all_strict.false+1
246 |         self.all_relax.false=self.all_relax.false+1
247 | 
248 |         if cate not in self.cate_strict:
249 |             self.cate_strict[cate]=PRF()
250 |         self.cate_strict[cate].false=self.cate_strict[cate].false+1
251 | 
252 |         if cate not in self.cate_relax:
253 |             self.cate_relax[cate]=PRF()
254 |         self.cate_relax[cate].false=self.cate_relax[cate].false+1
255 | 
256 |     def handle(self,bio):
257 |         llen=len(bio)
258 | 
259 |         #accumulate accuracy data
260 |         for i in range(llen):
261 |             if bio[i][-1].strip() == bio[i][-2].strip():
262 |                 self.acc.true=self.acc.true+1
263 |             else:
264 |                 self.acc.false=self.acc.false+1
265 |                 
266 |         i=0
267 |         # handle system prediction
268 |         while i < llen:
269 |             if bio[i][-1] == 'o':
270 |                 i=i+1
271 |             else:
272 |                 # find the start and end pos
273 |                 starti=i
274 |                 endi=i+1
275 |                 cate=bio[starti][-1][2:].strip()
276 |                 while endi<llen and bio[endi][-1].startswith('i-'+cate):
277 |                     endi=endi+1
278 |                 #find the categor
279 |                 # exactly match
280 |                 if self.same(bio,starti,endi):
281 |                     self.add_tp_strict(cate)
282 |                 # overlap        
283 |                 elif self.overlap(bio,starti,endi):
284 |                     self.add_tp_overlap(cate)
285 |                 else: # no overlap
286 |                     self.add_nolap(cate)
287 |                 i=endi
288 | 
289 |         #handle the ground truth label
290 |         i=0
291 |         while i < llen:
292 |             if bio[i][-2] == 'o':
293 |                 i=i+1
294 |             else:
295 |                 # find the start and end pos
296 |                 starti=i
297 |                 endi=i+1
298 |                 cate=bio[starti][-2][2:].strip()
299 |                 while endi<llen and bio[endi][-2].startswith('i-'+cate):
300 |                     endi=endi+1
301 |                 self.gold_all=self.gold_all+1
302 |                 if cate not in self.gold_cate:
303 |                     self.gold_cate[cate]=0
304 |                 self.gold_cate[cate]=self.gold_cate[cate]+1
305 | 
306 |                 i=endi
307 | 
308 | 
309 | def load_bio_file_into_sents(bio_file, word_sep=" ", do_lower=False):
310 |     bio_text = read_from_file(bio_file)
311 |     bio_text = bio_text.strip()
312 |     if do_lower:
313 |         bio_text = bio_text.lower()
314 | 
315 |     new_sents = []
316 |     sents = bio_text.split("\n\n")
317 | 
318 |     for sent in sents:
319 |         new_sent = []
320 |         words = sent.split("\n")
321 |         for word in words:
322 |             new_word = word.split(word_sep)
323 |             new_sent.append(new_word)
324 |         new_sents.append(new_sent)
325 | 
326 |     return new_sents
327 | 
328 | 
329 | def output_bio(bio_data, output_file, sep=" "):
330 |     with open(output_file, "w") as f:
331 |         for sent in bio_data:
332 |             for word in sent:
333 |                 line = sep.join(word)
334 |                 f.write(line)
335 |                 f.write("\n")
336 |             f.write("\n")
337 | 
338 | 
339 | def fmerge(f1, f2, ofn):
340 |     ss1 = load_bio_file_into_sents(f1)
341 |     ss2 = load_bio_file_into_sents(f2)
342 | 
343 |     assert len(ss1) == len(ss2), "There are {} sents in GS but {} sents in prediction".format(len(ss1), len(ss2))
344 |     ss = []
345 |     for s1, s2 in zip(ss1, ss2):
346 |         assert len(s1) == len(s2), "There are {} words in GS but {} words in prediction".format(len(s1), len(s2))
347 |         s = []
348 |         for w1, w2 in zip(s1, s2):
349 |             s.append((w1[0], w1[-1], w2[-1]))
350 |         ss.append(s)
351 | 
352 |     output_bio(ss, ofn)
353 | 
354 | 
355 | def main(file1, file2):
356 |     import os
357 |     ofn = "temp.txt"
358 |     fmerge(file1, file2, ofn)
359 |     evaluate = BioEval(ofn)
360 |     evaluate.eval_fn()
361 |     # evaluate.train_msg()
362 |     os.remove(ofn)
363 | 
364 | def test():
365 |     a = [['O', 'O', 'B-misc', 'O', 'O', 'B-misc', 'I-misc', 'I-misc', 'I-misc', 'I-misc', 'O']]
366 |     b = [['O', 'O', 'B-misc', 'O', 'O', 'B-misc', 'I-misc', 'O', 'O', 'B-misc', 'O']]
367 |     a = [[e.lower() for e in each] for each in a]
368 |     b = [[e.lower() for e in each] for each in b]
369 |     c = [[x for x in zip(each_a, each_b)] for each_a, each_b in zip(a, b)]
370 |     e = BioEval(None)
371 |     for each in c:
372 |         e.handle(each)
373 |     e.prf()
374 | 
375 | 
376 | if __name__ == "__main__":
377 | #     import sys
378 | #     import os
379 | #     f1 = sys.argv[1]
380 | #     f2 = sys.argv[2]
381 | #     main(f1, f2)
382 |     test()
383 | 


--------------------------------------------------------------------------------
/scipts/make_relation.py:
--------------------------------------------------------------------------------
  1 | #make relation
  2 | from pathlib import Path
  3 | import pickle as pkl
  4 | from collections import defaultdict, Counter
  5 | from itertools import permutations, combinations
  6 | from functools import reduce
  7 | import numpy as np
  8 | import os
  9 | 
 10 | 
 11 | def pkl_save(data, file):
 12 |     with open(file, "wb") as f:
 13 |         pkl.dump(data, f)
 14 | 
 15 |         
 16 | def pkl_load(file):
 17 |     with open(file, "rb") as f:
 18 |         data = pkl.load(f)
 19 |     return data
 20 | 
 21 | 
 22 | def load_text(ifn):
 23 |     with open(ifn, "r") as f:
 24 |         txt = f.read()
 25 |     return txt
 26 | 
 27 | 
 28 | def save_text(text, ofn):
 29 |     with open(ofn, "w") as f:
 30 |         f.write(text)
 31 | 
 32 | 
 33 | import sys
 34 | # https://github.com/uf-hobi-informatics-lab/NLPreprocessing (git clone this repo to local)
 35 | sys.path.append("../NLPreprocessing/")
 36 | sys.path.append("../NLPreprocessing/text_process")
 37 | from annotation2BIO import pre_processing, read_annotation_brat, generate_BIO
 38 | MIMICIII_PATTERN = "\[\*\*|\*\*\]"
 39 | from sentence_tokenization import logger as l1
 40 | from annotation2BIO import logger as l2
 41 | l1.disabled = True
 42 | l2.disabled = True
 43 | 
 44 | 
 45 | data_dir=sys.argv[1]
 46 | output_name=sys.argv[2]
 47 | 
 48 | 
 49 | def create_entity_to_sent_mapping(nnsents, entities, idx2e):
 50 |     loc_ens = []
 51 |     
 52 |     ll = len(nnsents)
 53 |     mapping = defaultdict(list)
 54 |     for idx, each in enumerate(entities):
 55 |         en_label = idx2e[idx]
 56 |         en_s = each[2][0]
 57 |         en_e = each[2][1]
 58 |         new_en = []
 59 |         
 60 |         i = 0
 61 |         while i < ll and nnsents[i][1][0] < en_s:
 62 |             i += 1
 63 |         s_s = nnsents[i][1][0]
 64 |         s_e = nnsents[i][1][1]
 65 | 
 66 |         if en_s == s_s:
 67 |             mapping[en_label].append(i)
 68 | 
 69 |             while i < ll and s_e < en_e:
 70 |                 i += 1
 71 |                 s_e = nnsents[i][1][1]
 72 |             if s_e == en_e:
 73 |                  mapping[en_label].append(i)
 74 |             else:
 75 |                 mapping[en_label].append(i)
 76 |                 print("last index not match ", each)
 77 |         else:
 78 |             mapping[en_label].append(i)
 79 |             print("first index not match ", each)
 80 | 
 81 |             while i < ll and s_e < en_e:
 82 |                 i += 1
 83 |                 s_e = nnsents[i][1][1]
 84 |             if s_e == en_e:
 85 |                  mapping[en_label].append(i)
 86 |             else:
 87 |                 mapping[en_label].append(i)
 88 |                 print("last index not match ", each)
 89 |     return mapping
 90 | 
 91 | 
 92 | def get_permutated_relation_pairs(eid2idx):
 93 |     all_pairs = []
 94 |     all_ids = [k for k, v in eid2idx.items()]
 95 |     for e1, e2 in permutations(all_ids, 2):
 96 |         all_pairs.append((e1, e2))
 97 |     return all_pairs
 98 | def validate_rels(rels, valid):
 99 |     nrels = []
100 |     for rel in rels:
101 |         rtype = rel[0]
102 |         if tuple(rtype) not in valid:
103 |             print("invalid: ", rel)
104 |             continue
105 |         nrels.append(rel)
106 |     return nrels
107 | 
108 | 
109 | def check_tags(s1, s2):
110 |     assert EN1_START in s1 and EN1_END in s1, f"tag error: {s1}"
111 |     assert EN2_START in s2 and EN2_END in s2, f"tag error: {s2}"
112 | 
113 | 
114 | def format_relen(en, rloc, nsents):
115 |     if rloc == 1:
116 |         spec1, spec2 = EN1_START, EN1_END
117 |     else:
118 |         spec1, spec2 = EN2_START, EN2_END
119 |     sn1, tn1 = en[0][3]
120 |     sn2, tn2 = en[-1][3]
121 |     target_sent = nsents[sn1]
122 |     target_sent = [each[0] for each in target_sent]
123 |     ors =  " ".join(target_sent)
124 |     
125 |     if sn1 != sn2:
126 |         tt = nsents[sn2]
127 |         tt = [each[0] for each in tt]
128 |         target_sent.insert(tn1, spec1)
129 |         tt.insert(tn2+1, spec2)
130 |         target_sent = target_sent + tt
131 |     else:
132 |         target_sent.insert(tn1, spec1)
133 |         target_sent.insert(tn2+2, spec2)
134 |     
135 |     fs = " ".join(target_sent)
136 |     
137 |     return sn1, sn2, fs, ors
138 | 
139 | 
140 | def gene_true_relations(rels, mappings, ens, e2i, nnsents, nsents, valid_comb, fid=None):
141 |     true_pairs = set()
142 |     pos_samples = []
143 |     
144 |     for rel in rels:
145 |         rel_type = rel[0]
146 |         enid1, enid2 = rel[1:]
147 |         """
148 |         [['100', (15443, 15446), (16473, 16476), (231, 4), 'B-Strength'], 
149 |         ['mg', (15447, 15449), (16477, 16479), (231, 5), 'I-Strength']] 
150 |         [['Metoprolol', (15422, 15432), (16452, 16462), (231, 2), 'B-Drug'], 
151 |         ['Succinate', (15433, 15442), (16463, 16472), (231, 3), 'I-Drug']]
152 |         """
153 |         enbs1, enbe1 = mappings[enid1]
154 |         en1 = nnsents[enbs1: enbe1+1]
155 |         si1, sii1, fs1, ors1 = format_relen(en1, 1, nsents)
156 |         enbs2, enbe2 = mappings[enid2]
157 |         en2 = nnsents[enbs2: enbe2+1]
158 |         si2, sii2, fs2, ors2 = format_relen(en2, 2, nsents)
159 |         sent_diff = abs(si1 - si2)
160 |         
161 |         en1t = en1[0][-1].split("-")[-1]
162 |         en2t = en2[0][-1].split("-")[-1]
163 | 
164 |         true_pairs.add((enid1, enid2))
165 |         
166 |         if (en1t, en2t) not in valid_comb:
167 |             continue
168 |         
169 |         if sent_diff <= CUTOFF:
170 |             check_tags(fs1, fs2)
171 |             assert (en1t, en2t) in valid_comb, f"{en1t} {en2t}"
172 |             if DO_BIN:
173 |                 pos_samples.append((sent_diff, "pos", fs1, fs2, en1t, en2t, enid1, enid2, fid))
174 |             else:
175 |                 pos_samples.append((sent_diff, rel_type, fs1, fs2, en1t, en2t, enid1, enid2, fid))
176 | 
177 |     return pos_samples, true_pairs
178 |         
179 | 
180 | def gene_neg_relation(perm_pairs, true_pairs, mappings, ens, e2i, nnsents, nsents, valid_comb, fid=None):
181 |     neg_samples = []
182 |     for each in perm_pairs:
183 |         enid1, enid2 = each
184 |         
185 |         # not in true relation
186 |         if (enid1, enid2) in true_pairs:
187 |             continue
188 |         
189 |         enc1 = ens[e2i[enid1]]
190 |         enc2 = ens[e2i[enid2]]
191 | 
192 |         enbs1, enbe1 = mappings[enid1]
193 |         en1 = nnsents[enbs1: enbe1+1]
194 |         si1, sii1, fs1, ors1 = format_relen(en1, 1, nsents)
195 |         enbs2, enbe2 = mappings[enid2]
196 |         en2 = nnsents[enbs2: enbe2+1]
197 |         si2, sii2, fs2, ors2 = format_relen(en2, 2, nsents)
198 |         sent_diff = abs(si1 - si2)
199 |         
200 |         en1t = en1[0][-1].split("-")[-1]
201 |         en2t = en2[0][-1].split("-")[-1]
202 |         
203 |         if (en1t, en2t) not in valid_comb:
204 |             continue
205 |         
206 |         if sent_diff <= CUTOFF:
207 |             check_tags(fs1, fs2)
208 |             assert (en1t, en2t) in valid_comb, f"{en1t} {en2t}"
209 |             if fid:
210 |                 neg_samples.append((sent_diff, NEG_REL, fs1, fs2, en1t, en2t, enid1, enid2, fid))
211 |             else:
212 |                 neg_samples.append((sent_diff, NEG_REL, fs1, fs2, en1t, en2t, enid1, enid2))
213 |     
214 |     return neg_samples
215 | 
216 |     
217 | def create_training_samples(file_path, valids=None, valid_comb=None):
218 |     fids = []
219 |     root = Path(file_path)
220 |     
221 |     dpos = defaultdict(list)
222 |     dneg = defaultdict(list)
223 |     
224 |     for txt_fn in root.glob("*.txt"):
225 |         fids.append(txt_fn.stem)
226 |         ann_fn = root / (txt_fn.stem+".ann")
227 | 
228 |         # load text
229 |         txt = load_text(txt_fn)
230 |         pre_txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
231 |         e2i, ens, rels = read_annotation_brat(ann_fn)
232 |         i2e = {v: k for k, v in e2i.items()}
233 |         
234 |         nsents, sent_bound = generate_BIO(sents, ens, file_id="", no_overlap=False, record_pos=True)
235 |         total_len = len(nsents)
236 |         nnsents = [w for sent in nsents for w in sent]
237 |         mappings = create_entity_to_sent_mapping(nnsents, ens, i2e)
238 | 
239 |         pos_samples, true_pairs = gene_true_relations(
240 |             rels, mappings, ens, e2i, nnsents, nsents, valid_comb, fid=txt_fn.stem)
241 |         perm_pairs = get_permutated_relation_pairs(e2i)
242 |         neg_samples = gene_neg_relation(
243 |             perm_pairs, true_pairs, mappings, ens, e2i, nnsents, nsents, valid_comb, fid=txt_fn.stem)
244 |         
245 |         for pos_sample in pos_samples:
246 |             dpos[pos_sample[0]].append(pos_sample)
247 |         for neg_sample in neg_samples:
248 |             dneg[neg_sample[0]].append(neg_sample)
249 |         
250 |     return dpos, dneg
251 | 
252 | 
253 | def create_test_samples(file_path, valids=None, valid_comb=None):
254 |     #create a separate mapping file
255 |     rel_mappings = []
256 |     #
257 |     fids = []
258 |     root = Path(file_path)
259 |     preds = defaultdict(list)
260 |     
261 |     for txt_fn in root.glob("*.txt"):
262 |         fids.append(txt_fn.stem)
263 |         ann_fn = root / (txt_fn.stem + ".ann")
264 | 
265 |         # load text
266 |         txt = load_text(txt_fn)
267 |         pre_txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN)
268 |         e2i, ens, _ = read_annotation_brat(ann_fn)
269 |         i2e = {v: k for k, v in e2i.items()}
270 |         
271 |         nsents, sent_bound = generate_BIO(sents, ens, file_id="", no_overlap=False, record_pos=True)
272 |         total_len = len(nsents)
273 |         nnsents = [w for sent in nsents for w in sent]
274 |         mappings = create_entity_to_sent_mapping(nnsents, ens, i2e)
275 |         
276 |         perm_pairs = get_permutated_relation_pairs(e2i)
277 |         pred = gene_neg_relation(perm_pairs, set(), mappings, ens, e2i, nnsents, nsents, valid_comb, fid=txt_fn.stem)
278 |         for idx, pred_s in enumerate(pred):
279 |             preds[pred_s[0]].append(pred_s)
280 |             
281 |     return preds
282 | 
283 | 
284 | def en_sent_id(en_pos, send_bound):
285 |     e_s = en_pos[0]
286 |     e_e = en_pos[1]
287 |     for k, v in sent_bound.items():
288 |         s_s = v[0]
289 |         s_e = v[1]
290 |         if e_s >= s_s and e_s <= s_e and e_e >s_e :
291 |             print("entity is in two sentence")
292 |         if e_s >= s_s and e_s <= s_e:
293 |             return k
294 |         
295 | 
296 | def extract_entity_comb_for_relation(e2idx, entities, rels, sent_bound):
297 |     #'T1': 0
298 |     #'meropenem', 'Drug', (4534, 4543)
299 |     #('Strength-Drug', 'T5', 'T39')
300 |     rn = defaultdict(list)
301 |     rl = []
302 |     for rel in rels:
303 |         rtype = rel[0]
304 |         en1 = rel[1]
305 |         en2 = rel[2]
306 |         en1_type = entities[e2idx[en1]][1]
307 |         en2_type = entities[e2idx[en2]][1]
308 |         rn[rtype].append((en1_type, en2_type))
309 |         en1_pos = entities[e2idx[en1]][2]
310 |         e1_n = en_sent_id(en1_pos, sent_bound)
311 |         en2_pos = entities[e2idx[en2]][2]
312 |         e2_n = en_sent_id(en2_pos, sent_bound)
313 |         rl.append(abs(e1_n-e2_n))
314 |     return rn, rl
315 | 
316 | 
317 | def to_tsv(data, fn):
318 |     header = "\t".join([str(i+1) for i in range(len(data[0]))])
319 |     with open(fn, "w") as f:
320 |         f.write(f"{header}\n")
321 |         for each in data:
322 |             d = "\t".join([str(e) for e in each])
323 |             f.write(f"{d}\n")
324 | 
325 | 
326 | def to_5_cv(data, ofd):
327 |     if not os.path.isdir(ofd):
328 |         os.mkdir(ofd)
329 |     
330 |     np.random.seed(13)
331 |     np.random.shuffle(data)
332 |     
333 |     dfs = np.array_split(data, 5)
334 |     a = [0,1,2,3,4]
335 |     for each in combinations(a, 4):
336 |         b = list(set(a) - set(each))[0]
337 |         n = dfs[b]
338 |         m = []
339 |         for k in each:
340 |             m.extend(dfs[k])
341 |         if not os.path.isdir(os.path.join(ofd, f"sample{b}")):
342 |             os.mkdir(os.path.join(ofd, f"sample{b}"))
343 |         
344 |         to_tsv(m, os.path.join(ofd, f"sample{b}", "train.tsv"))
345 |         to_tsv(n, os.path.join(ofd, f"sample{b}", "dev.tsv"))
346 | 
347 | 
348 | def all_in_one(*dd, dn="2018n2c2", do_train=True):
349 |     data = []
350 |     for d in dd:
351 |         for k, v in d.items():
352 |             for each in v:
353 |                 data.append(each[1:])
354 |     
355 |     output_path = f"../temp/{dn}_aio_th{CUTOFF}"
356 |     p = Path(output_path)
357 |     p.mkdir(parents=True, exist_ok=True)
358 |     
359 |     if do_train:
360 |         to_tsv(data, p/"train.tsv")
361 |         if OUTPUT_CV:
362 |             to_5_cv(data, p.as_posix())
363 |     else:
364 |         to_tsv(data, p/"test.tsv")
365 |     
366 | 
367 | def all_in_unique(*dd, dn="2018n2c2", do_train=True):
368 |     for idx in range(CUTOFF+1):
369 |         data = []
370 |         for d in dd:
371 |             for k, v in d.items():
372 |                 for each in v:
373 |                     if k == idx:
374 |                         data.append(each[1:])
375 |         
376 |         output_path = f"../temp/{dn}_aiu_th{CUTOFF}"
377 |         p = Path(output_path) / f"cutoff_{idx}"
378 |         p.mkdir(parents=True, exist_ok=True)
379 |         if do_train:
380 |             to_tsv(data, p/"train.tsv")
381 |             if OUTPUT_CV:
382 |                 to_5_cv(data, p.as_posix())
383 |         else:
384 |             to_tsv(data, p/"test.tsv")
385 | 
386 | 
387 | # general pre-defined special tags
388 | EN1_START = "[s1]"
389 | EN1_END = "[e1]"
390 | EN2_START = "[s2]"
391 | EN2_END = "[e2]"
392 | NEG_REL = "NonRel"
393 | # max valid cross sentence distance
394 | CUTOFF = 1
395 | # output 5-fold cross validation data
396 | OUTPUT_CV = False
397 | # do binary classification (if false, then we do multiclass classification)
398 | DO_BIN = False
399 | 
400 | sdoh_valid_comb = {
401 |         ('Tobacco_use', 'Substance_use_status'), ('Substance_use_status', 'Smoking_type'),
402 |         ('Substance_use_status', 'Smoking_freq_ppd'), ('Substance_use_status', 'Smoking_freq_py'), 
403 |         ('Substance_use_status', 'Smoking_freq_qy'), ('Substance_use_status', 'Smoking_freq_sy'),
404 |         ('Substance_use_status', 'Smoking_freq_other'), ('Alcohol_use', 'Substance_use_status'),
405 |         ('Substance_use_status', 'Alcohol_freq'), ('Substance_use_status', 'Alcohol_type'), 
406 |         ('Substance_use_status', 'Alcohol_other'), ('Drug_use', 'Substance_use_status'),
407 |         ('Substance_use_status', 'Drug_freq'), ('Substance_use_status', 'Drug_type'),('Substance_use_status', 'Drug_other'), ('Sex_act', 'Sdoh_status'),
408 |         ('Sex_act', 'Partner'), ('Sex_act', 'Protection'), 
409 |         ('Sex_act', 'Sex_act_other'), ('Occupation', 'Employment_status'),
410 |         ('Occupation', 'Employment_location'), ('Gender', 'Sdoh_status'),('Social_cohesion', 'Social_method'), ('Social_method', 'Sdoh_status'),
411 |         ('Physical_act', 'Sdoh_status'), ('Physical_act', 'Sdoh_freq'), 
412 |         ('Living_supply', 'Sdoh_status'), ('Abuse', 'Sdoh_status'),
413 |         ('Transportation', 'Sdoh_status'), ('Health_literacy', 'Sdoh_status'),
414 |         ('Financial_constrain', 'Sdoh_status'), ('Social_cohesion', 'Sdoh_status'),
415 |         ('Social_cohesion', 'Sdoh_freq'), ('Gender', 'Sdoh_status'), 
416 |         ('Race', 'Sdoh_status'), ('Ethnicity', 'Sdoh_status'),
417 |         ('Living_Condition', 'Sdoh_status')
418 |     }
419 | 
420 | test_root=f'../temp/{output_name}_formatted_output'
421 | preds = create_test_samples(test_root, None, sdoh_valid_comb)
422 | all_in_one(preds, dn=output_name, do_train=False)
423 | 
424 | 
425 | 
426 | 
427 | 
428 | 


--------------------------------------------------------------------------------
/ClinicalTransformerRelationExtraction/src/data_utils.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | from config import MODEL_REQUIRE_SEGMENT_ID, SPEC_TAGS, TOKENIZER_USE_FOUR_SPECIAL_TOKs
  3 | import csv
  4 | from pathlib import Path
  5 | import torch
  6 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
  7 | import re
  8 | from tqdm import tqdm
  9 | from functools import partial
 10 | from concurrent.futures import ProcessPoolExecutor
 11 | import numpy as np
 12 | 
 13 | 
 14 | class InputExample(object):
 15 |     """A single training/test example for simple sequence classification."""
 16 | 
 17 |     def __init__(self, guid, text_a, text_b=None, label=None):
 18 |         """Constructs a InputExample.
 19 | 
 20 |         Args:
 21 |             guid: Unique id for the example.
 22 |             text_a: string. The not tokenized text of the first sequence. For single
 23 |             sequence tasks, only this sequence must be specified.
 24 |             text_b: (Optional) string. The not tokenized text of the second sequence.
 25 |             Only must be specified for sequence pair tasks.
 26 |             label: (Optional) string. The label of the example. This should be
 27 |             specified for train and dev examples, but not for test examples.
 28 |         """
 29 |         self.guid = guid
 30 |         self.text_a = text_a
 31 |         self.text_b = text_b
 32 |         self.label = label
 33 | 
 34 |     def __str__(self):
 35 |         s = ""
 36 |         for k, v in self.__dict__.items():
 37 |             s += "{}={}\n".format(k, v)
 38 |         return s
 39 | 
 40 | 
 41 | class InputFeatures(object):
 42 |     """A single set of features of data."""
 43 | 
 44 |     def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
 45 |         self.input_ids = input_ids
 46 |         self.attention_mask = attention_mask
 47 |         self.token_type_ids = token_type_ids
 48 |         self.label = label
 49 | 
 50 |     def __str__(self):
 51 |         s = ""
 52 |         for k, v in self.__dict__.items():
 53 |             s += "{}={}\n".format(k, v)
 54 |         return s
 55 | 
 56 | 
 57 | def convert_examples_to_relation_extraction_features(
 58 |         examples, label2idx, tokenizer, max_length=128):
 59 |     """This function is the same as transformers.glue_convert_examples_to_features"""
 60 |     features = []
 61 | 
 62 |     for idx, example in enumerate(tqdm(examples)):
 63 |         text_a, text_b = example.text_a, example.text_b
 64 | 
 65 |         tokens_a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_a))
 66 | 
 67 |         if text_b:
 68 |             tokens_b = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_b))
 69 |         else:
 70 |             tokens_b = None
 71 | 
 72 |         inputs = tokenizer.encode_plus(
 73 |             tokens_a, tokens_b, pad_to_max_length=True, max_length=max_length, truncation=False)
 74 | 
 75 |         label = label2idx[example.label]
 76 |         feature = InputFeatures(**inputs, label=label)
 77 |         features.append(feature)
 78 | 
 79 |         if idx < 3:
 80 |             print("###exampel###\nguide: {}\ntext: {}\ntoken ids: {}\nmasks: {}\nlabel: {}\n########".format(
 81 |                 example.guid,
 82 |                 example.text_a + " " + example.text_b,
 83 |                 feature.input_ids,
 84 |                 feature.attention_mask,
 85 |                 feature.label))
 86 | 
 87 |     return features
 88 | 
 89 | 
 90 | def features2tensors(features, logger=None):
 91 |     tensor_input_ids = []
 92 |     tensor_attention_masks = []
 93 |     tensor_token_type_ids = []
 94 |     tensor_label_ids = []
 95 | 
 96 |     for idx, feature in enumerate(features):
 97 |         if logger and idx < 3:
 98 |             logger.info("Feature{}:\n{}\n".format(idx + 1, feature))
 99 | 
100 |         tensor_input_ids.append(feature.input_ids)
101 |         tensor_attention_masks.append(feature.attention_mask)
102 |         tensor_label_ids.append(feature.label)
103 | 
104 |         if feature.token_type_ids:
105 |             tensor_token_type_ids.append(feature.token_type_ids)
106 | 
107 |     tensor_input_ids = torch.tensor(tensor_input_ids, dtype=torch.long)
108 |     tensor_attention_masks = torch.tensor(tensor_attention_masks, dtype=torch.long)
109 |     tensor_label_ids = torch.tensor(tensor_label_ids, dtype=torch.long)
110 |     tensor_token_type_ids = torch.tensor(tensor_token_type_ids, dtype=torch.long) if tensor_token_type_ids \
111 |         else torch.zeros(tensor_attention_masks.shape)
112 | 
113 |     return TensorDataset(tensor_input_ids, tensor_attention_masks, tensor_token_type_ids, tensor_label_ids)
114 | 
115 | 
116 | def relation_extraction_data_loader(dataset, batch_size=2, task='train', logger=None):
117 |     """
118 |     task has two levels:
119 |     train for training using RandomSampler
120 |     test for evaluation and prediction using SequentialSampler
121 | 
122 |     if set auto to True we will default call convert_features_to_tensors,
123 |     so features can be directly passed into the function
124 |     """
125 |     dataset = features2tensors(dataset, logger=logger)
126 | 
127 |     if task == 'train':
128 |         sampler = RandomSampler(dataset)
129 |     elif task == 'test':
130 |         sampler = SequentialSampler(dataset)
131 |     else:
132 |         raise ValueError('task argument only support train or test but get {}'.format(task))
133 | 
134 |     data_loader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, pin_memory=True)
135 | 
136 |     return data_loader
137 | 
138 | 
139 | def batch_to_model_input(batch, model_type="bert", device=torch.device("cpu")):
140 |     return {"input_ids": batch[0].to(device),
141 |             "attention_mask": batch[1].to(device),
142 |             "labels": batch[3].to(device),
143 |             "token_type_ids": batch[2].to(device) if model_type in MODEL_REQUIRE_SEGMENT_ID else None}
144 | 
145 | 
146 | class DataProcessor(object):
147 |     """Base class for data converters for sequence classification data sets."""
148 | 
149 |     def __init__(self, data_dir=None, max_seq_len=128, num_core=1, header=True, tokenizer_type='bert'):
150 |         if data_dir:
151 |             self.data_dir = Path(data_dir)
152 |         else:
153 |             self.data_dir = data_dir
154 | 
155 |         self.tokenizer = None
156 |         self.max_seq_len = max_seq_len
157 |         self.num_core = num_core
158 |         self.header = header
159 |         self.tokenizer_type = tokenizer_type
160 |         self.total_special_token_num = 3
161 | 
162 |     def __str__(self):
163 |         rep = [f"key: {k}; val: {v}" for k, v in self.__dict__.items()]
164 |         return "\n".join(rep)
165 | 
166 |     def set_data_dir(self, data_dir):
167 |         self.data_dir = Path(data_dir)
168 | 
169 |     def set_tokenizer(self, tokenizer):
170 |         self.tokenizer = tokenizer
171 | 
172 |     def set_max_seq_len(self, max_seq_len):
173 |         self.max_seq_len = max_seq_len
174 | 
175 |     def set_tokenizer_type(self, tokenizer_type):
176 |         self.tokenizer_type = tokenizer_type
177 | 
178 |     def set_num_core(self, num_core):
179 |         self.num_core = num_core
180 | 
181 |     def set_header(self, header):
182 |         self.header = header
183 | 
184 |     def get_train_examples(self, filename=None):
185 |         """See base class."""
186 |         input_file_name = self.data_dir / filename if filename else self.data_dir / "train.tsv"
187 | 
188 |         return self._create_examples(
189 |             self._read_tsv(input_file_name), "train")
190 | 
191 |     def get_dev_examples(self, filename=None):
192 |         """See base class."""
193 |         input_file_name = self.data_dir / filename if filename else self.data_dir / "dev.tsv"
194 | 
195 |         return self._create_examples(
196 |             self._read_tsv(input_file_name), "dev")
197 | 
198 |     def get_test_examples(self, filename=None):
199 |         """See base class."""
200 |         input_file_name = self.data_dir / filename if filename else self.data_dir / "test.tsv"
201 | 
202 |         return self._create_examples(
203 |             self._read_tsv(input_file_name), "test")
204 | 
205 |     def get_labels(self, train_file=None, label_file=None):
206 |         """
207 |             Gets the list of labels for this data set.
208 |             1. use labels in train file for indexing
209 |                 In all different formats, the first column always should be label
210 |             2. add a label index file
211 |                 A plain text with each unique label in one line
212 |         """
213 |         if label_file:
214 |             with open(label_file, "r") as f:
215 |                 unique_labels = [e.strip() for e in f.read().strip().split("\n")]
216 |         elif label_file is None and train_file:
217 |             lines = self._read_tsv(train_file)
218 |             unique_labels = set()
219 |             for (i, line) in enumerate(lines):
220 |                 unique_labels.add(line[0])
221 |         elif label_file is None and train_file is None and self.data_dir:
222 |             lines = self._read_tsv(self.data_dir / "train.tsv")
223 |             unique_labels = set()
224 |             for (i, line) in enumerate(lines):
225 |                 unique_labels.add(line[0])
226 |         else:
227 |             raise RuntimeError("Cannot find files to generate labels"
228 |                                "You need one of label_file, train_file (full path) or data_dir setup")
229 | 
230 |         label2idx = {k: v for v, k in enumerate(unique_labels)}
231 |         idx2label = {v: k for k, v in label2idx.items()}
232 | 
233 |         return unique_labels, label2idx, idx2label
234 | 
235 |     def _create_examples(self, lines, set_type):
236 |         """Creates examples for the training and dev sets."""
237 |         raise NotImplementedError(
238 |             "You must use FamilyHistoryRelationDataFormatSep or FamilyHistoryRelationDataFormatOne.")
239 | 
240 |     @staticmethod
241 |     def _read_tsv(input_file, header=True, quotechar=None):
242 |         """Reads a tab separated value file."""
243 |         lines = []
244 | 
245 |         with open(input_file, "r", encoding="utf-8") as f:
246 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
247 |             for line in reader:
248 |                 lines.append(line)
249 |         if header:
250 |             lines = lines[1:]
251 | 
252 |         return lines
253 | 
254 | 
255 | class RelationDataFormatSepProcessor(DataProcessor):
256 |     """
257 |         data format:
258 |             [CLS] sent1 [SEP] sent2 [SEP] : BERT
259 |             <s> sent1 </s> </s> sent2 </s> : RoBERTa, LongFormer
260 |             sent1 <sep> sent2 <sep> <cls>
261 |     """
262 | 
263 |     def _create_examples_helper(self, lines_idx, set_type, total_special_toks):
264 |         start_idx, lines = lines_idx
265 |         examples = []
266 |         for (i, line) in enumerate(tqdm(lines)):
267 |             guid = "{}_{}_{}".format(set_type, start_idx, i)
268 |             text_a = line[1]
269 |             text_b = line[2]
270 |             label = line[0]
271 |             # text after tokenization has a len > max_seq_len:
272 |             # 1. skip all these cases
273 |             # 2. use truncate strategy
274 |             # we adopt truncate way (2) in this implementation as _process_seq_len
275 |             text_a, text_b = self._process_seq_len(text_a, text_b, total_special_toks=total_special_toks)
276 |             examples.append(
277 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
278 |         return examples
279 | 
280 |     def _create_examples(self, lines, set_type):
281 |         """Creates examples for the training and dev sets."""
282 | 
283 |         if self.tokenizer_type in TOKENIZER_USE_FOUR_SPECIAL_TOKs:
284 |             self.total_special_token_num = 4
285 | 
286 |         if self.num_core < 2:
287 |             # single process - maybe too slow - replace with multiprocess
288 |             examples = self._create_examples_helper((0, lines), set_type, self.total_special_token_num)
289 |         else:
290 |             # multi-process - assume the first read-in data is the csv title with no data information
291 |             # use multi-cores to process data if you have many long sentences;
292 |             # otherwise single process should be faster
293 |             examples = []
294 |             array_lines = np.array_split(lines, self.num_core)
295 |             with ProcessPoolExecutor(max_workers=self.num_core) as exe:
296 |                 for each in exe.map(partial(self._create_examples_helper,
297 |                                             set_type=set_type,
298 |                                             total_special_toks=self.total_special_token_num),
299 |                                     enumerate(array_lines)):
300 |                     examples.extend(each)
301 | 
302 |         return examples
303 | 
304 |     @staticmethod
305 |     def _truncate_helper(text):
306 |         tokens = text.split(" ")
307 |         spec_tag_idx1, spec_tag_idx2 = [idx for (idx, tk) in enumerate(tokens) if tk.lower() in SPEC_TAGS]
308 |         start_idx, end_idx = 0, len(tokens) - 1
309 |         truncate_space_head = spec_tag_idx1 - start_idx
310 |         truncate_space_tail = end_idx - spec_tag_idx2
311 | 
312 |         if truncate_space_head == truncate_space_tail == 0:
313 |             return text
314 | 
315 |         if truncate_space_head > truncate_space_tail:
316 |             tokens.pop(0)
317 |         else:
318 |             tokens.pop(-1)
319 | 
320 |         return " ".join(tokens)
321 | 
322 |     def _process_seq_len(self, text_a, text_b, total_special_toks=3):
323 |         """
324 |             This function is used to truncate sequences with len > max_seq_len
325 |             Truncate strategy:
326 |             1. find all the index for special tags
327 |             3. count distances between leading word to first tag and second tag to last.
328 |             first -1- tag1 entity tag2 -2- last
329 |             4. pick the longest distance from (1, 2), if 1 remove first token, if 2 remove last token
330 |             5. repeat until len is equal to max_seq_len
331 |         """
332 |         flag = True
333 | 
334 |         while len(self.tokenizer.tokenize(text_a) + self.tokenizer.tokenize(text_b)) \
335 |                 > (self.max_seq_len - total_special_toks):
336 | 
337 |             if flag:
338 |                 text_a = self._truncate_helper(text_a)
339 |             else:
340 |                 text_b = self._truncate_helper(text_b)
341 | 
342 |             flag = not flag
343 | 
344 |         return text_a, text_b
345 | 
346 | 
347 | class RelationDataFormatUniProcessor(DataProcessor):
348 |     """
349 |         data format:
350 |             [CLS] sent1 sent2 [SEP]
351 |     """
352 | 
353 |     def _create_examples_helper(self, lines_idx, set_type, total_special_toks):
354 |         examples = []
355 |         start_idx, lines = lines_idx
356 |         for (i, line) in enumerate(lines):
357 |             guid = "%s-%s-%s" % (set_type, start_idx, i)
358 |             text_a = line[1]
359 |             text_a_1 = line[2]
360 |             text_a = " ".join([text_a, text_a_1])
361 |             label = line[0]
362 |             # text after tokenization has a len > max_seq_len:
363 |             # 1. skip all these cases
364 |             # 2. use truncate strategy (truncate from both side) (adopted)
365 |             text_a = self._process_seq_len(text_a)
366 | 
367 |             examples.append(
368 |                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
369 | 
370 |         return examples
371 | 
372 |     def _create_examples(self, lines, set_type):
373 |         """Creates examples for the training and dev sets."""
374 | 
375 |         if self.tokenizer_type in TOKENIZER_USE_FOUR_SPECIAL_TOKs:
376 |             self.total_special_token_num = 4
377 | 
378 |         if self.num_core < 2:
379 |             # single process
380 |             examples = self._create_examples_helper((0, lines), set_type, self.total_special_token_num)
381 |         else:
382 |             # multi-process
383 |             examples = []
384 |             array_lines = np.array_split(lines, self.num_core)
385 |             with ProcessPoolExecutor(max_workers=self.num_core) as exe:
386 |                 for each in exe.map(partial(self._create_examples_helper,
387 |                                             set_type=set_type,
388 |                                             total_special_toks=self.total_special_token_num),
389 |                                     enumerate(array_lines)):
390 |                     examples.extend(each)
391 | 
392 |         return examples
393 | 
394 |     def _process_seq_len(self, text_a):
395 |         """
396 |             see RelationDataFormatSepProcessor._process_seq_len for details
397 |         """
398 |         while len(self.tokenizer.tokenize(text_a)) > (self.max_seq_len - 2):
399 |             w1 = text_a.split(" ")
400 |             t1, t2, t3, t4 = [idx for (idx, w) in enumerate(w1) if w.lower() in SPEC_TAGS]
401 |             ss1, mid1, se1 = 0, (len(w1) - 1) // 2, len(w1) - 1
402 | 
403 |             a1 = t1 - ss1
404 |             b1 = se1 - t4
405 |             c1 = mid1 - t2
406 |             d1 = t3 - mid1
407 |             m_idx = max(a1, b1, c1, d1)
408 |             if a1 == m_idx:
409 |                 w1.pop(0)
410 |             elif b1 == m_idx:
411 |                 w1.pop(-1)
412 |             elif c1 == m_idx:
413 |                 w1.pop((t2 + c1 // 2))
414 |             else:
415 |                 w1.pop((t3 - d1 // 2))
416 | 
417 |             text_a = " ".join(w1)
418 | 
419 |         return text_a
420 | 


--------------------------------------------------------------------------------
/ClinicalTransformerNER/src/common_utils/bio_prf_eval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | ############################################################################################################
  3 | # the performance return a dict with structure:
  4 | #     {'category': {'relax': {'xx': {'f_score': 0.85714,
  5 | #                                    'precision': 0.75,
  6 | #                                    'recall': 1.0},
  7 | #                             'yy': {'f_score': 0.8,
  8 | #                                    'precision': 1.0,
  9 | #                                    'recall': 0.6667}},
 10 | #                   'strict': {'xx': {'f_score': 0.8571,
 11 | #                                     'precision': 0.75,
 12 | #                                     'recall': 1.0},
 13 | #                              'yy': {'f_score': 0.4,
 14 | #                                     'precision': 0.5,
 15 | #                                     'recall': 0.3333}}},
 16 | #      'overall': {'acc': 0.7857,
 17 | #                  'relax': {'f_score': 0.8334,
 18 | #                            'precision': 0.8334,
 19 | #                            'recall': 0.8334},
 20 | #                  'strict': {'f_score': 0.6667,
 21 | #                             'precision': 0.6667,
 22 | #                             'recall': 0.6667}}}
 23 | 
 24 | # also get entity counts by call BioEval.counts
 25 | # return a dictionary:
 26 | #     {'expect': {'overall': 6, 'xx': 3, 'yy': 3},
 27 | #      'prediction': {'relax': {'overall': {'false': 1, 'total': 6, 'true': 5},
 28 | #                               'xx': {'false': 1, 'total': 4, 'true': 3},
 29 | #                               'yy': {'false': 0, 'total': 2, 'true': 2}},
 30 | #                     'strict': {'overall': {'false': 2, 'total': 6, 'true': 4},
 31 | #                                'xx': {'false': 1, 'total': 4, 'true': 3},
 32 | #                                'yy': {'false': 1, 'total': 2, 'true': 1}}}}
 33 | #
 34 | # see test() for use cases
 35 | ############################################################################################################
 36 | 
 37 | 
 38 | from common_utils.common_io import load_bio_file_into_sents
 39 | from itertools import chain
 40 | from collections import defaultdict
 41 | from common_utils.common_log import create_logger
 42 | from math import pow
 43 | from pathlib import Path
 44 | import argparse
 45 | 
 46 | 
 47 | class PRF:
 48 |     def __init__(self):
 49 |         self.true = 0
 50 |         self.false = 0
 51 | 
 52 |     def add_true_case(self):
 53 |         self.true += 1
 54 | 
 55 |     def add_false_case(self):
 56 |         self.false += 1
 57 | 
 58 |     def get_true_false_counts(self):
 59 |         return self.true, self.false
 60 | 
 61 |     def __str__(self):
 62 |         return str(self.__dict__)
 63 | 
 64 | 
 65 | class BioEval:
 66 |     def __init__(self):
 67 |         self.logger = create_logger('BioEval')
 68 |         self.acc = PRF()
 69 |         # prediction
 70 |         self.all_strict = PRF()
 71 |         self.all_relax = PRF()
 72 |         self.cat_strict = defaultdict(PRF)
 73 |         self.cat_relax = defaultdict(PRF)
 74 |         # gold standard
 75 |         self.gs_all = 0
 76 |         self.gs_cat = defaultdict(int)
 77 |         self.performance = dict()
 78 |         self.counts = dict()
 79 |         self.beta = 1
 80 |         self.label_not_for_eval = {'o'}
 81 | 
 82 |     def reset(self):
 83 |         self.acc = PRF()
 84 |         self.all_strict = PRF()
 85 |         self.all_relax = PRF()
 86 |         self.cat_strict = defaultdict(PRF)
 87 |         self.cat_relax = defaultdict(PRF)
 88 |         self.gs_all = 0
 89 |         self.gs_cat = defaultdict(int)
 90 |         self.performance = dict()
 91 |         self.counts = dict()
 92 | 
 93 |     def set_beta_for_f_score(self, beta):
 94 |         self.logger.warning("Using beta={} for calculating F-score".format(beta))
 95 |         self.beta = beta
 96 | 
 97 |     def set_logger(self, logger):
 98 |         self.logger = logger
 99 | 
100 |     def add_labels_not_for_eval(self, *labels):
101 |         for each in labels:
102 |             self.label_not_for_eval.add(each.lower())
103 | 
104 |     def __calc_prf(self, tp, fp, tp_tn):
105 |         """
106 |         Using this function to calculate F-beta score, beta=1 is f_score-score, set beta=2 favor recall, and set beta=0.5 favor precision.
107 |         Using set_beta_for_f_score function to change beta value.
108 |         """
109 |         tp_fp = tp + fp
110 |         pre = 1.0 * tp / tp_fp if tp_fp > 0 else 0.0
111 |         rec = 1.0 * tp / tp_tn if tp_tn > 0 else 0.0
112 |         beta2 = pow(self.beta, 2)
113 |         f_beta = (1 + beta2) * pre * rec / (beta2 * pre + rec) if (pre + rec) > 0 else 0.0
114 |         return pre, rec, f_beta
115 | 
116 |     def __measure_performance(self):
117 |         self.performance['overall'] = dict()
118 | 
119 |         acc_true_num, acc_false_num = self.acc.get_true_false_counts()
120 |         total_acc_num = acc_true_num + acc_false_num
121 |         # calc acc
122 |         overall_acc = round(1.0 * acc_true_num / total_acc_num, 4) if total_acc_num > 0 else 0.0
123 |         self.performance['overall']['acc'] = overall_acc
124 | 
125 |         strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts()
126 |         strict_pre, strict_rec, strict_f_score = self.__calc_prf(strict_true_counts, strict_false_counts, self.gs_all)
127 |         self.performance['overall']['strict'] = dict()
128 |         self.performance['overall']['strict']['precision'] = strict_pre
129 |         self.performance['overall']['strict']['recall'] = strict_rec
130 |         self.performance['overall']['strict']['f_score'] = strict_f_score
131 | 
132 |         relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts()
133 |         relax_pre, relax_rec, relax_f_score = self.__calc_prf(relax_true_counts, relax_false_counts, self.gs_all)
134 |         self.performance['overall']['relax'] = dict()
135 |         self.performance['overall']['relax']['precision'] = relax_pre
136 |         self.performance['overall']['relax']['recall'] = relax_rec
137 |         self.performance['overall']['relax']['f_score'] = relax_f_score
138 | 
139 |         self.performance['category'] = dict()
140 |         self.performance['category']['strict'] = dict()
141 |         for k, v in self.cat_strict.items():
142 |             self.performance['category']['strict'][k] = dict()
143 |             stc, sfc = v.get_true_false_counts()
144 |             p, r, f = self.__calc_prf(stc, sfc, self.gs_cat[k])
145 |             self.performance['category']['strict'][k]['precision'] = p
146 |             self.performance['category']['strict'][k]['recall'] = r
147 |             self.performance['category']['strict'][k]['f_score'] = f
148 | 
149 |         self.performance['category']['relax'] = dict()
150 |         for k, v in self.cat_relax.items():
151 |             self.performance['category']['relax'][k] = dict()
152 |             rtc, rfc = v.get_true_false_counts()
153 |             p, r, f = self.__calc_prf(rtc, rfc, self.gs_cat[k])
154 |             self.performance['category']['relax'][k]['precision'] = p
155 |             self.performance['category']['relax'][k]['recall'] = r
156 |             self.performance['category']['relax'][k]['f_score'] = f
157 | 
158 |     def __measure_counts(self):
159 |         # gold standard
160 |         self.counts['expect'] = dict()
161 |         self.counts['expect']['overall'] = self.gs_all
162 |         for k, v in self.gs_cat.items():
163 |             self.counts['expect'][k] = v
164 |         # prediction
165 |         self.counts['prediction'] = {'strict': dict(), 'relax': dict()}
166 |         # strict
167 |         strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts()
168 |         self.counts['prediction']['strict']['overall'] = dict()
169 |         self.counts['prediction']['strict']['overall']['total'] = strict_true_counts + strict_false_counts
170 |         self.counts['prediction']['strict']['overall']['true'] = strict_true_counts
171 |         self.counts['prediction']['strict']['overall']['false'] = strict_false_counts
172 |         for k, v in self.cat_strict.items():
173 |             t, f = v.get_true_false_counts()
174 |             self.counts['prediction']['strict'][k] = dict()
175 |             self.counts['prediction']['strict'][k]['total'] = t + f
176 |             self.counts['prediction']['strict'][k]['true'] = t
177 |             self.counts['prediction']['strict'][k]['false'] = f
178 |         # relax
179 |         relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts()
180 |         self.counts['prediction']['relax']['overall'] = dict()
181 |         self.counts['prediction']['relax']['overall']['total'] = relax_true_counts + relax_false_counts
182 |         self.counts['prediction']['relax']['overall']['true'] = relax_true_counts
183 |         self.counts['prediction']['relax']['overall']['false'] = relax_false_counts
184 |         for k, v in self.cat_relax.items():
185 |             t, f = v.get_true_false_counts()
186 |             self.counts['prediction']['relax'][k] = dict()
187 |             self.counts['prediction']['relax'][k]['total'] = t + f
188 |             self.counts['prediction']['relax'][k]['true'] = t
189 |             self.counts['prediction']['relax'][k]['false'] = f
190 | 
191 |     @staticmethod
192 |     def __strict_match(gs, pred, s_idx, e_idx, en_type):
193 |         if e_idx < len(gs) and gs[e_idx] == f"i-{en_type}":
194 |             # check token after end in GS is not continued entity token
195 |             return False
196 |         elif gs[s_idx] != f"b-{en_type}" or pred[s_idx] != f"b-{en_type}":
197 |             # force first token to be B-
198 |             return False
199 |         # check every token in span is the same
200 |         for idx in range(s_idx, e_idx):
201 |             if gs[idx] != pred[idx]:
202 |                 return False
203 |         return True
204 | 
205 |     @staticmethod
206 |     def __relax_match(gs, pred, s_idx, e_idx, en_type):
207 |         # we adopt the partial match strategy which is very loose compare to right-left or approximate match
208 |         for idx in range(s_idx, e_idx):
209 |             gs_cate = gs[idx].split("-")[-1]
210 |             pred_bound, pred_cate = pred[idx].split("-")
211 |             if gs_cate == pred_cate == en_type:
212 |                 return True
213 |         return False
214 | 
215 |     @staticmethod
216 |     def __check_evaluated_already(gs_dict, cate, start_idx, end_idx):
217 |         for k, v in gs_dict.items():
218 |             c, s, e = k
219 |             if not (e < start_idx or s > end_idx) and c == cate:
220 |                 if v == 0:
221 |                     return True
222 |                 else:
223 |                     gs_dict[k] -= 1
224 |                     return False
225 |         return False
226 | 
227 |     def __process_bio(self, gs_bio, pred_bio):
228 |         # measure acc
229 |         for w_idx, (gs_word, pred_word) in enumerate(zip(gs_bio, pred_bio)):
230 |             # measure acc
231 |             if gs_word == pred_word:
232 |                 self.acc.add_true_case()
233 |             else:
234 |                 self.acc.add_false_case()
235 | 
236 |         # process gold standard
237 |         llen = len(gs_bio)
238 |         gs_dict = defaultdict(int)
239 |         cur_idx = 0
240 |         while cur_idx < llen:
241 |             if gs_bio[cur_idx].strip() in self.label_not_for_eval:
242 |                 cur_idx += 1
243 |             else:
244 |                 start_idx = cur_idx
245 |                 end_idx = start_idx + 1
246 |                 _, cate = gs_bio[start_idx].strip().split('-')
247 |                 while end_idx < llen and gs_bio[end_idx].strip() == f"i-{cate}":
248 |                     end_idx += 1
249 |                 self.gs_all += 1
250 |                 self.gs_cat[cate] += 1
251 |                 gs_dict[(cate, start_idx, end_idx)] += 1
252 |                 cur_idx = end_idx
253 |         # process predictions
254 |         cur_idx = 0
255 |         while cur_idx < llen:
256 |             if pred_bio[cur_idx].strip() in self.label_not_for_eval:
257 |                 cur_idx += 1
258 |             else:
259 |                 start_idx = cur_idx
260 |                 end_idx = start_idx + 1
261 |                 _, cate = pred_bio[start_idx].strip().split("-")
262 |                 while end_idx < llen and pred_bio[end_idx].strip() == f"i-{cate}":
263 |                     end_idx += 1
264 |                 if self.__strict_match(gs_bio, pred_bio, start_idx, end_idx, cate):
265 |                     self.all_strict.add_true_case()
266 |                     self.cat_strict[cate].add_true_case()
267 |                     self.all_relax.add_true_case()
268 |                     self.cat_relax[cate].add_true_case()
269 |                 elif self.__relax_match(gs_bio, pred_bio, start_idx, end_idx, cate):
270 |                     if self.__check_evaluated_already(gs_dict, cate, start_idx, end_idx):
271 |                         cur_idx = end_idx
272 |                         continue
273 |                     self.all_strict.add_false_case()
274 |                     self.cat_strict[cate].add_false_case()
275 |                     self.all_relax.add_true_case()
276 |                     self.cat_relax[cate].add_true_case()
277 |                 else:
278 |                     self.all_strict.add_false_case()
279 |                     self.cat_strict[cate].add_false_case()
280 |                     self.all_relax.add_false_case()
281 |                     self.cat_relax[cate].add_false_case()
282 |                 cur_idx = end_idx
283 | 
284 |     def eval_file(self, gs_file, pred_file):
285 |         self.logger.info("processing gold standard file: {} and prediciton file: {}".format(gs_file, pred_file))
286 |         pred_bio_sents = load_bio_file_into_sents(pred_file, do_lower=True)
287 |         gs_bio_sents = load_bio_file_into_sents(gs_file, do_lower=True)
288 |         # process bio data
289 |         # check two data have same amount of sents
290 |         assert len(gs_bio_sents) == len(pred_bio_sents), \
291 |             "gold standard and prediction have different dimension: gs: {}; pred: {}".format(len(gs_bio_sents), len(pred_bio_sents))
292 |         # measure performance
293 |         for s_idx, (gs_sent, pred_sent) in enumerate(zip(gs_bio_sents, pred_bio_sents)):
294 |             # check two sents have same No. of words
295 |             assert len(gs_sent) == len(pred_sent), \
296 |                 "In {}th sentence, the words counts are different; gs: {}; pred: {}".format(s_idx, gs_sent, pred_sent)
297 |             gs_sent = list(map(lambda x: x[-1], gs_sent))
298 |             pred_sent = list(map(lambda x: x[-1], pred_sent))
299 |             self.__process_bio(gs_sent, pred_sent)
300 |         # get the evaluation matrix
301 |         self.__measure_performance()
302 |         self.__measure_counts()
303 | 
304 |     def eval_mem(self, gs, pred, do_flat=False):
305 |         # flat sents to sent; we assume input sequences only have 1 dimension (only labels)
306 |         if do_flat:
307 |             self.logger.warning('Sentences have been flatten to 1 dim.')
308 |             gs = list(chain(*gs))
309 |             pred = list(chain(*pred))
310 |             gs = list(map(lambda x: x.lower(), gs))
311 |             pred = list(map(lambda x: x.lower(), pred))
312 |             self.__process_bio(gs, pred)
313 |         else:
314 |             for sidx, (gs_s, pred_s) in enumerate(zip(gs, pred)):
315 |                 gs_s = list(map(lambda x: x.lower(), gs_s))
316 |                 pred_s = list(map(lambda x: x.lower(), pred_s))
317 |                 self.__process_bio(gs_s, pred_s)
318 | 
319 |         self.__measure_performance()
320 |         self.__measure_counts()
321 | 
322 |     def get_performance(self):
323 |         return self.performance
324 | 
325 |     def get_counts(self):
326 |         return self.counts
327 | 
328 |     def show_evaluation(self, digits=4):
329 |         if len(self.performance) == 0:
330 |             raise RuntimeError('call eval_mem() first to get the performance attribute')
331 | 
332 |         cate = self.performance['category']['strict'].keys()
333 | 
334 |         headers = ['precision', 'recall', 'f1']
335 |         width = max(max([len(c) for c in cate]), len('overall'), digits)
336 |         head_fmt = '{:>{width}s} ' + ' {:>9}' * len(headers)
337 | 
338 |         report = head_fmt.format(u'', *headers, width=width)
339 |         report += '\n\nstrict\n'
340 | 
341 |         row_fmt = '{:>{width}s} ' + ' {:>9.{digits}f}' * 3 + '\n'
342 |         for c in cate:
343 |             precision = self.performance['category']['strict'][c]['precision']
344 |             recall = self.performance['category']['strict'][c]['recall']
345 |             f1 = self.performance['category']['strict'][c]['f_score']
346 |             report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits)
347 | 
348 |         report += '\nrelax\n'
349 | 
350 |         for c in cate:
351 |             precision = self.performance['category']['relax'][c]['precision']
352 |             recall = self.performance['category']['relax'][c]['recall']
353 |             f1 = self.performance['category']['relax'][c]['f_score']
354 |             report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits)
355 | 
356 |         report += '\n\noverall\n'
357 |         report += 'acc: ' + str(self.performance['overall']['acc'])
358 |         report += '\nstrict\n'
359 |         report += row_fmt.format('', *[self.performance['overall']['strict']['precision'],
360 |                                        self.performance['overall']['strict']['recall'],
361 |                                        self.performance['overall']['strict']['f_score']], width=width, digits=digits)
362 | 
363 |         report += '\nrelax\n'
364 |         report += row_fmt.format('', *[self.performance['overall']['relax']['precision'],
365 |                                        self.performance['overall']['relax']['recall'],
366 |                                        self.performance['overall']['relax']['f_score']], width=width, digits=digits)
367 |         return report
368 | 


--------------------------------------------------------------------------------