├── NLPreprocessing ├── __init__.py ├── text_process │ ├── __init__.py │ ├── text_case_formatter.py │ └── text_special_cases.py ├── .gitignore ├── README.md ├── file_utils │ ├── nlp_io.py │ ├── create_sent_map_files.py │ └── create_train_dev_test_set.py ├── LICENSE └── annotation2BIO.py ├── ClinicalTransformerRelationExtraction ├── src │ ├── __init__.py │ ├── data_processing │ │ ├── __init__.py │ │ ├── data_format_conf.py │ │ ├── io_utils.py │ │ └── post_processing.py │ ├── config.py │ ├── relation_extraction_json.py │ ├── utils.py │ ├── model_utils.py │ ├── run_app.py │ ├── relation_extraction.py │ ├── models.py │ └── data_utils.py ├── requirements.txt ├── .gitignore ├── run_json.sh ├── config_experiment_sample.json ├── LICENSE ├── run.sh └── readme.md ├── .gitignore ├── ClinicalTransformerNER ├── src │ ├── __init__.py │ ├── common_utils │ │ ├── __init__.py │ │ ├── common_config.py │ │ ├── common_log.py │ │ ├── common_io.py │ │ ├── output_format_converter.py │ │ └── bio_prf_eval.py │ ├── eval_scripts │ │ ├── __init__.py │ │ └── old_bio_eval.py │ ├── transformer_ner │ │ ├── __init__.py │ │ ├── transfomer_log.py │ │ ├── test_transfomer.py │ │ └── model_utils.py │ ├── run_format_bio_output.py │ ├── run_transformer_batch_prediction.py │ └── run_transformer_ner.py ├── .gitignore ├── requirements.txt ├── LICENSE ├── run_transformer_batch_prediction.sh ├── run_transformer_ner.sh └── README.md ├── requirements.txt ├── LICENSE ├── scipts ├── compare_ner.py ├── run_ner.py ├── get_ann.py ├── run_pred.sh ├── training_process.sh ├── get_statistics.py ├── training_ner.py └── make_relation.py └── README.md /NLPreprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NLPreprocessing/text_process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | data/ 4 | models/ 5 | -------------------------------------------------------------------------------- /NLPreprocessing/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | __pycache__ -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | tqdm 4 | numpy 5 | scikit-learn 6 | packaging -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/eval_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/common_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/transformer_ner/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | .idea/ 4 | __pycahce__ 5 | new_ner_model 6 | /.python-version -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.2.0 2 | transformers>=3.1.0 3 | tqdm>=4.36.1 4 | numpy 5 | scikit-learn 6 | packaging -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | __pycache__ 4 | /data/ 5 | .ipynb_checkpoints/ 6 | /notebook/ 7 | /test/ 8 | -------------------------------------------------------------------------------- /NLPreprocessing/README.md: -------------------------------------------------------------------------------- 1 | # NLPpreprocessing 2 | A comprehensive NLP preprocessing package for clinical notes sentence boundary detection, tokenization 3 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/requirements.txt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | torch>=1.6.0 5 | transformers==3.1.0 6 | tqdm>=4.36.1 7 | numpy==1.16.0 8 | packaging 9 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/run_json.sh: -------------------------------------------------------------------------------- 1 | # example of training using json config to initialize all experiment parameters 2 | export CUDA_VISIBLE_DEVICES=1 3 | 4 | python ./src/relation_extraction_json.py \ 5 | --config_json "./config_experiment_sample.json" -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/data_processing/data_format_conf.py: -------------------------------------------------------------------------------- 1 | NON_RELATION_TAG = "NonRel" 2 | BRAT_REL_TEMPLATE = "R{}\t{} Arg1:{} Arg2:{}" 3 | EN1_START = "[s1]" 4 | EN1_END = "[e1]" 5 | EN2_START = "[s2]" 6 | EN2_END = "[e2]" 7 | SPEC_TAGS = [EN1_START, EN1_END, EN2_START, EN2_END] -------------------------------------------------------------------------------- /NLPreprocessing/file_utils/nlp_io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle as pkl 3 | 4 | 5 | def make_dir(mdir): 6 | if not os.path.isdir(mdir): 7 | os.mkdir(mdir) 8 | 9 | 10 | def pkl_dump(data, file): 11 | with open(file, "wb") as f: 12 | pkl.dump(data, f) 13 | 14 | 15 | def pkl_load(file): 16 | with open(file, "rb") as f: 17 | data = pkl.load(f) 18 | return data 19 | 20 | 21 | def read_file(file, encoding="utf-8"): 22 | with open(file, "r", encoding=encoding) as f: 23 | text = f.read().strip() 24 | return text 25 | 26 | 27 | def write_file(text, file, encoding="utf-8"): 28 | with open(file, "w", encoding=encoding) as f: 29 | f.write(text) 30 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/transformer_ner/transfomer_log.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from common_utils.common_log import create_logger 5 | import logging 6 | from pathlib import Path 7 | 8 | 9 | class TransformerNERLogger: 10 | def __init__(self, logger_file=None, logger_level=logging.DEBUG): 11 | self.lf = logger_file 12 | self.lvl = logger_level 13 | 14 | def set_log_info(self, logger_file, logger_level): 15 | self.lf = logger_file 16 | self.lvl = logger_level 17 | 18 | def get_logger(self): 19 | Path(self.lf).parent.mkdir(parents=True, exist_ok=True) 20 | return create_logger("Transformer_NER", log_level=self.lvl, set_file=self.lf) 21 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/data_processing/io_utils.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import json 3 | 4 | 5 | def load_text(ifn): 6 | with open(ifn, "r") as f: 7 | txt = f.read() 8 | return txt 9 | 10 | 11 | def save_text(text, ofn): 12 | with open(ofn, "w") as f: 13 | f.write(text) 14 | 15 | 16 | def pkl_save(data, file): 17 | with open(file, "wb") as f: 18 | pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL) 19 | 20 | 21 | def pkl_load(file): 22 | with open(file, "rb") as f: 23 | data = pkl.load(f) 24 | return data 25 | 26 | 27 | def load_json(file): 28 | with open(file, "r") as f: 29 | data = json.load(f) 30 | return data 31 | 32 | 33 | def save_json(data, file): 34 | with open(file, "w") as f: 35 | json.dump(data, f) 36 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/common_log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | 5 | LOG_LVLs = { 6 | 'i': logging.INFO, 7 | 'd': logging.DEBUG, 8 | 'e': logging.ERROR, 9 | 'w': logging.WARN 10 | } 11 | 12 | 13 | def create_logger(logger_name="", log_level="d", set_file=None): 14 | logger = logging.getLogger(logger_name) 15 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 16 | logger.setLevel(LOG_LVLs[log_level]) 17 | if set_file: 18 | fh = logging.FileHandler(set_file) 19 | fh.setFormatter(formatter) 20 | fh.setLevel(LOG_LVLs[log_level]) 21 | logger.addHandler(fh) 22 | else: 23 | ch = logging.StreamHandler() 24 | ch.setFormatter(formatter) 25 | ch.setLevel(LOG_LVLs[log_level]) 26 | logger.addHandler(ch) 27 | 28 | return logger 29 | -------------------------------------------------------------------------------- /NLPreprocessing/text_process/text_case_formatter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def all2lower(ifn): 5 | idx = ifn.rfind(".") 6 | ofn = ifn[:idx] + ".lower.txt" 7 | 8 | with open(ifn, "r") as fr, open(ofn, "w") as fw: 9 | for i, line in enumerate(fr): 10 | nline = " ".join([w.lower() for w in line.split(" ")]) 11 | fw.write(nline) 12 | 13 | 14 | def all2upper(ifn): 15 | idx = ifn.rfind(".") 16 | ofn = ifn[:idx] + ".upper.txt" 17 | 18 | with open(ifn, "r") as fr, open(ofn, "w") as fw: 19 | for i, line in enumerate(fr): 20 | nline = " ".join([w.upper() for w in line.split(" ")]) 21 | fw.write(nline) 22 | 23 | 24 | def all2capitalized(ifn): 25 | idx = ifn.rfind(".") 26 | ofn = ifn[:idx] + ".capitalized.txt" 27 | 28 | with open(ifn, "r") as fr, open(ofn, "w") as fw: 29 | for i, line in enumerate(fr): 30 | nline = " ".join([w.capitalize() for w in line.split(" ")]) 31 | fw.write(nline) -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/config_experiment_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "deberta", 3 | "data_format_mode": 0, 4 | "classification_scheme": 2, 5 | "pretrained_model": "microsoft/deberta-base", 6 | "data_dir": "../sample_data", 7 | "new_model_dir": "../deberta_re_model", 8 | "predict_output_file": "../deberta_re_predict.txt", 9 | "overwrite_model_dir": true, 10 | "seed": 1234, 11 | "max_seq_length": 128, 12 | "cache_data": false, 13 | "data_file_header": true, 14 | "do_train": true, 15 | "do_eval": false, 16 | "do_predict": true, 17 | "do_lower_case": true, 18 | "train_batch_size": 2, 19 | "eval_batch_size": 32, 20 | "learning_rate": 1e-05, 21 | "num_train_epochs": 5, 22 | "gradient_accumulation_steps": 1, 23 | "do_warmup": true, 24 | "warmup_ratio": 0.1, 25 | "weight_decay": 0.0, 26 | "adam_epsilon": 1e-08, 27 | "max_grad_norm": 1.0, 28 | "max_num_checkpoints": 0, 29 | "log_file": null, 30 | "log_lvl": "i", 31 | "log_step": 2, 32 | "num_core": 4, 33 | "non_relation_label": "nonRel", 34 | "progress_bar": false, 35 | "fp16": false, 36 | "fp16_opt_level": "O1" 37 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 uf-hobi-informatics-lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NLPreprocessing/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 uf-hobi-informatics-lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 uf-hobi-informatics-lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 uf-hobi-informatics-lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scipts/compare_ner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | import shutil 4 | 5 | gs_root=Path(str(sys.argv[1])) 6 | bert_root=Path(str(sys.argv[2])) 7 | save_root=Path(str(sys.argv[3])) 8 | save_root.mkdir(parents=True, exist_ok=True) 9 | for k in gs_root.glob('*.ann'): 10 | fid=k.stem 11 | txt_fn = gs_root / (fid + ".txt") 12 | ann_fn = gs_root / (fid + ".ann") 13 | txt_fn1 = save_root / (fid + ".txt") 14 | ann_fn1 = save_root / (fid + ".ann") 15 | shutil.copyfile(txt_fn, txt_fn1) 16 | shutil.copyfile(ann_fn, ann_fn1) 17 | 18 | for k in save_root.glob('*.ann'): 19 | #print(k.stem) 20 | with open(bert_root/(k.stem+'.ann')) as f: 21 | lines=f.readlines() 22 | lines_used=[] 23 | i=300 24 | for line in lines: 25 | if line[0]=='T': 26 | entity_name=line.split('\t',2)[1].split(' ',1)[0] 27 | entity_num=line.split('\t',2)[1].split(' ',1)[1] 28 | #print(entity_name) 29 | lines_used = lines_used+['T'+str(i)+'\t'+entity_name+'_predicted '+entity_num+'\t'+line.split('\t',2)[2]] 30 | i+=1 31 | with open(k, "a") as f1: 32 | f1.writelines(lines_used) -------------------------------------------------------------------------------- /ClinicalTransformerNER/run_transformer_batch_prediction.sh: -------------------------------------------------------------------------------- 1 | : ' 2 | The script is used to run multi-file batch prediction using transformer ner 3 | We only use bert as example, the roberta, XLNet should be the same 4 | The input files must have offset information 5 | If no offset information, just combine all the files into one test.txt and use the do_pred from run_transformer_ner.sh for prediction 6 | This script is design for mainly production using to generate brat/BioC formatted outputs with offset information. 7 | ' 8 | 9 | ################# BERT example ##################### 10 | export CUDA_VISIBLE_DEVICES=0 11 | 12 | # config and tokenizer information can be found in the pretrained model dir 13 | # use format 1 for BRAT, 2 for BioC, 0 as default for BIO 14 | python ./src/run_transformer_batch_prediction.py \ 15 | --model_type bert \ 16 | --pretrained_model \ 17 | --raw_text_dir \ 18 | --preprocessed_text_dir \ 19 | --output_dir \ 20 | --max_seq_length 128 \ 21 | --do_lower_case \ 22 | --eval_batch_size 8 \ 23 | --log_file ./log.txt\ 24 | --do_format 1 \ 25 | --do_copy \ 26 | --data_has_offset_information -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/config.py: -------------------------------------------------------------------------------- 1 | from transformers import (BertConfig, RobertaConfig, XLNetConfig, AlbertConfig, LongformerConfig, 2 | BertTokenizer, RobertaTokenizer, XLNetTokenizer, AlbertTokenizer, LongformerTokenizer, 3 | DebertaConfig, DebertaTokenizer) 4 | from models import (BertForRelationIdentification, RoBERTaForRelationIdentification, 5 | XLNetForRelationIdentification, AlbertForRelationIdentification, 6 | LongFormerForRelationIdentification, DebertaForRelationIdentification) 7 | 8 | 9 | EN1_START = "[s1]" 10 | EN1_END = "[e1]" 11 | EN2_START = "[s2]" 12 | EN2_END = "[e2]" 13 | # keep the seq order 14 | SPEC_TAGS = [EN1_START, EN1_END, EN2_START, EN2_END] 15 | 16 | MODEL_REQUIRE_SEGMENT_ID = {'bert', 'xlnet', 'albert', 'deberta'} 17 | 18 | MODEL_DICT = { 19 | "bert": (BertForRelationIdentification, BertConfig, BertTokenizer), 20 | "roberta": (RoBERTaForRelationIdentification, RobertaConfig, RobertaTokenizer), 21 | "xlnet": (XLNetForRelationIdentification, XLNetConfig, XLNetTokenizer), 22 | "albert": (AlbertForRelationIdentification, AlbertConfig, AlbertTokenizer), 23 | "longformer": (LongFormerForRelationIdentification, LongformerConfig, LongformerTokenizer), 24 | "deberta": (DebertaForRelationIdentification, DebertaConfig, DebertaTokenizer) 25 | } 26 | 27 | TOKENIZER_USE_FOUR_SPECIAL_TOKs = {'roberta', 'longformer'} -------------------------------------------------------------------------------- /scipts/run_ner.py: -------------------------------------------------------------------------------- 1 | #run NER 2 | import sys 3 | sys.path.append("../ClinicalTransformerNER/") 4 | sys.path.append("../NLPreprocessing/") 5 | import os 6 | from pathlib import Path 7 | from collections import defaultdict, Counter 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | import shutil 11 | import fileinput 12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file 13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]" 14 | 15 | 16 | data_dir=sys.argv[1] 17 | output_name=sys.argv[2] 18 | 19 | #data stat 20 | file_ids = set() 21 | enss = [] 22 | 23 | for fn in Path(data_dir).glob("*.ann"): 24 | file_ids.add(fn.stem) 25 | _, ens, _ = read_annotation_brat(fn) 26 | #print( _) 27 | enss.extend(ens) 28 | 29 | print("number of test files: ", len(file_ids)) 30 | print("total number of test eneitites: ", len(enss)) 31 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) 32 | 33 | # generate bio 34 | test_root = Path(data_dir) 35 | test_bio = "../temp/"+output_name 36 | output_root = Path(test_bio) 37 | output_root.mkdir(parents=True, exist_ok=True) 38 | 39 | for fn in test_root.glob("*.txt"): 40 | txt_fn = fn 41 | bio_fn = output_root / (fn.stem + ".bio.txt") 42 | 43 | txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN) 44 | nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False) 45 | 46 | BIOdata_to_file(bio_fn, nsents) 47 | 48 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/run_format_bio_output.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | The script is used to format the BIO output to target output format like BRAT 6 | The script is used to help all the prediction without format 7 | """ 8 | 9 | import argparse 10 | import traceback 11 | from pathlib import Path 12 | from common_utils.output_format_converter import main as format_converter 13 | 14 | 15 | def main(args): 16 | base_path = Path(args.bio_dir) 17 | output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output" 18 | output_formatted_dir.mkdir(parents=True, exist_ok=True) 19 | format_converter(text_dir=args.raw_text_dir, 20 | input_bio_dir=args.bio_dir, 21 | output_dir=output_formatted_dir, 22 | formatter=args.do_format, 23 | do_copy_text=args.do_copy) 24 | 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--raw_text_dir", type=str, required=True, 29 | help="The input data directory.") 30 | parser.add_argument("--bio_dir", type=str, required=True, 31 | help="The output data directory.") 32 | parser.add_argument("--do_format", default=0, type=int, 33 | help="0=bio (not format change will be applied); 1=brat; 2=bioc") 34 | parser.add_argument("--do_copy", action='store_true', 35 | help="if copy the original plain text to output folder") 36 | global_args = parser.parse_args() 37 | 38 | try: 39 | main(global_args) 40 | except Exception as ex: 41 | traceback.print_exc() -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/common_io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pickle as pkl 4 | import json 5 | 6 | 7 | def read_from_file(ifn): 8 | with open(ifn, "r") as f: 9 | text = f.read() 10 | return text 11 | 12 | 13 | def write_to_file(text, ofn): 14 | with open(ofn, "w") as f: 15 | f.write(text) 16 | return True 17 | 18 | 19 | def pkl_load(ifn): 20 | with open(ifn, "rb") as f: 21 | pdata = pkl.load(f) 22 | return pdata 23 | 24 | 25 | def pkl_dump(pdata, ofn): 26 | with open(ofn, "wb") as f: 27 | pkl.dump(pdata, f) 28 | return True 29 | 30 | 31 | def json_load(ifn): 32 | with open(ifn, "r") as f: 33 | jdata = json.load(f) 34 | return jdata 35 | 36 | 37 | def json_dump(jdata, ofn): 38 | with open(ofn, "w") as f: 39 | json.dump(jdata, f) 40 | return True 41 | 42 | 43 | def load_bio_file_into_sents(bio_file, word_sep=" ", do_lower=False): 44 | bio_text = read_from_file(bio_file) 45 | bio_text = bio_text.strip() 46 | if do_lower: 47 | bio_text = bio_text.lower() 48 | 49 | new_sents = [] 50 | sents = bio_text.split("\n\n") 51 | 52 | for sent in sents: 53 | new_sent = [] 54 | words = sent.split("\n") 55 | for word in words: 56 | new_word = word.split(word_sep) 57 | new_sent.append(new_word) 58 | new_sents.append(new_sent) 59 | 60 | return new_sents 61 | 62 | 63 | def output_bio(bio_data, output_file, sep=" "): 64 | with open(output_file, "w") as f: 65 | for sent in bio_data: 66 | for word in sent: 67 | line = sep.join(word) 68 | f.write(line) 69 | f.write("\n") 70 | f.write("\n") 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Requirement 2 | - python env: 3.8+ 3 | - use ```pip install -r requirements.txt``` to install dependencies 4 | 5 | 6 | # Models 7 | - we have two models trained for NER and Relation 8 | - Both models base on BERT architecture with different classifiers 9 | - we provide models on Huggingface: 10 | - NER:https://huggingface.co/nvbic/SODA_BERT_NER 11 | - RE:https://huggingface.co/nvbic/SODA_BERT_RE 12 | - contact: zehao.yu@ufl.edu; alexgre@ufl.edu; yonghui.wu@ufl.edu 13 | 14 | 15 | # SDoH_NLPend2end System 16 | - The system aims for extract SDoH information from clinical notes 17 | - We support text format for production and brat format for evaluation 18 | - The system is a two stage pipeline 19 | - The first stage is to extract SDoH concepts 20 | - The second stage is to identify relations between extracted concepts 21 | 22 | 23 | # Usage 24 | - download the models and unzip into this project root directory, you should have: 25 | - ./models/ner_bert 26 | - ./models/re_bert 27 | - then, cd to the ```./scripts``` directory 28 | - execute pipeline as 29 | ```shell 30 | bash run_pred.sh -i -c gpu_id 31 | ``` 32 | - "input data directory" is the location of the data you annotated (*.txt and *.ann) e.g., ./test_data 33 | - gpu_id is the id where you want to run the program. e.g, 0 - use the GPU with id as 0 34 | - if GPU is not available, try -1 to use CPU which is slow but should work. 35 | 36 | 37 | # Results 38 | - in the main directory (./SDoH_NLPend2end), we will create three directories for outputs 39 | - the first is ./logs which saves all the running logs 40 | - the second is ./temp which saves all the intermediate generated files 41 | - the third is ./results where the eval_results.txt stores the final performance measurement and the rest directories are the e2e outputs 42 | -------------------------------------------------------------------------------- /NLPreprocessing/file_utils/create_sent_map_files.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from annotation2BIO import pre_processing 4 | import logging 5 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG) 6 | logger = logging.getLogger('pre_processing clinical notes') 7 | 8 | 9 | def token2file(fw, sents): 10 | for sent in sents: 11 | for word in sent: 12 | new_line = "\t".join(map(lambda x: str(x), 13 | [word[0], word[1][0], word[1][1], word[2][0], word[2][1], "\n"])) 14 | fw.write(new_line) 15 | 16 | 17 | def output_mapping_sent_files(raw_data_dir, output_dir, deid_pattern=None): 18 | raw_data_dir = raw_data_dir 19 | output_dir = output_dir 20 | 21 | if not os.path.isdir(raw_data_dir): 22 | raise RuntimeError("Input data source directory is not exist.") 23 | 24 | if not os.path.isdir(output_dir): 25 | os.mkdir(output_dir) 26 | 27 | for input_file in os.listdir(raw_data_dir): 28 | logger.info(f'Current processing {input_file}') 29 | 30 | output_sent_file = "".join([output_dir, "/", input_file.split(".")[0], ".sent.txt"]) 31 | output_map_file = "".join([output_dir, "/", input_file.split(".")[0], ".map.txt"]) 32 | input_file = "".join([raw_data_dir, "/", input_file]) 33 | 34 | with open(output_map_file, "w") as fw_map, open(output_sent_file, "w") as fw_sent: 35 | sents, tokens = pre_processing(input_file, deid_pattern=deid_pattern) 36 | fw_sent.write(sents) 37 | token2file(fw_map, tokens) 38 | 39 | 40 | if __name__ == '__main__': 41 | # output_mapping_sent_files("data_sample/test", "data_sample/test_output", deid_pattern="\[\*\*|\*\*\]") 42 | assert len(sys.argv) == 4, "must provide input, output file directories and de-identifier pattern using # if None" 43 | if sys.argv[3] == '#': 44 | dp = None 45 | else: 46 | dp = sys.argv[3] 47 | output_mapping_sent_files(sys.argv[1], sys.argv[2], deid_pattern=dp) 48 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/run.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | bz=4 4 | epn=3 5 | sc=2 6 | dfmm=0 7 | model_type=bert 8 | pm=bert-base-uncased 9 | data_dir=/home/zehao.yu/workspace/py3/data/dr_relation_aio_th1 10 | nmd=./new_model 11 | pof=./predictions.txt 12 | log=./logs/log_1.txt 13 | 14 | python3 ./src/relation_extraction.py \ 15 | --model_type $model_type \ 16 | --data_format_mode $dfmm \ 17 | --classification_scheme $sc \ 18 | --pretrained_model $pm \ 19 | --data_dir $data_dir \ 20 | --new_model_dir $nmd \ 21 | --predict_output_file $pof \ 22 | --overwrite_model_dir \ 23 | --seed 13 \ 24 | --max_seq_length 512 \ 25 | --cache_data \ 26 | --do_train \ 27 | --do_predict \ 28 | --do_lower_case \ 29 | --train_batch_size $bz \ 30 | --eval_batch_size $bz \ 31 | --learning_rate 1e-5 \ 32 | --num_train_epochs $epn \ 33 | --gradient_accumulation_steps 1 \ 34 | --do_warmup \ 35 | --warmup_ratio 0.1 \ 36 | --weight_decay 0 \ 37 | --max_num_checkpoints 1 \ 38 | --log_file $log \ 39 | 40 | 41 | # example of testing and convert predictions to brat 42 | export CUDA_VISIBLE_DEVICES=1 43 | python3 ./src/relation_extraction.py \ 44 | --model_type $model_type \ 45 | --data_format_mode $dfmm \ 46 | --classification_scheme $sc \ 47 | --pretrained_model $pm \ 48 | --data_dir $data_dir \ 49 | --new_model_dir $nmd \ 50 | --predict_output_file $pof \ 51 | --overwrite_model_dir \ 52 | --seed 13 \ 53 | --max_seq_length 512 \ 54 | --cache_data \ 55 | --do_predict \ 56 | --do_lower_case \ 57 | --train_batch_size $bz \ 58 | --eval_batch_size $bz \ 59 | --learning_rate 1e-5 \ 60 | --num_train_epochs $epn \ 61 | --gradient_accumulation_steps 1 \ 62 | --do_warmup \ 63 | --warmup_ratio 0.1 \ 64 | --weight_decay 0 \ 65 | --max_num_checkpoints 1 \ 66 | --log_file $log \ 67 | 68 | edr="./data_annotation_entity_only" 69 | pod="./predicted_results" 70 | python3 src/data_processing/post_processing.py \ 71 | --mode mul \ 72 | --predict_result_file $pof \ 73 | --entity_data_dir $edr \ 74 | --test_data_file ${data_dir}/test.tsv \ 75 | --brat_result_output_dir $pod 76 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/run_transformer_ner.sh: -------------------------------------------------------------------------------- 1 | : ' 2 | The script contains the example shell commands you can use to run the transformer_ner tasks 3 | We include two groups of commands: 4 | 1. train and predict: The commands in train and predict section demonstrated how to run training and prediction in sequence 5 | 2. only predict: If you have a trained model, you can run prediction only following the commands in the only predict section 6 | 3. the prediction here is only for one file (test.txt) prediction. If you need batch prediction on group of files, use run_transformer_batch_prediction.sh instead. 7 | 8 | Each section contains an example for BERT 9 | We do support ALBERT, DISTILBERT, XLNet, RoBERTa as well. You can find more model information at https://huggingface.co/transformers/pretrained_models.html. 10 | We did not include examples using fp16 training mode but you can train model with fp16 (read run_transformer_ner.py source code) 11 | We currently do not support distraibuted multi-GPU training since fine-tuning task is not heavy on most clinical NER datasets. 12 | ' 13 | 14 | ########################### train and predict ########################### 15 | # tell system which GPU to use 16 | export CUDA_VISIBLE_DEVICES=0 17 | 18 | ########################### train and predict ########################### 19 | #bert 20 | python src/run_transformer_ner.py \ 21 | --model_type bert \ 22 | --pretrained_model bert-base-uncased \ 23 | --data_dir ./test_data/conll-2003 \ 24 | --new_model_dir ./new_bert_ner_model \ 25 | --overwrite_model_dir \ 26 | --predict_output_file ./bert_pred.txt \ 27 | --max_seq_length 256 \ 28 | --save_model_core \ 29 | --do_train \ 30 | --do_predict \ 31 | --model_selection_scoring strict-f_score-1 \ 32 | --do_lower_case \ 33 | --train_batch_size 8 \ 34 | --eval_batch_size 8 \ 35 | --train_steps 500 \ 36 | --learning_rate 1e-5 \ 37 | --num_train_epochs 1 \ 38 | --gradient_accumulation_steps 1 \ 39 | --do_warmup \ 40 | --seed 13 \ 41 | --warmup_ratio 0.1 \ 42 | --max_num_checkpoints 3 \ 43 | --log_file ./log.txt \ 44 | --progress_bar \ 45 | --early_stop 3 -------------------------------------------------------------------------------- /NLPreprocessing/text_process/text_special_cases.py: -------------------------------------------------------------------------------- 1 | SYMBOLS = {',', '?', '!', ':', '\'', '"', '(', ')', ';', '@', '^', '^', '&', '&', '$', '$', '£', 2 | '[', ']', '{', '}', '<', '>', '+', '-', "*", "#", "%", "=", "~", '/', "_"} 3 | 4 | PREP = {'about', 'above', 'across', 'after', 'against', 'aka', 'along', 'and', 'anti', 'apart', 'around', 'as', 5 | 'astride', 'at', 'away', 'because', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 6 | 'but', 'by', 'contra', 'down', 'due to', 'during', 'ex', 'except', 'excluding', 'following', 'for', 'from', 7 | 'given', 'in', 'including', 'inside', 'into', 'like', 'near', 'nearby', 'neath', 'of', 'off', 'on', 'onto', 8 | 'or', 'out', 'over', 'past', 'per', 'plus', 'since', 'so', 'than', 'though', 'through', 'til', 'to', 9 | 'toward', 'towards', 'under', 'underneath', 'versus', 'via', 'where', 'while', 'with', 'within', 'without', 10 | 'also'} 11 | 12 | DET = {'a', 'an', 'the'} 13 | 14 | NON_STOP_PUNCT = {',', ';'} 15 | 16 | STOP_PUNCT = {'.', '?', '!'} 17 | 18 | SENT_WORD = {'we', 'us', 'patient', 'denies', 'reveals', 'no', 'none', 'he', 'she', 'his', 'her', 'they', 'them', 'is', 19 | 'was', 'who', 'when', 'where', 'which', 'are', 'be', 'have', 'had', 'has', 'this', 'will', 'that', 'the', 20 | 'to', 'in', 'with', 'for', 'an', 'and', 'but', 'or', 'as', 'at', 'of', 'have', 'it', 'that', 'by', 'from', 21 | 'on', 'include', 'other', 'another'} 22 | 23 | UNIT = {'mg', 'lb', 'kg', 'mm', 'cm', 'm', 'doz', 'am', 'pm', 'mph', 'oz', 'ml', 'l', 'mb', 'mmHg', 'min', 'cm2', 'm2', 'M2', 24 | 'mm2', 'mL', 'F', 'ppd', 'L', 'g', 'cc', "MG", "Munits", "pack", "mcg", "K", "hrs", "N", "inch", "d", 25 | "AM", "PM", "HS", "QAM", "QPM", "BID", "mEq", "hr", "cGy", "mGy", "mLs", "mOsm"} 26 | 27 | MIMICIII_DEID_PATTERN = "\[\*\*|\*\*\]" 28 | 29 | NAME_PREFIX_SUFFIX = { 30 | 'Dr', 'Mr', 'Mrs', 'Jr', 'Ms', 'Prof' 31 | } 32 | 33 | PROFESSIONAL_TITLE = { 34 | 'M.D.', 'Ph.D.', 'Pharm.D.' 35 | } 36 | 37 | SPECIAL_ABBV = { 38 | 'e.c.', 'p.o.', 'b.i.d.', 'p.r.n.', 'i.v.', 'i.m.', 'b.i.d', 'p.r.n', 'i.m', 'i.v', 'p.o', 'd.o.b', 'vo.', 'm.o', 39 | 'r.i.', 'y.o.' 40 | } 41 | 42 | ROMAN_NUM = { 43 | 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX' 44 | } 45 | 46 | WHITE_LIST = { 47 | 'NaCl', 'KCl', 'HandiHaler', 'MetroCream', 'ChloraPrep', 'NovoLog', 'FlexPen', 'EpiPen', 'CellCept', 'iPad', 'eConsult', 'PreserVision' 48 | } -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/transformer_ner/test_transfomer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from pathlib import Path 5 | from transformer_ner.task import run_task 6 | from transformer_ner.transfomer_log import TransformerNERLogger 7 | 8 | 9 | class Args: 10 | def __init__(self, model_type, pretrained_model): 11 | self.model_type = model_type 12 | self.pretrained_model = pretrained_model 13 | self.config_name = self.pretrained_model 14 | self.tokenizer_name = self.pretrained_model 15 | self.do_lower_case = True 16 | self.overwrite_model_dir = True 17 | self.data_dir = Path(__file__).resolve().parent.parent.parent/'test_data/conll-2003' 18 | self.data_has_offset_information = False 19 | self.new_model_dir = Path(__file__).resolve().parent.parent.parent/f'new_ner_model/{model_type}_new_ner_model' 20 | self.predict_output_file = Path(__file__).resolve().parent.parent.parent/f"new_ner_model/{model_type}_new_ner_model/pred.txt" 21 | self.overwrite_output_dir = True 22 | self.max_seq_length = 16 23 | self.do_train = True 24 | self.do_predict = True 25 | self.model_selection_scoring = "strict-f_score-1" 26 | self.train_batch_size = 4 27 | self.eval_batch_size = 4 28 | self.learning_rate = 0.00001 29 | self.seed = 13 30 | self.logger = TransformerNERLogger( 31 | logger_level="i", 32 | logger_file=Path(__file__).resolve().parent.parent.parent/"new_ner_model/log.txt").get_logger() 33 | self.num_train_epochs = 2 34 | self.gradient_accumulation_steps = 1 35 | self.do_warmup = True 36 | self.label2idx = None 37 | self.idx2label = None 38 | self.max_num_checkpoints = 1 39 | self.warmup_ratio = 0.1 40 | self.weight_decay = 0.0 41 | self.adam_epsilon = 0.00000001 42 | self.max_grad_norm = 1.0 43 | self.log_file = None 44 | self.log_lvl = None 45 | self.fp16 = False 46 | self.local_rank = -1 47 | self.device = "cpu" 48 | self.train_steps = 100 49 | self.early_stop = -1 50 | self.progress_bar = True 51 | self.save_model_core = True 52 | self.use_crf = False 53 | 54 | 55 | def test(): 56 | for each in [('deberta', "microsoft/deberta-base"), 57 | ('bert', 'bert-base-uncased'), 58 | ('roberta', 'roberta-base'), 59 | ('xlnet', 'xlnet-base-cased')]: 60 | args = Args(each[0], each[1]) 61 | run_task(args) 62 | 63 | 64 | if __name__ == '__main__': 65 | test() 66 | -------------------------------------------------------------------------------- /scipts/get_ann.py: -------------------------------------------------------------------------------- 1 | run NER 2 | import sys 3 | sys.path.append("../ClinicalTransformerNER/") 4 | sys.path.append("../NLPreprocessing/") 5 | import os 6 | from pathlib import Path 7 | from collections import defaultdict, Counter 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | import shutil 11 | import fileinput 12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file 13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]" 14 | 15 | #check number of ann in 150/50 split 16 | train_dev_root1 = Path('../data/training_set_150') 17 | test_root1 = Path('../data/test_set_150') 18 | #data stat 19 | file_ids = set() 20 | enss = [] 21 | 22 | for fn in test_root1.glob("*.ann"): 23 | file_ids.add(fn.stem) 24 | _, ens, _ = read_annotation_brat(fn) 25 | #print( _) 26 | enss.extend(ens) 27 | print("150 files as training, test files: ", len(file_ids), list(file_ids)[:5]) 28 | print("150 files as training, total test eneitites: ", len(enss)) 29 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) 30 | 31 | 32 | file_ids = set() 33 | enss = [] 34 | 35 | for fn in train_dev_root1.glob("*.ann"): 36 | file_ids.add(fn.stem) 37 | _, ens, _ = read_annotation_brat(fn) 38 | #print( _) 39 | enss.extend(ens) 40 | print("150 files as training, training files: ", len(file_ids), list(file_ids)[:5]) 41 | print("150 files as training, total training eneitites: ", len(enss)) 42 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) 43 | 44 | #check ann in 100/100 split 45 | 46 | train_dev_root2 = Path('../data/training_set_100') 47 | test_root1 = Path('../data/test_set_100') 48 | #data stat 49 | file_ids = set() 50 | enss = [] 51 | 52 | for fn in test_root2.glob("*.ann"): 53 | file_ids.add(fn.stem) 54 | _, ens, _ = read_annotation_brat(fn) 55 | #print( _) 56 | enss.extend(ens) 57 | print("100 files as training, test files: ", len(file_ids), list(file_ids)[:5]) 58 | print("100 files as training, total test eneitites: ", len(enss)) 59 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) 60 | 61 | 62 | file_ids = set() 63 | enss = [] 64 | 65 | for fn in train_dev_root2.glob("*.ann"): 66 | file_ids.add(fn.stem) 67 | _, ens, _ = read_annotation_brat(fn) 68 | #print( _) 69 | enss.extend(ens) 70 | print("100 files as training, training files: ", len(file_ids), list(file_ids)[:5]) 71 | print("100 files as training, total training eneitites: ", len(enss)) 72 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/relation_extraction_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import torch 4 | from utils import TransformerLogger 5 | from relation_extraction import app as main_app 6 | 7 | 8 | class Args: 9 | """ 10 | used to hold all parameters 11 | actual parameters for experiments will be loaded from the user defined json config file 12 | """ 13 | def __init__(self, **kwargs): 14 | self.model_type = "bert" 15 | self.data_format_mode = 0 16 | self.classification_scheme = 2 17 | self.pretrained_model = "bert-base-uncased" # microsoft/deberta-large; microsoft/deberta-xlarge 18 | self.data_dir = "../sample_data" 19 | self.new_model_dir = "./bert_re_model" 20 | self.predict_output_file = "./bert_re_predict.txt" 21 | self.overwrite_model_dir = True 22 | self.seed = 1234 23 | self.max_seq_length = 128 24 | self.cache_data = False 25 | self.data_file_header = True 26 | self.do_train = True 27 | self.do_eval = False 28 | self.do_predict = True 29 | self.do_lower_case = True 30 | self.train_batch_size = 8 31 | self.eval_batch_size = 32 32 | self.learning_rate = 1e-5 33 | self.num_train_epochs = 4 34 | self.gradient_accumulation_steps = 1 35 | self.do_warmup = True 36 | self.warmup_ratio = 0.1 37 | self.weight_decay = 0.0 38 | self.adam_epsilon = 1e-8 39 | self.max_grad_norm = 1.0 40 | self.max_num_checkpoints = 0 41 | self.log_file = "./bert_re_log_txt" 42 | self.log_lvl = "i" 43 | self.log_step = 100 44 | self.num_core = 4 45 | self.non_relation_label = "nonRel" 46 | self.progress_bar = True 47 | self.fp16 = False 48 | self.fp16_opt_level = "O1" 49 | 50 | self.__update_args(**kwargs) 51 | 52 | def __update_args(self, **kwargs): 53 | for k, v in kwargs.items(): 54 | setattr(self, k, v) 55 | 56 | def __repr__(self): 57 | return repr(self.__dict__) 58 | 59 | 60 | def json2args(jsondata): 61 | return Args(**jsondata) 62 | 63 | 64 | def app(gargs): 65 | main_app(gargs) 66 | 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser() 70 | # parse arguments 71 | parser.add_argument("--config_json", default="./config.json", type=str, required=True, 72 | help="json file for experiment configurations") 73 | args = parser.parse_args() 74 | 75 | with open(args.config_json, "r") as f: 76 | configs = json.load(f, object_hook=json2args) 77 | 78 | # other setup 79 | configs.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 80 | configs.logger = TransformerLogger( 81 | logger_file=configs.log_file, logger_level=configs.log_lvl).get_logger() 82 | 83 | app(configs) 84 | -------------------------------------------------------------------------------- /NLPreprocessing/file_utils/create_train_dev_test_set.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from numpy import random 4 | from shutil import copyfile 5 | import logging 6 | 7 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG) 8 | logger = logging.getLogger('train_test_split') 9 | 10 | 11 | def __create_files_list(src_dir): 12 | assert os.path.isdir(src_dir), f"{src_dir} is not exist." 13 | return list(map(lambda x: "/".join([src_dir, x]), os.listdir(src_dir))), os.listdir(src_dir) 14 | 15 | 16 | def train_test_ids_to_file(fids, dir, cate='train'): 17 | file_name = f"{cate}_set_all_ids.txt" 18 | 19 | fids = list(map(lambda x: x.split('/')[-1], fids)) 20 | 21 | with open(f"{dir}/{file_name}", "w") as f: 22 | f.write("\n".join(fids)) 23 | 24 | 25 | def __write2file(file_list, output_dir, output_file_name): 26 | with open("/".join([output_dir, output_file_name]), "w") as f_tr: 27 | for file in file_list: 28 | file_id = file.split("/")[-1] 29 | # f_tr.write("\t".join([f"__doc {file_id}__", "-1", "-1", "-1", "-1", "O", "\n\n"])) 30 | f_tr.write(f"-DOCSTART- __doc {file_id}__\n\n") 31 | with open(file, "r") as fr: 32 | txt = fr.read().strip() 33 | f_tr.write(txt) 34 | f_tr.write("\n\n") 35 | 36 | 37 | def create_train_test_sets(src_dir, test_proportion=0.2, merge=True, shuffle_num=3): 38 | file_list, file_id_list = __create_files_list(src_dir) 39 | 40 | for _ in range(shuffle_num): 41 | random.shuffle(file_list) 42 | 43 | slice_index = int(len(file_list) * test_proportion) 44 | 45 | test_set = file_list[:slice_index] 46 | train_set = file_list[slice_index:] 47 | logger.info(f"train set size: {len(train_set)}; test set size: {len(test_set)}") 48 | 49 | output_dir = "_".join([src_dir, "train_test_split"]) 50 | 51 | if not os.path.isdir(output_dir): 52 | os.mkdir(output_dir) 53 | 54 | # write train and test ids to files 55 | train_test_ids_to_file(train_set, output_dir, "train") 56 | train_test_ids_to_file(test_set, output_dir, "test") 57 | 58 | if not merge: 59 | train_dir = "/".join([output_dir, "training_set"]) 60 | test_dir = "/".join([output_dir, "test_set"]) 61 | if not os.path.isdir(train_dir): 62 | os.mkdir(train_dir) 63 | if not os.path.isdir(test_dir): 64 | os.mkdir(test_dir) 65 | for file in train_set: 66 | new_file = "/".join([train_dir, file.split("/")[-1]]) 67 | copyfile(file, new_file) 68 | for file in test_set: 69 | new_file = "/".join([test_dir, file.split("/")[-1]]) 70 | copyfile(file, new_file) 71 | 72 | __write2file(train_set, output_dir, "training_set.txt") 73 | __write2file(test_set, output_dir, "testing_set.txt") 74 | 75 | 76 | def test(): 77 | # create_train_test_sets("data_sample/bio", test_proportion=0.5, merge=False) 78 | # print(os.getcwd()) 79 | create_train_test_sets(src_dir="/Users/alexgre/workspace/py3/2019AMIA_DEID/2019amia_train_bio", 80 | test_proportion=0.25) 81 | pass 82 | 83 | 84 | if __name__ == '__main__': 85 | test() 86 | -------------------------------------------------------------------------------- /scipts/run_pred.sh: -------------------------------------------------------------------------------- 1 | #run prediction from notes 2 | #git clone https://github.com/uf-hobi-informatics-lab/ClinicalTransformerRelationExtraction.git 3 | #git clone https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER.git 4 | #git clone https://github.com/uf-hobi-informatics-lab/NLPreprocessing.git 5 | 6 | while getopts :i:d:n:c: flag 7 | do 8 | case "${flag}" in 9 | i) input_dir=${OPTARG};; 10 | c) cuda=${OPTARG};; 11 | esac 12 | done 13 | echo "Input dir: $input_dir"; 14 | echo "CUDA used: $cuda"; 15 | 16 | output_dir=../results 17 | output_name=bio 18 | 19 | 20 | mkdir $output_dir 21 | export CUDA_VISIBLE_DEVICES=$cuda 22 | 23 | python3 ./run_ner.py $input_dir $output_name 24 | 25 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \ 26 | --model_type bert \ 27 | --pretrained_model ../models/ner_bert \ 28 | --raw_text_dir $input_dir \ 29 | --preprocessed_text_dir ../temp/${output_name} \ 30 | --output_dir ../temp/${output_name} \ 31 | --max_seq_length 128 \ 32 | --do_lower_case \ 33 | --eval_batch_size 8 \ 34 | --log_file ../logs/log_ner.txt\ 35 | --do_format 1 \ 36 | --do_copy \ 37 | --data_has_offset_information 38 | 39 | 40 | python3 ./make_relation.py $input_dir $output_name 41 | 42 | 43 | bz=4 44 | epn=3 45 | sc=2 46 | dfmm=0 47 | model_type=bert 48 | pm=bert-large 49 | data_dir_re=../temp/${output_name}_aio_th1 50 | nmd=../models/re_bert 51 | pof=../temp/predictions_${output_name}.txt 52 | log=../logs/log_re_${output_name}.txt 53 | 54 | python3 ../ClinicalTransformerRelationExtraction/src/relation_extraction.py \ 55 | --model_type $model_type \ 56 | --data_format_mode $dfmm \ 57 | --classification_scheme $sc \ 58 | --pretrained_model $pm \ 59 | --data_dir $data_dir_re \ 60 | --new_model_dir $nmd \ 61 | --predict_output_file $pof \ 62 | --overwrite_model_dir \ 63 | --seed 13 \ 64 | --max_seq_length 512 \ 65 | --num_core 10 \ 66 | --cache_data \ 67 | --do_predict \ 68 | --do_lower_case \ 69 | --train_batch_size $bz \ 70 | --eval_batch_size $bz \ 71 | --learning_rate 1e-5 \ 72 | --num_train_epochs $epn \ 73 | --gradient_accumulation_steps 1 \ 74 | --do_warmup \ 75 | --warmup_ratio 0.1 \ 76 | --weight_decay 0 \ 77 | --max_num_checkpoints 0 \ 78 | --log_file $log 79 | 80 | mkdir ${output_dir}/result 81 | mkdir ${output_dir}/result/eval 82 | mkdir ${output_dir}/result/RE 83 | 84 | edr=../temp/${output_name}_formatted_output 85 | pod=${output_dir}/result/RE/${output_name}_relation_predicted_results 86 | python3 ../ClinicalTransformerRelationExtraction/src/data_processing/post_processing.py \ 87 | --mode mul \ 88 | --predict_result_file $pof \ 89 | --entity_data_dir $edr \ 90 | --test_data_file ${data_dir_re}/test.tsv \ 91 | --brat_result_output_dir $pod\ 92 | --log_file $log 93 | 94 | python brat_eval.py --f1 $input_dir --f2 $pod >> ${output_dir}/eval_result.txt 95 | 96 | -------------------------------------------------------------------------------- /scipts/training_process.sh: -------------------------------------------------------------------------------- 1 | #training from pre-trained model on 1FL dataset 2 | while getopts :i:d:n:c: flag 3 | do 4 | case "${flag}" in 5 | i) input_dir=${OPTARG};; 6 | c) cuda=${OPTARG};; 7 | esac 8 | done 9 | echo "Input dir: $input_dir"; 10 | echo "CUDA used: $cuda"; 11 | export CUDA_VISIBLE_DEVICES=$cuda 12 | output_dir=../results 13 | output_name=bio_training 14 | mkdir ../models/SDOH_bert_updated_150 15 | mkdir ../models/SDOH_bert_updated_100 16 | mkdir ${output_dir} 17 | python3 ./training_ner.py $input_dir 18 | python3 ../ClinicalTransformerNER/src/run_transformer_ner.py \ 19 | --model_type bert \ 20 | --pretrained_model ../models/ner_bert \ 21 | --data_dir ../bio/bio_training_150 \ 22 | --new_model_dir ../models/SDOH_bert_updated_150 \ 23 | --overwrite_model_dir \ 24 | --max_seq_length 128 \ 25 | --data_has_offset_information \ 26 | --save_model_core \ 27 | --do_train \ 28 | --model_selection_scoring strict-f_score-1 \ 29 | --do_lower_case \ 30 | --train_batch_size 8 \ 31 | --train_steps 1000 \ 32 | --learning_rate 1e-5 \ 33 | --num_train_epochs 30 \ 34 | --gradient_accumulation_steps 1 \ 35 | --do_warmup \ 36 | --seed 13 \ 37 | --warmup_ratio 0.1 \ 38 | --max_num_checkpoints 3 \ 39 | --log_file ../logs/log_ner_training.txt \ 40 | --progress_bar \ 41 | --early_stop 3 42 | 43 | 44 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \ 45 | --model_type bert \ 46 | --pretrained_model ../models/SDOH_bert_updated_150 \ 47 | --raw_text_dir ../data/test_set_150 \ 48 | --preprocessed_text_dir ../bio/bio_test_150 \ 49 | --output_dir ../result/training_result_150 \ 50 | --max_seq_length 128 \ 51 | --do_lower_case \ 52 | --eval_batch_size 8 \ 53 | --log_file ../logs/log_ner_training.txt\ 54 | --do_format 1 \ 55 | --do_copy \ 56 | --data_has_offset_information 57 | 58 | python ./brat_eval.py --f1 ../data/test_set_150 --f2 ../result/training_result_150_formatted_output >> ${output_dir}/eval_result_training_150.txt 59 | 60 | 61 | # training process on 1:1 split 62 | 63 | python3 ../ClinicalTransformerNER/src/run_transformer_ner.py \ 64 | --model_type bert \ 65 | --pretrained_model ../models/ner_bert \ 66 | --data_dir ../bio/bio_training_100 \ 67 | --new_model_dir ../models/SDOH_bert_updated_100 \ 68 | --overwrite_model_dir \ 69 | --max_seq_length 128 \ 70 | --data_has_offset_information \ 71 | --save_model_core \ 72 | --do_train \ 73 | --model_selection_scoring strict-f_score-1 \ 74 | --do_lower_case \ 75 | --train_batch_size 8 \ 76 | --train_steps 1000 \ 77 | --learning_rate 1e-5 \ 78 | --num_train_epochs 30 \ 79 | --gradient_accumulation_steps 1 \ 80 | --do_warmup \ 81 | --seed 13 \ 82 | --warmup_ratio 0.1 \ 83 | --max_num_checkpoints 3 \ 84 | --log_file ../logs/log_ner_training.txt \ 85 | --progress_bar \ 86 | --early_stop 3 87 | 88 | 89 | python3 ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \ 90 | --model_type bert \ 91 | --pretrained_model ../models/SDOH_bert_updated_100 \ 92 | --raw_text_dir ../data/test_set_100 \ 93 | --preprocessed_text_dir ../bio/bio_test_100 \ 94 | --output_dir ../result/training_result_100 \ 95 | --max_seq_length 128 \ 96 | --do_lower_case \ 97 | --eval_batch_size 8 \ 98 | --log_file ../logs/log_ner_training.txt\ 99 | --do_format 1 \ 100 | --do_copy \ 101 | --data_has_offset_information 102 | 103 | python ./brat_eval.py --f1 ../data/test_set_100 --f2 ../result/training_result_100_formatted_output >> ${output_dir}/eval_result_training_100.txt 104 | 105 | -------------------------------------------------------------------------------- /scipts/get_statistics.py: -------------------------------------------------------------------------------- 1 | #get stat result 2 | from pathlib import Path 3 | import numpy as np 4 | import os 5 | import pickle as pkl 6 | import sys 7 | import pandas as pd 8 | def pkl_save(data, file): 9 | with open(file, "wb") as f: 10 | pkl.dump(data, f) 11 | 12 | def pkl_load(file): 13 | with open(file, "rb") as f: 14 | data = pkl.load(f) 15 | return data 16 | 17 | def ann_stat(data_root1): 18 | dict1=dict() 19 | for fn in Path(data_root1).glob("*.ann"): 20 | # i+=1 21 | # print(fn.stem.split('_')[-1]) 22 | # file_ids.add(fn) 23 | fid=fn.stem 24 | if fid not in dict1.keys(): 25 | dict1.update({fid:{}}) 26 | with open(fn,'r') as f: 27 | lines=f.readlines() 28 | # if not lines: 29 | # continue 30 | #else: 31 | for line in lines: 32 | line=line.strip() 33 | try: 34 | ann_cate=line.split('\t')[1].split(' ')[0] 35 | ann_res=line.split('\t')[2].split('\n')[0] 36 | # print(ann_cate) 37 | # print(ann_res) 38 | if ann_cate not in dict1[fid].keys(): 39 | dict1[fid].update({ann_cate:[ann_res]}) 40 | else: 41 | dict1[fid][ann_cate].append(ann_res) 42 | except: 43 | # print('except') 44 | # print(line) 45 | continue 46 | return dict1 47 | 48 | data_dir1=sys.argv[1] 49 | data_dir2=sys.argv[2] 50 | 51 | 52 | dict1=ann_stat(data_dir1) 53 | dict2=ann_stat(data_dir2) 54 | 55 | def find_agg_data(dict1): 56 | null_notes=set() 57 | notes=set() 58 | dict_agg=dict() 59 | for k,v in dict1.items(): 60 | if len(v)==0: 61 | null_notes.add(k) 62 | else: 63 | notes.add(k) 64 | for k1,v1 in v.items(): 65 | if k1 not in dict_agg.keys(): 66 | dict_agg.update({k1:set()}) 67 | dict_agg[k1].add(k) 68 | else: 69 | dict_agg[k1].add(k) 70 | return null_notes,notes,dict_agg 71 | 72 | 73 | pd_null,pd_pts,pd_dict_agg=find_agg_data(dict1) 74 | gs_null,gs_pts,gs_dict_agg=find_agg_data(dict2) 75 | sdoh_cate=sorted(list(pd_dict_agg.keys())+list(gs_dict_agg.keys())) 76 | def find_agg_data_3(dict1): 77 | dict_agg=dict() 78 | for k in sdoh_cate: 79 | dict_agg.update({k:[]}) 80 | for k,v in dict1.items(): 81 | 82 | for sdoh_label in sdoh_cate: 83 | if sdoh_label not in v.keys(): 84 | dict_agg[sdoh_label].append(0) 85 | else: 86 | dict_agg[sdoh_label].append(len(v[sdoh_label])) 87 | 88 | return dict_agg 89 | pd_dict_2=find_agg_data_3(dict1) 90 | gs_dict_2=find_agg_data_3(dict2) 91 | 92 | data={'SDoH_cate':sorted(list(pd_dict_agg.keys())+list(gs_dict_agg.keys())+['null_note'])} 93 | df=pd.DataFrame(data) 94 | def count_pts(x,dict_agg): 95 | if x in dict_agg.keys(): 96 | return len(dict_agg[x]) 97 | else: 98 | return 0 99 | def sum_pts_cate(x,dict_agg): 100 | if x in dict_agg.keys(): 101 | return sum(dict_agg[x]) 102 | else: 103 | return 0 104 | 105 | df['concept_sum_pred']=df.apply(lambda x: sum_pts_cate(x['SDoH_cate'],pd_dict_2),axis=1) 106 | df['concept_sum_ann']=df.apply(lambda x: sum_pts_cate(x['SDoH_cate'],gs_dict_2),axis=1) 107 | df['notes_count_pred']=df.apply(lambda x: count_pts(x['SDoH_cate'],pd_dict_agg),axis=1) 108 | df['notes_count_ann']=df.apply(lambda x: count_pts(x['SDoH_cate'],gs_dict_agg),axis=1) 109 | df.loc[(df.SDoH_cate == 'null_note'),'notes_count_pred']=len(pd_null) 110 | df.loc[(df.SDoH_cate == 'null_note'),'notes_count_pred']=len(gs_null) 111 | Path('../results').mkdir(parents=True, exist_ok=True) 112 | df.to_csv('../results/count_concepts.csv') 113 | #print(df) -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from sklearn.metrics import accuracy_score 3 | import traceback 4 | from collections import defaultdict 5 | 6 | 7 | def try_catch_annotator(func): 8 | def try_catch(*args, **kwargs): 9 | try: 10 | return func(*args, **kwargs) 11 | except Exception as ex: 12 | traceback.print_exc() 13 | return None 14 | return try_catch 15 | 16 | 17 | class TransformerLogger: 18 | LOG_LVLs = { 19 | 'i': logging.INFO, 20 | 'd': logging.DEBUG, 21 | 'e': logging.ERROR, 22 | 'w': logging.WARN 23 | } 24 | 25 | def __init__(self, logger_file=None, logger_level='d'): 26 | self.lf = logger_file 27 | self.lvl = logger_level 28 | 29 | def set_log_info(self, logger_file, logger_level): 30 | self.lf = logger_file 31 | self.lvl = logger_level 32 | 33 | def _create_logger(self, logger_name=""): 34 | logger = logging.getLogger(logger_name) 35 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s", 36 | datefmt="%Y-%m-%d %H:%M:%S") 37 | logger.setLevel(self.LOG_LVLs[self.lvl]) 38 | if self.lf: 39 | fh = logging.FileHandler(self.lf) 40 | fh.setFormatter(formatter) 41 | fh.setLevel(self.LOG_LVLs[self.lvl]) 42 | logger.addHandler(fh) 43 | else: 44 | ch = logging.StreamHandler() 45 | ch.setFormatter(formatter) 46 | ch.setLevel(self.LOG_LVLs[self.lvl]) 47 | logger.addHandler(ch) 48 | 49 | return logger 50 | 51 | def get_logger(self): 52 | return self._create_logger("Transformer_Relation_Extraction") 53 | 54 | 55 | class PRF: 56 | def __init__(self): 57 | self.tp = 0 58 | self.fp = 0 59 | 60 | def __repr__(self): 61 | return f'tp: {self.tp}; fp: {self.fp}' 62 | 63 | def calc(tp, tp_fp, tp_tn): 64 | if tp_fp != 0: 65 | pre = tp / tp_fp 66 | else: 67 | pre = 0 68 | 69 | if tp_tn == 0: 70 | rec = 0 71 | else: 72 | rec = tp / tp_tn 73 | 74 | if pre == 0 and rec == 0: 75 | f1 = 0 76 | else: 77 | f1 = 2 * pre * rec / (pre + rec) 78 | 79 | return round(pre, 4), round(rec, 4), round(f1, 4) 80 | 81 | 82 | def measure_prf(preds, gs_labels, non_rel_label): 83 | res = dict() 84 | temp = defaultdict(PRF) 85 | total_tp, total_tp_fp, total_tp_tn = 0, 0, 0 86 | tn_dict = defaultdict(lambda: 0) 87 | 88 | assert preds == gs_labels, "prediction and gold standard is not equal" 89 | 90 | labels = set(gs_labels) 91 | for l in labels: 92 | for p, g in zip(preds, gs_labels): 93 | if g == l: 94 | tn_dict[l] += 1 95 | if g == p == l: 96 | temp[l].tp += 1 97 | elif g != l and p == l: 98 | temp[l].fp += 1 99 | 100 | for l in labels: 101 | if l == non_rel_label: 102 | continue 103 | tp, fp = temp[l].tp, temp[l].fp 104 | tp_fp = tp + fp 105 | tp_tn = tn_dict[l] 106 | res[l] = calc(tp, tp_fp, tp_tn) 107 | 108 | total_tp += tp 109 | total_tp_fp += tp_fp 110 | total_tp_tn += tp_tn 111 | 112 | res['micro_average_pre_rec_f1'] = calc(total_tp, total_tp_fp, total_tp_tn) 113 | f1 = res['micro_average_pre_rec_f1'][-1] 114 | 115 | return res, f1 116 | 117 | 118 | def acc_and_f1(labels, preds, label2idx, non_rel_label): 119 | acc = accuracy_score(labels, preds) 120 | 121 | idx2label = {v: k for k, v in label2idx.items()} 122 | new_labels = [idx2label[e] for e in labels] 123 | new_preds = [idx2label[e] for e in preds] 124 | prf_list, f1 = measure_prf(new_preds, new_labels, non_rel_label) 125 | prf_list = sorted(prf_list.items(), key=lambda x: len(x[0])) 126 | res = [] 127 | for k, v in prf_list: 128 | res.append(f"{k} - pre: {v[0]}, rec: {v[1]}, f1: {v[2]}") 129 | 130 | return acc, "\n".join(res), f1 131 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/transformer_ner/model_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import math 4 | 5 | 6 | def gelu(x): 7 | """ 8 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see 9 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 10 | """ 11 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 12 | 13 | 14 | def get_mask(input, local_context): 15 | if not isinstance(local_context, DropoutContext): 16 | dropout = local_context 17 | mask = None 18 | else: 19 | dropout = local_context.dropout 20 | dropout *= local_context.scale 21 | mask = local_context.mask if local_context.reuse_mask else None 22 | 23 | if dropout > 0 and mask is None: 24 | mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool() 25 | 26 | if isinstance(local_context, DropoutContext): 27 | if local_context.mask is None: 28 | local_context.mask = mask 29 | 30 | return mask, dropout 31 | 32 | 33 | class DropoutContext(object): 34 | def __init__(self): 35 | self.dropout = 0 36 | self.mask = None 37 | self.scale = 1 38 | self.reuse_mask = True 39 | 40 | 41 | class XDropout(torch.autograd.Function): 42 | """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" 43 | 44 | @staticmethod 45 | def forward(ctx, input, local_ctx): 46 | mask, dropout = get_mask(input, local_ctx) 47 | ctx.scale = 1.0 / (1 - dropout) 48 | if dropout > 0: 49 | ctx.save_for_backward(mask) 50 | return input.masked_fill(mask, 0) * ctx.scale 51 | else: 52 | return input 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | if ctx.scale > 1: 57 | (mask,) = ctx.saved_tensors 58 | return grad_output.masked_fill(mask, 0) * ctx.scale, None 59 | else: 60 | return grad_output, None 61 | 62 | 63 | class StableDropout(torch.nn.Module): 64 | """ 65 | Optimized dropout module for stabilizing the training 66 | Args: 67 | drop_prob (float): the dropout probabilities 68 | """ 69 | 70 | def __init__(self, drop_prob): 71 | super().__init__() 72 | self.drop_prob = drop_prob 73 | self.count = 0 74 | self.context_stack = None 75 | 76 | def forward(self, x): 77 | """ 78 | Call the module 79 | Args: 80 | x (:obj:`torch.tensor`): The input tensor to apply dropout 81 | """ 82 | if self.training and self.drop_prob > 0: 83 | return XDropout.apply(x, self.get_context()) 84 | return x 85 | 86 | def clear_context(self): 87 | self.count = 0 88 | self.context_stack = None 89 | 90 | def init_context(self, reuse_mask=True, scale=1): 91 | if self.context_stack is None: 92 | self.context_stack = [] 93 | self.count = 0 94 | for c in self.context_stack: 95 | c.reuse_mask = reuse_mask 96 | c.scale = scale 97 | 98 | def get_context(self): 99 | if self.context_stack is not None: 100 | if self.count >= len(self.context_stack): 101 | self.context_stack.append(DropoutContext()) 102 | ctx = self.context_stack[self.count] 103 | ctx.dropout = self.drop_prob 104 | self.count += 1 105 | return ctx 106 | else: 107 | return self.drop_prob 108 | 109 | 110 | class ContextPooler(nn.Module): 111 | def __init__(self, config): 112 | super().__init__() 113 | self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) 114 | self.dropout = StableDropout(config.pooler_dropout) 115 | self.config = config 116 | 117 | def forward(self, hidden_states): 118 | # We "pool" the model by simply taking the hidden state corresponding 119 | # to the first token. 120 | 121 | context_token = hidden_states[:, 0] 122 | context_token = self.dropout(context_token) 123 | pooled_output = self.dense(context_token) 124 | pooled_output = gelu(pooled_output) 125 | return pooled_output 126 | 127 | @property 128 | def output_dim(self): 129 | return self.config.hidden_size -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/model_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import math 4 | 5 | 6 | def gelu(x): 7 | """ 8 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see 9 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 10 | """ 11 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 12 | 13 | 14 | def get_mask(input, local_context): 15 | if not isinstance(local_context, DropoutContext): 16 | dropout = local_context 17 | mask = None 18 | else: 19 | dropout = local_context.dropout 20 | dropout *= local_context.scale 21 | mask = local_context.mask if local_context.reuse_mask else None 22 | 23 | if dropout > 0 and mask is None: 24 | mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool() 25 | 26 | if isinstance(local_context, DropoutContext): 27 | if local_context.mask is None: 28 | local_context.mask = mask 29 | 30 | return mask, dropout 31 | 32 | 33 | class DropoutContext(object): 34 | def __init__(self): 35 | self.dropout = 0 36 | self.mask = None 37 | self.scale = 1 38 | self.reuse_mask = True 39 | 40 | 41 | class XDropout(torch.autograd.Function): 42 | """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" 43 | 44 | @staticmethod 45 | def forward(ctx, input, local_ctx): 46 | mask, dropout = get_mask(input, local_ctx) 47 | ctx.scale = 1.0 / (1 - dropout) 48 | if dropout > 0: 49 | ctx.save_for_backward(mask) 50 | return input.masked_fill(mask, 0) * ctx.scale 51 | else: 52 | return input 53 | 54 | @staticmethod 55 | def backward(ctx, grad_output): 56 | if ctx.scale > 1: 57 | (mask,) = ctx.saved_tensors 58 | return grad_output.masked_fill(mask, 0) * ctx.scale, None 59 | else: 60 | return grad_output, None 61 | 62 | 63 | class StableDropout(torch.nn.Module): 64 | """ 65 | Optimized dropout module for stabilizing the training 66 | 67 | Args: 68 | drop_prob (float): the dropout probabilities 69 | """ 70 | 71 | def __init__(self, drop_prob): 72 | super().__init__() 73 | self.drop_prob = drop_prob 74 | self.count = 0 75 | self.context_stack = None 76 | 77 | def forward(self, x): 78 | """ 79 | Call the module 80 | 81 | Args: 82 | x (:obj:`torch.tensor`): The input tensor to apply dropout 83 | """ 84 | if self.training and self.drop_prob > 0: 85 | return XDropout.apply(x, self.get_context()) 86 | return x 87 | 88 | def clear_context(self): 89 | self.count = 0 90 | self.context_stack = None 91 | 92 | def init_context(self, reuse_mask=True, scale=1): 93 | if self.context_stack is None: 94 | self.context_stack = [] 95 | self.count = 0 96 | for c in self.context_stack: 97 | c.reuse_mask = reuse_mask 98 | c.scale = scale 99 | 100 | def get_context(self): 101 | if self.context_stack is not None: 102 | if self.count >= len(self.context_stack): 103 | self.context_stack.append(DropoutContext()) 104 | ctx = self.context_stack[self.count] 105 | ctx.dropout = self.drop_prob 106 | self.count += 1 107 | return ctx 108 | else: 109 | return self.drop_prob 110 | 111 | 112 | class ContextPooler(nn.Module): 113 | def __init__(self, config): 114 | super().__init__() 115 | self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) 116 | self.dropout = StableDropout(config.pooler_dropout) 117 | self.config = config 118 | 119 | def forward(self, hidden_states): 120 | # We "pool" the model by simply taking the hidden state corresponding 121 | # to the first token. 122 | 123 | context_token = hidden_states[:, 0] 124 | context_token = self.dropout(context_token) 125 | pooled_output = self.dense(context_token) 126 | pooled_output = gelu(pooled_output) 127 | return pooled_output 128 | 129 | @property 130 | def output_dim(self): 131 | return self.config.hidden_size 132 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/run_app.py: -------------------------------------------------------------------------------- 1 | """ 2 | The py file is an example for 3 | 4 | 1. how to run RE as an python app not through command line 5 | 2. how to manually add new models other than those available in the models.py 6 | 7 | We used deberta as an example 8 | """ 9 | from models import BaseModel 10 | from data_utils import RelationDataFormatSepProcessor 11 | from transformers import DebertaForSequenceClassification, DebertaModel, DebertaConfig, DebertaTokenizer 12 | from task import TaskRunner 13 | from utils import TransformerLogger 14 | 15 | import numpy as np 16 | import torch 17 | import traceback 18 | 19 | 20 | class DeBERTaRelationExtraction(DebertaForSequenceClassification, BaseModel): 21 | def __init__(self, config): 22 | super().__init__(config) 23 | 24 | def forward( 25 | self, 26 | input_ids=None, 27 | attention_mask=None, 28 | token_type_ids=None, 29 | position_ids=None, 30 | inputs_embeds=None, 31 | labels=None, 32 | output_attentions=None, 33 | output_hidden_states=None, 34 | return_dict=None, 35 | ): 36 | outputs = self.deberta( 37 | input_ids, 38 | token_type_ids=token_type_ids, 39 | attention_mask=attention_mask, 40 | position_ids=position_ids, 41 | inputs_embeds=inputs_embeds, 42 | output_attentions=output_attentions, 43 | output_hidden_states=output_hidden_states, 44 | return_dict=return_dict, 45 | ) 46 | seq_output = outputs[0] 47 | pooled_output = self.pooler(seq_output) 48 | 49 | pooled_output = self.dropout(pooled_output) 50 | seq_output = self.dropout(seq_output) 51 | 52 | logits = self.output2logits(pooled_output, seq_output, input_ids) 53 | outputs = (logits,) + outputs[2:] 54 | loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 55 | outputs = (loss,) + outputs 56 | 57 | return outputs 58 | 59 | 60 | class DeBERTaDataProcessor(RelationDataFormatSepProcessor): 61 | def __init__(self, data_dir=None, max_seq_len=128, num_core=-1, header=True, tokenizer_type='deberta'): 62 | super().__init__( 63 | data_dir=data_dir, max_seq_len=max_seq_len, num_core=num_core, header=True, tokenizer_type='deberta') 64 | self.total_special_token_num = 4 65 | 66 | 67 | class Args: 68 | """ 69 | used to init all parameters 70 | deberta use roberta vocab 71 | deberta-v2 need new tokenizer as XLNet based on SPM 72 | """ 73 | def __init__(self, **kwargs): 74 | self.model_type = "deberta" 75 | self.data_format_mode = 0 76 | self.classification_scheme = 2 77 | self.pretrained_model = "microsoft/deberta-base" # microsoft/deberta-large; microsoft/deberta-xlarge 78 | self.data_dir = "../sample_data" 79 | self.new_model_dir = "../deberta_re_model" 80 | self.predict_output_file = "../deberta_re_predict.txt" 81 | self.overwrite_model_dir = True 82 | self.seed = 1234 83 | self.max_seq_length = 128 84 | self.cache_data = False 85 | self.data_file_header = True 86 | self.do_train = True 87 | self.do_eval = False 88 | self.do_predict = True 89 | self.do_lower_case = True 90 | self.train_batch_size = 2 91 | self.eval_batch_size = 32 92 | self.learning_rate = 1e-5 93 | self.num_train_epochs = 5 94 | self.gradient_accumulation_steps = 1 95 | self.do_warmup = True 96 | self.warmup_ratio = 0.1 97 | self.weight_decay = 0.0 98 | self.adam_epsilon = 1e-8 99 | self.max_grad_norm = 1.0 100 | self.max_num_checkpoints = 0 101 | self.log_file = None 102 | self.log_lvl = "i" 103 | self.log_step = 2 104 | self.num_core = 4 105 | self.non_relation_label = "nonRel" 106 | self.progress_bar = False 107 | self.fp16 = False 108 | self.fp16_opt_level = "O1" 109 | 110 | self.__update_args(**kwargs) 111 | 112 | def __update_args(self, **kwargs): 113 | for k, v in kwargs.items(): 114 | setattr(self, k, v) 115 | 116 | 117 | def app(): 118 | args = Args() 119 | args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 120 | args.logger = TransformerLogger(logger_file=args.log_file, logger_level='i').get_logger() 121 | 122 | np.random.seed(args.seed) 123 | torch.manual_seed(args.seed) 124 | 125 | task_runner = TaskRunner(args) 126 | 127 | # add deberta to model dict 128 | task_runner.model_dict['deberta'] = (DeBERTaRelationExtraction, DebertaConfig, DebertaTokenizer) 129 | # set deberta data processor for data processing 130 | task_runner.data_processor = DeBERTaDataProcessor( 131 | max_seq_len=args.max_seq_length, num_core=args.num_core) 132 | 133 | task_runner.task_runner_default_init() 134 | 135 | if args.do_train: 136 | try: 137 | task_runner.train() 138 | except Exception as ex: 139 | raise RuntimeError(traceback.print_exc()) 140 | 141 | if args.do_predict: 142 | try: 143 | task_runner.predict() 144 | except Exception as ex: 145 | raise RuntimeError(traceback.print_exc()) 146 | 147 | 148 | if __name__ == '__main__': 149 | app() 150 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/readme.md: -------------------------------------------------------------------------------- 1 | # Clinical Relation Extration with Transformers 2 | 3 | ## Aim 4 | This package is developed for researchers easily to use state-of-the-art transformers models for extracting relations from clinical notes. 5 | No prior knowledge of transformers is required. We handle the whole process from data preprocessing to training to prediction. 6 | 7 | ## Dependency 8 | The package is built on top of the Transformers developed by the HuggingFace. 9 | We have the requirement.txt to specify the packages required to run the project. 10 | 11 | ## Background 12 | Our training strategy is inspired by the paper: https://arxiv.org/abs/1906.03158 13 | We only support train-dev mode, but you can do 5-fold CV. 14 | 15 | ## Available models 16 | - BERT 17 | - XLNet 18 | - RoBERTa 19 | - ALBERT 20 | - DeBERTa 21 | - Longformer 22 | > We will keep adding new models. 23 | 24 | ## usage and example 25 | - data format 26 | > see sample_data dir (train.tsv and test.tsv) for the train and test data format 27 | 28 | > The sample data is a small subset of the data prepared from the 2018 umass made1.0 challenge corpus 29 | 30 | ``` 31 | # data format: tsv file with 8 columns: 32 | 1. relation_type: adverse 33 | 2. sentence_1: ALLERGIES : [s1] Penicillin [e1] . 34 | 3. sentence_2: [s2] ALLERGIES [e2] : Penicillin . 35 | 4. entity_type_1: Drug 36 | 5. entity_type_2: ADE 37 | 6. entity_id_1: T1 38 | 7. entity_id2: T2 39 | 8. file_id: 13_10 40 | 41 | note: 42 | 1) the entity between [s1][e1] is the first entity in a relation; the second entity in the relation is inbetween [s2][e2] 43 | 2) even the two entities in the same sentenc, we still require to put them separately 44 | 3) in the test.tsv, you can set all labels to neg or no_relation or whatever, because we will not use the label anyway 45 | 4) We recommend to evaluate the test performance in a separate process based on prediction. (see **post-processing**) 46 | 5) We recommend using official evaluation scripts to do evaluation to make sure the results reported are reliable. 47 | ``` 48 | 49 | - preprocess data (see the preprocess.ipynb script for more details on usage) 50 | > we did not provide a script for training and test data generation 51 | 52 | > we have a jupyter notebook with preprocessing 2018 n2c2 data as an example 53 | 54 | > you can follow our example to generate your own dataset 55 | 56 | - special tags 57 | > we use 4 special tags to identify two entities in a relation 58 | ``` 59 | # the defaults tags we defined in the repo are 60 | 61 | EN1_START = "[s1]" 62 | EN1_END = "[e1]" 63 | EN2_START = "[s2]" 64 | EN2_END = "[e2]" 65 | 66 | If you need to customize these tags, you can change them in 67 | config.py 68 | ``` 69 | 70 | - training 71 | > please refer to the wiki page for all details of the parameters 72 | > [flag details](https://github.com/uf-hobi-informatics-lab/ClinicalTransformerRelationExtraction/wiki/All-flags-explained-for-training-and-test) 73 | 74 | ```shell script 75 | export CUDA_VISIBLE_DEVICES=1 76 | data_dir=./sample_data 77 | nmd=./new_model 78 | pof=./predictions.txt 79 | log=./log.txt 80 | 81 | python ./src/relation_extraction.py \ 82 | --model_type bert \ 83 | --data_format_mode 0 \ 84 | --classification_scheme 1 \ 85 | --pretrained_model bert-base-uncased \ 86 | --data_dir $data_dir \ 87 | --new_model_dir $nmd \ 88 | --predict_output_file $pof \ 89 | --overwrite_model_dir \ 90 | --seed 13 \ 91 | --max_seq_length 256 \ 92 | --cache_data \ 93 | --do_train \ 94 | --do_lower_case \ 95 | --train_batch_size 4 \ 96 | --eval_batch_size 4 \ 97 | --learning_rate 1e-5 \ 98 | --num_train_epochs 3 \ 99 | --gradient_accumulation_steps 1 \ 100 | --do_warmup \ 101 | --warmup_ratio 0.1 \ 102 | --weight_decay 0 \ 103 | --max_num_checkpoints 1 \ 104 | --log_file $log \ 105 | ``` 106 | 107 | - prediction 108 | ```shell script 109 | export CUDA_VISIBLE_DEVICES=1 110 | data_dir=./sample_data 111 | nmd=./new_model 112 | pof=./predictions.txt 113 | log=./log.txt 114 | 115 | # we have to set data_dir, new_model_dir, model_type, log_file, and eval_batch_size, data_format_mode 116 | python ./src/relation_extraction.py \ 117 | --model_type bert \ 118 | --data_format_mode 0 \ 119 | --classification_scheme 1 \ 120 | --pretrained_model bert-base-uncased \ 121 | --data_dir $data_dir \ 122 | --new_model_dir $nmd \ 123 | --predict_output_file $pof \ 124 | --overwrite_model_dir \ 125 | --seed 13 \ 126 | --max_seq_length 256 \ 127 | --cache_data \ 128 | --do_predict \ 129 | --do_lower_case \ 130 | --eval_batch_size 4 \ 131 | --log_file $log \ 132 | ``` 133 | 134 | - post-processing (we only support transformation to brat format) 135 | ```shell script 136 | # see --help for more information 137 | data_dir=./sample_data 138 | pof=./predictions.txt 139 | 140 | python src/data_processing/post_processing.py \ 141 | --mode mul \ 142 | --predict_result_file $pof \ 143 | --entity_data_dir ./test_data_entity_only \ 144 | --test_data_file ${data_dir}/test.tsv \ 145 | --brat_result_output_dir ./brat_output 146 | ``` 147 | 148 | 149 | ## Using json file for experiment config instead of commend line 150 | 151 | - to simplify using the package, we support using json file for configuration 152 | - using json, you can define all parameters in a separate json file instead of input via commend line 153 | - config_experiment_sample.json is a sample json file you can follow to develop yours 154 | - to run experiment with json config, you need to follow run_json.sh 155 | ```shell script 156 | export CUDA_VISIBLE_DEVICES=1 157 | 158 | python ./src/relation_extraction_json.py \ 159 | --config_json "./config_experiment_sample.json" 160 | ``` 161 | 162 | ## Issues 163 | raise an issue if you have problems. 164 | 165 | ## Citation 166 | please cite our paper: 167 | ``` 168 | 169 | ``` 170 | 171 | ## Clinical Pre-trained Transformer Models 172 | We have a series transformer models pre-trained on MIMIC-III. 173 | You can find them here: 174 | - https://transformer-models.s3.amazonaws.com/mimiciii_albert_10e_128b.zip 175 | - https://transformer-models.s3.amazonaws.com/mimiciii_bert_10e_128b.zip 176 | - https://transformer-models.s3.amazonaws.com/mimiciii_electra_5e_128b.zip 177 | - https://transformer-models.s3.amazonaws.com/mimiciii_roberta_10e_128b.zip 178 | - https://transformer-models.s3.amazonaws.com/mimiciii_xlnet_5e_128b.zip 179 | - https://transformer-models.s3.amazonaws.com/mimiciii_deberta_10e_128b.tar.gz 180 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/output_format_converter.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | The script provide a tool to convert BIO formatted file to either Brat or BioC format 6 | The script also provide a tool to merge several brat or BioC formatted files into one file by concatenating all the unique entities. 7 | 8 | The pre-request is BIO data must have offset information 9 | """ 10 | 11 | from pathlib import Path 12 | from common_utils.common_io import read_from_file, load_bio_file_into_sents 13 | import shutil 14 | import traceback 15 | 16 | 17 | BRAT_TEMPLATE = "{}\t{} {} {}\t{}" 18 | BIOC_TEMPLATE = """ 19 | 20 | {e} 21 | 22 | {b} 23 | \n 24 | """ 25 | BIOC_HEADER = """ 26 | 27 | 28 | 29 | 30 | 31 | 32 | {} 33 | 34 | 0 35 | """ 36 | BIOC_END = """ 37 | 38 | 39 | 40 | """ 41 | 42 | 43 | def __prepare_path(text_dir, input_dir, output_dir): 44 | t_input = Path(text_dir) 45 | p_input = Path(input_dir) 46 | p_output = Path(output_dir) 47 | p_output.mkdir(parents=True, exist_ok=True) 48 | 49 | return t_input, p_input, p_output 50 | 51 | 52 | def tag2entity(sents): 53 | entities = [] 54 | for i, sent in enumerate(sents): 55 | term, start, end, sem_tag, prev_tag = [], None, None, None, "O" 56 | for j, word in enumerate(sent): 57 | text, w_s, w_e, w_a_s, w_a_e, predict_tag = word # must have offset information 58 | if predict_tag == "O": 59 | if prev_tag != "O": 60 | entities.append((" ".join(term), start, end, sem_tag)) 61 | term, start, end, sem_tag = [], None, None, None 62 | else: 63 | boundary, ttag = predict_tag.split("-") 64 | if boundary == "B": 65 | if prev_tag != "O": 66 | entities.append((" ".join(term), start, end, sem_tag)) 67 | term, start, end, sem_tag = [], None, None, None 68 | term.append(text) 69 | start, end, sem_tag = w_s, w_e, ttag 70 | elif boundary == "I": 71 | if sem_tag == ttag: 72 | term.append(text) 73 | end = w_e 74 | else: 75 | if prev_tag != "O": 76 | entities.append((" ".join(term), start, end, sem_tag)) 77 | term, start, end, sem_tag = [], None, None, None 78 | term.append(text) 79 | start, end, sem_tag = w_s, w_e, ttag 80 | else: 81 | raise ValueError('The BIO scheme only support B, I but get {}-{} in {}'.format(boundary, ttag, sent)) 82 | prev_tag = predict_tag 83 | 84 | if term: 85 | entities.append((" ".join(term), start, end, sem_tag)) 86 | 87 | return entities 88 | 89 | 90 | def bio2output(text_dir, input_dir, output_dir, output_template, do_copy_text, file_suffix='ann'): 91 | """ 92 | we expect the input as a directory of all bio files end with .txt suffix 93 | we expect the each bio file contain the offset info (start; end position of each words) and tag info; 94 | original words are not required 95 | convert the bio formatted files to brat formatted .ann file 96 | the output directory will not contain the .txt file 97 | """ 98 | t_input, p_input, p_output = __prepare_path(text_dir, input_dir, output_dir) 99 | for ifn in p_input.glob("*.txt"): 100 | try: 101 | ifn_stem = ifn.stem.split(".")[0] 102 | doc_text_file = t_input / "{}.txt".format(ifn_stem) 103 | ofn = p_output / "{}.{}".format(ifn_stem, file_suffix) 104 | sents = load_bio_file_into_sents(ifn, do_lower=False) 105 | doc_text = read_from_file(doc_text_file) 106 | entities = tag2entity(sents) 107 | output_entities = [] 108 | for idx, entity in enumerate(entities): 109 | ann_text, offset_s, offset_e, sem_tag = entity 110 | offset_s, offset_e = int(offset_s), int(offset_e) 111 | # we need to use original text not the ann text here 112 | # you can use ann_text for debugging 113 | raw_entity_text = doc_text[offset_s:offset_e] 114 | 115 | if "\n" in raw_entity_text: 116 | idx = raw_entity_text.index("\n") 117 | offset_s = "{} {};{}".format(offset_s, offset_s+idx, offset_s+idx+1) 118 | raw_entity_text = raw_entity_text.replace("\n", " ") 119 | 120 | if file_suffix == "ann": 121 | formatted_output = output_template.format("T{}".format(idx+1), sem_tag, offset_s, offset_e, raw_entity_text) 122 | elif file_suffix == "xml": 123 | formatted_output = output_template.format(a=idx+1, b=raw_entity_text, c=offset_s, d=offset_e-offset_s, e=sem_tag) 124 | else: 125 | formatted_output = None 126 | print('formatted output is None due to unknown formatter code') 127 | 128 | output_entities.append(formatted_output) 129 | 130 | if do_copy_text: 131 | new_text_file = p_output / "{}.txt".format(ifn_stem) 132 | shutil.copy2(doc_text_file.as_posix(), new_text_file.as_posix()) 133 | 134 | with open(ofn, "w") as f: 135 | formatted_output = "\n".join(output_entities) 136 | if file_suffix == "xml": 137 | formatted_output = BIOC_HEADER.format(ifn.stem) + formatted_output + BIOC_END 138 | f.write(formatted_output) 139 | f.write("\n") 140 | except Exception as ex: 141 | traceback.print_exc() 142 | 143 | 144 | def main(text_dir=None, input_bio_dir=None, output_dir=None, formatter=1, do_copy_text=True): 145 | if formatter == 1: 146 | bio2output(text_dir, input_bio_dir, output_dir, BRAT_TEMPLATE, do_copy_text, file_suffix="ann") 147 | elif formatter == 2: 148 | bio2output(text_dir, input_bio_dir, output_dir, BIOC_TEMPLATE, do_copy_text, file_suffix='xml') 149 | else: 150 | raise RuntimeError("Only support formatter as 1 and 2 but get {}; see help for more information.".format(formatter)) 151 | -------------------------------------------------------------------------------- /scipts/training_ner.py: -------------------------------------------------------------------------------- 1 | #create training and test bio for NER 2 | import sys 3 | sys.path.append("../ClinicalTransformerNER/") 4 | sys.path.append("../NLPreprocessing/") 5 | import os 6 | from pathlib import Path 7 | from collections import defaultdict, Counter 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | import shutil 11 | import fileinput 12 | from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file 13 | MIMICIII_PATTERN = "\[\*\*|\*\*\]" 14 | 15 | data_dir=sys.argv[1] 16 | #output_name='test' 17 | 18 | #data stat 19 | file_ids = set() 20 | enss = [] 21 | 22 | for fn in Path(data_dir).glob("*.ann"): 23 | file_ids.add(fn.stem) 24 | _, ens, _ = read_annotation_brat(fn) 25 | #print( _) 26 | enss.extend(ens) 27 | print("test files: ", len(file_ids), list(file_ids)[:5]) 28 | print("total test eneitites: ", len(enss)) 29 | print("Entities distribution by types:\n", "\n".join([str(c) for c in Counter([each[1] for each in enss]).most_common()])) 30 | # generate bio 31 | file_ids = list(file_ids) 32 | train_dev_ids, test_ids = train_test_split(file_ids, train_size=0.75, random_state=13, shuffle=True)#use 150 for training 33 | print('length of training and test') 34 | len(train_dev_ids), len(test_ids) 35 | train_dev_root = Path('../data/training_set_150') 36 | test_root = Path('../data/test_set_150') 37 | #create notes file 38 | Path(train_dev_root).mkdir(parents=True, exist_ok=True) 39 | Path(test_root).mkdir(parents=True, exist_ok=True) 40 | train_root=Path(data_dir) 41 | #copy file to train and test 42 | for fid in train_dev_ids: 43 | txt_fn = train_root / (fid + ".txt") 44 | ann_fn = train_root / (fid + ".ann") 45 | txt_fn1 = train_dev_root / (fid + ".txt") 46 | ann_fn1 = train_dev_root / (fid + ".ann") 47 | shutil.copyfile(txt_fn, txt_fn1) 48 | shutil.copyfile(ann_fn, ann_fn1) 49 | for fid in test_ids: 50 | txt_fn = train_root / (fid + ".txt") 51 | ann_fn = train_root / (fid + ".ann") 52 | txt_fn1 = test_root / (fid + ".txt") 53 | ann_fn1 = test_root / (fid + ".ann") 54 | shutil.copyfile(txt_fn, txt_fn1) 55 | shutil.copyfile(ann_fn, ann_fn1) 56 | 57 | train_dev_ids = list(train_dev_ids) 58 | train_ids, dev_ids = train_test_split(train_dev_ids, train_size=0.9, random_state=13, shuffle=True) 59 | test_bio = "../bio/"+'bio_test_150' 60 | training_bio = "../bio/"+'bio_training_150' 61 | output_root1 = Path(test_bio) 62 | output_root2 = Path(training_bio) 63 | output_root1.mkdir(parents=True, exist_ok=True) 64 | output_root2.mkdir(parents=True, exist_ok=True) 65 | 66 | for fid in train_dev_ids: 67 | txt_fn = train_dev_root / (fid + ".txt") 68 | ann_fn = train_dev_root / (fid + ".ann") 69 | bio_fn = output_root2 / (fid + ".bio.txt") 70 | 71 | txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN) 72 | e2idx, entities, rels = read_annotation_brat(ann_fn) 73 | nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False) 74 | #print(nsents) 75 | #print(bio_fn) 76 | #break 77 | BIOdata_to_file(bio_fn, nsents) 78 | # train 79 | with open(training_bio+"/train.txt", "w") as f: 80 | for fid in train_ids: 81 | f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt"))) 82 | fileinput.close() 83 | 84 | # dev 85 | with open(training_bio+"/dev.txt", "w") as f: 86 | for fid in dev_ids: 87 | f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt"))) 88 | fileinput.close() 89 | 90 | #test 91 | for fn in test_root.glob("*.txt"): 92 | txt_fn = fn 93 | bio_fn = output_root1 / (fn.stem + ".bio.txt") 94 | 95 | txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN) 96 | nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False) 97 | 98 | BIOdata_to_file(bio_fn, nsents) 99 | 100 | #same process but have train test split as 1:1 101 | train_dev_ids, test_ids = train_test_split(file_ids, train_size=0.5, random_state=13, shuffle=True)#use 8:2 split 102 | print('length of training and test') 103 | len(train_dev_ids), len(test_ids) 104 | train_dev_root = Path('../data/training_set_100') 105 | test_root = Path('../data/test_set_100') 106 | #create notes file 107 | Path(train_dev_root).mkdir(parents=True, exist_ok=True) 108 | Path(test_root).mkdir(parents=True, exist_ok=True) 109 | train_root=Path(data_dir) 110 | #copy file to train and test 111 | for fid in train_dev_ids: 112 | txt_fn = train_root / (fid + ".txt") 113 | ann_fn = train_root / (fid + ".ann") 114 | txt_fn1 = train_dev_root / (fid + ".txt") 115 | ann_fn1 = train_dev_root / (fid + ".ann") 116 | shutil.copyfile(txt_fn, txt_fn1) 117 | shutil.copyfile(ann_fn, ann_fn1) 118 | for fid in test_ids: 119 | txt_fn = train_root / (fid + ".txt") 120 | ann_fn = train_root / (fid + ".ann") 121 | txt_fn1 = test_root / (fid + ".txt") 122 | ann_fn1 = test_root / (fid + ".ann") 123 | shutil.copyfile(txt_fn, txt_fn1) 124 | shutil.copyfile(ann_fn, ann_fn1) 125 | 126 | train_dev_ids = list(train_dev_ids) 127 | train_ids, dev_ids = train_test_split(train_dev_ids, train_size=0.9, random_state=13, shuffle=True) 128 | test_bio = "../bio/"+'bio_test_100' 129 | training_bio = "../bio/"+'bio_training_100' 130 | output_root1 = Path(test_bio) 131 | output_root2 = Path(training_bio) 132 | output_root1.mkdir(parents=True, exist_ok=True) 133 | output_root2.mkdir(parents=True, exist_ok=True) 134 | 135 | for fid in train_dev_ids: 136 | txt_fn = train_dev_root / (fid + ".txt") 137 | ann_fn = train_dev_root / (fid + ".ann") 138 | bio_fn = output_root2 / (fid + ".bio.txt") 139 | 140 | txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN) 141 | e2idx, entities, rels = read_annotation_brat(ann_fn) 142 | nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False) 143 | #print(nsents) 144 | #print(bio_fn) 145 | #break 146 | BIOdata_to_file(bio_fn, nsents) 147 | # train 148 | with open(training_bio+"/train.txt", "w") as f: 149 | for fid in train_ids: 150 | f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt"))) 151 | fileinput.close() 152 | 153 | # dev 154 | with open(training_bio+"/dev.txt", "w") as f: 155 | for fid in dev_ids: 156 | f.writelines(fileinput.input(output_root2 / (fid + ".bio.txt"))) 157 | fileinput.close() 158 | 159 | #test 160 | for fn in test_root.glob("*.txt"): 161 | txt_fn = fn 162 | bio_fn = output_root1 / (fn.stem + ".bio.txt") 163 | 164 | txt, sents = pre_processing(txt_fn, deid_pattern=MIMICIII_PATTERN) 165 | nsents, sent_bound = generate_BIO(sents, [], file_id=txt_fn, no_overlap=False) 166 | 167 | BIOdata_to_file(bio_fn, nsents) -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/run_transformer_batch_prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | The input files must have offset information. In input file, for each word in line, it must have at least text, start, end, tag information 5 | output file suffix will be set to .bio.txt 6 | """ 7 | 8 | import torch 9 | import argparse 10 | import os 11 | import traceback 12 | from pathlib import Path 13 | 14 | from common_utils.common_io import json_load, output_bio 15 | from transformer_ner.data_utils import TransformerNerDataProcessor, transformer_convert_data_to_features 16 | from transformer_ner.task import load_model, predict, MODEL_CLASSES, _output_bio 17 | from transformer_ner.transfomer_log import TransformerNERLogger 18 | from common_utils.common_log import LOG_LVLs 19 | from common_utils.output_format_converter import main as format_converter 20 | 21 | import transformers 22 | from packaging import version 23 | 24 | 25 | pytorch_version = version.parse(transformers.__version__) 26 | assert pytorch_version >= version.parse('3.0.0'), \ 27 | 'we now only support transformers version >=3.0.0, but your version is {}'.format(pytorch_version) 28 | 29 | 30 | def main(args): 31 | label2idx = json_load(os.path.join(args.pretrained_model, "label2idx.json")) 32 | num_labels = len(label2idx) 33 | idx2label = {v: k for k, v in label2idx.items()} 34 | args.label2idx = label2idx 35 | args.idx2label = idx2label 36 | # get config, model and tokenizer 37 | model_config, _, model_tokenizer = MODEL_CLASSES[args.model_type] 38 | tokenizer = model_tokenizer.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) 39 | args.tokenizer = tokenizer 40 | config = model_config.from_pretrained(args.pretrained_model, do_lower_case=args.do_lower_case) 41 | args.config = config 42 | args.use_crf = config.use_crf 43 | model = load_model(args, args.pretrained_model) 44 | model.to(args.device) 45 | 46 | ner_data_processor = TransformerNerDataProcessor() 47 | ner_data_processor.set_logger(args.logger) 48 | ner_data_processor.set_data_dir(args.preprocessed_text_dir) 49 | if args.data_has_offset_information: 50 | ner_data_processor.offset_info_available() 51 | 52 | # fids = [each.stem.split(".")[0] for each in Path(args.preprocessed_text_dir).glob("*.txt")] 53 | for each_file in Path(args.preprocessed_text_dir).glob("*.txt"): 54 | try: 55 | test_example = ner_data_processor.get_test_examples(file_name=each_file.name) 56 | test_features = transformer_convert_data_to_features(args=args, 57 | input_examples=test_example, 58 | label2idx=label2idx, 59 | tokenizer=tokenizer, 60 | max_seq_len=args.max_seq_length) 61 | predictions = predict(args, model, test_features) 62 | Path(args.output_dir).mkdir(parents=True, exist_ok=True) 63 | ofn = each_file.stem.split(".")[0] + ".bio.txt" 64 | args.predict_output_file = os.path.join(args.output_dir, ofn) 65 | _output_bio(args, test_example, predictions) 66 | except Exception as ex: 67 | args.logger.error(f"Encountered an error when processing predictions for file: {each_file.name}") 68 | args.logger.error(traceback.format_exc()) 69 | 70 | if args.do_format: 71 | base_path = Path(args.output_dir) 72 | output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output" 73 | output_formatted_dir.mkdir(parents=True, exist_ok=True) 74 | format_converter(text_dir=args.raw_text_dir, 75 | input_bio_dir=args.output_dir, 76 | output_dir=output_formatted_dir, 77 | formatter=args.do_format, 78 | do_copy_text=args.do_copy) 79 | 80 | 81 | if __name__ == '__main__': 82 | parser = argparse.ArgumentParser() 83 | 84 | parser.add_argument("--model_type", default='bert', type=str, required=True, 85 | help="valid values: bert, roberta or xlnet, albert, distilbert") 86 | parser.add_argument("--pretrained_model", type=str, required=True, 87 | help="The pretrained model file or directory for fine tuning.") 88 | parser.add_argument("--preprocessed_text_dir", type=str, required=True, 89 | help="The input data directory.") 90 | parser.add_argument("--raw_text_dir", type=str, required=True, 91 | help="The input data directory.") 92 | parser.add_argument("--data_has_offset_information", action='store_true', 93 | help="The input data directory.") 94 | parser.add_argument("--output_dir", type=str, required=True, 95 | help="The output data directory.") 96 | parser.add_argument("--do_lower_case", action='store_true', 97 | help="Set this flag if you are using an uncased model.") 98 | parser.add_argument("--eval_batch_size", default=8, type=int, 99 | help="Total batch size for eval.") 100 | parser.add_argument("--max_seq_length", default=128, type=int, 101 | help="maximum number of tokens allowed in each sentence") 102 | parser.add_argument("--log_file", default=None, 103 | help="where to save the log information") 104 | parser.add_argument("--log_lvl", default="i", type=str, 105 | help="d=DEBUG; i=INFO; w=WARNING; e=ERROR") 106 | parser.add_argument("--do_format", default=0, type=int, 107 | help="0=bio (not format change will be applied); 1=brat; 2=bioc") 108 | parser.add_argument("--do_copy", action='store_true', 109 | help="if copy the original plain text to output folder") 110 | parser.add_argument("--progress_bar", action='store_true', 111 | help="show progress during the training in tqdm") 112 | parser.add_argument("--use_crf", action='store_true', 113 | help="Whether to use crf layer as classifier.") 114 | 115 | global_args = parser.parse_args() 116 | # create logger 117 | logger = TransformerNERLogger(global_args.log_file, global_args.log_lvl).get_logger() 118 | global_args.logger = logger 119 | # device 120 | global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 121 | logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.') 122 | 123 | main(global_args) 124 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/run_transformer_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import torch 6 | from transformer_ner.task import run_task 7 | from transformer_ner.transfomer_log import TransformerNERLogger 8 | from traceback import format_exc 9 | 10 | from packaging import version 11 | import transformers 12 | 13 | 14 | pytorch_version = version.parse(transformers.__version__) 15 | assert pytorch_version >= version.parse('3.0.0'), \ 16 | 'we now only support transformers version >=3.0.0, but your version is {}'.format(pytorch_version) 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser() 21 | 22 | # add arguments 23 | parser.add_argument("--model_type", default='bert', type=str, required=True, 24 | help="valid values: bert, roberta or xlnet") 25 | parser.add_argument("--pretrained_model", type=str, required=True, 26 | help="The pretrained model file or directory for fine tuning.") 27 | parser.add_argument("--config_name", default=None, type=str, 28 | help="Pretrained config name or path if not the same as pretrained_model") 29 | parser.add_argument("--tokenizer_name", default=None, type=str, 30 | help="Pretrained tokenizer name or path if not the same as pretrained_model") 31 | parser.add_argument("--data_dir", type=str, required=True, 32 | help="The input data directory.") 33 | parser.add_argument("--data_has_offset_information", action='store_true', 34 | help="The input data directory.") 35 | parser.add_argument("--new_model_dir", type=str, required=True, 36 | help="directory for saving new model checkpoints (keep latest n only)") 37 | parser.add_argument("--save_model_core", action='store_true', 38 | help="""save the transformer core of the model 39 | which allows model to be used as base model for further pretraining""") 40 | parser.add_argument("--predict_output_file", type=str, default=None, 41 | help="predicted results output file.") 42 | parser.add_argument('--overwrite_model_dir', action='store_true', 43 | help="Overwrite the content of the new model directory") 44 | parser.add_argument("--seed", default=3, type=int, 45 | help='random seed') 46 | parser.add_argument("--max_seq_length", default=128, type=int, 47 | help="maximum number of tokens allowed in each sentence") 48 | parser.add_argument("--do_train", action='store_true', 49 | help="Whether to run training.") 50 | parser.add_argument("--model_selection_scoring", default='strict-f_score-1', type=str, 51 | help="""The scoring methos used to select model on dev dataset 52 | only support strict-f_score-n, relax-f_score-n (n is 0.5, 1, or 2)""") 53 | parser.add_argument("--do_predict", action='store_true', 54 | help="Whether to run prediction on the test set.") 55 | parser.add_argument("--use_crf", action='store_true', 56 | help="Whether to use crf layer as classifier.") 57 | parser.add_argument("--do_lower_case", action='store_true', 58 | help="Set this flag if you are using an uncased model.") 59 | parser.add_argument("--train_batch_size", default=8, type=int, 60 | help="The batch size for training.") 61 | parser.add_argument("--eval_batch_size", default=8, type=int, 62 | help="The batch size for eval.") 63 | parser.add_argument('--train_steps', type=int, default=-1, 64 | help="Number of trianing steps between two evaluations on the dev set; if <0 then evaluate after each epoch") 65 | parser.add_argument("--learning_rate", default=1e-5, type=float, 66 | help="The initial learning rate for Adam.") 67 | parser.add_argument("--num_train_epochs", default=10, type=float, 68 | help="Total number of training epochs to perform.") 69 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 70 | help="Number of updates steps to accumulate before performing a backward/update pass.") 71 | parser.add_argument("--do_warmup", action='store_true', 72 | help='Whether to apply warmup strategy in optimizer.') 73 | parser.add_argument("--warmup_ratio", default=0.1, type=float, 74 | help="Linear warmup over warmup_ratio.") 75 | parser.add_argument("--weight_decay", default=0.0, type=float, 76 | help="Weight deay if we apply some.") 77 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 78 | help="Epsilon for Adam optimizer.") 79 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 80 | help="Max gradient norm.") 81 | parser.add_argument("--max_num_checkpoints", default=3, type=int, 82 | help="max number of checkpoints saved during training, old checkpoints will be removed.") 83 | parser.add_argument("--log_file", default=None, 84 | help="where to save the log information") 85 | parser.add_argument("--log_lvl", default="i", type=str, 86 | help="d=DEBUG; i=INFO; w=WARNING; e=ERROR") 87 | parser.add_argument("--progress_bar", action='store_true', 88 | help="show progress during the training in tqdm") 89 | parser.add_argument("--early_stop", default=-1, type=int, 90 | help="""The training will stop after num of epoch without performance improvement. If set to 0 or -1, then not use early stop.""") 91 | 92 | # fp16 and distributed training 93 | parser.add_argument('--fp16', action='store_true', 94 | help="Whether to use 16-bit float precision instead of 32-bit") 95 | # parser.add_argument("--fp16_opt_level", type=str, default="O1", 96 | # help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 97 | # "See details at https://nvidia.github.io/apex/amp.html") 98 | # parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") 99 | # parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") 100 | # parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") 101 | 102 | global_args = parser.parse_args() 103 | 104 | # create logger 105 | logger = TransformerNERLogger(global_args.log_file, global_args.log_lvl).get_logger() 106 | global_args.logger = logger 107 | 108 | # set and check cuda (we recommend to set up CUDA device in shell) 109 | # os.environ['CUDA_VISIBLE_DEVICES'] = global_args.cuda_ids 110 | global_args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 111 | logger.info("Task will use cuda device: GPU_{}.".format(torch.cuda.current_device()) if torch.cuda.device_count() else 'Task will use CPU.') 112 | 113 | # if args.tokenizer_name and args.config_name are not specially set, set them as pretrained_model 114 | if not global_args.tokenizer_name: 115 | global_args.tokenizer_name = global_args.pretrained_model 116 | logger.warning("set tokenizer as {}".format(global_args.tokenizer_name)) 117 | 118 | if not global_args.config_name: 119 | global_args.config_name = global_args.pretrained_model 120 | logger.warning("set config as {}".format(global_args.config_name)) 121 | 122 | if global_args.do_predict and not global_args.predict_output_file: 123 | raise RuntimeError("Running prediction but predict output file is not set.") 124 | 125 | try: 126 | run_task(global_args) 127 | except Exception as ex: 128 | logger.error(format_exc()) 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/relation_extraction.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import numpy as np 4 | import random 5 | from utils import TransformerLogger 6 | from task import TaskRunner 7 | from pathlib import Path 8 | from data_processing.io_utils import save_text 9 | import traceback 10 | 11 | 12 | def set_seed(gargs): 13 | random.seed(gargs.seed) 14 | np.random.seed(gargs.seed) 15 | torch.manual_seed(gargs.seed) 16 | 17 | 18 | def app(gargs): 19 | set_seed(gargs) 20 | 21 | # do_eval is used with do_train in most cases for 5-CV 22 | if gargs.do_eval and not gargs.do_train: 23 | raise RuntimeError("Evaluation mode (do_eval) is only available when do_train is used.\n" 24 | "You may want to use do_predict instead.") 25 | 26 | # make it case in-sensitive 27 | gargs.model_type = gargs.model_type.lower() 28 | task_runner = TaskRunner(gargs) 29 | task_runner.task_runner_default_init() 30 | 31 | if gargs.do_train: 32 | if Path(gargs.new_model_dir).exists() and not gargs.overwrite_model_dir: 33 | raise RuntimeError("{} is exist and overwrite this dir is not permitted.".format(gargs.new_model_dir)) 34 | 35 | # training 36 | try: 37 | task_runner.train() 38 | except Exception as ex: 39 | gargs.logger.error("Training error:\n{}".format(traceback.format_exc())) 40 | traceback.print_exc() 41 | raise RuntimeError() 42 | 43 | if gargs.do_predict: 44 | # run prediction 45 | try: 46 | preds = task_runner.predict() 47 | except Exception as ex: 48 | gargs.logger.error("Prediction error:\n{}".format(traceback.format_exc())) 49 | raise RuntimeError(traceback.format_exc()) 50 | 51 | pred_res = "\n".join([str(pred) for pred in preds]) 52 | 53 | # predict_output_file must be a file, we will create parent dir automatically 54 | Path(gargs.predict_output_file).parent.mkdir(parents=True, exist_ok=True) 55 | save_text(pred_res, gargs.predict_output_file) 56 | 57 | 58 | if __name__ == '__main__': 59 | parser = argparse.ArgumentParser() 60 | # parse arguments 61 | parser.add_argument("--model_type", default='bert', type=str, required=True, 62 | help="valid values: bert, roberta, albert or xlnet") 63 | parser.add_argument("--data_format_mode", default=0, type=int, 64 | help="valid values: 0: sep mode - [CLS]S1[SEP]S2[SEP]; 1: uni mode - [CLS]S1S2[SEP]") 65 | parser.add_argument("--classification_scheme", default=2, type=int, 66 | help="special tokens used for classification. " 67 | "Valid values: " 68 | "0: [CLS]; 1: [CLS], [S1], [S2]; 2: [CLS], [S1], [S2], [E1], [E2]; 3: [S1], [S2]") 69 | parser.add_argument("--pretrained_model", type=str, 70 | help="The pretrained model file or directory for fine tuning.") 71 | parser.add_argument("--data_dir", type=str, required=True, 72 | help="The input data directory. Should have at least a file named train.tsv") 73 | parser.add_argument("--new_model_dir", type=str, required=True, 74 | help="directory for saving new model checkpoints (keep latest n only)") 75 | parser.add_argument("--predict_output_file", type=str, default=None, 76 | help="predicted results output file.") 77 | parser.add_argument('--overwrite_model_dir', action='store_true', 78 | help="Overwrite the content of the new model directory") 79 | parser.add_argument("--seed", default=1234, type=int, 80 | help='random seed') 81 | parser.add_argument("--max_seq_length", default=512, type=int, 82 | help="maximum number of tokens allowed in each sentence") 83 | parser.add_argument("--cache_data", action='store_true', 84 | help="Whether to cache the features after tokenization (save training initialization time)") 85 | parser.add_argument("--data_file_header", default=True, type=bool, 86 | help="flag used to define whether the data tsv file has header or not. " 87 | "If has header, we will skip the first line") 88 | parser.add_argument("--do_train", action='store_true', 89 | help="Whether to run training.") 90 | parser.add_argument("--do_eval", action='store_true', 91 | help="Whether to run evaluation on dev. (require dev.tsv)") 92 | parser.add_argument("--do_predict", action='store_true', 93 | help="Whether to run prediction on the test set. (require test.tsv)") 94 | parser.add_argument("--do_lower_case", action='store_true', 95 | help="Set this flag if you are using an uncased model.") 96 | parser.add_argument("--train_batch_size", default=8, type=int, 97 | help="The batch size for training.") 98 | parser.add_argument("--eval_batch_size", default=8, type=int, 99 | help="The batch size for eval.") 100 | parser.add_argument("--learning_rate", default=1e-5, type=float, 101 | help="The initial learning rate for Adam.") 102 | parser.add_argument("--num_train_epochs", default=10, type=int, 103 | help="Total number of training epochs to perform.") 104 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 105 | help="Number of updates steps to accumulate before performing a backward/update pass.") 106 | parser.add_argument("--do_warmup", action='store_true', 107 | help='Whether to apply warmup strategy in optimizer.') 108 | parser.add_argument("--warmup_ratio", default=0.1, type=float, 109 | help="Linear warmup over warmup_ratio.") 110 | parser.add_argument("--weight_decay", default=0.0, type=float, 111 | help="Weight deay if we apply some.") 112 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 113 | help="Epsilon for Adam optimizer.") 114 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 115 | help="Max gradient norm.") 116 | parser.add_argument("--max_num_checkpoints", default=0, type=int, 117 | help="max number of checkpoints saved during training, old checkpoints will be removed." 118 | "if 0, then only save the last one at the end of training") 119 | parser.add_argument("--log_file", default=None, 120 | help="where to save the log information") 121 | parser.add_argument("--log_lvl", default="i", type=str, 122 | help="d=DEBUG; i=INFO; w=WARNING; e=ERROR") 123 | parser.add_argument("--log_step", default=1000, type=int, 124 | help="logging after how many steps of training. If < 0, no log during training") 125 | parser.add_argument("--num_core", default=1, type=int, 126 | help="how many cores used for multiple process for data generation") 127 | parser.add_argument("--non_relation_label", default="NonRel", type=str, 128 | help="The label used for representing " 129 | "candidate entity pairs that is not a true relation (negative sample)") 130 | parser.add_argument("--progress_bar", action='store_true', 131 | help="show progress during the training in tqdm") 132 | parser.add_argument('--fp16', action='store_true', 133 | help="Whether to use 16-bit float precision instead of 32-bit") 134 | parser.add_argument("--fp16_opt_level", type=str, default="O1", 135 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 136 | "See details at https://nvidia.github.io/apex/amp.html") 137 | 138 | args = parser.parse_args() 139 | 140 | # other setup 141 | args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 142 | args.logger = TransformerLogger(logger_file=args.log_file, logger_level=args.log_lvl).get_logger() 143 | app(args) 144 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/data_processing/post_processing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Post processing 3 | 4 | Using this script to merge the system prediction with the entities 5 | The output format will be in BRAT 6 | 7 | We will automatically align the predictions to entity pairs and file ids 8 | The results will be write out with the entity information into a new file 9 | We will not copy the original text to the results output dir 10 | """ 11 | import argparse 12 | from pathlib import Path 13 | import numpy as np 14 | from io_utils import load_text, save_text, pkl_load 15 | from collections import defaultdict 16 | from data_format_conf import NON_RELATION_TAG, BRAT_REL_TEMPLATE 17 | import traceback 18 | 19 | # import logger from upper level dir 20 | import os 21 | import sys 22 | sys.path.append(Path(os.path.abspath(__file__)).parent.parent.as_posix()) 23 | from utils import TransformerLogger 24 | 25 | 26 | def load_mappings(map_file): 27 | maps = [] 28 | text = load_text(map_file) 29 | for idx, line in enumerate(text.strip().split("\n")): 30 | if idx == 0: 31 | continue 32 | info = line.split("\t") 33 | maps.append(info[-3:]) 34 | 35 | return maps 36 | 37 | 38 | def load_predictions(result_file): 39 | results = [] 40 | text = load_text(result_file) 41 | for each in text.strip().split("\n"): 42 | results.append(each.strip()) 43 | 44 | return results 45 | 46 | 47 | def map_results(res): 48 | mapped_preds = defaultdict(list) 49 | prev_fid = "no previous file id" 50 | rel_idx = 1 51 | 52 | for each in res: 53 | fid, rt, arg1, arg2 = each 54 | if prev_fid != fid: 55 | prev_fid = fid 56 | rel_idx = 1 57 | brat_res = BRAT_REL_TEMPLATE.format(rel_idx, rt, arg1, arg2) 58 | mapped_preds[fid].append(brat_res) 59 | rel_idx += 1 60 | 61 | return mapped_preds 62 | 63 | 64 | def output_results(mapped_predictions, entity_data_dir, output_dir): 65 | entity_data_dir = Path(entity_data_dir) 66 | 67 | output_dir = Path(output_dir) 68 | output_dir.mkdir(parents=True, exist_ok=True) 69 | 70 | for fid in entity_data_dir.glob("*.ann"): 71 | fid_key = fid.stem 72 | ofn = output_dir / "{}.ann".format(fid_key) 73 | entities = load_text(fid).strip() 74 | if fid_key in mapped_predictions: 75 | rels = mapped_predictions[fid_key] 76 | rels = "\n".join(rels) 77 | outputs = "\n".join([entities, rels]) 78 | save_text(outputs, ofn) 79 | else: 80 | save_text(entities, ofn) 81 | 82 | 83 | def combine_maps_predictions_mul(args): 84 | comb_map_pred = [] 85 | 86 | for mf, pf in zip(args.test_data_file, args.predict_result_file): 87 | maps = load_mappings(mf) 88 | preds = load_predictions(pf) 89 | llp = len(preds) 90 | llm = len(maps) 91 | assert llp == llm, \ 92 | f"prediction results and mappings should have same amount data, but got preds: {llp} and maps: {llm}" 93 | for m, rel_type in zip(maps, preds): 94 | if rel_type == NON_RELATION_TAG: 95 | continue 96 | arg1, arg2, fid = m 97 | comb_map_pred.append((fid, rel_type, arg1, arg2)) 98 | 99 | comb_map_pred.sort(key=lambda x: x[0]) 100 | return comb_map_pred 101 | 102 | 103 | def load_mappings_bin(map_file): 104 | maps = [] 105 | text = load_text(map_file) 106 | for idx, line in enumerate(text.strip().split("\n")): 107 | if idx == 0: 108 | continue 109 | info = line.split("\t") 110 | maps.append(info[-5:]) 111 | 112 | return maps 113 | 114 | 115 | def combine_maps_predictions_bin(args): 116 | if not args.type_map: 117 | raise RuntimeError("no type maps (entity-relation) provided. See help.") 118 | type_maps = pkl_load(args.type_map) 119 | 120 | comb_map_pred = [] 121 | 122 | for mf, pf in zip(args.test_data_file, args.predict_result_file): 123 | maps = load_mappings_bin(mf) 124 | preds = load_predictions(pf) 125 | llp = len(preds) 126 | llm = len(maps) 127 | assert llp == llm, \ 128 | f"prediction results and mappings should have same amount data, but got preds: {llp} and maps: {llm}" 129 | for m, rel_type in zip(maps, preds): 130 | if rel_type == NON_RELATION_TAG: 131 | continue 132 | en_type_1, en_type_2, arg1, arg2, fid = m 133 | real_rel_type = type_maps[(en_type_1, en_type_2)] 134 | comb_map_pred.append((fid, real_rel_type, arg1, arg2)) 135 | 136 | comb_map_pred.sort(key=lambda x: x[0]) 137 | return comb_map_pred 138 | 139 | 140 | def app(args): 141 | lltf = len(args.test_data_file) 142 | llpf = len(args.predict_result_file) 143 | 144 | args.logger.info("mode: {}; predict file: {}; output: {}".format( 145 | args.mode, 146 | args.predict_result_file, 147 | args.brat_result_output_dir 148 | )) 149 | 150 | try: 151 | assert lltf == llpf 152 | except AssertionError as ex: 153 | args.logger.error( 154 | f"test and prediction file number should be same but get test: {lltf} and preduction {llpf}.") 155 | raise RuntimeError( 156 | f"test and prediction file number should be same but get test: {lltf} and preduction {llpf}.") 157 | 158 | if args.mode == "mul": 159 | combined_results = combine_maps_predictions_mul(args) 160 | elif args.mode == "bin": 161 | combined_results = combine_maps_predictions_bin(args) 162 | else: 163 | args.logger.error("expect mode to be mul or bin but get {}".format(args.mode)) 164 | raise RuntimeError("expect mode to be mul or bin but get {}".format(args.mode)) 165 | 166 | try: 167 | combined_results = map_results(combined_results) 168 | output_results(combined_results, args.entity_data_dir, args.brat_result_output_dir) 169 | except Exception as ex: 170 | args.logger.error(traceback.print_exc()) 171 | 172 | 173 | if __name__ == '__main__': 174 | parser = argparse.ArgumentParser() 175 | # parse arguments 176 | """ 177 | To input multiple test data and prediction files, using following syntax in terminal; 178 | You need to make sure the files order between test and prediction is correct 179 | 180 | bash: 181 | python post_processing.py --test_data_file tf1.txt --test_data_file tf2.txt --predict_result_file res1.txt 182 | --predict_result_file res2.txx 183 | 184 | in the program: 185 | args.test_data_file = ['tf1.txt', 'tf2.txt'] 186 | args.predict_result_file = ['res1.txt', 'res2.txt'] 187 | 188 | if use bin model, you need a map file to map positive relation to its relation type. 189 | We use entity type pair as key to conduct this mapping 190 | example: 191 | (ADE, Drug): Drug-ADE 192 | """ 193 | parser.add_argument("--mode", type=str, default='mul', required=True, 194 | help="we have two mode for binary (bin) and multiple (mul) classes classification") 195 | parser.add_argument("--type_map", type=str, default=None, 196 | help="a map of entity pair types to relation types (only use when mode is bin)") 197 | parser.add_argument("--test_data_file", type=str, nargs='+', required=True, 198 | help="The test data file in which we need to read the maps; available to accept multiple files") 199 | parser.add_argument("--entity_data_dir", type=str, required=True, 200 | help="The annotation files with all the entities") 201 | parser.add_argument("--predict_result_file", nargs='+', type=str, required=True, 202 | help="prediction results; available to accept multiple files") 203 | parser.add_argument("--brat_result_output_dir", type=str, required=True, 204 | help="prediction results") 205 | parser.add_argument("--log_file", default="./log.txt", 206 | help="where to save the log information") 207 | pargs = parser.parse_args() 208 | 209 | pargs.logger = TransformerLogger(logger_file=pargs.log_file, 210 | logger_level='i').get_logger() 211 | 212 | app(pargs) 213 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/README.md: -------------------------------------------------------------------------------- 1 | # Clinical Transformer NER 2 | 3 | ## Aim 4 | The package is the implementation of a transformer based NER system for clinical information extraction task. We aim to provide a simple and quick tool for researchers to conduct clinical NER without comprehensive knowledge of transformers. We also implemented a strategy to handle the sequence with length longer than the general transformer limits (512 tokens) without truncating any tokens. 5 | 6 | ## Current available models 7 | - BERT (base, large, mimiciii-pretrained) 8 | - RoBERTa (base, large, mimiciii-pretrained) 9 | - ALBERT (base, large, xlarge, xxlarge, mimiciii-pretrained) 10 | - ELECTRA (base, large, mimiciii-pretrained) 11 | - DistilBERT (base) 12 | - XLNet (base, large, mimiciii-pretrained) 13 | - Longformer (allenai/longformer-base-4096, allenai/longformer-large-4096) 14 | - DeBERTa (microsoft/deberta-base, microsoft/deberta-large, microsoft/deberta-xlarge) 15 | > note: 1. all mimic-pretrained models are based on base transformer architecture (Download is available in the section MIMIC-III pre-trained models); 2. DeBERTa is not support xlarge-v2 due to tokenizer change in original implementation 16 | 17 | ## Usage and example 18 | - Training and test with BIO 19 | 20 | ```shell script 21 | # set GPU 22 | export CUDA_VISIBLE_DEVICES=0 23 | 24 | # use bert 25 | python src/run_transformer_ner.py \ 26 | --model_type bert \ 27 | --pretrained_model bert-base-uncased \ 28 | --data_dir ./test_data/conll-2003 \ 29 | --new_model_dir ./new_bert_ner_model \ 30 | --overwrite_model_dir \ 31 | --predict_output_file ./bert_pred.txt \ 32 | --max_seq_length 256 \ 33 | --save_model_core \ 34 | --do_train \ 35 | --do_predict \ 36 | --model_selection_scoring strict-f_score-1 \ 37 | --do_lower_case \ 38 | --train_batch_size 8 \ 39 | --eval_batch_size 8 \ 40 | --train_steps 500 \ 41 | --learning_rate 1e-5 \ 42 | --num_train_epochs 1 \ 43 | --gradient_accumulation_steps 1 \ 44 | --do_warmup \ 45 | --seed 13 \ 46 | --warmup_ratio 0.1 \ 47 | --max_num_checkpoints 3 \ 48 | --log_file ./log.txt \ 49 | --progress_bar \ 50 | --early_stop 3 51 | ``` 52 | 53 | - Test on multiple files and convert bio to brat format 54 | 55 | ```shell script 56 | ##### note ###### 57 | # In the script below, you are asked to provide a preprocessed_text_dir which contains all the preprocessed file. 58 | # 59 | # If you only use the BIO format for output (you have to remove --data_has_offset_information flag 60 | # and set --do_format flag to 0), and the data format will be the format exactly as the conll-2003 dataset. 61 | # 62 | # If you need BRAT or BioC format as output (as the example script), then you have to add offset information 63 | # to the BIO data to indicate where each word is located in the raw text. 64 | # We suggest you to follow the format below: 65 | # 66 | # The original sentences: "Name: John Doe\nAge: 18" 67 | # The two sentences after preprocesing "Name : John Doe\nAge : 18" 68 | # 69 | # then, you can convert the data into BIO format similar as the Conll-2003 as 70 | # """ 71 | # Name 0 4 0 4 O 72 | # : 4 5 5 6 O 73 | # John 6 10 7 11 B-name 74 | # Doe 11 14 12 15 I-name 75 | # 76 | # Age 15 18 16 19 O 77 | # : 18 19 19 20 O 78 | # 18 20 22 22 24 B-age 79 | # 80 | # For test purposes, you do not need to assign a real BIO label for each word, 81 | # you can just simple assign "O" to all of them. 82 | # It will not influence the prediction results since the predictions will be converted to brat/BioC, 83 | # and you need to use those for evaluation. 84 | # """ 85 | # 86 | # The first two numbers are the offsets of a word in the original text and the following 87 | # two numbers are the offsets of a word in the preprocessed text. 88 | # If you do not need to perform any preprocessing, then you have to set the second set of offsets as the first one. 89 | ################# 90 | 91 | export CUDA_VISIBLE_DEVICES=0 92 | 93 | # config and tokenizer information can be found in the pretrained model dir 94 | # use format 1 for BRAT, 2 for BioC, 0 as default for BIO 95 | python ./src/run_transformer_batch_prediction.py \ 96 | --model_type bert \ 97 | --pretrained_model \ 98 | --raw_text_dir \ 99 | --preprocessed_text_dir \ 100 | --output_dir \ 101 | --max_seq_length 128 \ 102 | --do_lower_case \ 103 | --eval_batch_size 8 \ 104 | --log_file ./log.txt\ 105 | --do_format 1 \ 106 | --do_copy \ 107 | --data_has_offset_information 108 | 109 | #### 110 | # note: If you use do_format, then we have two outputs: 111 | # 1) all bio outputs in output_dir; 112 | # 2) 2) we create a formatted output dir (this dir's name is output_dir's name with a suffix of '_formatted_output') for the formatted # outputs (brat format if you set do_format=1). If you set --do_copy, we will copy the .txt files to the formatted output dir, otherwise we only put .ann files in the formatted output dir. 113 | #### 114 | ``` 115 | 116 | ## Wiki for all parameters 117 | [wiki](https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER/wiki/Parameters) 118 | 119 | ## Organization 120 | - Department of Health Outcomes and Biomedical Informatics, College of Medicine, University of Florida 121 | 122 | ## Authors 123 | - Xi Yang (alexgre@ufl.edu) 124 | - Jiang Bian (bianjiang@ufl.edu) 125 | - Yonghui Wu (yonghui.wu@ufl.edu) 126 | 127 | ## Contact 128 | - If you have any questions, please raise an issue in the GitHub 129 | 130 | ## Reference 131 | please cite our paper: 132 | > Xi Yang, Jiang Bian, William R Hogan, Yonghui Wu, Clinical concept extraction using transformers, Journal of the American Medical Informatics Association, ocaa189, https://doi.org/10.1093/jamia/ocaa189 133 | 134 | ``` 135 | @article{10.1093/jamia/ocaa189, 136 | author = {Yang, Xi and Bian, Jiang and Hogan, William R and Wu, Yonghui}, 137 | title = "{Clinical concept extraction using transformers}", 138 | journal = {Journal of the American Medical Informatics Association}, 139 | year = {2020}, 140 | month = {10}, 141 | abstract = "{The goal of this study is to explore transformer-based models (eg, Bidirectional Encoder Representations from Transformers [BERT]) for clinical concept extraction and develop an open-source package with pretrained clinical models to facilitate concept extraction and other downstream natural language processing (NLP) tasks in the medical domain.We systematically explored 4 widely used transformer-based architectures, including BERT, RoBERTa, ALBERT, and ELECTRA, for extracting various types of clinical concepts using 3 public datasets from the 2010 and 2012 i2b2 challenges and the 2018 n2c2 challenge. We examined general transformer models pretrained using general English corpora as well as clinical transformer models pretrained using a clinical corpus and compared them with a long short-term memory conditional random fields (LSTM-CRFs) mode as a baseline. Furthermore, we integrated the 4 clinical transformer-based models into an open-source package.The RoBERTa-MIMIC model achieved state-of-the-art performance on 3 public clinical concept extraction datasets with F1-scores of 0.8994, 0.8053, and 0.8907, respectively. Compared to the baseline LSTM-CRFs model, RoBERTa-MIMIC remarkably improved the F1-score by approximately 4\\% and 6\\% on the 2010 and 2012 i2b2 datasets. This study demonstrated the efficiency of transformer-based models for clinical concept extraction. Our methods and systems can be applied to other clinical tasks. The clinical transformer package with 4 pretrained clinical models is publicly available at https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER. We believe this package will improve current practice on clinical concept extraction and other tasks in the medical domain.}", 142 | issn = {1527-974X}, 143 | doi = {10.1093/jamia/ocaa189}, 144 | url = {https://doi.org/10.1093/jamia/ocaa189}, 145 | note = {ocaa189}, 146 | eprint = {https://academic.oup.com/jamia/advance-article-pdf/doi/10.1093/jamia/ocaa189/34055422/ocaa189.pdf}, 147 | } 148 | ``` 149 | 150 | ## MIMIC-III pre-trained models 151 | - https://transformer-models.s3.amazonaws.com/mimiciii_albert_10e_128b.zip 152 | - https://transformer-models.s3.amazonaws.com/mimiciii_bert_10e_128b.zip 153 | - https://transformer-models.s3.amazonaws.com/mimiciii_electra_5e_128b.zip 154 | - https://transformer-models.s3.amazonaws.com/mimiciii_roberta_10e_128b.zip 155 | - https://transformer-models.s3.amazonaws.com/mimiciii_xlnet_5e_128b.zip 156 | - https://transformer-models.s3.amazonaws.com/mimiciii_deberta_10e_128b.tar.gz 157 | - https://transformer-models.s3.amazonaws.com/mimiciii_longformer_5e_128b.zip 158 | > note: all model pretraining tasks were done with the scripts at https://github.com/huggingface/transformers/tree/master/examples/language-modeling with a few customization. 159 | -------------------------------------------------------------------------------- /NLPreprocessing/annotation2BIO.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script aims to convert BRAT format data into BIO format data for NER 3 | Entities will be mapped from their original offsets to the new offsets after sentence tokenization 4 | Two sentences are separated by a empty line 5 | entities and relations information are also provided in json format 6 | """ 7 | 8 | import os 9 | import sys 10 | import logging 11 | from text_process.sentence_tokenization import SentenceBoundaryDetection 12 | logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG) 13 | logger = logging.getLogger(__file__) 14 | # logger.disabled = True 15 | MIMICIII_PATTERN = "\[\*\*|\*\*\]" 16 | 17 | 18 | def __ann_info(ann): 19 | en_info = ann.split(" ") 20 | return en_info[0], int(en_info[1]), int(en_info[-1]) 21 | 22 | 23 | def __rel_info(rel_id, rel, rep): 24 | info = rel.split(" ") 25 | assert len(info) == 3, f"{rel_id}\t{rel} is not a valid relation" 26 | 27 | arg1 = info[1].split(":")[1] 28 | arg2 = info[2].split(":")[1] 29 | rel_type = info[0] 30 | 31 | if rep: 32 | rel_type = rel_type.replace("-", "_") # format rel_type replace - with _ 33 | 34 | return rel_type, arg1, arg2 35 | 36 | 37 | def read_annotation_brat(ann_file, rep=False): 38 | """ 39 | load annotation data 40 | entity_id2index_map -> {'T1': 0} 41 | entites -> ('T1', 'anticoagulant medications', 'Drug', (1000, 1025)) 42 | relations -> ('Route-Drug', 'T3', 'T2') 43 | """ 44 | # map the entity id (e.g., T1) to its index in entities list 45 | entity_id2index_map = dict() 46 | entites = [] 47 | relations = [] 48 | with open(ann_file, "r") as f: 49 | for line in f: 50 | line = line.strip() 51 | if not line: 52 | continue 53 | anns = line.split("\t") 54 | ann_id = anns[0] 55 | if ann_id.startswith("T"): 56 | t_type = anns[-1] 57 | # for each in __ann_info(anns[1]): 58 | # entites.append((t_type, each[0], each[1])) 59 | entity_words, offset_s, offset_e = __ann_info(anns[1]) 60 | entites.append((t_type, entity_words, (offset_s, offset_e))) 61 | entity_id2index_map[ann_id] = len(entites) - 1 62 | elif ann_id.startswith("R"): 63 | relations.append(__rel_info(ann_id, anns[1], rep)) 64 | 65 | # sort entities list 66 | # entites = sorted(entites, key=lambda x: x[2][1]) 67 | 68 | return entity_id2index_map, entites, relations 69 | 70 | 71 | def pre_processing(abs_file_path, deid_pattern=None, word_level=True, replace_number=False): 72 | sent_tokenizer = SentenceBoundaryDetection() 73 | 74 | if replace_number and not word_level: 75 | logger.info("sentence level tokenization") 76 | return sent_tokenizer.sent_tokenizer(replace_number) 77 | 78 | if deid_pattern: 79 | sent_tokenizer.set_deid_pattern(deid_pattern) 80 | 81 | sent_tokenizer.set_input_file(abs_file_path) 82 | 83 | logger.info(f"word level tokenization with replace_number set to {replace_number}") 84 | 85 | return sent_tokenizer.sent_word_tokenization_and_mapping(replace_number) 86 | 87 | 88 | def __remove_overlap_entity(sorted_entities): 89 | valid_en = [] 90 | for idx, en in enumerate(sorted_entities): 91 | if idx == 0: 92 | valid_en.append(en) 93 | continue 94 | pre_en = sorted_entities[idx-1] 95 | c_s = en[2][0] 96 | c_e = en[2][1] 97 | p_s = pre_en[2][0] 98 | p_e = pre_en[2][1] 99 | if c_s > p_e: 100 | valid_en.append(en) 101 | return valid_en 102 | 103 | 104 | def generate_BIO(sents, entities, file_id="", no_overlap=False, record_pos=False, tag_types=None, 105 | exclude_tag_types=None): 106 | """ 107 | assign annotation information to each token 108 | if two token have overlapped offsets, the second one will be discarded 109 | if define tag_types (iterable type), only the types in the tag_types list will be labeled to the corpus 110 | if define exclude_tag_types (iterable type), the tags will not be annotated 111 | """ 112 | nsents = [] 113 | if file_id: 114 | logger.info(f"process {file_id} file") 115 | 116 | entities = sorted(entities, key=lambda x: x[2][0]) 117 | 118 | if tag_types: 119 | entities = list(filter(lambda x: x[1] in tag_types, entities)) 120 | 121 | if exclude_tag_types: 122 | entities = list(filter(lambda x: x[1] not in exclude_tag_types, entities)) 123 | 124 | if no_overlap: 125 | entities = __remove_overlap_entity(entities) 126 | 127 | entities_iter = iter(entities) 128 | entity = next(entities_iter, None) 129 | for i, sent in enumerate(sents): 130 | nsent = [] 131 | for j, token in enumerate(sent): 132 | if record_pos: 133 | token.append((i, j)) 134 | if not entity: 135 | token.append('O') 136 | else: 137 | # token: ('Admission', (0, 9), (0, 9)) 138 | offset_start = token[1][0] 139 | offset_end = token[1][1] 140 | en_s = entity[2][0] 141 | en_e = entity[2][1] 142 | en_type = entity[1] 143 | if offset_start < en_s and offset_end < en_e: 144 | token.append('O') 145 | elif offset_start == en_s: 146 | token.append("-".join(['B', en_type])) 147 | if offset_end >= en_e: 148 | entity = next(entities_iter, None) 149 | elif offset_start > en_s and offset_end < en_e: 150 | token.append("-".join(['I', en_type])) 151 | elif offset_start > en_s and offset_end == en_e: 152 | token.append("-".join(['I', en_type])) 153 | entity = next(entities_iter, None) 154 | else: 155 | # check entity position and token position 156 | logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap") 157 | entity = next(entities_iter, None) 158 | if not entity: 159 | token.append('O') 160 | continue 161 | if offset_start > en_e: 162 | # logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap") 163 | # entity = next(entities_iter, None) 164 | en_s = entity[2][0] 165 | en_e = entity[2][1] 166 | en_type = entity[1] 167 | if offset_end <= en_s: 168 | token.append('O') 169 | else: 170 | if offset_start == en_s: 171 | token.append("-".join(['B', en_type])) 172 | if offset_end >= en_e: 173 | entity = next(entities_iter, None) 174 | else: 175 | logger.error(f"{token}\t{entity} not matched by their offsets.") 176 | token.append('O') 177 | entity = next(entities_iter, None) 178 | else: 179 | # logger.warning(f"{entity} offset is overlapped with previous entity; current tok not overlap") 180 | # entity = next(entities_iter, None) 181 | en_s = entity[2][0] 182 | en_e = entity[2][1] 183 | en_type = entity[1] 184 | if offset_start == en_s: 185 | token.append("-".join(['B', en_type])) 186 | if offset_end >= en_e: 187 | entity = next(entities_iter, None) 188 | elif offset_end < en_s: 189 | token.append('O') 190 | else: 191 | logger.error(f"{token}\t{entity} not matched by their offsets.") 192 | # token.append("-".join(['B', en_type])) 193 | token.append('O') 194 | entity = next(entities_iter, None) 195 | nsent.append(token) 196 | nsents.append(nsent) 197 | 198 | sent_bound_range = dict() # key: sent id; value: boundary range 199 | for i, each in enumerate(nsents): 200 | try: 201 | sent_start_index = each[0][1][0] 202 | sent_end_index = each[-1][1][1] 203 | sent_bound_range[i] = (sent_start_index, sent_end_index) 204 | except Exception as ex: 205 | if i != len(nsents) - 1: 206 | raise RuntimeError(f'The {i}th sentence is an empty sentence') 207 | 208 | # if record_pos: 209 | # nsents = [w for e in nsents for w in e] 210 | 211 | return nsents, sent_bound_range 212 | 213 | 214 | def __flat(data, to_str=False): 215 | flatted = [] 216 | 217 | for each in data: 218 | if isinstance(each, list) or isinstance(each, tuple): 219 | for e in each: 220 | flatted.append(e) 221 | else: 222 | flatted.append(each) 223 | 224 | if to_str: 225 | flatted = list(map(lambda x: str(x), flatted)) 226 | 227 | return flatted 228 | 229 | 230 | def BIOdata_to_file(file_name, sents, sep=" "): 231 | # the data must be list of list 232 | assert isinstance(sents, list), "the data object must be list and generated from generate_BIO()." 233 | with open(file_name, "w") as fw: 234 | # 'anticoagulant', (1000, 1013), (976, 989), 'B-Drug' 235 | for sent in sents: 236 | for word in sent: 237 | word = __flat(word, to_str=True) 238 | # word.append("\n") 239 | fw.write(sep.join(word)+"\n") 240 | fw.write("\n") 241 | 242 | 243 | def load_mapping_file(mapping_file, sep=" "): 244 | with open(mapping_file, "r") as f: 245 | txt = f.read().strip() 246 | sents = txt.split("\n\n") 247 | nsents = [] 248 | for sent in sents: 249 | words = sent.split("\n") 250 | for word in words: 251 | info = word.strip().split(sep) 252 | ninfo = list(map(lambda x: int(x) if x.isdigit() else x, info)) 253 | nsents.append(ninfo) 254 | 255 | mapping_dict = {(each[-2], each[-1]): each for each in nsents} 256 | 257 | return nsents, mapping_dict 258 | 259 | 260 | def __find_B_tag(word_seq, c_index): 261 | for k in range(c_index, -1, -1): 262 | c_tag = word_seq[k][-1].split("-")[0] 263 | if c_tag == 'B': 264 | return k 265 | elif c_tag == 'O': 266 | raise RuntimeError(f'check {word_seq[k]} since the label should be either I or B not O') 267 | raise RuntimeError("No B-tag has been labeled in the data.") 268 | 269 | 270 | def window_sliding_sample_creation(bio_data, window_size): 271 | pass 272 | 273 | 274 | def test(): 275 | pass 276 | 277 | if __name__ == '__main__': 278 | test() -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import CrossEntropyLoss 4 | from utils import TransformerLogger 5 | from transformers.modeling_utils import SequenceSummary 6 | from transformers import (BertForSequenceClassification, BertModel, 7 | XLNetForSequenceClassification, XLNetModel, 8 | RobertaForSequenceClassification, RobertaModel, 9 | AlbertForSequenceClassification, AlbertModel, 10 | LongformerForSequenceClassification, LongformerModel, 11 | DebertaForSequenceClassification, DebertaModel, 12 | PreTrainedModel) 13 | from model_utils import StableDropout 14 | 15 | 16 | logger = TransformerLogger(logger_level='i').get_logger() 17 | 18 | 19 | class BaseModel(PreTrainedModel): 20 | 21 | def __init__(self, config): 22 | super().__init__(config) 23 | 24 | self.spec_tag1, self.spec_tag2, self.spec_tag3, self.spec_tag4 = config.tags 25 | self.scheme = config.scheme 26 | self.num_labels = config.num_labels 27 | self.loss_fct = CrossEntropyLoss() 28 | 29 | self.drop_out = StableDropout(config.hidden_dropout_prob) 30 | 31 | if self.scheme == 1: 32 | self.classifier_dim = config.hidden_size * 3 33 | elif self.scheme == 2: 34 | self.classifier_dim = config.hidden_size * 5 35 | elif self.scheme == 3: 36 | self.classifier_dim = config.hidden_size * 2 37 | else: 38 | self.classifier_dim = config.hidden_size 39 | 40 | self.base_classifier = nn.Linear(self.classifier_dim, self.num_labels) 41 | 42 | @staticmethod 43 | def special_tag_representation(seq_output, input_ids, special_tag): 44 | spec_idx = (input_ids == special_tag).nonzero(as_tuple=False) 45 | 46 | temp = [] 47 | for idx in spec_idx: 48 | temp.append(seq_output[idx[0], idx[1], :]) 49 | tags_rep = torch.stack(temp, dim=0) 50 | 51 | return tags_rep 52 | 53 | def output2logits(self, pooled_output, seq_output, input_ids): 54 | if self.scheme == 1: 55 | seq_tags = [] 56 | for each_tag in [self.spec_tag1, self.spec_tag3]: 57 | seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag)) 58 | new_pooled_output = torch.cat((pooled_output, *seq_tags), dim=1) 59 | elif self.scheme == 2: 60 | seq_tags = [] 61 | for each_tag in [self.spec_tag1, self.spec_tag2, self.spec_tag3, self.spec_tag4]: 62 | seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag)) 63 | new_pooled_output = torch.cat((pooled_output, *seq_tags), dim=1) 64 | elif self.scheme == 3: 65 | seq_tags = [] 66 | for each_tag in [self.spec_tag1, self.spec_tag3]: 67 | seq_tags.append(self.special_tag_representation(seq_output, input_ids, each_tag)) 68 | new_pooled_output = torch.cat(seq_tags, dim=1) 69 | else: 70 | new_pooled_output = pooled_output 71 | 72 | logits = self.base_classifier(self.drop_out(new_pooled_output)) 73 | 74 | return logits 75 | 76 | def calc_loss(self, logits, outputs, labels): 77 | new_outputs = (logits,) + outputs[2:] 78 | loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 79 | new_outputs = (loss,) + new_outputs 80 | 81 | return new_outputs 82 | 83 | 84 | class BertForRelationIdentification(BertForSequenceClassification, BaseModel): 85 | def __init__(self, config): 86 | super().__init__(config) 87 | self.bert = BertModel(config) 88 | self.init_weights() 89 | 90 | def forward(self, 91 | input_ids=None, 92 | attention_mask=None, 93 | token_type_ids=None, 94 | position_ids=None, 95 | head_mask=None, 96 | inputs_embeds=None, 97 | labels=None, 98 | output_attentions=None, 99 | **kwargs): 100 | 101 | outputs = self.bert( 102 | input_ids, 103 | attention_mask=attention_mask, 104 | token_type_ids=token_type_ids, 105 | position_ids=position_ids, 106 | head_mask=head_mask 107 | ) 108 | 109 | pooled_output = outputs[1] 110 | seq_output = outputs[0] 111 | logits = self.output2logits(pooled_output, seq_output, input_ids) 112 | 113 | return self.calc_loss(logits, outputs, labels) 114 | 115 | 116 | class RoBERTaForRelationIdentification(RobertaForSequenceClassification, BaseModel): 117 | def __init__(self, config): 118 | super().__init__(config) 119 | self.roberta = RobertaModel(config) 120 | self.init_weights() 121 | 122 | def forward(self, 123 | input_ids=None, 124 | attention_mask=None, 125 | token_type_ids=None, 126 | position_ids=None, 127 | head_mask=None, 128 | inputs_embeds=None, 129 | labels=None, 130 | output_attentions=None, 131 | output_hidden_states=None, 132 | **kwargs): 133 | 134 | outputs = self.roberta( 135 | input_ids, 136 | attention_mask=attention_mask, 137 | token_type_ids=token_type_ids, 138 | position_ids=position_ids, 139 | head_mask=head_mask, 140 | output_attentions=output_attentions, 141 | output_hidden_states=output_hidden_states 142 | ) 143 | 144 | pooled_output = outputs[1] 145 | seq_output = outputs[0] 146 | logits = self.output2logits(pooled_output, seq_output, input_ids) 147 | 148 | return self.calc_loss(logits, outputs, labels) 149 | 150 | 151 | class AlbertForRelationIdentification(AlbertForSequenceClassification, BaseModel): 152 | def __init__(self, config): 153 | super().__init__(config) 154 | self.albert = AlbertModel(config) 155 | self.init_weights() 156 | 157 | def forward(self, 158 | input_ids=None, 159 | attention_mask=None, 160 | token_type_ids=None, 161 | position_ids=None, 162 | head_mask=None, 163 | inputs_embeds=None, 164 | labels=None, 165 | output_attentions=None, 166 | output_hidden_states=None, 167 | **kwargs): 168 | 169 | outputs = self.albert( 170 | input_ids=input_ids, 171 | attention_mask=attention_mask, 172 | token_type_ids=token_type_ids, 173 | position_ids=position_ids, 174 | head_mask=head_mask, 175 | inputs_embeds=inputs_embeds, 176 | output_attentions=output_attentions, 177 | output_hidden_states=output_hidden_states 178 | ) 179 | 180 | pooled_output = outputs[1] 181 | seq_output = outputs[0] 182 | logits = self.output2logits(pooled_output, seq_output, input_ids) 183 | 184 | return self.calc_loss(logits, outputs, labels) 185 | 186 | 187 | class XLNetForRelationIdentification(XLNetForSequenceClassification, BaseModel): 188 | def __init__(self, config): 189 | super().__init__(config) 190 | self.transformer = XLNetModel(config) 191 | self.sequence_summary = SequenceSummary(config) 192 | self.init_weights() 193 | 194 | def forward(self, 195 | input_ids=None, 196 | attention_mask=None, 197 | mems=None, 198 | perm_mask=None, 199 | target_mapping=None, 200 | token_type_ids=None, 201 | input_mask=None, 202 | head_mask=None, 203 | inputs_embeds=None, 204 | use_cache=True, 205 | labels=None, 206 | output_attentions=None, 207 | output_hidden_states=None, 208 | **kwargs): 209 | 210 | outputs = self.transformer( 211 | input_ids, 212 | attention_mask=attention_mask, 213 | mems=mems, 214 | perm_mask=perm_mask, 215 | target_mapping=target_mapping, 216 | token_type_ids=token_type_ids, 217 | input_mask=input_mask, 218 | head_mask=head_mask, 219 | inputs_embeds=inputs_embeds, 220 | use_cache=use_cache, 221 | output_attentions=output_attentions, 222 | output_hidden_states=output_hidden_states, 223 | **kwargs) 224 | 225 | seq_output = outputs[0] 226 | pooled_output = self.sequence_summary(seq_output) 227 | logits = self.output2logits(pooled_output, seq_output, input_ids) 228 | 229 | return self.calc_loss(logits, outputs, labels) 230 | 231 | 232 | class LongFormerForRelationIdentification(LongformerForSequenceClassification, BaseModel): 233 | def __init__(self, config): 234 | super().__init__(config) 235 | self.longformer = LongformerModel(config) 236 | self.init_weights() 237 | 238 | def forward(self, 239 | input_ids=None, 240 | attention_mask=None, 241 | global_attention_mask=None, 242 | token_type_ids=None, 243 | position_ids=None, 244 | inputs_embeds=None, 245 | labels=None, 246 | output_attentions=None, 247 | output_hidden_states=None, 248 | **kwargs): 249 | 250 | outputs = self.longformer( 251 | input_ids, 252 | attention_mask=attention_mask, 253 | global_attention_mask=global_attention_mask, 254 | token_type_ids=token_type_ids, 255 | position_ids=position_ids, 256 | inputs_embeds=inputs_embeds, 257 | output_attentions=output_attentions, 258 | output_hidden_states=output_hidden_states 259 | ) 260 | 261 | pooled_output = outputs[1] 262 | seq_output = outputs[0] 263 | logits = self.output2logits(pooled_output, seq_output, input_ids) 264 | 265 | return self.calc_loss(logits, outputs, labels) 266 | 267 | 268 | class DebertaForRelationIdentification(DebertaForSequenceClassification, BaseModel): 269 | def __init__(self, config): 270 | from model_utils import ContextPooler 271 | super().__init__(config) 272 | self.deberta = DebertaModel(config) 273 | self.pooler = ContextPooler(config) 274 | 275 | self.init_weights() 276 | 277 | def forward( 278 | self, 279 | input_ids=None, 280 | attention_mask=None, 281 | token_type_ids=None, 282 | position_ids=None, 283 | inputs_embeds=None, 284 | labels=None, 285 | output_attentions=None, 286 | output_hidden_states=None, 287 | return_dict=None, 288 | ): 289 | outputs = self.deberta( 290 | input_ids, 291 | token_type_ids=token_type_ids, 292 | attention_mask=attention_mask, 293 | position_ids=position_ids, 294 | inputs_embeds=inputs_embeds, 295 | output_attentions=output_attentions, 296 | output_hidden_states=output_hidden_states, 297 | return_dict=return_dict, 298 | ) 299 | 300 | seq_output = outputs[0] 301 | pooled_output = self.pooler(seq_output) 302 | logits = self.output2logits(pooled_output, seq_output, input_ids) 303 | 304 | return self.calc_loss(logits, outputs, labels) 305 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/eval_scripts/old_bio_eval.py: -------------------------------------------------------------------------------- 1 | ### 2 | #

Title:

3 | #

Create Date: 21:23:36 01/28/18

4 | #

Copyright: College of Medicine

5 | #

Organization: University of Florida

6 | # @author Yonghui Wu 7 | # @version 1.0 8 | #

Description:

9 | ## 10 | # from create_log import create_logger 11 | 12 | from __future__ import print_function 13 | 14 | 15 | def read_from_file(ifn): 16 | with open(ifn, "r") as f: 17 | text = f.read() 18 | return text 19 | 20 | 21 | class PRF: 22 | def __init__(self): 23 | self.true=0 24 | self.false=0 25 | 26 | 27 | class BioEval: 28 | def __init__(self, ifn, log_name=None): 29 | self.ifn=ifn 30 | self.acc=PRF() 31 | self.all_strict=PRF() 32 | self.all_relax=PRF() 33 | self.cate_strict={} 34 | self.cate_relax={} 35 | 36 | self.gold_all=0 37 | self.gold_cate={} 38 | # self.entities=[] 39 | self.log_name = log_name 40 | 41 | def eval_fn(self): 42 | text=read_from_file(self.ifn).strip().lower() 43 | secs=text.split('\n\n') 44 | for sec in secs: 45 | sec=sec.strip() 46 | lines=sec.split('\n') 47 | bio=[] 48 | for line in lines: 49 | words=line.split(None) 50 | #words.append(words[-1]) 51 | bio.append(words) 52 | self.handle(bio) 53 | self.prf() 54 | 55 | def feed_bio(self,bio): 56 | self.handle(bio) 57 | 58 | def train_msg(self): 59 | stt="Entities: " 60 | for k, v in self.gold_cate.items(): 61 | stt=stt+k+":"+str(v)+" " 62 | if (self.acc.true+self.acc.false) > 0: 63 | acc=float(self.acc.true)/(self.acc.true+self.acc.false) 64 | else: 65 | acc=0.0 66 | if (self.all_strict.true+self.all_strict.false) > 0 and self.gold_all>0: 67 | pre = float(self.all_strict.true)/(self.all_strict.true+self.all_strict.false) 68 | rec = float(self.all_strict.true)/self.gold_all 69 | if pre+rec>0.0: 70 | f1=2*pre*rec/(pre+rec) 71 | else: 72 | f1=0.0 73 | else: 74 | pre=0.0 75 | rec=0.0 76 | f1=0.0 77 | 78 | #all_relex 79 | if (self.all_relax.true+self.all_relax.false) > 0 and self.gold_all>0: 80 | rpre = float(self.all_relax.true)/(self.all_relax.true+self.all_relax.false) 81 | rrec = float(self.all_relax.true)/self.gold_all 82 | if (rpre+rrec) > 0.0: 83 | rf1=2*rpre*rrec/(rpre+rrec) 84 | else: 85 | rf1=0.0 86 | else: 87 | rpre=0.0 88 | rrec=0.0 89 | rf1=0.0 90 | 91 | return([stt,f1,pre,rec,rf1,rpre,rrec,acc]) 92 | 93 | def prf(self): 94 | # print "Total %s entities " % self.gold_all 95 | log_info = "Total %s entities " % self.gold_all + "\n" 96 | for k,v in self.gold_cate.items(): 97 | # print " %s : %s" % (k,v) 98 | log_info += " %s : %s\n" % (k,v) 99 | 100 | acc=float(self.acc.true)/(self.acc.true+self.acc.false) 101 | # print "\nAccuracy : %s" % acc 102 | log_info += "\nAccuracy : %s\n" % acc 103 | 104 | pre = float(self.all_strict.true)/(self.all_strict.true+self.all_strict.false) 105 | rec = float(self.all_strict.true)/self.gold_all 106 | try: 107 | f1=2*pre*rec/(pre+rec) 108 | except ZeroDivisionError: 109 | f1 = 0.0 110 | 111 | # print "\n\nStrict score ----- " 112 | log_info += "\n\nStrict score ----- \n" 113 | # print 'precision : %s , recall : %s , f1 : %s' % (pre,rec,f1) 114 | log_info += 'precision : %s , recall : %s , f1 : %s\n' % (pre,rec,f1) 115 | # print 'find : %s , true : %s , false : %s' % (self.all_strict.true+self.all_strict.false,self.all_strict.true,self.all_strict.false) 116 | log_info += 'find : %s , true : %s , false : %s \n' % (self.all_strict.true+self.all_strict.false, 117 | self.all_strict.true,self.all_strict.false) 118 | #all_relex 119 | pre = float(self.all_relax.true)/(self.all_relax.true+self.all_relax.false) 120 | rec = float(self.all_relax.true)/self.gold_all 121 | try: 122 | f1=2*pre*rec/(pre+rec) 123 | except ZeroDivisionError: 124 | f1 = 0.0 125 | 126 | # print "\nRelax score -----" 127 | log_info += "\nRelax score -----\n" 128 | # print 'precision : %s , recall : %s , f1 : %s' % (pre,rec,f1) 129 | log_info += 'precision : %s , recall : %s , f1 : %s\n' % (pre,rec,f1) 130 | # print 'find : %s , true : %s , false : %s' % (self.all_relax.true+self.all_relax.false,self.all_relax.true,self.all_relax.false) 131 | log_info += 'find : %s , true : %s , false : %s \n' % (self.all_relax.true+self.all_relax.false, 132 | self.all_relax.true,self.all_relax.false) 133 | ##category score 134 | # print "\nstrict score by cate -----" 135 | log_info += "\nstrict score by cate -----\n" 136 | for k,v in self.cate_strict.items(): 137 | pre = float(v.true)/(v.true+v.false) 138 | if k not in self.gold_cate: 139 | rec=0.0 140 | f1=0.0 141 | else: 142 | rec = float(v.true)/self.gold_cate[k] 143 | try: 144 | f1 = 2 * pre * rec / (pre + rec) 145 | except ZeroDivisionError: 146 | f1 = 0.0 147 | 148 | # print "Cate : %s, precision : %s , recall : %s , f1 : %s" % (k,pre,rec,f1) 149 | log_info += "Cate : %s, precision : %s , recall : %s , f1 : %s\n" % (k,pre,rec,f1) 150 | # print 'find : %s , true : %s , false : %s' % (v.true+v.false,v.true,v.false) 151 | log_info += 'find : %s , true : %s , false : %s\n' % (v.true+v.false,v.true,v.false) 152 | 153 | # print "\nrelax score by cate -----" 154 | log_info += "\nrelax score by cate -----\n" 155 | for k,v in self.cate_relax.items(): 156 | pre = float(v.true)/(v.true+v.false) 157 | if k not in self.gold_cate: 158 | rec = 0.0 159 | f1 = 0.0 160 | else: 161 | rec = float(v.true)/self.gold_cate[k] 162 | try: 163 | f1 = 2 * pre * rec / (pre + rec) 164 | except ZeroDivisionError: 165 | f1 = 0.0 166 | 167 | # print "Cate : %s, precision : %s , recall : %s , f1 : %s" % (k,pre,rec,f1) 168 | log_info += "Cate : %s, precision : %s , recall : %s , f1 : %s\n" % (k,pre,rec,f1) 169 | # print 'find : %s , true : %s , false : %s' % (v.true+v.false,v.true,v.false) 170 | log_info += 'find : %s , true : %s , false : %s\n' % (v.true+v.false,v.true,v.false) 171 | 172 | print(log_info) 173 | # if self.log_name: 174 | # logger = create_logger(self.log_name, "--evaluation--") 175 | # logger.info(log_info) 176 | 177 | def same(self,bio,starti,endi): 178 | ''' 179 | whether the ner (starti : endi) is exactly match 180 | ''' 181 | flag=True 182 | pcate=bio[starti][-1][2:] 183 | if bio[starti][-2].startswith("i-"): 184 | cate=bio[starti][-2][2:] 185 | if cate != pcate: 186 | flag=False 187 | else: 188 | #check starti-1 189 | if starti -1 >= 0 and bio[starti-1][-2] == "i-"+cate or bio[starti-1][-2] == "b-"+cate: 190 | flag=False 191 | if flag: 192 | for i in range(starti+1,endi): 193 | if bio[i][-2] != "i-"+cate: 194 | flag=False 195 | if flag:# check endi 196 | if endi < len(bio) and bio[endi][-2] == "i-"+cate: 197 | flag=False 198 | elif bio[starti][-2].startswith("b-"): 199 | cate=bio[starti][-2][2:] 200 | if cate != pcate: 201 | flag=False 202 | # do not need check starti -1 203 | if flag: 204 | for i in range(starti+1,endi): 205 | if bio[i][-2] != "i-"+cate: 206 | flag=False 207 | if flag:# check endi 208 | if endi < len(bio) and bio[endi][-2] == "i-"+cate: 209 | flag=False 210 | else: 211 | flag=False 212 | 213 | return flag 214 | 215 | def overlap(self,bio,starti,endi): 216 | flag=False 217 | for i in range(starti,endi): 218 | if len(bio[i][-2])> 2 and bio[i][-1][2:] == bio[i][-2][2:]: 219 | flag=True 220 | break 221 | return flag 222 | 223 | def add_tp_strict(self,cate): 224 | self.all_strict.true=self.all_strict.true+1 225 | self.all_relax.true=self.all_relax.true+1 226 | if cate not in self.cate_strict: 227 | self.cate_strict[cate]=PRF() 228 | self.cate_strict[cate].true=self.cate_strict[cate].true+1 229 | if cate not in self.cate_relax: 230 | self.cate_relax[cate]=PRF() 231 | self.cate_relax[cate].true=self.cate_relax[cate].true+1 232 | 233 | def add_tp_overlap(self,cate): 234 | self.all_relax.true=self.all_relax.true+1 235 | if cate not in self.cate_relax: 236 | self.cate_relax[cate]=PRF() 237 | self.cate_relax[cate].true=self.cate_relax[cate].true+1 238 | # treat as false by strict 239 | self.all_strict.false=self.all_strict.false+1 240 | if cate not in self.cate_strict: 241 | self.cate_strict[cate]=PRF() 242 | self.cate_strict[cate].false=self.cate_strict[cate].false+1 243 | 244 | def add_nolap(self,cate): 245 | self.all_strict.false=self.all_strict.false+1 246 | self.all_relax.false=self.all_relax.false+1 247 | 248 | if cate not in self.cate_strict: 249 | self.cate_strict[cate]=PRF() 250 | self.cate_strict[cate].false=self.cate_strict[cate].false+1 251 | 252 | if cate not in self.cate_relax: 253 | self.cate_relax[cate]=PRF() 254 | self.cate_relax[cate].false=self.cate_relax[cate].false+1 255 | 256 | def handle(self,bio): 257 | llen=len(bio) 258 | 259 | #accumulate accuracy data 260 | for i in range(llen): 261 | if bio[i][-1].strip() == bio[i][-2].strip(): 262 | self.acc.true=self.acc.true+1 263 | else: 264 | self.acc.false=self.acc.false+1 265 | 266 | i=0 267 | # handle system prediction 268 | while i < llen: 269 | if bio[i][-1] == 'o': 270 | i=i+1 271 | else: 272 | # find the start and end pos 273 | starti=i 274 | endi=i+1 275 | cate=bio[starti][-1][2:].strip() 276 | while endi= s_s and e_s <= s_e and e_e >s_e : 291 | print("entity is in two sentence") 292 | if e_s >= s_s and e_s <= s_e: 293 | return k 294 | 295 | 296 | def extract_entity_comb_for_relation(e2idx, entities, rels, sent_bound): 297 | #'T1': 0 298 | #'meropenem', 'Drug', (4534, 4543) 299 | #('Strength-Drug', 'T5', 'T39') 300 | rn = defaultdict(list) 301 | rl = [] 302 | for rel in rels: 303 | rtype = rel[0] 304 | en1 = rel[1] 305 | en2 = rel[2] 306 | en1_type = entities[e2idx[en1]][1] 307 | en2_type = entities[e2idx[en2]][1] 308 | rn[rtype].append((en1_type, en2_type)) 309 | en1_pos = entities[e2idx[en1]][2] 310 | e1_n = en_sent_id(en1_pos, sent_bound) 311 | en2_pos = entities[e2idx[en2]][2] 312 | e2_n = en_sent_id(en2_pos, sent_bound) 313 | rl.append(abs(e1_n-e2_n)) 314 | return rn, rl 315 | 316 | 317 | def to_tsv(data, fn): 318 | header = "\t".join([str(i+1) for i in range(len(data[0]))]) 319 | with open(fn, "w") as f: 320 | f.write(f"{header}\n") 321 | for each in data: 322 | d = "\t".join([str(e) for e in each]) 323 | f.write(f"{d}\n") 324 | 325 | 326 | def to_5_cv(data, ofd): 327 | if not os.path.isdir(ofd): 328 | os.mkdir(ofd) 329 | 330 | np.random.seed(13) 331 | np.random.shuffle(data) 332 | 333 | dfs = np.array_split(data, 5) 334 | a = [0,1,2,3,4] 335 | for each in combinations(a, 4): 336 | b = list(set(a) - set(each))[0] 337 | n = dfs[b] 338 | m = [] 339 | for k in each: 340 | m.extend(dfs[k]) 341 | if not os.path.isdir(os.path.join(ofd, f"sample{b}")): 342 | os.mkdir(os.path.join(ofd, f"sample{b}")) 343 | 344 | to_tsv(m, os.path.join(ofd, f"sample{b}", "train.tsv")) 345 | to_tsv(n, os.path.join(ofd, f"sample{b}", "dev.tsv")) 346 | 347 | 348 | def all_in_one(*dd, dn="2018n2c2", do_train=True): 349 | data = [] 350 | for d in dd: 351 | for k, v in d.items(): 352 | for each in v: 353 | data.append(each[1:]) 354 | 355 | output_path = f"../temp/{dn}_aio_th{CUTOFF}" 356 | p = Path(output_path) 357 | p.mkdir(parents=True, exist_ok=True) 358 | 359 | if do_train: 360 | to_tsv(data, p/"train.tsv") 361 | if OUTPUT_CV: 362 | to_5_cv(data, p.as_posix()) 363 | else: 364 | to_tsv(data, p/"test.tsv") 365 | 366 | 367 | def all_in_unique(*dd, dn="2018n2c2", do_train=True): 368 | for idx in range(CUTOFF+1): 369 | data = [] 370 | for d in dd: 371 | for k, v in d.items(): 372 | for each in v: 373 | if k == idx: 374 | data.append(each[1:]) 375 | 376 | output_path = f"../temp/{dn}_aiu_th{CUTOFF}" 377 | p = Path(output_path) / f"cutoff_{idx}" 378 | p.mkdir(parents=True, exist_ok=True) 379 | if do_train: 380 | to_tsv(data, p/"train.tsv") 381 | if OUTPUT_CV: 382 | to_5_cv(data, p.as_posix()) 383 | else: 384 | to_tsv(data, p/"test.tsv") 385 | 386 | 387 | # general pre-defined special tags 388 | EN1_START = "[s1]" 389 | EN1_END = "[e1]" 390 | EN2_START = "[s2]" 391 | EN2_END = "[e2]" 392 | NEG_REL = "NonRel" 393 | # max valid cross sentence distance 394 | CUTOFF = 1 395 | # output 5-fold cross validation data 396 | OUTPUT_CV = False 397 | # do binary classification (if false, then we do multiclass classification) 398 | DO_BIN = False 399 | 400 | sdoh_valid_comb = { 401 | ('Tobacco_use', 'Substance_use_status'), ('Substance_use_status', 'Smoking_type'), 402 | ('Substance_use_status', 'Smoking_freq_ppd'), ('Substance_use_status', 'Smoking_freq_py'), 403 | ('Substance_use_status', 'Smoking_freq_qy'), ('Substance_use_status', 'Smoking_freq_sy'), 404 | ('Substance_use_status', 'Smoking_freq_other'), ('Alcohol_use', 'Substance_use_status'), 405 | ('Substance_use_status', 'Alcohol_freq'), ('Substance_use_status', 'Alcohol_type'), 406 | ('Substance_use_status', 'Alcohol_other'), ('Drug_use', 'Substance_use_status'), 407 | ('Substance_use_status', 'Drug_freq'), ('Substance_use_status', 'Drug_type'),('Substance_use_status', 'Drug_other'), ('Sex_act', 'Sdoh_status'), 408 | ('Sex_act', 'Partner'), ('Sex_act', 'Protection'), 409 | ('Sex_act', 'Sex_act_other'), ('Occupation', 'Employment_status'), 410 | ('Occupation', 'Employment_location'), ('Gender', 'Sdoh_status'),('Social_cohesion', 'Social_method'), ('Social_method', 'Sdoh_status'), 411 | ('Physical_act', 'Sdoh_status'), ('Physical_act', 'Sdoh_freq'), 412 | ('Living_supply', 'Sdoh_status'), ('Abuse', 'Sdoh_status'), 413 | ('Transportation', 'Sdoh_status'), ('Health_literacy', 'Sdoh_status'), 414 | ('Financial_constrain', 'Sdoh_status'), ('Social_cohesion', 'Sdoh_status'), 415 | ('Social_cohesion', 'Sdoh_freq'), ('Gender', 'Sdoh_status'), 416 | ('Race', 'Sdoh_status'), ('Ethnicity', 'Sdoh_status'), 417 | ('Living_Condition', 'Sdoh_status') 418 | } 419 | 420 | test_root=f'../temp/{output_name}_formatted_output' 421 | preds = create_test_samples(test_root, None, sdoh_valid_comb) 422 | all_in_one(preds, dn=output_name, do_train=False) 423 | 424 | 425 | 426 | 427 | 428 | -------------------------------------------------------------------------------- /ClinicalTransformerRelationExtraction/src/data_utils.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from config import MODEL_REQUIRE_SEGMENT_ID, SPEC_TAGS, TOKENIZER_USE_FOUR_SPECIAL_TOKs 3 | import csv 4 | from pathlib import Path 5 | import torch 6 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset 7 | import re 8 | from tqdm import tqdm 9 | from functools import partial 10 | from concurrent.futures import ProcessPoolExecutor 11 | import numpy as np 12 | 13 | 14 | class InputExample(object): 15 | """A single training/test example for simple sequence classification.""" 16 | 17 | def __init__(self, guid, text_a, text_b=None, label=None): 18 | """Constructs a InputExample. 19 | 20 | Args: 21 | guid: Unique id for the example. 22 | text_a: string. The not tokenized text of the first sequence. For single 23 | sequence tasks, only this sequence must be specified. 24 | text_b: (Optional) string. The not tokenized text of the second sequence. 25 | Only must be specified for sequence pair tasks. 26 | label: (Optional) string. The label of the example. This should be 27 | specified for train and dev examples, but not for test examples. 28 | """ 29 | self.guid = guid 30 | self.text_a = text_a 31 | self.text_b = text_b 32 | self.label = label 33 | 34 | def __str__(self): 35 | s = "" 36 | for k, v in self.__dict__.items(): 37 | s += "{}={}\n".format(k, v) 38 | return s 39 | 40 | 41 | class InputFeatures(object): 42 | """A single set of features of data.""" 43 | 44 | def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None): 45 | self.input_ids = input_ids 46 | self.attention_mask = attention_mask 47 | self.token_type_ids = token_type_ids 48 | self.label = label 49 | 50 | def __str__(self): 51 | s = "" 52 | for k, v in self.__dict__.items(): 53 | s += "{}={}\n".format(k, v) 54 | return s 55 | 56 | 57 | def convert_examples_to_relation_extraction_features( 58 | examples, label2idx, tokenizer, max_length=128): 59 | """This function is the same as transformers.glue_convert_examples_to_features""" 60 | features = [] 61 | 62 | for idx, example in enumerate(tqdm(examples)): 63 | text_a, text_b = example.text_a, example.text_b 64 | 65 | tokens_a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_a)) 66 | 67 | if text_b: 68 | tokens_b = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_b)) 69 | else: 70 | tokens_b = None 71 | 72 | inputs = tokenizer.encode_plus( 73 | tokens_a, tokens_b, pad_to_max_length=True, max_length=max_length, truncation=False) 74 | 75 | label = label2idx[example.label] 76 | feature = InputFeatures(**inputs, label=label) 77 | features.append(feature) 78 | 79 | if idx < 3: 80 | print("###exampel###\nguide: {}\ntext: {}\ntoken ids: {}\nmasks: {}\nlabel: {}\n########".format( 81 | example.guid, 82 | example.text_a + " " + example.text_b, 83 | feature.input_ids, 84 | feature.attention_mask, 85 | feature.label)) 86 | 87 | return features 88 | 89 | 90 | def features2tensors(features, logger=None): 91 | tensor_input_ids = [] 92 | tensor_attention_masks = [] 93 | tensor_token_type_ids = [] 94 | tensor_label_ids = [] 95 | 96 | for idx, feature in enumerate(features): 97 | if logger and idx < 3: 98 | logger.info("Feature{}:\n{}\n".format(idx + 1, feature)) 99 | 100 | tensor_input_ids.append(feature.input_ids) 101 | tensor_attention_masks.append(feature.attention_mask) 102 | tensor_label_ids.append(feature.label) 103 | 104 | if feature.token_type_ids: 105 | tensor_token_type_ids.append(feature.token_type_ids) 106 | 107 | tensor_input_ids = torch.tensor(tensor_input_ids, dtype=torch.long) 108 | tensor_attention_masks = torch.tensor(tensor_attention_masks, dtype=torch.long) 109 | tensor_label_ids = torch.tensor(tensor_label_ids, dtype=torch.long) 110 | tensor_token_type_ids = torch.tensor(tensor_token_type_ids, dtype=torch.long) if tensor_token_type_ids \ 111 | else torch.zeros(tensor_attention_masks.shape) 112 | 113 | return TensorDataset(tensor_input_ids, tensor_attention_masks, tensor_token_type_ids, tensor_label_ids) 114 | 115 | 116 | def relation_extraction_data_loader(dataset, batch_size=2, task='train', logger=None): 117 | """ 118 | task has two levels: 119 | train for training using RandomSampler 120 | test for evaluation and prediction using SequentialSampler 121 | 122 | if set auto to True we will default call convert_features_to_tensors, 123 | so features can be directly passed into the function 124 | """ 125 | dataset = features2tensors(dataset, logger=logger) 126 | 127 | if task == 'train': 128 | sampler = RandomSampler(dataset) 129 | elif task == 'test': 130 | sampler = SequentialSampler(dataset) 131 | else: 132 | raise ValueError('task argument only support train or test but get {}'.format(task)) 133 | 134 | data_loader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, pin_memory=True) 135 | 136 | return data_loader 137 | 138 | 139 | def batch_to_model_input(batch, model_type="bert", device=torch.device("cpu")): 140 | return {"input_ids": batch[0].to(device), 141 | "attention_mask": batch[1].to(device), 142 | "labels": batch[3].to(device), 143 | "token_type_ids": batch[2].to(device) if model_type in MODEL_REQUIRE_SEGMENT_ID else None} 144 | 145 | 146 | class DataProcessor(object): 147 | """Base class for data converters for sequence classification data sets.""" 148 | 149 | def __init__(self, data_dir=None, max_seq_len=128, num_core=1, header=True, tokenizer_type='bert'): 150 | if data_dir: 151 | self.data_dir = Path(data_dir) 152 | else: 153 | self.data_dir = data_dir 154 | 155 | self.tokenizer = None 156 | self.max_seq_len = max_seq_len 157 | self.num_core = num_core 158 | self.header = header 159 | self.tokenizer_type = tokenizer_type 160 | self.total_special_token_num = 3 161 | 162 | def __str__(self): 163 | rep = [f"key: {k}; val: {v}" for k, v in self.__dict__.items()] 164 | return "\n".join(rep) 165 | 166 | def set_data_dir(self, data_dir): 167 | self.data_dir = Path(data_dir) 168 | 169 | def set_tokenizer(self, tokenizer): 170 | self.tokenizer = tokenizer 171 | 172 | def set_max_seq_len(self, max_seq_len): 173 | self.max_seq_len = max_seq_len 174 | 175 | def set_tokenizer_type(self, tokenizer_type): 176 | self.tokenizer_type = tokenizer_type 177 | 178 | def set_num_core(self, num_core): 179 | self.num_core = num_core 180 | 181 | def set_header(self, header): 182 | self.header = header 183 | 184 | def get_train_examples(self, filename=None): 185 | """See base class.""" 186 | input_file_name = self.data_dir / filename if filename else self.data_dir / "train.tsv" 187 | 188 | return self._create_examples( 189 | self._read_tsv(input_file_name), "train") 190 | 191 | def get_dev_examples(self, filename=None): 192 | """See base class.""" 193 | input_file_name = self.data_dir / filename if filename else self.data_dir / "dev.tsv" 194 | 195 | return self._create_examples( 196 | self._read_tsv(input_file_name), "dev") 197 | 198 | def get_test_examples(self, filename=None): 199 | """See base class.""" 200 | input_file_name = self.data_dir / filename if filename else self.data_dir / "test.tsv" 201 | 202 | return self._create_examples( 203 | self._read_tsv(input_file_name), "test") 204 | 205 | def get_labels(self, train_file=None, label_file=None): 206 | """ 207 | Gets the list of labels for this data set. 208 | 1. use labels in train file for indexing 209 | In all different formats, the first column always should be label 210 | 2. add a label index file 211 | A plain text with each unique label in one line 212 | """ 213 | if label_file: 214 | with open(label_file, "r") as f: 215 | unique_labels = [e.strip() for e in f.read().strip().split("\n")] 216 | elif label_file is None and train_file: 217 | lines = self._read_tsv(train_file) 218 | unique_labels = set() 219 | for (i, line) in enumerate(lines): 220 | unique_labels.add(line[0]) 221 | elif label_file is None and train_file is None and self.data_dir: 222 | lines = self._read_tsv(self.data_dir / "train.tsv") 223 | unique_labels = set() 224 | for (i, line) in enumerate(lines): 225 | unique_labels.add(line[0]) 226 | else: 227 | raise RuntimeError("Cannot find files to generate labels" 228 | "You need one of label_file, train_file (full path) or data_dir setup") 229 | 230 | label2idx = {k: v for v, k in enumerate(unique_labels)} 231 | idx2label = {v: k for k, v in label2idx.items()} 232 | 233 | return unique_labels, label2idx, idx2label 234 | 235 | def _create_examples(self, lines, set_type): 236 | """Creates examples for the training and dev sets.""" 237 | raise NotImplementedError( 238 | "You must use FamilyHistoryRelationDataFormatSep or FamilyHistoryRelationDataFormatOne.") 239 | 240 | @staticmethod 241 | def _read_tsv(input_file, header=True, quotechar=None): 242 | """Reads a tab separated value file.""" 243 | lines = [] 244 | 245 | with open(input_file, "r", encoding="utf-8") as f: 246 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 247 | for line in reader: 248 | lines.append(line) 249 | if header: 250 | lines = lines[1:] 251 | 252 | return lines 253 | 254 | 255 | class RelationDataFormatSepProcessor(DataProcessor): 256 | """ 257 | data format: 258 | [CLS] sent1 [SEP] sent2 [SEP] : BERT 259 | sent1 sent2 : RoBERTa, LongFormer 260 | sent1 sent2 261 | """ 262 | 263 | def _create_examples_helper(self, lines_idx, set_type, total_special_toks): 264 | start_idx, lines = lines_idx 265 | examples = [] 266 | for (i, line) in enumerate(tqdm(lines)): 267 | guid = "{}_{}_{}".format(set_type, start_idx, i) 268 | text_a = line[1] 269 | text_b = line[2] 270 | label = line[0] 271 | # text after tokenization has a len > max_seq_len: 272 | # 1. skip all these cases 273 | # 2. use truncate strategy 274 | # we adopt truncate way (2) in this implementation as _process_seq_len 275 | text_a, text_b = self._process_seq_len(text_a, text_b, total_special_toks=total_special_toks) 276 | examples.append( 277 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 278 | return examples 279 | 280 | def _create_examples(self, lines, set_type): 281 | """Creates examples for the training and dev sets.""" 282 | 283 | if self.tokenizer_type in TOKENIZER_USE_FOUR_SPECIAL_TOKs: 284 | self.total_special_token_num = 4 285 | 286 | if self.num_core < 2: 287 | # single process - maybe too slow - replace with multiprocess 288 | examples = self._create_examples_helper((0, lines), set_type, self.total_special_token_num) 289 | else: 290 | # multi-process - assume the first read-in data is the csv title with no data information 291 | # use multi-cores to process data if you have many long sentences; 292 | # otherwise single process should be faster 293 | examples = [] 294 | array_lines = np.array_split(lines, self.num_core) 295 | with ProcessPoolExecutor(max_workers=self.num_core) as exe: 296 | for each in exe.map(partial(self._create_examples_helper, 297 | set_type=set_type, 298 | total_special_toks=self.total_special_token_num), 299 | enumerate(array_lines)): 300 | examples.extend(each) 301 | 302 | return examples 303 | 304 | @staticmethod 305 | def _truncate_helper(text): 306 | tokens = text.split(" ") 307 | spec_tag_idx1, spec_tag_idx2 = [idx for (idx, tk) in enumerate(tokens) if tk.lower() in SPEC_TAGS] 308 | start_idx, end_idx = 0, len(tokens) - 1 309 | truncate_space_head = spec_tag_idx1 - start_idx 310 | truncate_space_tail = end_idx - spec_tag_idx2 311 | 312 | if truncate_space_head == truncate_space_tail == 0: 313 | return text 314 | 315 | if truncate_space_head > truncate_space_tail: 316 | tokens.pop(0) 317 | else: 318 | tokens.pop(-1) 319 | 320 | return " ".join(tokens) 321 | 322 | def _process_seq_len(self, text_a, text_b, total_special_toks=3): 323 | """ 324 | This function is used to truncate sequences with len > max_seq_len 325 | Truncate strategy: 326 | 1. find all the index for special tags 327 | 3. count distances between leading word to first tag and second tag to last. 328 | first -1- tag1 entity tag2 -2- last 329 | 4. pick the longest distance from (1, 2), if 1 remove first token, if 2 remove last token 330 | 5. repeat until len is equal to max_seq_len 331 | """ 332 | flag = True 333 | 334 | while len(self.tokenizer.tokenize(text_a) + self.tokenizer.tokenize(text_b)) \ 335 | > (self.max_seq_len - total_special_toks): 336 | 337 | if flag: 338 | text_a = self._truncate_helper(text_a) 339 | else: 340 | text_b = self._truncate_helper(text_b) 341 | 342 | flag = not flag 343 | 344 | return text_a, text_b 345 | 346 | 347 | class RelationDataFormatUniProcessor(DataProcessor): 348 | """ 349 | data format: 350 | [CLS] sent1 sent2 [SEP] 351 | """ 352 | 353 | def _create_examples_helper(self, lines_idx, set_type, total_special_toks): 354 | examples = [] 355 | start_idx, lines = lines_idx 356 | for (i, line) in enumerate(lines): 357 | guid = "%s-%s-%s" % (set_type, start_idx, i) 358 | text_a = line[1] 359 | text_a_1 = line[2] 360 | text_a = " ".join([text_a, text_a_1]) 361 | label = line[0] 362 | # text after tokenization has a len > max_seq_len: 363 | # 1. skip all these cases 364 | # 2. use truncate strategy (truncate from both side) (adopted) 365 | text_a = self._process_seq_len(text_a) 366 | 367 | examples.append( 368 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 369 | 370 | return examples 371 | 372 | def _create_examples(self, lines, set_type): 373 | """Creates examples for the training and dev sets.""" 374 | 375 | if self.tokenizer_type in TOKENIZER_USE_FOUR_SPECIAL_TOKs: 376 | self.total_special_token_num = 4 377 | 378 | if self.num_core < 2: 379 | # single process 380 | examples = self._create_examples_helper((0, lines), set_type, self.total_special_token_num) 381 | else: 382 | # multi-process 383 | examples = [] 384 | array_lines = np.array_split(lines, self.num_core) 385 | with ProcessPoolExecutor(max_workers=self.num_core) as exe: 386 | for each in exe.map(partial(self._create_examples_helper, 387 | set_type=set_type, 388 | total_special_toks=self.total_special_token_num), 389 | enumerate(array_lines)): 390 | examples.extend(each) 391 | 392 | return examples 393 | 394 | def _process_seq_len(self, text_a): 395 | """ 396 | see RelationDataFormatSepProcessor._process_seq_len for details 397 | """ 398 | while len(self.tokenizer.tokenize(text_a)) > (self.max_seq_len - 2): 399 | w1 = text_a.split(" ") 400 | t1, t2, t3, t4 = [idx for (idx, w) in enumerate(w1) if w.lower() in SPEC_TAGS] 401 | ss1, mid1, se1 = 0, (len(w1) - 1) // 2, len(w1) - 1 402 | 403 | a1 = t1 - ss1 404 | b1 = se1 - t4 405 | c1 = mid1 - t2 406 | d1 = t3 - mid1 407 | m_idx = max(a1, b1, c1, d1) 408 | if a1 == m_idx: 409 | w1.pop(0) 410 | elif b1 == m_idx: 411 | w1.pop(-1) 412 | elif c1 == m_idx: 413 | w1.pop((t2 + c1 // 2)) 414 | else: 415 | w1.pop((t3 - d1 // 2)) 416 | 417 | text_a = " ".join(w1) 418 | 419 | return text_a 420 | -------------------------------------------------------------------------------- /ClinicalTransformerNER/src/common_utils/bio_prf_eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ############################################################################################################ 3 | # the performance return a dict with structure: 4 | # {'category': {'relax': {'xx': {'f_score': 0.85714, 5 | # 'precision': 0.75, 6 | # 'recall': 1.0}, 7 | # 'yy': {'f_score': 0.8, 8 | # 'precision': 1.0, 9 | # 'recall': 0.6667}}, 10 | # 'strict': {'xx': {'f_score': 0.8571, 11 | # 'precision': 0.75, 12 | # 'recall': 1.0}, 13 | # 'yy': {'f_score': 0.4, 14 | # 'precision': 0.5, 15 | # 'recall': 0.3333}}}, 16 | # 'overall': {'acc': 0.7857, 17 | # 'relax': {'f_score': 0.8334, 18 | # 'precision': 0.8334, 19 | # 'recall': 0.8334}, 20 | # 'strict': {'f_score': 0.6667, 21 | # 'precision': 0.6667, 22 | # 'recall': 0.6667}}} 23 | 24 | # also get entity counts by call BioEval.counts 25 | # return a dictionary: 26 | # {'expect': {'overall': 6, 'xx': 3, 'yy': 3}, 27 | # 'prediction': {'relax': {'overall': {'false': 1, 'total': 6, 'true': 5}, 28 | # 'xx': {'false': 1, 'total': 4, 'true': 3}, 29 | # 'yy': {'false': 0, 'total': 2, 'true': 2}}, 30 | # 'strict': {'overall': {'false': 2, 'total': 6, 'true': 4}, 31 | # 'xx': {'false': 1, 'total': 4, 'true': 3}, 32 | # 'yy': {'false': 1, 'total': 2, 'true': 1}}}} 33 | # 34 | # see test() for use cases 35 | ############################################################################################################ 36 | 37 | 38 | from common_utils.common_io import load_bio_file_into_sents 39 | from itertools import chain 40 | from collections import defaultdict 41 | from common_utils.common_log import create_logger 42 | from math import pow 43 | from pathlib import Path 44 | import argparse 45 | 46 | 47 | class PRF: 48 | def __init__(self): 49 | self.true = 0 50 | self.false = 0 51 | 52 | def add_true_case(self): 53 | self.true += 1 54 | 55 | def add_false_case(self): 56 | self.false += 1 57 | 58 | def get_true_false_counts(self): 59 | return self.true, self.false 60 | 61 | def __str__(self): 62 | return str(self.__dict__) 63 | 64 | 65 | class BioEval: 66 | def __init__(self): 67 | self.logger = create_logger('BioEval') 68 | self.acc = PRF() 69 | # prediction 70 | self.all_strict = PRF() 71 | self.all_relax = PRF() 72 | self.cat_strict = defaultdict(PRF) 73 | self.cat_relax = defaultdict(PRF) 74 | # gold standard 75 | self.gs_all = 0 76 | self.gs_cat = defaultdict(int) 77 | self.performance = dict() 78 | self.counts = dict() 79 | self.beta = 1 80 | self.label_not_for_eval = {'o'} 81 | 82 | def reset(self): 83 | self.acc = PRF() 84 | self.all_strict = PRF() 85 | self.all_relax = PRF() 86 | self.cat_strict = defaultdict(PRF) 87 | self.cat_relax = defaultdict(PRF) 88 | self.gs_all = 0 89 | self.gs_cat = defaultdict(int) 90 | self.performance = dict() 91 | self.counts = dict() 92 | 93 | def set_beta_for_f_score(self, beta): 94 | self.logger.warning("Using beta={} for calculating F-score".format(beta)) 95 | self.beta = beta 96 | 97 | def set_logger(self, logger): 98 | self.logger = logger 99 | 100 | def add_labels_not_for_eval(self, *labels): 101 | for each in labels: 102 | self.label_not_for_eval.add(each.lower()) 103 | 104 | def __calc_prf(self, tp, fp, tp_tn): 105 | """ 106 | Using this function to calculate F-beta score, beta=1 is f_score-score, set beta=2 favor recall, and set beta=0.5 favor precision. 107 | Using set_beta_for_f_score function to change beta value. 108 | """ 109 | tp_fp = tp + fp 110 | pre = 1.0 * tp / tp_fp if tp_fp > 0 else 0.0 111 | rec = 1.0 * tp / tp_tn if tp_tn > 0 else 0.0 112 | beta2 = pow(self.beta, 2) 113 | f_beta = (1 + beta2) * pre * rec / (beta2 * pre + rec) if (pre + rec) > 0 else 0.0 114 | return pre, rec, f_beta 115 | 116 | def __measure_performance(self): 117 | self.performance['overall'] = dict() 118 | 119 | acc_true_num, acc_false_num = self.acc.get_true_false_counts() 120 | total_acc_num = acc_true_num + acc_false_num 121 | # calc acc 122 | overall_acc = round(1.0 * acc_true_num / total_acc_num, 4) if total_acc_num > 0 else 0.0 123 | self.performance['overall']['acc'] = overall_acc 124 | 125 | strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts() 126 | strict_pre, strict_rec, strict_f_score = self.__calc_prf(strict_true_counts, strict_false_counts, self.gs_all) 127 | self.performance['overall']['strict'] = dict() 128 | self.performance['overall']['strict']['precision'] = strict_pre 129 | self.performance['overall']['strict']['recall'] = strict_rec 130 | self.performance['overall']['strict']['f_score'] = strict_f_score 131 | 132 | relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts() 133 | relax_pre, relax_rec, relax_f_score = self.__calc_prf(relax_true_counts, relax_false_counts, self.gs_all) 134 | self.performance['overall']['relax'] = dict() 135 | self.performance['overall']['relax']['precision'] = relax_pre 136 | self.performance['overall']['relax']['recall'] = relax_rec 137 | self.performance['overall']['relax']['f_score'] = relax_f_score 138 | 139 | self.performance['category'] = dict() 140 | self.performance['category']['strict'] = dict() 141 | for k, v in self.cat_strict.items(): 142 | self.performance['category']['strict'][k] = dict() 143 | stc, sfc = v.get_true_false_counts() 144 | p, r, f = self.__calc_prf(stc, sfc, self.gs_cat[k]) 145 | self.performance['category']['strict'][k]['precision'] = p 146 | self.performance['category']['strict'][k]['recall'] = r 147 | self.performance['category']['strict'][k]['f_score'] = f 148 | 149 | self.performance['category']['relax'] = dict() 150 | for k, v in self.cat_relax.items(): 151 | self.performance['category']['relax'][k] = dict() 152 | rtc, rfc = v.get_true_false_counts() 153 | p, r, f = self.__calc_prf(rtc, rfc, self.gs_cat[k]) 154 | self.performance['category']['relax'][k]['precision'] = p 155 | self.performance['category']['relax'][k]['recall'] = r 156 | self.performance['category']['relax'][k]['f_score'] = f 157 | 158 | def __measure_counts(self): 159 | # gold standard 160 | self.counts['expect'] = dict() 161 | self.counts['expect']['overall'] = self.gs_all 162 | for k, v in self.gs_cat.items(): 163 | self.counts['expect'][k] = v 164 | # prediction 165 | self.counts['prediction'] = {'strict': dict(), 'relax': dict()} 166 | # strict 167 | strict_true_counts, strict_false_counts = self.all_strict.get_true_false_counts() 168 | self.counts['prediction']['strict']['overall'] = dict() 169 | self.counts['prediction']['strict']['overall']['total'] = strict_true_counts + strict_false_counts 170 | self.counts['prediction']['strict']['overall']['true'] = strict_true_counts 171 | self.counts['prediction']['strict']['overall']['false'] = strict_false_counts 172 | for k, v in self.cat_strict.items(): 173 | t, f = v.get_true_false_counts() 174 | self.counts['prediction']['strict'][k] = dict() 175 | self.counts['prediction']['strict'][k]['total'] = t + f 176 | self.counts['prediction']['strict'][k]['true'] = t 177 | self.counts['prediction']['strict'][k]['false'] = f 178 | # relax 179 | relax_true_counts, relax_false_counts = self.all_relax.get_true_false_counts() 180 | self.counts['prediction']['relax']['overall'] = dict() 181 | self.counts['prediction']['relax']['overall']['total'] = relax_true_counts + relax_false_counts 182 | self.counts['prediction']['relax']['overall']['true'] = relax_true_counts 183 | self.counts['prediction']['relax']['overall']['false'] = relax_false_counts 184 | for k, v in self.cat_relax.items(): 185 | t, f = v.get_true_false_counts() 186 | self.counts['prediction']['relax'][k] = dict() 187 | self.counts['prediction']['relax'][k]['total'] = t + f 188 | self.counts['prediction']['relax'][k]['true'] = t 189 | self.counts['prediction']['relax'][k]['false'] = f 190 | 191 | @staticmethod 192 | def __strict_match(gs, pred, s_idx, e_idx, en_type): 193 | if e_idx < len(gs) and gs[e_idx] == f"i-{en_type}": 194 | # check token after end in GS is not continued entity token 195 | return False 196 | elif gs[s_idx] != f"b-{en_type}" or pred[s_idx] != f"b-{en_type}": 197 | # force first token to be B- 198 | return False 199 | # check every token in span is the same 200 | for idx in range(s_idx, e_idx): 201 | if gs[idx] != pred[idx]: 202 | return False 203 | return True 204 | 205 | @staticmethod 206 | def __relax_match(gs, pred, s_idx, e_idx, en_type): 207 | # we adopt the partial match strategy which is very loose compare to right-left or approximate match 208 | for idx in range(s_idx, e_idx): 209 | gs_cate = gs[idx].split("-")[-1] 210 | pred_bound, pred_cate = pred[idx].split("-") 211 | if gs_cate == pred_cate == en_type: 212 | return True 213 | return False 214 | 215 | @staticmethod 216 | def __check_evaluated_already(gs_dict, cate, start_idx, end_idx): 217 | for k, v in gs_dict.items(): 218 | c, s, e = k 219 | if not (e < start_idx or s > end_idx) and c == cate: 220 | if v == 0: 221 | return True 222 | else: 223 | gs_dict[k] -= 1 224 | return False 225 | return False 226 | 227 | def __process_bio(self, gs_bio, pred_bio): 228 | # measure acc 229 | for w_idx, (gs_word, pred_word) in enumerate(zip(gs_bio, pred_bio)): 230 | # measure acc 231 | if gs_word == pred_word: 232 | self.acc.add_true_case() 233 | else: 234 | self.acc.add_false_case() 235 | 236 | # process gold standard 237 | llen = len(gs_bio) 238 | gs_dict = defaultdict(int) 239 | cur_idx = 0 240 | while cur_idx < llen: 241 | if gs_bio[cur_idx].strip() in self.label_not_for_eval: 242 | cur_idx += 1 243 | else: 244 | start_idx = cur_idx 245 | end_idx = start_idx + 1 246 | _, cate = gs_bio[start_idx].strip().split('-') 247 | while end_idx < llen and gs_bio[end_idx].strip() == f"i-{cate}": 248 | end_idx += 1 249 | self.gs_all += 1 250 | self.gs_cat[cate] += 1 251 | gs_dict[(cate, start_idx, end_idx)] += 1 252 | cur_idx = end_idx 253 | # process predictions 254 | cur_idx = 0 255 | while cur_idx < llen: 256 | if pred_bio[cur_idx].strip() in self.label_not_for_eval: 257 | cur_idx += 1 258 | else: 259 | start_idx = cur_idx 260 | end_idx = start_idx + 1 261 | _, cate = pred_bio[start_idx].strip().split("-") 262 | while end_idx < llen and pred_bio[end_idx].strip() == f"i-{cate}": 263 | end_idx += 1 264 | if self.__strict_match(gs_bio, pred_bio, start_idx, end_idx, cate): 265 | self.all_strict.add_true_case() 266 | self.cat_strict[cate].add_true_case() 267 | self.all_relax.add_true_case() 268 | self.cat_relax[cate].add_true_case() 269 | elif self.__relax_match(gs_bio, pred_bio, start_idx, end_idx, cate): 270 | if self.__check_evaluated_already(gs_dict, cate, start_idx, end_idx): 271 | cur_idx = end_idx 272 | continue 273 | self.all_strict.add_false_case() 274 | self.cat_strict[cate].add_false_case() 275 | self.all_relax.add_true_case() 276 | self.cat_relax[cate].add_true_case() 277 | else: 278 | self.all_strict.add_false_case() 279 | self.cat_strict[cate].add_false_case() 280 | self.all_relax.add_false_case() 281 | self.cat_relax[cate].add_false_case() 282 | cur_idx = end_idx 283 | 284 | def eval_file(self, gs_file, pred_file): 285 | self.logger.info("processing gold standard file: {} and prediciton file: {}".format(gs_file, pred_file)) 286 | pred_bio_sents = load_bio_file_into_sents(pred_file, do_lower=True) 287 | gs_bio_sents = load_bio_file_into_sents(gs_file, do_lower=True) 288 | # process bio data 289 | # check two data have same amount of sents 290 | assert len(gs_bio_sents) == len(pred_bio_sents), \ 291 | "gold standard and prediction have different dimension: gs: {}; pred: {}".format(len(gs_bio_sents), len(pred_bio_sents)) 292 | # measure performance 293 | for s_idx, (gs_sent, pred_sent) in enumerate(zip(gs_bio_sents, pred_bio_sents)): 294 | # check two sents have same No. of words 295 | assert len(gs_sent) == len(pred_sent), \ 296 | "In {}th sentence, the words counts are different; gs: {}; pred: {}".format(s_idx, gs_sent, pred_sent) 297 | gs_sent = list(map(lambda x: x[-1], gs_sent)) 298 | pred_sent = list(map(lambda x: x[-1], pred_sent)) 299 | self.__process_bio(gs_sent, pred_sent) 300 | # get the evaluation matrix 301 | self.__measure_performance() 302 | self.__measure_counts() 303 | 304 | def eval_mem(self, gs, pred, do_flat=False): 305 | # flat sents to sent; we assume input sequences only have 1 dimension (only labels) 306 | if do_flat: 307 | self.logger.warning('Sentences have been flatten to 1 dim.') 308 | gs = list(chain(*gs)) 309 | pred = list(chain(*pred)) 310 | gs = list(map(lambda x: x.lower(), gs)) 311 | pred = list(map(lambda x: x.lower(), pred)) 312 | self.__process_bio(gs, pred) 313 | else: 314 | for sidx, (gs_s, pred_s) in enumerate(zip(gs, pred)): 315 | gs_s = list(map(lambda x: x.lower(), gs_s)) 316 | pred_s = list(map(lambda x: x.lower(), pred_s)) 317 | self.__process_bio(gs_s, pred_s) 318 | 319 | self.__measure_performance() 320 | self.__measure_counts() 321 | 322 | def get_performance(self): 323 | return self.performance 324 | 325 | def get_counts(self): 326 | return self.counts 327 | 328 | def show_evaluation(self, digits=4): 329 | if len(self.performance) == 0: 330 | raise RuntimeError('call eval_mem() first to get the performance attribute') 331 | 332 | cate = self.performance['category']['strict'].keys() 333 | 334 | headers = ['precision', 'recall', 'f1'] 335 | width = max(max([len(c) for c in cate]), len('overall'), digits) 336 | head_fmt = '{:>{width}s} ' + ' {:>9}' * len(headers) 337 | 338 | report = head_fmt.format(u'', *headers, width=width) 339 | report += '\n\nstrict\n' 340 | 341 | row_fmt = '{:>{width}s} ' + ' {:>9.{digits}f}' * 3 + '\n' 342 | for c in cate: 343 | precision = self.performance['category']['strict'][c]['precision'] 344 | recall = self.performance['category']['strict'][c]['recall'] 345 | f1 = self.performance['category']['strict'][c]['f_score'] 346 | report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits) 347 | 348 | report += '\nrelax\n' 349 | 350 | for c in cate: 351 | precision = self.performance['category']['relax'][c]['precision'] 352 | recall = self.performance['category']['relax'][c]['recall'] 353 | f1 = self.performance['category']['relax'][c]['f_score'] 354 | report += row_fmt.format(c, *[precision, recall, f1], width=width, digits=digits) 355 | 356 | report += '\n\noverall\n' 357 | report += 'acc: ' + str(self.performance['overall']['acc']) 358 | report += '\nstrict\n' 359 | report += row_fmt.format('', *[self.performance['overall']['strict']['precision'], 360 | self.performance['overall']['strict']['recall'], 361 | self.performance['overall']['strict']['f_score']], width=width, digits=digits) 362 | 363 | report += '\nrelax\n' 364 | report += row_fmt.format('', *[self.performance['overall']['relax']['precision'], 365 | self.performance['overall']['relax']['recall'], 366 | self.performance['overall']['relax']['f_score']], width=width, digits=digits) 367 | return report 368 | --------------------------------------------------------------------------------