├── requirements.txt ├── emrqa_download_image.jpg ├── generation ├── .idea │ ├── misc.xml │ ├── modules.xml │ └── generation.iml ├── i2b2_smoking │ └── smoking-answers.py ├── i2b2_relations │ ├── problem_classfiers.py │ ├── common_names.txt │ ├── matching_notes.csv │ └── relations-answers.py ├── combine_data │ └── combine_answers.py ├── i2b2_obesity │ └── obesity-answers.py └── i2b2_medications │ └── medication-answers.py ├── .gitignore ├── evaluation ├── paraphrase-analysis.py ├── template-analysis.py └── basic-stats.py ├── main.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | xmltodict 3 | -------------------------------------------------------------------------------- /emrqa_download_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panushri25/emrQA/HEAD/emrqa_download_image.jpg -------------------------------------------------------------------------------- /generation/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Compiler outputs # 3 | *.a 4 | *.bat 5 | *.exe 6 | *.json 7 | *.pyc 8 | temp_risk.txt 9 | squad_format.py 10 | 11 | # Directories # 12 | output 13 | baselines 14 | i2b2 15 | .idea 16 | 17 | # Packages # 18 | *.gz 19 | *.iso 20 | *.jar 21 | -------------------------------------------------------------------------------- /generation/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /generation/.idea/generation.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /evaluation/paraphrase-analysis.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import nltk 4 | from nltk.metrics import * 5 | from nltk.translate.bleu_score import sentence_bleu 6 | import argparse 7 | import itertools 8 | import random 9 | import numpy as np 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--templates_dir', default='/home/anusri/Desktop/emrQA/templates', help='Directory containing template files in the given format') 13 | 14 | args = parser.parse_args() 15 | csv_reader = list(csv.reader(open(os.path.join(args.templates_dir,"templates-all.csv")))) 16 | 17 | def scoring_method(qtuple,method): 18 | 19 | if method == "jaccard_score": 20 | set1 = set(nltk.word_tokenize(qtuple[0])) 21 | set2 = set(nltk.word_tokenize(qtuple[1])) 22 | score = jaccard_distance(set1,set2) 23 | 24 | if method == "blue_score": 25 | (reference, candidate) = qtuple 26 | score = sentence_bleu(reference, candidate) 27 | 28 | return score 29 | 30 | if __name__=="__main__": 31 | 32 | method = "blue_score" 33 | #method = "jaccard_score" 34 | unique_logical_forms = [] 35 | total_questions = [] 36 | total_scores = [] 37 | 38 | for line in csv_reader[1:]: 39 | 40 | question = line[2].strip() 41 | logical_form = line[3].strip() 42 | 43 | question = question.replace("|medication| or |medication|", "|medication|") 44 | question = question.replace("|problem| or |problem|", "|problem|") 45 | question = question.replace("|test| or |test|", "|test|") 46 | question = question.replace("|test| |test| |test|", "|test|") 47 | question = question.replace("\t", "") 48 | logical_form = logical_form.replace("\t", "").replace("|medication|","|treatment|") 49 | if logical_form not in unique_logical_forms: 50 | unique_logical_forms.append(logical_form) 51 | 52 | paraphrase_questions = question.split("##") 53 | random.shuffle(paraphrase_questions) 54 | total_questions.extend(list(set(paraphrase_questions))) 55 | 56 | question_tuples = list(itertools.product([paraphrase_questions[0]], paraphrase_questions[1:])) 57 | scores = [] 58 | for qtuple in question_tuples: 59 | if qtuple[0] == qtuple[1]: 60 | continue 61 | scoring_tuple = scoring_method(qtuple, method) 62 | scores.append(scoring_tuple) 63 | 64 | if len(scores) != 0: 65 | min_value = min(scores) 66 | max_value = max(scores) 67 | 68 | total_scores.extend(scores) 69 | 70 | ## total questions by total question types 71 | 72 | print("Average paraphrases per question", len(total_questions)*1.0/len(unique_logical_forms)) 73 | print("Average of "+ method+ " of paraphrases", np.mean(np.array(total_scores))) 74 | print("Standard deviation of " + method + " of paraphrases", np.std(np.array(total_scores))) 75 | 76 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from subprocess import check_call 2 | import sys 3 | import os 4 | import csv 5 | 6 | PYTHON = sys.executable 7 | 8 | #################################### set the full file paths ############################################### 9 | 10 | i2b2_relations_challenge_directory = "i2b2/relations/" 11 | i2b2_medications_challenge_directory = "i2b2/medication/" 12 | i2b2_heart_disease_risk_challenge_directory = "i2b2/heart-disease-risk/" 13 | i2b2_obesity_challenge_directory = "i2b2/obesity/" 14 | i2b2_smoking_challenge_directory = "i2b2/smoking/" 15 | i2b2_coreference_challeneg_directory = "i2b2/coreference" 16 | 17 | templates_directory = "templates/templates-all.csv" 18 | 19 | #################################### make output directory if it does not already exist ######################### 20 | 21 | cwd = os.getcwd() 22 | model_dir = "output/" 23 | if not os.path.exists(os.path.join(cwd,model_dir)): 24 | os.makedirs(model_dir) 25 | 26 | output_directory = os.path.join(cwd,model_dir) ## you can modify this to change the output directory path ## 27 | 28 | ########################################################################################################### 29 | 30 | matching_notes = os.path.join("generation/i2b2_relations/", "matching_notes.csv") 31 | match_file = open(matching_notes) 32 | csvreader = csv.reader(match_file) 33 | matching_files = list(csvreader) # relation, coreference 34 | new_file = [] 35 | new_file.append(matching_files[0]) 36 | flag = 0 37 | for file in matching_files[1:]: 38 | if i2b2_relations_challenge_directory in file[0]: 39 | flag = 1 40 | break 41 | new_file.append([os.path.join(i2b2_relations_challenge_directory,file[0]),os.path.join(i2b2_coreference_challeneg_directory,file[1])]) 42 | 43 | if flag == 0: 44 | ofile = open(matching_notes, "w") 45 | filewriter = csv.writer(ofile, delimiter="\t") 46 | 47 | for val in new_file: 48 | filewriter.writerow(val) 49 | 50 | ofile.close() 51 | 52 | ################################### run the generation scripts ####################################### 53 | 54 | 55 | cmd = "{python} generation/i2b2_medications/medication-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_medications_challenge_directory, templates_dir=templates_directory, output_dir=output_directory) 56 | print(cmd) 57 | check_call(cmd, shell=True) 58 | 59 | 60 | cmd = "{python} generation/i2b2_relations/relations-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_relations_challenge_directory, templates_dir=templates_directory, output_dir=output_directory) 61 | print(cmd) 62 | check_call(cmd, shell=True) 63 | 64 | 65 | cmd = "{python} generation/i2b2_heart_disease_risk/risk-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_heart_disease_risk_challenge_directory, templates_dir=templates_directory, output_dir=output_directory) 66 | print(cmd) 67 | check_call(cmd, shell=True) 68 | 69 | 70 | cmd = "{python} generation/i2b2_smoking/smoking-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_smoking_challenge_directory, templates_dir=templates_directory, output_dir=output_directory) 71 | print(cmd) 72 | check_call(cmd, shell=True) 73 | 74 | 75 | cmd = "{python} generation/i2b2_obesity/obesity-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_obesity_challenge_directory, templates_dir=templates_directory, output_dir=output_directory) 76 | print(cmd) 77 | check_call(cmd, shell=True) 78 | 79 | ################## combine all the output files and generate the output in normal format #################### 80 | 81 | cmd = "{python} generation/combine_data/combine_answers.py --output_dir={output_dir}".format(python=PYTHON, output_dir=output_directory) 82 | print(cmd) 83 | check_call(cmd, shell=True) 84 | 85 | ##################### convert normal output to squad format ################################## 86 | 87 | 88 | ######################### basic analysis of the dataset ####################################### 89 | 90 | ''' 91 | cmd = "{python} evaluation/analysis.py".format(python=PYTHON) 92 | print(cmd) 93 | check_call(cmd, shell=True) 94 | ''' -------------------------------------------------------------------------------- /generation/i2b2_smoking/smoking-answers.py: -------------------------------------------------------------------------------- 1 | import xmltodict 2 | import csv 3 | import json 4 | import argparse 5 | import os 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 smoking challange files') 8 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format') 9 | parser.add_argument('--output_dir', default='', help='Directory to store the output') 10 | args = parser.parse_args() 11 | 12 | 13 | ###################################################### SET FILE PATHS ################################################################## 14 | 15 | templates_file = args.templates_dir 16 | i2b2_file_paths = args.i2b2_dir 17 | 18 | ql_output = os.path.join(args.output_dir,"smoking-ql.csv") 19 | qa_output = os.path.join(args.output_dir,"smoking-qa.json") 20 | file_names = ["smokers_surrogate_test_all_groundtruth_version2.xml","smokers_surrogate_train_all_version2.xml"] 21 | 22 | ######################################################## CODE ######################################################################### 23 | 24 | def ReadFile(): 25 | file_path = i2b2_file_paths 26 | 27 | status = [] 28 | for file_name in file_names: 29 | file = file_path + file_name 30 | with open(file) as fd: 31 | XML = xmltodict.parse(fd.read()) 32 | idx = 0 33 | for key in XML["ROOT"]["RECORD"]: 34 | idx += 1 35 | 36 | patient_id = key["@ID"] 37 | answer_class = key["SMOKING"]["@STATUS"] 38 | patient_note = key["TEXT"] 39 | 40 | status.append([patient_id,answer_class,patient_note]) 41 | return status 42 | 43 | 44 | def MakeJSONOutput(smoking_data, json_out, status, filewriter_forlform): 45 | 46 | smoking_out = {"paragraphs": [], "title": "smoking"} 47 | 48 | for state in status: 49 | patient_id = state[0] 50 | patient_note = state[2] 51 | 52 | out = {"note_id": patient_id, "context": patient_note, "qas": []} 53 | 54 | for row in smoking_data: 55 | question = row[2].strip() 56 | form = row[3].strip() 57 | answer_type = row[4] 58 | 59 | if question == "": 60 | continue 61 | 62 | question_list = question.split("##") 63 | for q in question_list: 64 | filewriter_forlform.writerow([q, form, q, form]) 65 | 66 | if answer_type == "smoke_class": 67 | 68 | out["qas"].append({"answers": [{"answer_start": "", "text": state[1], "evidence": "", "evidence_start": ""}], 69 | "id": [zip(question_list, question_list), form], "question": question_list}) 70 | 71 | 72 | smoking_out["paragraphs"].append(out) 73 | 74 | 75 | with open(json_out, 'w') as outfile: 76 | json.dump(smoking_out, outfile) 77 | 78 | if __name__=="__main__": 79 | 80 | ### Read i2b2 files, one status per clinical note ### 81 | 82 | status = ReadFile() 83 | 84 | ### File to read templates ### 85 | 86 | filereader = list(csv.reader(open(templates_file))) 87 | 88 | ## read only templates relevant to smoking challenge ## 89 | 90 | smoking_lines = [] 91 | for line in filereader[1:]: 92 | if line[0] != "smoking" and line[0] != "smoking": 93 | continue 94 | smoking_lines.append(line) 95 | 96 | ofile = open(ql_output, "w") 97 | filewriter_forlform = csv.writer(ofile, delimiter="\t") 98 | filewriter_forlform.writerow(["Question", "Logical Form"]) 99 | 100 | MakeJSONOutput(smoking_lines, qa_output, status, filewriter_forlform) 101 | #MakeQuestion(smoking_lines,out_file,status) 102 | 103 | 104 | 105 | ''' 106 | def MakeQuestion(smoking_data,out_file,status): 107 | 108 | ofile = open(out_file,"w") 109 | ofilewriter = csv.writer(ofile) 110 | 111 | values = ["Question", "Answer" , "Answer line in note", "Note ID", "Difference in QA lines"] 112 | ofilewriter.writerow(values) 113 | 114 | for row in smoking_data: 115 | #print(row) 116 | question = row[1].strip() 117 | #print(row) 118 | answer_type = row[3] 119 | 120 | if answer_type == "smoke_class": 121 | for state in status: 122 | values = [question, state[1],"",state[0],""] 123 | patient_id = status[0] 124 | patient_note = status[2] 125 | 126 | ofilewriter.writerow(values) 127 | elif answer_type == "None": 128 | #return [] 129 | pass 130 | else: 131 | print(answer_type) 132 | 133 | ''' -------------------------------------------------------------------------------- /evaluation/template-analysis.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import os 4 | import numpy as np 5 | import collections 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--templates_dir', default='/home/anusri/Desktop/emrQA/templates', help='Directory containing template files in the given format') 10 | args = parser.parse_args() 11 | 12 | relations = ["reveals", "relates","causes","given","conducted","improves","worsens"] 13 | Functions = ["CheckRange","CheckIfNull","sortBy"] 14 | attributes = ["date","result","onsetdate","startdate","QuitDate","PackPerDay","status","abnormalResultFlag","adherence","enddate","IsTobaccoUser","sig", 15 | "YearsOfUse","diagnosisdate","dosage"] 16 | attribute_values_defined = ["pending","currentDate"] 17 | 18 | csv_reader = list(csv.reader(open(os.path.join(args.templates_dir,"templates-all.csv")))) 19 | answer = "no" 20 | 21 | question_lforms = [] 22 | for line in csv_reader[1:]: 23 | 24 | dataset = line[0] 25 | if dataset == "relations": 26 | check = line[5] 27 | else: 28 | check = line[4] 29 | 30 | ## analyze all logical forms or only the ones used with answers, 31 | 32 | if answer == "yes": 33 | if check != "none": 34 | if (line[2],line[3]) not in question_lforms: 35 | question_lforms.append((line[2],line[3])) 36 | else: 37 | if (line[2],line[3]) not in question_lforms: 38 | question_lforms.append((line[2],line[3])) 39 | 40 | 41 | ######################################################################################################## 42 | lforms = [] 43 | for (question_list,lform) in question_lforms: 44 | #print(lform) 45 | if lform not in lforms: 46 | lforms.append(lform.replace("\t", "").replace("|medication|","|treatment|")) 47 | 48 | ########################################################################################################## 49 | #print(len(lforms)) 50 | 51 | 52 | lform_vocab = [] 53 | for lform in lforms: 54 | lform = lform.replace("-"," - ").replace("1","").replace("2","").replace("/"," / ").replace("<"," < ").replace(">"," > ").replace("("," ( ").replace(")"," ) ").replace("["," [ ").replace("]"," ] ").replace("{"," { ").replace("}"," } ").replace("="," = ").replace(",", " , ") 55 | if lform.count("(") != lform.count(")"): 56 | print("(") 57 | print(lform) 58 | if lform.count("{") != lform.count("}"): 59 | print("{") 60 | print(lform) 61 | if lform.count("[") != lform.count("]"): 62 | print('[') 63 | print(lform) 64 | 65 | 66 | tokens = [tok for tok in lform.split(" ") if tok != ""] 67 | lform_vocab += tokens 68 | 69 | vocab_counter = collections.Counter(lform_vocab) 70 | Events = [] 71 | arguments = [] 72 | arthemetic = [] 73 | brackets = [] 74 | Events = [] 75 | arthemetic = [] 76 | punctuations = [] 77 | attribute_values = [] 78 | Functions = [] 79 | Event_Combination = [] 80 | Relations_COmbination = [] 81 | brackets = [] 82 | arguments = [] 83 | 84 | for vocab in vocab_counter: 85 | if "Event" in vocab: 86 | Events.append(vocab) 87 | elif vocab in relations + Functions + attributes + attribute_values_defined: 88 | pass 89 | elif "." in vocab: 90 | attribute_values.append(vocab) 91 | elif vocab in [">","<","=","Y","N","x","-"]: 92 | arthemetic.append(vocab) 93 | elif vocab in ["OR", "AND"]: 94 | Event_Combination.append(vocab) 95 | elif vocab in ["/"]: 96 | Relations_COmbination.append(vocab) 97 | elif vocab in ["(",")","[","]","{","}"]: 98 | brackets.append(vocab) 99 | elif "|" in vocab: 100 | arguments.append(vocab) 101 | elif "," in vocab: 102 | punctuations.append(vocab) 103 | else: 104 | pass 105 | 106 | 107 | 108 | arthemetic_questions = [] 109 | question_with_relation = [] 110 | medical_domain_qs = [] 111 | date_questions = [] 112 | time_questions = [] 113 | trend_question = [] 114 | events_used = {} 115 | multiple_events = [] 116 | Lab_Questions = [] 117 | arthmetic_questions = [] 118 | indefinite_evidence = [] 119 | event_confirmation = [] 120 | current = [] 121 | property = 0.0 122 | past = [] 123 | more_than_one = 0.0 124 | attribute_questions = 0.0 125 | event_questions = 0.0 126 | medical_queston = 0.0 127 | 128 | for event in Events: 129 | events_used[event] = 0 130 | 131 | for lform in lforms: 132 | #print(lform) 133 | lform = lform.replace("-", " - ").replace("1", "").replace("2", "").replace("/", " / ").replace("<", " < ").replace( 134 | ">", 135 | " > ").replace( 136 | "(", " ( ").replace(")", " ) ").replace("[", " [ ").replace("]", " ] ").replace("{", " { ").replace("}", 137 | " } ").replace( 138 | "=", " = ").replace(",", " , ") 139 | 140 | if "( x )" in lform: 141 | #print(lform) 142 | event_questions += 1 143 | 144 | if "= "in lform: 145 | #print(lform) 146 | attribute_questions += 1 147 | 148 | if "." in lform: 149 | #print(lform) 150 | medical_queston += 1 151 | 152 | tokens = [tok for tok in lform.split(" ") if tok != ""] 153 | 154 | rel = set(tokens).intersection(set(relations)) 155 | 156 | if len(set(["CheckRange", ">", "<", ]).intersection(tokens)) != 0: 157 | #print(lform) 158 | arthemetic_questions.append(lform) 159 | 160 | if len(rel) == 0: 161 | if "[" not in tokens: 162 | indefinite_evidence.append(lform) 163 | else: 164 | out = list((set(Events)).intersection(set(tokens))) ## Event Property Questions 165 | for e in out: 166 | events_used[e] += 1 167 | property += 1 168 | else: 169 | question_with_relation.append(lform) 170 | 171 | if len(rel) > 0: 172 | more_than_one += 1 173 | 174 | print("Arthemetic questions",len(arthemetic_questions)*100.0/len(lforms)) 175 | print("One or more than one relations", 100.0 * more_than_one/len(lforms)) 176 | print("Course Questions",100.0*event_questions/len(lforms)) 177 | print("Fine Questions",100.0*attribute_questions/len(lforms)) 178 | print("Medical Questions",100.0*medical_queston/len(lforms)) 179 | 180 | ## medical 181 | 182 | ## corse 183 | 184 | ## fine 185 | -------------------------------------------------------------------------------- /generation/i2b2_relations/problem_classfiers.py: -------------------------------------------------------------------------------- 1 | from nltk.stem import WordNetLemmatizer 2 | import nltk 3 | from nltk.corpus import stopwords 4 | 5 | ## Open common names to use in is_common_noun function ## 6 | file = open("generation/i2b2_relations/common_names.txt") ## you can use any set of common nouns to filter, here we call the top 500 high frequency words occuring in our templates as commoun nouns ## 7 | data = file.readlines() 8 | file.close() 9 | common_nouns = [line.strip() for line in data] 10 | 11 | ## Get Stop words ## 12 | 13 | stopWords = set(stopwords.words('english')) 14 | lemmatizer = WordNetLemmatizer() 15 | 16 | ## Functions For Use ## 17 | 18 | def concept_is_CommonNoun(concept): 19 | ''' 20 | Return 1 if the concept is a common noun 21 | :param concept: 22 | :return: 23 | ''' 24 | 25 | tags = nltk.pos_tag(nltk.word_tokenize(concept)) 26 | [words, tag] = zip(*tags) 27 | 28 | words = list(words) 29 | 30 | nouns = [] 31 | if tag[0] in ["DT", "PRP", "PRP$"]: 32 | words[0] = "" 33 | for idx in range(1, len(tag)): 34 | if words[idx] in stopWords: 35 | continue 36 | nouns.append(words[idx]) 37 | else: 38 | for idx in range(len(tag)): 39 | if words[idx] in stopWords: 40 | continue 41 | nouns.append(words[idx]) 42 | 43 | flag = 0 44 | for noun in nouns: 45 | if (lemmatizer.lemmatize(noun) in common_nouns) or (noun in common_nouns): 46 | flag = 1 47 | else: 48 | flag = 0 49 | break 50 | 51 | ''' 52 | if flag == 1: 53 | print(" ".join(words).strip(), tags) 54 | ''' 55 | return flag 56 | 57 | def concept_is_PastTense(concept): 58 | ''' 59 | Return 1 if the concept ends in past tense 60 | :param concept: 61 | :return: 62 | ''' 63 | text = nltk.word_tokenize(concept) 64 | tagged = nltk.pos_tag(text) 65 | 66 | tense = {} 67 | tense["future"] = len([word for word in tagged[-1:] if word[1] == "MD"]) 68 | tense["present"] = len([word for word in tagged[-1:] if word[1] in ["VBP", "VBZ", "VBG"]]) 69 | tense["past"] = len([word for word in tagged[-1:] if word[1] in ["VBD", "VBN"]]) 70 | 71 | if tense["past"] > 0: 72 | flag = 1 73 | else: 74 | flag = 0 75 | 76 | return flag 77 | 78 | ''' 79 | import sys 80 | sys.path.insert(0, '/home/anusri/Desktop/IBM/GetUMLS/QuickUMLS') 81 | import quickumls 82 | matcher = quickumls.QuickUMLS("/home/anusri/Desktop/IBM/GetUMLS/installation") 83 | 84 | ## Get UMLS semantic mapping ## 85 | sfile = open("/home/anusri/Desktop/IBM/GetUMLS/QuickUMLS/SemanticTypes_2013AA.txt") 86 | data = sfile.readlines() 87 | sfile.close() 88 | mapping = {} 89 | for line in data: 90 | words = line.split("|") 91 | short_type = words[1] 92 | full_type = words[0] 93 | mapping[short_type] = full_type 94 | 95 | def concept_is_Disease(concept): 96 | #if concept_is_CommonNoun(concept) == 1: 97 | # return 0 98 | 99 | SemanticTypes = CheckSemanticType(concept) 100 | 101 | otype = disease 102 | for (word,wtype) in SemanticTypes: 103 | for type in wtype: 104 | if (type in otype): 105 | return 1 106 | 107 | 108 | return 0 109 | def concept_is_Symptom(concept): 110 | # if concept_is_CommonNoun(concept) == 1: 111 | # return 0 112 | 113 | SemanticTypes = CheckSemanticType(concept) 114 | for (word, wtype) in SemanticTypes: 115 | for type in wtype: 116 | if (type in symptoms): 117 | return 1 118 | 119 | return 0 120 | def concept_is_MentalDisease(concept): 121 | # if concept_is_CommonNoun(concept) == 1: 122 | # return 0 123 | 124 | SemanticTypes = CheckSemanticType(concept) 125 | 126 | 127 | for (word, wtype) in SemanticTypes: 128 | for type in wtype: 129 | if (type in mental_disease): 130 | return 1 131 | 132 | return 0 133 | def concept_is_VirusBacterium(concept): 134 | # if concept_is_CommonNoun(concept) == 1: 135 | # return 0 136 | 137 | SemanticTypes = CheckSemanticType(concept) 138 | 139 | for (word, wtype) in SemanticTypes: 140 | for type in wtype: 141 | if type in bacteria: 142 | return 1 143 | 144 | return 0 145 | def concept_is_Injury(concept): 146 | # if concept_is_CommonNoun(concept) == 1: 147 | # return 0 148 | 149 | SemanticTypes = CheckSemanticType(concept) 150 | 151 | 152 | for (word, wtype) in SemanticTypes: 153 | for type in wtype: 154 | if (type in injury): 155 | return 1 156 | 157 | return 0 158 | def concept_is_Abnormality(concept): 159 | # if concept_is_CommonNoun(concept) == 1: 160 | # return 0 161 | 162 | SemanticTypes = CheckSemanticType(concept) 163 | 164 | 165 | for (word, wtype) in SemanticTypes: 166 | for type in wtype: 167 | if (type in abnormality): 168 | return 1 169 | 170 | return 0 171 | def concept_is_AbnormalTestResult(concept): 172 | # if concept_is_CommonNoun(concept) == 1: 173 | # return 0 174 | 175 | SemanticTypes = CheckSemanticType(concept) 176 | 177 | 178 | for (word, wtype) in SemanticTypes: 179 | for type in wtype: 180 | if (type in lab_result): 181 | return 1 182 | 183 | return 0 184 | def CheckSemanticType(text): 185 | types = [] 186 | out = matcher.match(text, best_match=True, ignore_syntax=False) 187 | for words in out: 188 | word = words[0]["ngram"] 189 | temp = [] 190 | for type in list(words[0]["semtypes"]): 191 | temp.append(mapping[type]) 192 | types.append((word,temp)) 193 | return types 194 | 195 | ## Functions for script check ## 196 | 197 | #TenseFilter() 198 | 199 | 200 | def determine_tense_input(sentance): 201 | text = nltk.word_tokenize(sentance) 202 | tagged = nltk.pos_tag(text) 203 | 204 | tense = {} 205 | tense["future"] = len([word for word in tagged[-1:] if word[1] == "MD"]) 206 | tense["present"] = len([word for word in tagged[-1:] if word[1] in ["VBP", "VBZ", "VBG"]]) 207 | tense["past"] = len([word for word in tagged[-1:] if word[1] in ["VBD", "VBN"]]) 208 | return tense 209 | 210 | def TenseFilter(): 211 | 212 | file = open("problem-concept.txt") 213 | data = file.readlines() 214 | file.close() 215 | 216 | concepts = [line.strip() for line in data] 217 | 218 | past = [] 219 | future = [] 220 | 221 | for concept in concepts: 222 | tense = determine_tense_input(concept) 223 | if tense["past"] > 0: 224 | past.append(concept) 225 | if tense["future"] > 0: 226 | future.append(concept) 227 | 228 | #for word in past: 229 | # term = word.strip().split(" ") 230 | # if len(term) > 1: 231 | # term = term[-1] 232 | # else: 233 | # term = term[0] 234 | # print(term) 235 | # print(word,en.verb.present(term)) 236 | 237 | print(past) 238 | print(future) 239 | 240 | #FilterCommonNouns() 241 | 242 | ''' -------------------------------------------------------------------------------- /evaluation/basic-stats.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.tokenize.stanford import StanfordTokenizer 3 | import os 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import nltk 7 | from random import * 8 | from nltk import sent_tokenize 9 | from nltk import word_tokenize 10 | import random 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--output_dir', default='/home/anusri/Desktop/emrQA/output/', help='Directory to store the output') 15 | 16 | args = parser.parse_args() 17 | 18 | #os.environ['STANFORD_PARSER'] = '/home/anusri/Desktop/codes_submission/packages/stanford-jars/' 19 | #os.environ['STANFORD_MODELS'] = '/home/anusri/Desktop/codes_submission/packages/stanford-jars' 20 | #tokenizer = StanfordTokenizer("/home/anusri/Desktop/codes_submission/packages/stanford-jars/stanford-postagger.jar") 21 | #from matplotlib2tikz import save as tikz_save 22 | 23 | def LengthStatistics(list_values): 24 | 25 | metrics = {} 26 | Total_values= len(list_values) 27 | Total_Tokens = 0.0 28 | #print(Total_values) 29 | for question in list_values: 30 | words = word_tokenize(question.strip()) 31 | words = [word for word in words if word != ""] 32 | Total_Tokens += len(words) 33 | 34 | Avg_token_length = Total_Tokens / Total_values 35 | metrics["question_length"] = Total_values 36 | metrics["avg_question_length"] = Avg_token_length 37 | 38 | return (Total_values, Avg_token_length) 39 | 40 | 41 | problem = [] 42 | treatments = [] 43 | tests = [] 44 | 45 | if __name__ == '__main__': 46 | 47 | data_file = os.path.join(args.output_dir,"data.json") 48 | datasets = json.load(open(data_file), encoding="latin-1") 49 | 50 | all_questions = [] 51 | all_clinical_notes = [] 52 | 53 | total_clinical_notes = 0 54 | number_of_answers_per_question = {} 55 | num_classes = 0.0 56 | classes = [] 57 | total_evidences = [] 58 | 59 | 60 | for dataset in datasets["data"]: 61 | 62 | 63 | 64 | print("Processing dataset",dataset["title"]) 65 | 66 | for note in dataset["paragraphs"]: 67 | total_clinical_notes += 1 68 | 69 | if " ".join(note["context"]) not in all_clinical_notes: 70 | all_clinical_notes.extend([" ".join(note["context"])]) 71 | else: 72 | continue 73 | 74 | for questions in note["qas"]: 75 | 76 | all_answers = [] 77 | evidences = [] 78 | 79 | all_questions.append(list(set(questions["question"]))) # all questions 80 | 81 | for answer in questions["answers"]: 82 | 83 | if dataset["title"] in ["obesity", "smoking"] : 84 | #print(answer["text"]) 85 | classes.append(answer["text"]) 86 | continue 87 | #for txt in answer["text"]: 88 | # if txt not in all_answers: 89 | # all_answers.append(txt) 90 | else: 91 | if answer["answer_start"][0] != "": 92 | if answer["answer_start"] not in all_answers: 93 | all_answers.append(answer["answer_start"]) ## all answers 94 | #print(questions["question"][0], answer["answer_start"],answer["evidence"]) 95 | evidences.append(answer["evidence"]) 96 | 97 | total_evidences.extend(evidences) 98 | 99 | ## distribution of evidences per question type 100 | 101 | ground_truth = all_answers 102 | total_answers = len(ground_truth) 103 | if total_answers not in number_of_answers_per_question: 104 | number_of_answers_per_question[total_answers] = 0 105 | number_of_answers_per_question[total_answers] += 1 106 | 107 | 108 | print("Total Clinical Notes", total_clinical_notes, len(all_clinical_notes)) 109 | total_question = len(all_questions) 110 | totals = 0 111 | questions_list = [] 112 | for value in all_questions: 113 | totals += len(value) 114 | questions_list.extend(value) 115 | 116 | ## Average Question Length ## 117 | 118 | print("Total Number Of Questions",totals) 119 | print("Total number of question types", total_question) 120 | stats_questions = LengthStatistics(questions_list) 121 | print("Average question length",stats_questions[1]) 122 | 123 | ## Average Evidence Length ## 124 | 125 | stats_evidences = LengthStatistics(total_evidences) 126 | print("Average evidence length",stats_evidences[1]) 127 | 128 | ## Average Note Length ## 129 | 130 | stats_evidences = LengthStatistics(all_clinical_notes) 131 | print("Average clinical note length", stats_evidences[1]) 132 | 133 | ## Average number of questions per note ## 134 | 135 | print("Average Number of questions per note", totals/total_clinical_notes) 136 | print("Average number of question types per note", total_question/total_clinical_notes) 137 | 138 | ## Average number of evidences per question ## 139 | 140 | total__num_answers = 0 141 | for value in number_of_answers_per_question: 142 | if value == 0: 143 | print(number_of_answers_per_question[value]) 144 | else: 145 | total__num_answers += value*number_of_answers_per_question[value] 146 | 147 | num_classes = len(set(classes)) 148 | print("Average number of evidences", float(total__num_answers) / total_question) 149 | print("Percentage with one evidences",number_of_answers_per_question[1]*100.0/total_question) 150 | print("range in number of evidences",min(number_of_answers_per_question.keys()),max(number_of_answers_per_question.keys())) 151 | print("total number of classes in obesity and smoking datasets", num_classes) 152 | 153 | ################# more stats ignore for now ###################### 154 | 155 | # indefinite_evidence_type = [] 156 | # forms_in_data = [] 157 | 158 | #print(indefinite_evidence_type) 159 | #print("indefinite",len(num_answers)*100.0/total_question) 160 | #print(min(num_answers),max(num_answers)) 161 | #plt.figure(2) 162 | #plt.xlabel("Number of evidences greater than 1") 163 | #plt.ylabel("Frequency") 164 | #plt.title("Formula Size Bins") 165 | #plt.hist(num_answers, bins=3) 166 | #plt.show() 167 | #tikz_save('evidences-hist.tex') 168 | 169 | #print(number_of_answers_per_question) 170 | #stats_clinincal_notes = LengthStatistics(all_clinical_notes) 171 | #print("Total Clinincal Notes",stats_clinincal_notes[0]) 172 | #print("Average Clinincal Note length", stats_clinincal_notes[1]) 173 | 174 | #print(number_of_answers_per_question[0]) 175 | #print(number_of_answers_per_question[1]) 176 | #print(number_of_answers_per_question) 177 | ## Plot the distribution of number of answer 178 | #print(number_of_answers_per_question) 179 | 180 | #x = np.arange(len(number_of_answers_per_question)-1) 181 | #plt.bar(x,list(np.array(number_of_answers_per_question.values().remove(number_of_answers_per_question[1])))) 182 | #plt.xticks(x, number_of_answers_per_question.keys().remove(1)) 183 | #plt.show() 184 | 185 | -------------------------------------------------------------------------------- /generation/combine_data/combine_answers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import random 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--output_dir', default='/home/anusri/Desktop/emrQA/output/', help='Directory of output files') 9 | 10 | args = parser.parse_args() 11 | 12 | ###################################################### SET FILE PATHS ################################################################## 13 | 14 | medications = json.load(open(os.path.join(args.output_dir,"medication-qa.json"))) 15 | relations = json.load(open(os.path.join(args.output_dir,"relations-qa.json")), encoding="latin-1") 16 | risk = json.load(open(os.path.join(args.output_dir,"risk-qa.json"))) 17 | smoking = json.load(open(os.path.join(args.output_dir,"smoking-qa.json"))) 18 | obesity = json.load(open(os.path.join(args.output_dir,"obesity-qa.json"))) 19 | 20 | 21 | ######################################################## CODE ######################################################################### 22 | 23 | data = [medications, relations, risk, smoking, obesity] 24 | #data = [relations] 25 | data_out = {"data": data} 26 | json_out = os.path.join(args.output_dir,"data.json") 27 | with open(json_out, 'w') as outfile: 28 | json.dump(data_out, outfile, encoding="latin-1") 29 | 30 | total_clinical_notes = 0 31 | all_questions = [] 32 | all_clinical_notes = [] 33 | for dataset in data: 34 | 35 | for note in dataset["paragraphs"]: 36 | total_clinical_notes += 1 37 | if " ".join(note["context"]) not in all_clinical_notes: 38 | all_clinical_notes.extend([" ".join(note["context"])]) 39 | else: 40 | #print("repeat") 41 | continue 42 | 43 | for questions in note["qas"]: 44 | #print(questions["question"]) 45 | all_questions.append(list(set(questions["question"]))) # all questions 46 | 47 | out = [] 48 | count = {} 49 | print("Total Clinical Notes", len(all_clinical_notes)) 50 | total_question = len(all_questions) 51 | totals = 0 52 | questions_list = [] 53 | for value in all_questions: 54 | #print(value) 55 | if type(value) != list: 56 | print("error") 57 | if len(value[0]) == 1: 58 | print(value) 59 | #out.append([len(value[0]),len(value),"\t".join(value)]) 60 | #if len(value) not in count: 61 | # count[len(value)] = [] 62 | totals += len(value) 63 | questions_list.extend(value) 64 | 65 | ''' 66 | print(len(count)) 67 | new_list = sorted(out, key=lambda x: x[1], reverse=True) 68 | 69 | ofile = open("testing","w") 70 | for val in new_list: 71 | ofile.write("\t".join(map(str,val))) 72 | ofile.write("\n") 73 | 74 | ofile.close() 75 | ''' 76 | ## Average Question Length ## 77 | 78 | print("Total Number Of Questions", totals) 79 | print("Total number of question types", total_question) 80 | 81 | ################################################################################################################################## 82 | 83 | medications = os.path.join(args.output_dir,"medication-ql.csv") 84 | relations = os.path.join(args.output_dir,"relations-ql.csv") 85 | risk = os.path.join(args.output_dir,"risk-ql.csv") 86 | smoking = os.path.join(args.output_dir,"smoking-ql.csv") 87 | obesity = os.path.join(args.output_dir,"obesity-ql.csv") 88 | 89 | data = [medications, relations, risk, smoking, obesity] 90 | 91 | unique = set() 92 | 93 | 94 | for file_path in data: 95 | file = open(file_path) 96 | filereader = list(csv.reader(file)) 97 | 98 | for line in filereader[1:]: 99 | unique.add(tuple(line)) 100 | #if random.randint(1,100) < 10: 101 | #print(line) 102 | 103 | values = list(unique) 104 | 105 | print("Total number of QL forms", len(values)) 106 | 107 | final_out = os.path.join(args.output_dir,"data-ql.csv") 108 | ofile = open(final_out, "w") 109 | writer = csv.writer(ofile, delimiter="\t") 110 | writer.writerow(["Question", "Logical Form", "QTemplate", "LTemplate"]) 111 | 112 | for val in values: 113 | writer.writerow(val) 114 | 115 | ofile.close() 116 | 117 | 118 | ''' 119 | 120 | datasets = json.load(open("data.json")) 121 | for dataset in datasets: 122 | print(dataset["title"]) 123 | 124 | for ClinicalNote in dataset["paragraphs"]: 125 | 126 | NoteText = "\n".join(ClinicalNote["context"]) 127 | 128 | for questions in ClinicalNote["qas"]: 129 | 130 | paraphrase_questions = questions["question"] 131 | print(paraphrase_questions) 132 | for answer in questions["answers"]: 133 | 134 | answer_text = answer["text"] 135 | answer_start = answer["answer_start"] ## [start_line,start_token] from NoteText 136 | evidence = answer["evidence"] ## The evidence here is question line + answer line (the evidence we use as ground truth is start_line from answer_start) 137 | 138 | print(answer_text,answer_start,evidence) 139 | 140 | ''' 141 | ''' 142 | use_evidence_model = "True" 143 | 144 | paras = [] 145 | idx = 0 146 | for note in medications["paragraphs"]: 147 | 148 | if medications["title"] == "risk-dataset": 149 | 150 | text = "\n".join(note["context"]) 151 | para = {"context": text, "qas": []} 152 | 153 | for questions in note["qas"]: 154 | idx += 1 ## Take care of this 155 | question = {"question": questions["question"], "answers": [], "id": idx} 156 | 157 | if use_evidence_model == "True": 158 | for answer in questions["answers"]: 159 | question["answers"].append({"text": answer["evidence"], "answer_start": answer["answer_start"][0]}) ## the answer line 160 | else: 161 | for answer in questions["answers"]: 162 | question["answers"].append({"text": answer["text"], "answer_start": answer["answer_start"][1]}) ## the answer text 163 | else: 164 | 165 | text = "".join(note["context"]) 166 | line_lenth = [len(line) for line in note["context"]] 167 | para = {"context": text, "qas": []} 168 | 169 | for questions in note["qas"]: 170 | idx += 1 171 | print(questions["id"]) 172 | question = {"question": questions["question"], "answers": [], "id": idx} 173 | for answer in questions["answers"]: 174 | 175 | if use_evidence_model == "True": 176 | try: ## evidence and evidence start token 177 | question["answers"].append({"text":note["context"][answer["answer_start"][0]-1],"answer_start":sum(line_lenth[answer[:answer["answer_start"][0]-1]])}) 178 | except: 179 | unique = [] 180 | for num in list(map(lambda x: x - 1, answer["evidence_start"])): 181 | if num not in unique: 182 | unique.append(num) 183 | question["answers"].append({"text":note["context"][num],"answer_start":sum(line_lenth[:num])}) 184 | else: 185 | try: ## answer and answer start token 186 | question["answers"].append({"text": answer["text"], 187 | "answer_start": sum( 188 | line_lenth[answer[:answer["answer_start"][0] - 1]])+answer["answer_start"][1]}) 189 | except: 190 | unique = [] 191 | for num in list(map(lambda x: x - 1, answer["evidence_start"])): 192 | if num not in unique: 193 | unique.append(num) 194 | question["answers"].append( 195 | {"text": note["context"][num], "answer_start": sum(line_lenth[:num])}) 196 | 197 | 198 | para["qas"].append(question) 199 | 200 | paras.append(para) 201 | 202 | medications_new = {"paragraphs": paras, "title": "medications"} 203 | 204 | #file = open("file.json", "w") 205 | data = {} 206 | data["data"] = [medications_new] 207 | output = {'qids': [], 'questions': [], 'answers': [], 208 | 'contexts': [], 'qid2cid': []} 209 | for article in data["data"]: 210 | for paragraph in article['paragraphs']: 211 | output['contexts'].append(paragraph['context']) 212 | for qa in paragraph['qas']: 213 | output['qids'].append(qa['id']) 214 | #print(qa["question"]) 215 | output['questions'].append(qa['question']) 216 | output['qid2cid'].append(len(output['contexts']) - 1) 217 | if 'answers' in qa: 218 | output['answers'].append(qa['answers']) 219 | #print(qa['answers']) 220 | 221 | json_out = "data_squad_format.json" 222 | with open(json_out, 'w') as outfile: 223 | json.dump(data, outfile, encoding="utf-8") 224 | 225 | ''' -------------------------------------------------------------------------------- /generation/i2b2_relations/common_names.txt: -------------------------------------------------------------------------------- 1 | patient 2 | affected_role 3 | patient_role 4 | trial 5 | trial_run 6 | test 7 | tryout 8 | psychometric_test 9 | examination 10 | exam 11 | hour_angle 12 | ha 13 | problem 14 | job 15 | trouble 16 | stopping_point 17 | finale 18 | finis 19 | finish 20 | conclusion 21 | death 22 | end 23 | final_stage 24 | shoemaker's_last 25 | cobbler's_last 26 | department_of_energy 27 | energy_department 28 | energy 29 | doe 30 | medicine 31 | medication 32 | medicament 33 | medicinal_drug 34 | pickings 35 | history 36 | account 37 | chronicle 38 | story 39 | return 40 | issue 41 | takings 42 | proceeds 43 | yield 44 | payoff 45 | consequence 46 | effect 47 | outcome 48 | result 49 | event 50 | solution 51 | answer 52 | resolution 53 | solvent 54 | resultant 55 | final_result 56 | termination 57 | resultant_role 58 | treatment 59 | intervention 60 | discussion 61 | discourse 62 | wherefore 63 | past 64 | past_times 65 | yesteryear 66 | past_tense 67 | washington 68 | evergreen_state 69 | wa 70 | holocene 71 | holocene_epoch 72 | recent_epoch 73 | electric_current 74 | stream 75 | flow 76 | platinum 77 | pt 78 | atomic_number_78 79 | mention 80 | reference 81 | citation 82 | cite 83 | acknowledgment 84 | credit 85 | quotation 86 | honorable_mention 87 | family 88 | household 89 | house 90 | home 91 | menage 92 | family_unit 93 | class 94 | category 95 | family_line 96 | folk 97 | kinfolk 98 | kinsfolk 99 | sept 100 | phratry 101 | kin 102 | kinsperson 103 | syndicate 104 | crime_syndicate 105 | mob 106 | fellowship 107 | dose 108 | dosage 109 | venereal_disease 110 | vd 111 | venereal_infection 112 | social_disease 113 | cupid's_itch 114 | cupid's_disease 115 | venus's_curse 116 | sexually_transmitted_disease 117 | std 118 | acid 119 | back_breaker 120 | battery-acid 121 | dot 122 | elvis 123 | loony_toons 124 | lucy_in_the_sky_with_diamonds 125 | pane 126 | superman 127 | window_pane 128 | zen 129 | diagnosis 130 | time 131 | clip 132 | clock_time 133 | fourth_dimension 134 | meter 135 | metre 136 | prison_term 137 | sentence 138 | number_one 139 | number_1 140 | commencement 141 | outset 142 | get-go 143 | start 144 | kickoff 145 | starting_time 146 | showtime 147 | offset 148 | first_base 149 | first_gear 150 | low_gear 151 | presumption 152 | precondition 153 | scope 154 | range 155 | reach 156 | orbit 157 | compass 158 | ambit 159 | mountain_range 160 | range_of_mountains 161 | chain 162 | mountain_chain 163 | chain_of_mountains 164 | image 165 | range_of_a_function 166 | grasp 167 | stove 168 | kitchen_range 169 | cooking_stove 170 | value 171 | economic_value 172 | time_value 173 | note_value 174 | chemical_reaction 175 | reaction 176 | response 177 | fountainhead 178 | type 179 | character 180 | case 181 | cause 182 | reason 183 | grounds 184 | campaign 185 | crusade 186 | drive 187 | movement 188 | effort 189 | causal_agent 190 | causal_agency 191 | lawsuit 192 | suit 193 | causa 194 | iodine 195 | iodin 196 | i 197 | atomic_number_53 198 | ace 199 | unity 200 | tendency 201 | trend 202 | course 203 | drift 204 | vogue 205 | style 206 | specialist 207 | specializer 208 | specialiser 209 | medical_specialist 210 | positive_degree 211 | date 212 | day_of_the_month 213 | escort 214 | appointment 215 | engagement 216 | particular_date 217 | veteran 218 | veteran_soldier 219 | vet 220 | ex-serviceman 221 | old-timer 222 | oldtimer 223 | old_hand 224 | warhorse 225 | old_stager 226 | stager 227 | use 228 | usage 229 | utilization 230 | utilisation 231 | employment 232 | exercise 233 | function 234 | purpose 235 | role 236 | consumption 237 | economic_consumption 238 | usance 239 | use_of_goods_and_services 240 | habit 241 | manipulation 242 | enjoyment 243 | checkup 244 | medical_checkup 245 | medical_examination 246 | medical_exam 247 | health_check 248 | show 249 | display 250 | appearance 251 | smoke 252 | smoking 253 | woman 254 | adult_female 255 | charwoman 256 | char 257 | cleaning_woman 258 | cleaning_lady 259 | womanhood 260 | fair_sex 261 | list 262 | listing 263 | tilt 264 | inclination 265 | lean 266 | spring 267 | springiness 268 | lab 269 | laboratory 270 | research_lab 271 | research_laboratory 272 | science_lab 273 | science_laboratory 274 | symptom 275 | side 276 | face 277 | side_of_meat 278 | position 279 | slope 280 | incline 281 | ground 282 | intellect 283 | rationality 284 | reasonableness 285 | impact 286 | wallop 287 | impingement 288 | encroachment 289 | evaluation 290 | rating 291 | valuation 292 | department 293 | section 294 | indication 295 | indicant 296 | denotation 297 | reading 298 | record 299 | phonograph_record 300 | disk 301 | disc 302 | platter 303 | track_record 304 | record_book 305 | book 306 | criminal_record 307 | measurement 308 | measure 309 | mensuration 310 | evidence 311 | graph 312 | graphical_record 313 | light-emitting_diode 314 | report 315 | study 316 | written_report 317 | news_report 318 | write_up 319 | report_card 320 | composition 321 | paper 322 | theme 323 | reputation 324 | etiology 325 | aetiology 326 | unit_of_measurement 327 | unit 328 | social_unit 329 | building_block 330 | work 331 | piece_of_work 332 | workplace 333 | oeuvre 334 | body_of_work 335 | heights 336 | senior_high_school 337 | senior_high 338 | highschool 339 | high_school 340 | high_gear 341 | hush 342 | stillness 343 | distillery 344 | manner 345 | mode 346 | way 347 | fashion 348 | modality 349 | mood 350 | musical_mode 351 | modal_value 352 | year 353 | twelvemonth 354 | yr 355 | times 356 | multiplication 357 | cover 358 | set 359 | exercise_set 360 | stage_set 361 | circle 362 | band 363 | lot 364 | bent 365 | solidification 366 | seth 367 | readiness 368 | exterior 369 | drug 370 | prescription 371 | prescription_drug 372 | prescription_medicine 373 | ethical_drug 374 | baseline 375 | service_line 376 | status 377 | condition 378 | tin 379 | canful 380 | can_buoy 381 | buttocks 382 | nates 383 | arse 384 | butt 385 | backside 386 | bum 387 | buns 388 | fundament 389 | hindquarters 390 | hind_end 391 | keister 392 | posterior 393 | prat 394 | rear 395 | rear_end 396 | rump 397 | stern 398 | seat 399 | tail 400 | tail_end 401 | tooshie 402 | tush 403 | bottom 404 | fanny 405 | ass 406 | toilet 407 | commode 408 | crapper 409 | pot 410 | potty 411 | stool 412 | throne 413 | lavatory 414 | lav 415 | john 416 | privy 417 | bathroom 418 | exploitation 419 | victimization 420 | victimisation 421 | the_like 422 | the_likes_of 423 | ilk 424 | ar 425 | master_of_education 426 | startle 427 | jump 428 | starting_line 429 | scratch 430 | scratch_line 431 | head_start 432 | david_low 433 | sir_david_low 434 | sir_david_alexander_cecil_low 435 | complication 436 | ramification 437 | complicatedness 438 | knottiness 439 | tortuousness 440 | visit 441 | sojourn 442 | remote_control 443 | remote 444 | child 445 | kid 446 | youngster 447 | minor 448 | shaver 449 | nipper 450 | small_fry 451 | tiddler 452 | tike 453 | tyke 454 | fry 455 | baby 456 | veterinary_surgeon 457 | hazard 458 | jeopardy 459 | peril 460 | risk 461 | endangerment 462 | danger 463 | risk_of_infection 464 | risk_of_exposure 465 | stallion 466 | fleshiness 467 | obesity 468 | corpulency 469 | findings 470 | determination 471 | change 472 | alteration 473 | modification 474 | variety 475 | search 476 | hunt 477 | lookup 478 | mixer 479 | hospital 480 | standard 481 | criterion 482 | touchstone 483 | summary 484 | sum-up 485 | detail 486 | item 487 | particular_proposition 488 | room 489 | elbow_room 490 | values 491 | meet 492 | control 493 | control_condition 494 | dominance 495 | ascendance 496 | ascendence 497 | ascendancy 498 | ascendency 499 | command 500 | mastery 501 | controller 502 | need 503 | demand 504 | want 505 | motivation 506 | motive 507 | indigence 508 | penury 509 | pauperism 510 | pauperization 511 | world_health_organization 512 | travel 513 | change_of_location 514 | locomotion 515 | numbers 516 | book_of_numbers 517 | numbers_pool 518 | numbers_game 519 | numbers_racket 520 | number 521 | figure 522 | act 523 | routine 524 | turn 525 | bit 526 | phone_number 527 | telephone_number 528 | identification_number 529 | practice 530 | pattern 531 | drill 532 | practice_session 533 | recitation 534 | praxis 535 | effects 536 | personal_effects 537 | impression 538 | essence 539 | burden 540 | core 541 | gist 542 | force 543 | fill 544 | woof 545 | weft 546 | pick 547 | infusion 548 | extract 549 | excerpt 550 | excerption 551 | selection 552 | management 553 | direction 554 | ii 555 | deuce 556 | transcript 557 | copy 558 | written_matter 559 | kind 560 | sort 561 | form 562 | multiple 563 | convention 564 | rule 565 | formula 566 | incidence 567 | relative_incidence 568 | plan 569 | program 570 | programme 571 | design 572 | architectural_plan 573 | nosology 574 | diagnostics 575 | discovery 576 | breakthrough 577 | iii 578 | trio 579 | threesome 580 | tierce 581 | leash 582 | troika 583 | triad 584 | trine 585 | trinity 586 | ternion 587 | triplet 588 | tercet 589 | terzetto 590 | trey 591 | deuce-ace 592 | stop 593 | halt 594 | stoppage 595 | stopover 596 | layover 597 | arrest 598 | check 599 | hitch 600 | stay 601 | stop_consonant 602 | plosive_consonant 603 | plosive_speech_sound 604 | plosive 605 | period 606 | point 607 | full_stop 608 | full_point 609 | diaphragm 610 | catch 611 | blockage 612 | block 613 | closure 614 | occlusion 615 | procedure 616 | process 617 | operation 618 | subroutine 619 | subprogram 620 | exhibit 621 | presentation 622 | video_display 623 | conditions 624 | weather 625 | weather_condition 626 | atmospheric_condition 627 | stipulation 628 | circumstance 629 | consideration 630 | shape 631 | term 632 | experimental_condition 633 | medical_specialty 634 | practice_of_medicine 635 | music 636 | recommendation 637 | testimonial 638 | good_word 639 | passport 640 | semen 641 | seed 642 | seminal_fluid 643 | ejaculate 644 | cum 645 | footing 646 | basis 647 | base 648 | foundation 649 | groundwork 650 | cornerstone 651 | name 652 | gens 653 | public_figure 654 | epithet 655 | fillet 656 | chart 657 | elevated_railroad 658 | el 659 | contraindication 660 | virginia 661 | old_dominion 662 | old_dominion_state 663 | va 664 | department_of_veterans_affairs 665 | degree 666 | grade 667 | level 668 | tier 669 | stage 670 | spirit_level 671 | horizontal_surface 672 | layer 673 | stratum 674 | floor 675 | storey 676 | person 677 | someone 678 | somebody 679 | mortal 680 | soul 681 | rich_person 682 | wealthy_person 683 | sum 684 | totality 685 | aggregate 686 | amount 687 | prevention 688 | bar 689 | campaigner 690 | candidate 691 | nominee 692 | prospect 693 | admission 694 | admittance 695 | entrance_fee 696 | admission_charge 697 | admission_fee 698 | admission_price 699 | price_of_admission 700 | entrance_money 701 | entree 702 | access 703 | accession 704 | night 705 | nighttime 706 | dark 707 | nox 708 | duration 709 | continuance 710 | length 711 | diet 712 | vital_organ 713 | vitals 714 | facility 715 | installation 716 | adeptness 717 | adroitness 718 | deftness 719 | quickness 720 | method 721 | appraisal 722 | assessment 723 | judgment 724 | judgement 725 | maximum 726 | upper_limit 727 | utmost 728 | uttermost 729 | intercession 730 | interference 731 | interposition 732 | follow-up 733 | followup 734 | reexamination 735 | review 736 | whitethorn 737 | english_hawthorn 738 | crataegus_laevigata 739 | crataegus_oxycantha 740 | details 741 | inside_information 742 | contingent 743 | -------------------------------------------------------------------------------- /generation/i2b2_obesity/obesity-answers.py: -------------------------------------------------------------------------------- 1 | import xmltodict 2 | import csv 3 | import json 4 | import argparse 5 | import os 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 obesity challange files') 9 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format') 10 | parser.add_argument('--output_dir', default='', help='Directory to store the output') 11 | args = parser.parse_args() 12 | 13 | ###################################################### SET FILE PATHS ################################################################## 14 | 15 | templates_file = args.templates_dir 16 | obesity_file_path = i2b2_file_paths = args.i2b2_dir 17 | 18 | file_names = ["obesity_standoff_annotations_test.xml","obesity_standoff_annotations_training.xml"] 19 | note_names = ["obesity_patient_records_test.xml", "obesity_patient_records_training.xml"] 20 | 21 | ql_output = os.path.join(args.output_dir,"obesity-ql.csv") 22 | #print(ql_output) 23 | qa_json_out = os.path.join(args.output_dir,"obesity-qa.json") 24 | 25 | ######################################################## CODE ######################################################################### 26 | 27 | def ReadFile(): 28 | 29 | file_path = obesity_file_path 30 | 31 | Patient = {} #note_id is the key with a dictionary as value 32 | 33 | for note_name in note_names: 34 | file = file_path + note_name 35 | with open(file) as fd: 36 | XML = xmltodict.parse(fd.read()) 37 | 38 | for doc in XML["root"]["docs"]["doc"]: 39 | doc_id = doc["@id"] 40 | note_text = doc["text"] 41 | 42 | 43 | if doc_id not in Patient: 44 | Patient[doc_id] = {} 45 | Patient[doc_id]["text"] = note_text 46 | 47 | for file_name in file_names: 48 | file = file_path + file_name 49 | with open(file) as fd: 50 | XML = xmltodict.parse(fd.read()) 51 | 52 | intuitive = XML["diseaseset"]["diseases"][0]["disease"] 53 | textual = XML["diseaseset"]["diseases"][1]["disease"] 54 | 55 | #print(intuitive) 56 | for idx in range(len(intuitive)): 57 | 58 | disease_name = intuitive[idx]["@name"] 59 | intuitive_docs_list = intuitive[idx]["doc"] 60 | 61 | for pidx in range(len(intuitive_docs_list)): 62 | 63 | idoc_id = intuitive_docs_list[pidx]["@id"] 64 | ijudgment = intuitive_docs_list[pidx]["@judgment"] 65 | 66 | if idoc_id not in Patient: 67 | Patient[idoc_id] = {} 68 | if disease_name not in Patient[idoc_id]: 69 | Patient[idoc_id][disease_name] = ijudgment 70 | 71 | for idx in range(len(textual)): 72 | 73 | disease_name = textual[idx]["@name"] 74 | textual_docs_list = textual[idx]["doc"] 75 | 76 | for pidx in range(len(textual_docs_list)): 77 | 78 | tdoc_id = textual_docs_list[pidx]["@id"] 79 | tjudgment = textual_docs_list[pidx]["@judgment"] 80 | 81 | try: 82 | ijudgment = Patient[tdoc_id][disease_name] 83 | if ijudgment != tjudgment and tjudgment != "U" and tjudgment != "Q": 84 | print(ijudgment, tjudgment, disease_name, tdoc_id) 85 | except: 86 | try: 87 | Patient[tdoc_id][disease_name] = tjudgment 88 | except: 89 | Patient[tdoc_id] = {disease_name:tjudgment} 90 | continue 91 | 92 | 93 | return Patient 94 | 95 | def MakeJSONOut(obesity_data,json_out,Patient): 96 | 97 | 98 | obesity_out = {"paragraphs": [], "title": "obesity"} 99 | 100 | for note_id in Patient: 101 | Y_class = [] 102 | U_class = [] 103 | Q_class = [] 104 | N_class = [] 105 | patient_note = Patient[note_id]["text"] 106 | out = {"note_id": note_id, "context": patient_note, "qas": []} 107 | unique_questions = [] 108 | 109 | for problem in Patient[note_id]: 110 | if problem == "text": 111 | continue 112 | if Patient[note_id][problem] == "Y": 113 | Y_class.append(problem) 114 | elif Patient[note_id][problem] == "N": 115 | N_class.append(problem) 116 | elif Patient[note_id][problem] == "U": 117 | U_class.append(problem) 118 | elif Patient[note_id][problem] == "Q": 119 | Q_class.append(problem) 120 | else: 121 | print(Patient[note_id][problem]) 122 | 123 | ###### not doing on all questions ##### 124 | 125 | for row in obesity_data: 126 | question = row[2].strip() 127 | 128 | if question == "": 129 | continue 130 | lform = row[3] 131 | answer_type = row[4] 132 | question = question.replace("\t", "") 133 | lform = lform.replace("\t", "") 134 | orginal = question 135 | 136 | if answer_type == "problems": 137 | for idx in range(len(Y_class)): 138 | problem = Y_class[idx] 139 | question = orginal 140 | 141 | if problem == "Obesity": 142 | qwords = question.split("|") 143 | qwords[1] = problem 144 | lform_new = lform.replace("|problem|",problem) 145 | qwords = [word.strip() for word in qwords] 146 | final_question = " ".join(qwords) 147 | Answer = Y_class[0:idx] + Y_class[idx + 1:] 148 | else: 149 | question = orginal.replace("|problem|", problem) 150 | lform_new = lform.replace("|problem|", problem) 151 | filewriter_forlform.writerow([question] + [lform_new] + [question] + [lform]) 152 | continue 153 | 154 | ans_list = [] 155 | for ans in Answer: 156 | ans_list.append({"answer_start": "", "text": ans, "evidence": "", "evidence_start": ""}) 157 | #print(final_question) 158 | answer = {"answers": ans_list, "id": [[final_question,final_question],lform], "question": [final_question]} 159 | out["qas"].append(answer) 160 | 161 | filewriter_forlform.writerow([question] + [lform_new] + [question] + [lform]) 162 | 163 | elif answer_type == "yes/no" and "|problem|" in question: 164 | answers = ["yes", "no", "UNK"] 165 | jdx = -1 166 | question_template = question.split("##") 167 | #print(question) 168 | for temp in [Y_class, N_class, U_class]: 169 | jdx += 1 170 | for problem in temp: 171 | 172 | #if problem.lower() != "obesity": 173 | # continue 174 | 175 | orginal_lform = lform 176 | question_lits = question.replace("|problem|",problem).split("##") 177 | lform_new = lform.replace("|problem|", problem) 178 | #print(question_lits) 179 | idx = 0 180 | if question_lits not in unique_questions: 181 | unique_questions.append(question_lits) 182 | 183 | for q in question_lits: 184 | filewriter_forlform.writerow([q] + [lform_new] + [question_template[idx]] + [orginal_lform]) 185 | idx += 1 186 | 187 | Answer = [answers[jdx]] 188 | ans_list = [] 189 | for ans in Answer: 190 | ans_list.append({"answer_start": "", "text": ans, "evidence": "", "evidence_start": ""}) 191 | 192 | answer = {"answers": ans_list, "id": [zip(question_lits,question_template),orginal_lform], "question": question_lits} 193 | 194 | out["qas"].append(answer) 195 | else: 196 | print(answer_type) 197 | 198 | obesity_out["paragraphs"].append(out) 199 | 200 | with open(json_out, 'w') as outfile: 201 | json.dump(obesity_out, outfile) 202 | 203 | 204 | if __name__=="__main__": 205 | 206 | ofile = open(ql_output, "w") 207 | filewriter_forlform = csv.writer(ofile, delimiter="\t") 208 | filewriter_forlform.writerow(["Question", "Logical Form"]) 209 | 210 | ### Read i2b2 files ### 211 | 212 | Patient = ReadFile() 213 | 214 | ### File to read templates ### 215 | 216 | qfile = open(templates_file) 217 | read_data = list(csv.reader(qfile)) 218 | 219 | ## read only templates relevant to obesity challenge ## 220 | 221 | obesity_data = [] 222 | for line in read_data[1:]: 223 | if line[0] != "obesity": 224 | continue 225 | obesity_data.append(line) 226 | 227 | 228 | MakeJSONOut(obesity_data,qa_json_out,Patient) 229 | #MakeQuestion(questions_file,out_file,Patient) 230 | 231 | 232 | ''' 233 | def MakeQuestion(questions_file,out_file,Patient): 234 | 235 | qfile = open(questions_file) 236 | read_data = list(csv.reader(qfile, delimiter="\t")) 237 | 238 | ofile = open(out_file, "w") 239 | ofilewriter = csv.writer(ofile) 240 | 241 | values = ["Question", "Answer", "Answer line in note", "Note ID", "Difference in QA lines"] 242 | ofilewriter.writerow(values) 243 | 244 | 245 | for note_id in Patient: 246 | Y_class = [] 247 | U_class = [] 248 | Q_class = [] 249 | N_class = [] 250 | for problem in Patient[note_id]: 251 | if Patient[note_id][problem] == "Y": 252 | Y_class.append(problem) 253 | elif Patient[note_id][problem] == "N": 254 | N_class.append(problem) 255 | elif Patient[note_id][problem] == "U": 256 | U_class.append(problem) 257 | elif Patient[note_id][problem] == "Q": 258 | Q_class.append(problem) 259 | else: 260 | print(Patient[note_id][problem]) 261 | 262 | 263 | for row in read_data[1:4]: 264 | question = row[1].strip() 265 | if question == "": 266 | continue 267 | #print(row) 268 | answer_type = row[3] 269 | question_in = row[0] #question_concept_type 270 | 271 | if answer_type == "problems": 272 | for idx in range(len(Y_class)): 273 | problem = Y_class[idx] 274 | qwords = question.split("|") 275 | qwords[1] = problem 276 | qwords = [word.strip() for word in qwords] 277 | final_question = " ".join(qwords) 278 | Answer = Y_class[0:idx]+Y_class[idx+1:] 279 | ofilewriter.writerow([final_question," ".join(Answer), "", note_id, ""]) 280 | elif answer_type == "yes/no" and question_in == "problem": 281 | answers = ["yes","no",""] 282 | jdx = -1 283 | for temp in [Y_class,N_class,U_class]: 284 | jdx += 1 285 | for idx in range(len(temp)): 286 | problem = temp[idx] 287 | qwords = question.split("|") 288 | qwords[1] = problem 289 | qwords = [word.strip() for word in qwords] 290 | final_question = " ".join(qwords) 291 | Answer = answers[jdx] 292 | ofilewriter.writerow([final_question,Answer, "", note_id, ""]) 293 | elif answer_type == "yes/no" and question_in == "None": 294 | try: 295 | if Patient[note_id]["Obesity"] == "Y": 296 | ofilewriter.writerow([question, "yes", "", note_id, ""]) 297 | if Patient[note_id]["Obesity"] == "N": 298 | ofilewriter.writerow([question, "no", "", note_id, ""]) 299 | if Patient[note_id]["Obesity"] == "U": 300 | ofilewriter.writerow([question, "", "", note_id, ""]) 301 | except: 302 | print(Patient[note_id].keys()) 303 | else: 304 | print(answer_type,question_in) 305 | ''' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # emrQA: A Large Corpus for Question Answering on Electronic Medical Records 2 | The page and codes are ready for use. We are excited to announce that this data 3 | will now be hosted directly under the i2b2 license !! So you can directly 4 | download the dataset from the i2b2 website instead of generating it from the scripts! 5 | For downloading please refer to the instructions under the download tab below. 6 | For later versions of emrqa and recent updates contact Preethi Raghavan (praghav@us.ibm.com). 7 | 8 | - This repo contains code for the paper 9 | Anusri Pampari, Preethi Raghavan, Jennifer Liang and Jian Peng, 10 | [emrQA: A Large Corpus for Question Answering on Electronic Medical Records][paper-link], 11 | In Conference on Empirical Methods in Natural Language Processing (EMNLP) 2018, Brussels, Belgium. 12 | - General queries/thoughts have been addressed in the discussion section below. 13 | - To automatically download emrQA from i2b2 instead of using the scripts to generate them follow instructions listed below. 14 | - For later versions of emrQA contact Preethi Raghavan (praghav@us.ibm.com). 15 | - Please contact [Anusri Pampari][anusri-home] (\@stanford.edu) for suggestions and comments. More instructions about reporting bugs detailed below. 16 | 17 | ## Quick Links 18 | 19 | - [About](#question-answering-on-electronic-medical-records) 20 | - [Download Dataset](#download-dataset) 21 | - [Requirements](#requirements) 22 | - [Data Generation](#emrqa-generation) 23 | - [Data Analysis](#emrqa-analysis) 24 | - [Baselines](#baselines) 25 | - [Discussion](#discussion) 26 | - [Report a bug](#dataset-bugs) 27 | - [Current works using emrQA](#current-works-using-emrqa) 28 | 29 | ## Question Answering on Electronic Medical Records (EMR) 30 | 31 | In this work, we address the lack of any publicly available EMR Question Answering (QA) corpus by creating a large-scale dataset, emrQA, using a novel semi-automated generation framework that allows for minimal expert involvement and re-purposes existing annotations available for other clinical NLP tasks. To briefly summarize the generation process: (1) we collect questions from experts (2) convert them to templates by replacing entities with placeholders (3) expert annotate the templates with logical form templates and then (4) use annotations from existing NLP tasks (based on information in logical forms) to populate placeholders in templates and generate answers. For our purpose, we use existing NLP task annotations from the [i2b2 Challenge datasets][i2b2-datasets]. We refer the reader to the paper to get a more detailed overview of the generation framework. 32 | 33 | This repository includes the question and logical form templates provided by our experts and the code for generating the emrQA dataset from these templates and the i2b2 challenge datasets. Note that this work is a refactored and extended version of the orginal dataset described in the paper. 34 | 35 | Some statistics of the current version of the generated data: 36 | 37 | | Datasets | QA pairs | QL pairs | #Clinical Notes | 38 | | :------: | :------: | :------: | :----: | 39 | | i2b2 relations (concepts, relations, assertions)| 1,322,789 | 1,008,205 | 425 | 40 | | i2b2 medications | 226,128 | 190,169 | 261 | 41 | | i2b2 heart disease risk | 49,897 | 35,777 | 119 | 42 | | i2b2 smoking | 4,518 | 14 | 502 | 43 | | i2b2 obesity | 354,503 | 336 | 1,118 | 44 | | **emrQA (total)** | **1,957,835** | **1,225,369** | **2,425** | 45 | 46 | **UPDATES:** 47 | ``` 48 | 29th Novemebr 2018: We are excited to announce that this data will now be hosted directly under the i2b2 license !! So you can directly download the dataset from the i2b2 website instead of generating it from the scripts. 49 | 27th August 2018: Extended the i2b2 obesity question-answer pairs to obesity comorbidities. 50 | 20th August 2018: Added QA pairs generated from i2b2 relations (assertions). 51 | 27th Jun 2018: Dataset as decribed in the paper. 52 | ``` 53 | 54 | ## Download Dataset 55 | 56 | emrQA is available for download here: https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ (you'll need to sign the agreement and request for the data, it comes through the same day). You'll find it listed under Community Annotations Downloads as follows - 57 | 58 | ![Image](emrqa_download_image.jpg) 59 | 60 | Please note that the download link title in the image has a typo. The link says that question answers were generated from 2014 Heart Disease risk factors data but it is generated from all the datasets listed in the table above (medications, smoking, obesity, heart disease and relations). So ignore the title and go ahead and downlaod the entire dataset from the link. i2b2/n2c2 will soon fix this typo. 61 | 62 | ## Requirements 63 | 64 | To generate emrQA, first download the NLP Datasets from the [i2b2 Challenges][i2b2-datasets] accessible by everyone subject to a license agreement. You will need to download and extract all the datasets corresponding to given a challenge (e.g 2009 Medications Challenge) to a directory named `i2b2` in the main folder (the contains of the folder location are eloborated below in the discussion section for your reference). Once completed, check the path location in `main.py`. In our work, we have currently made use of all the challenge datasets except the 2012 Temporal Relations Challenge. Our future extensions of the dataset to include this challenge dataset will soon be available. 65 | 66 | The generation scrpits in the repo require Python 2.7. Run the following commands to clone the repository and install the requirements for emrQA: 67 | 68 | ```bash 69 | git clone https://github.com/emrqa/emrQA.git 70 | cd emrQA; pip install -r requirements.txt 71 | ``` 72 | 73 | 74 | ## emrQA Generation 75 | 76 | Run `python main.py` to generate the question-answers pairs in a json format and the question-logical form pairs in a csv format. The input to these scripts is a csv file (`templates-all.csv`) located in `templates\` directory. By default the script creates an `output\` directory to store all the generated files. You can access the combined question-answer dataset as `data.json` and question-logical form data as `data-ql.csv`. You can also access the intermediate datasets generated per every i2b2 challenge (e.g. `medications-qa.json` and `medication-ql.csv` generated from the 2009 medications challenge annotations). 77 | 78 | 79 | A thorough discussion of the output format of these files is presented below. 80 | 81 | #### Input: Templates (CSV) Format 82 | 83 | Each row in the csv file has the following format: 84 | 85 | ``` 86 | "dataset" \t "question templates" \t "logical form templates" \t "answer type" \t "sub-answer-type" 87 | ``` 88 | 89 | A brief explantion how following fields are used in `main.py`, 90 | 91 | ``` 92 | dataset: The i2b2 challenge dataset annotations to be used for the templates in that row. This field should be one of the following values, medications, relations, risk, smoking or obesity. 93 | 94 | question templates: All the question paraphrase templates are provided as a string seperated by ##. 95 | 96 | logical form templates: The logical form template expert annotated for the question templates. 97 | 98 | answer type: The output type 99 | 100 | sub-answer-type: 101 | ``` 102 | #### Output: Question-Answer (JSON) Format 103 | 104 | The json files in `output\` directory have the following format: 105 | 106 | ``` 107 | data.json 108 | ├── "data" 109 | └── [i] 110 | ├── "paragraphs" 111 | │ └── [j] 112 | │ ├── "note_id": "clinical note id" 113 | │ ├── "context": "clinical note text" 114 | │ └── "qas" 115 | │ └── [k] 116 | │ ├── "answers" 117 | │ │ └── [l] 118 | │ │ ├── "answer_start" 119 | │ │ │ └── [m] 120 | │ │ │ ├── integer (line number in clinical note to find the answer entity) 121 | │ │ │ └── integer (token position in line to find the answer entity) 122 | │ │ │ 123 | │ │ ├──"text": "answer entity" 124 | │ │ │ 125 | │ │ ├──"evidence": "evidence line to support the answer entity " 126 | │ │ │ 127 | │ │ ├──"answer_entity_type": takes the value "single" or "empty" or "complex" (refer to discussion for more details) 128 | │ │ │ 129 | │ │ └── "evidence_start": integer (line number in clinical note to find the evidence line) 130 | │ │ 131 | │ ├── "id" 132 | │ │ └─ [n] 133 | │ │ ├──[o] 134 | │ │ │ ├── "paraphrase question" 135 | │ │ │ └── "paraphrase question-template" 136 | │ │ │ 137 | │ │ └── "logical-form-template" 138 | │ │ 139 | │ └── "question" 140 | │ └──[p] 141 | │ └──"paraphrase question" 142 | │ 143 | └── "title": "i2b2 challenge name" 144 | 145 | ``` 146 | 147 | 155 | 156 | #### Output: Question-Logical Form (CSV) Format 157 | 158 | Each row in the csv file has the following format, 159 | 160 | ``` 161 | "question" \t "logical-form" \t "question-template" \t "logical-form-template" 162 | ``` 163 | 164 | ## emrQA Analysis 165 | 166 | #### Basic statistics 167 | 168 | To run the scripts that finds the basic statistics of the dataset, such as average question length etc, do. 169 | 170 | ```bash 171 | python evaluation/basic-stats.py --output_dir output/ 172 | ``` 173 | 174 | #### Paraphrase analysis 175 | 176 | To run the scripts that finds (1) the average number of paraphrase templates (2) Jaccard and BLEU Score of parapharase templates 177 | 178 | ```bash 179 | python evaluation/paraphrase-analysis --templates_dir templates/ 180 | ``` 181 | 182 | #### Logical form template analysis 183 | 184 | To run the scripts that filter logical form templates with specific properties, 185 | 186 | ```bash 187 | python evaluation/template-analysis.py --templates_dir templates/ 188 | ``` 189 | ## Discussion 190 | 191 | ##### What is the "answer_entity_type" field for ? 192 | 193 | The "answer_entity_type" field in `data.json` takes the following values, 194 | 195 | 1) "empty": This indicates that the "text" field is an empty string, which means that there is no specific entity to look for in the evidence line. 196 | 197 | 2) "single": This indicates that the "text" field contains a single entity that can be found in the evidence line and can answer the question byitself. 198 | 199 | 3) "complex": This indicates that each "text" field is a list of entities. This means that each answer needs all the entities in this list to give a single answer. Here the evidence lines and answer_start (line start and token start) are all lists corresponding to the entity. 200 | 201 | ##### Why do I see “coronary artery” instead of “coronary artery disease” in the question? Why is the entity used in question not complete ? 202 | 203 | We have a preprocessing step, before using the i2b2 annotation in the question. This is because the annotation itself are noisy and can include generic concepts within the annotations. 204 | 205 | For example, 206 | 207 | Minor disease, her disease, her dominant CAD - these are all annotated as problems. So we remove/clean them using a pre-processing step using some rules which checks for generic words in the annotation. As a result of this we are getting "coronary artery" instead of "coronary artery disease". 208 | 209 | ##### How is the "context" field related to the clinical notes text ? 210 | 211 | In i2b2 medications, i2b2 relations, i2b2 smoking and i2b2 obesity challenge every patient has a single clinical note which is directly used in the "context" field. 212 | 213 | For i2b2 heart disease risk dataset we have 4/5 longitudnal clinical notes per patient named as follows, "note_id-01.txt", "note_id-02.txt"..."note_id-05.txt". Each of these files correspond to notes on a particular day and are already in timeline order. 214 | We combine all these ".txt" files (in the order given) seperated by "\n" and use them in the "context" field. The note_id part of the file name is used in "note_id" field. If you wish to break them down into individual notes, you can refer to the "note_id" field and in reverse find the note_id-01.txt, note_id-02.txt contents in the "context" field. 215 | 216 | ##### i2b2 smoking and i2b2 obesity challenge generted QA are different. How ? 217 | 218 | For the QA pairs generated from these datasets we do not have an evidence, neither do have a specific entity to look for. Instead the "text" field here is the class information provided in these two challenges and the entire "context" field can be seen as evidence. Please refer to the corresponding challenges for more information about the classes. 219 | 220 | ##### The answer evidence is not a complete sentence. Why ? 221 | 222 | The annotations used from the i2b2 datasets (except heart disease risk) have both token span and line number annotations. Clinical notes in these datasets are split at the newline character and assigned a line number. Our evidence line is simply the line in the clinical note corresponsing to a particular i2b2 annotation's line number. Since i2b2 heart disease risk annotations has only token span annotations without any line number annotations, we break the clinical notes at newline character and the line containing the token span is considered as our evidence line. 223 | 224 | - When clinical notes are split at newline character, start/stop of the evidence line may not overlap with a complete sentence in a clinical note. To avoid this we tried to use a sentence splitter instead of newline character to determine our evidence lines. But existing sentence splitter's such as NLTK sentence splitter do even worse in breaking a clinical notes sentence because of its noisy, ungrammatical structure. 225 | - Clinical notes are noisy, so some of the evidence lines may not have complete context or may not be grammatically correct. 226 | 227 | ##### i2b2 datasets directory structure 228 | 229 | The i2b2 challenge datasets used to generate the current emrQA version was downloaded in August, 2017. Since the structure of these i2b2 datasets itself could change, we thought it might be useful to discuss our i2b2 repository structure. 230 | 231 | The scipts in this repository are used to parse the following i2b2 directory structure, 232 | 233 | ``` 234 | 235 | ├── "i2b2 (download the datsets in single folder)" 236 | ├── "smoking" (download 2006 smoking challenge datasets here) 237 | │ │ 238 | │ ├── "smokers_surrogate_test_all_groundtruth_version2.xml" 239 | │ └── "smokers_surrogate_train_all_version2.xml" 240 | │ 241 | ├── "obesity" (download 2008 obesity challenge datasets here) 242 | │ │ 243 | │ ├── "obesity_standoff_annotations_test.xml" 244 | │ ├── "obesity_standoff_annotations_training.xml" 245 | │ ├── "obesity_patient_records_test.xml" 246 | │ └── "obesity_patient_records_training.xml" 247 | │ 248 | ├── "medication" (download 2009 medication challenge datasets here) 249 | │ │ 250 | │ ├── "train.test.released.8.17.09/" (folder containing all clinical notes) 251 | │ ├── "annotations_ground_truth/converted.noduplicates.sorted/" (folder path with medication annotations 252 | │ └── "training.ground.truth/" (folder path with medication annotations) 253 | │ 254 | ├── "relations" (download 2010 relation challenge datasets here) 255 | │ │ 256 | │ ├── "concept_assertion_relation_training_data/partners/txt/" (folder path containing clinical notes) 257 | │ ├── "concept_assertion_relation_training_data/beth/txt/" (folder path containing clinical notes) 258 | │ ├── "test_data/txt/" (folder path containing clinical notes) 259 | │ ├── "concept_assertion_relation_training_data/partners/rel/" (folder path with relation annotations) 260 | │ ├── "concept_assertion_relation_training_data/beth/rel/" (folder path with relation annotations) 261 | │ ├── "test_data/rel/" (folder path with relation annotations) 262 | │ ├── "concept_assertion_relation_training_data/partners/ast/" (folder path with assertion annotations) 263 | │ ├── "concept_assertion_relation_training_data/beth/ast/" (folder path with assertion annotations) 264 | │ └── "test_data/ast/" (folder path with assertion annotations) 265 | │ 266 | ├── "coreference" (download 2011 coreference challenge datasets here) 267 | │ │ 268 | │ ├── "Beth_Train" (folder with the following subfolders "chains", "concepts", "docs", "pairs") 269 | │ ├── "Partners_Train" (folder with the following subfolders "chains", "concepts", "docs", "pairs") 270 | │ └── "i2b2_Test" (folder with "i2b2_Beth_Test" and "i2b2_Partners_Test" containing "chains" and "concepts" subfolders) 271 | │ 272 | └── "heart-disease-risk" (download 2014 heart disease risk factprs challenge datasets here) 273 | │ 274 | └── "training-RiskFactors-Complete-Set1/" (folder path with files containing annotations and clinical notes) 275 | 276 | 277 | ``` 278 | 279 | 280 | 281 | ## Dataset Bugs 282 | 283 | ##### I see a bug in the dataset, What should I do ? 284 | 285 | For later versions of emrQA and recent updates contact Preethi Raghavan (praghav@us.ibm.com). 286 | 287 | Please contact [Anusri Pampari][anusri-home] (\@stanford.edu) for any bugs. The more details you provide me about the bug, the easier and hence quicker you will make it for me to debug it. You can help me with the following information: 288 | 289 | ``` 290 | i2b2 dataset name 291 | example note_id, how many notes are affected by this bug if possible 292 | is there a trend in the type of questions (particular question template) where this bug occurs 293 | An example instance of the bug indetail. 294 | ``` 295 | 296 | Opening a public issue, might go against the i2b2 license agreement. So it is important you mail me the bug. Thank you for understanding. I will try my best to reply at the earliest. 297 | 298 | ## Current works using emrQA 299 | 300 | For a full and updated list please refer to the entire list published [here][citation-list]. 301 | 302 | - [Neural Mask Generator: Learning to Generate Adaptive Word Maskings for Language Model Adaptation][NMG] 303 | - [Improved Pretraining for Domain-specific Contextual Embedding Models][pretraining] 304 | - [Calibrating Structured Output Predictors for Natural Language Processing][calibration] 305 | - [Annotating and Characterizing Clinical Sentences with Explicit Why-QA Cues][why-qa] 306 | - [Entity-Enriched Neural Models for Clinical Question Answering][entity] 307 | - [Evaluation of Dataset Selection for Pre-Training and Fine-Tuning Transformer Language Models for Clinical Question Answering][eval] 308 | - [CliniQG4QA: Generating Diverse Questions for Domain Adaptation of Clinical Question Answering][da] 309 | - [How You Ask Matters: The Effect of Paraphrastic Questions to BERT Performance on a Clinical SQuAD Dataset][para1] 310 | - [Advancing Seq2seq with Joint Paraphrase Learning][para2] 311 | - [Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset][crc] - Though this work provides interesting analysis on some sub-parts of emrQA, we have concerns regarding the dataset bias considered in their analysis resulting in the said conclusions. We think that the emrQA readers should be aware of this bias and hence we try to convey this through a letter posted [here][letter] for the readers. 312 | 313 | [NMG]:https://arxiv.org/abs/2010.02705 314 | [para1]:https://www.aclweb.org/anthology/2020.clinicalnlp-1.13.pdf 315 | [para2]:https://www.aclweb.org/anthology/2020.clinicalnlp-1.30.pdf 316 | [da]: https://arxiv.org/pdf/2010.16021.pdf 317 | [eval]: https://www.aclweb.org/anthology/2020.lrec-1.679.pdf 318 | [entity]: https://arxiv.org/abs/2005.06587 319 | [why-qa]: https://www.aclweb.org/anthology/W19-1913.pdf 320 | [letter]: https://docs.google.com/document/d/1IeOqKPy3qzUEvpuSMy0Tvg7rjfYoAkn1ueplC5RXjpA/edit?usp=sharing 321 | [crc]: https://arxiv.org/abs/2005.00574 322 | [pretraining]: https://arxiv.org/pdf/2004.02288.pdf 323 | [calibration]: https://arxiv.org/pdf/2004.04361.pdf 324 | [citation-list]: https://scholar.google.com/scholar?cites=14819103415098730167&as_sdt=2005&sciodt=0,5&hl=en 325 | [i2b2-datasets]: https://www.i2b2.org/NLP/DataSets/ 326 | [anusri-home]: https://www.linkedin.com/in/anusri-pampari-594bb5126/ 327 | [drqa]: https://github.com/facebookresearch/DrQA 328 | [paper-link]: http://aclweb.org/anthology/D18-1258 329 | 330 | 331 | 332 | -------------------------------------------------------------------------------- /generation/i2b2_medications/medication-answers.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from os import listdir 4 | from os.path import isfile, join 5 | import json 6 | import random 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 medications challange files') 11 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format') 12 | parser.add_argument('--output_dir', default='', help='Directory to store the output') 13 | 14 | args = parser.parse_args() 15 | 16 | 17 | ###################################################### SET FILE PATHS ################################################################## 18 | 19 | ## i2b2 file paths ## 20 | 21 | DosageFilePath = [ os.path.join(args.i2b2_dir,"annotations_ground_truth/converted.noduplicates.sorted/"), os.path.join(args.i2b2_dir,"training.ground.truth/")] 22 | 23 | MedicationClinicalNotes = [os.path.join(args.i2b2_dir,"train.test.released.8.17.09/")] 24 | 25 | ## template file path ## 26 | 27 | template_file_path = args.templates_dir 28 | 29 | ## output file paths ## 30 | 31 | ql_output = os.path.join(args.output_dir,"medication-ql.csv") 32 | medications_qa_output_json = os.path.join(args.output_dir,"medication-qa.json") 33 | 34 | 35 | ######################################################## CODE ######################################################################### 36 | 37 | class GenerateQA(): 38 | 39 | DosageFilePath = DosageFilePath 40 | MedicationClinicalNotes = MedicationClinicalNotes 41 | 42 | def __init__(self): 43 | 44 | self.ReadMedicationData() 45 | self.ReadTemplates() 46 | 47 | ######################### Read i2b2 file functions ################################### 48 | 49 | def ReadMedicationData(self): 50 | 51 | ## based on format of the i2b2 files. please refer to the i2b2 medications challenge documentation for details ### 52 | 53 | abbs = {"m": "medication", "do": "dosage", "mo": "mode", "f": "frequency", "du": "duration", "r": "problem", 54 | "e": "event", "t": "temporal", "c": "certainty", "ln": "list"} 55 | exception = ["list", "event", "temporal", 56 | "certainty"] ## very few annotations are tagged with these, hence we willl ignore them. 57 | 58 | self.MedicationData = [] 59 | ClinicalNotes = {} 60 | 61 | ## read the clinical notes ## 62 | for paths in self.MedicationClinicalNotes: 63 | files = [f for f in listdir(paths) if isfile(join(paths, f))] 64 | for file in files: 65 | remote_file = open(paths + file) 66 | ClinicalNotes[file.strip()] = remote_file.readlines() 67 | 68 | ## read the annotations per clinical note (parse the files) ## 69 | 70 | annotations_span = [] 71 | for paths in self.DosageFilePath: 72 | files = [f for f in listdir(paths) if isfile(join(paths, f))] 73 | for file in files: 74 | remote_file = open(paths + file) 75 | 76 | note_id = file.split(".")[0] 77 | note_id = note_id.split("_")[0] 78 | # print(file) 79 | dictionary = {note_id: []} 80 | PatientNote = ClinicalNotes[note_id] ## access the corresponding clinical note. 81 | flag = 0 82 | for line in remote_file: 83 | med_list = {} 84 | line = line.replace("|||", "||") 85 | words = line.split("||") 86 | 87 | for word in words: 88 | term = word.split("=") 89 | try: 90 | type = abbs[term[0].strip()] ## check if all of them lie within the given annotation list 91 | except: 92 | print(paths + file) 93 | flag = 1 94 | break 95 | 96 | full_annotation = "=".join(term[1:]) 97 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""] 98 | pos1 = int(index[0]) 99 | pos2 = int(index[-1]) 100 | 101 | annotation = full_annotation[pos1 + 1:pos2] 102 | indxs = full_annotation[pos2 + 1:].split(",") 103 | 104 | line_in_note = "" 105 | start_line = None 106 | if annotation == "nm" or type in exception: 107 | med_list[type] = [annotation, line_in_note, start_line] 108 | continue 109 | 110 | # print(word,annotation,indxs) 111 | # print(indxs) 112 | for indx in indxs: 113 | indx = indx.strip() 114 | out = indx.split(" ") 115 | 116 | start_line = out[0].split(":")[0] 117 | start_token = out[0].split(":")[1] 118 | end_line = out[1].split(":")[0] 119 | end_token = out[1].split(":")[1] 120 | 121 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)]) 122 | 123 | # if int(end_line) > int(start_line): 124 | # print(type) 125 | # print(line) 126 | # print(end_line,start_line) 127 | 128 | ## some end line number are greater than start line numbers. annotation line_in_note can span upto 3 lines 129 | ## annotation can be discontinous set of tokens 130 | 131 | med_list[type] = [annotation, line_in_note, start_line, start_token] 132 | 133 | # if start_line != end_line: 134 | # print(int(end_line)-int(start_line)) 135 | # print(line_in_note) 136 | 137 | dictionary[note_id].append(med_list) 138 | 139 | remote_file.close() 140 | 141 | if flag == 0: 142 | if (dictionary, PatientNote) not in self.MedicationData: 143 | self.MedicationData.append((dictionary, PatientNote)) 144 | 145 | 146 | # print(annotations_span) 147 | 148 | ######################## Main program functions ########################################## 149 | 150 | def ReadTemplates(self): 151 | 152 | self.medications_out = {"paragraphs": [], "title": "medication"} 153 | self.logical_out = [] 154 | 155 | ########################################## Set File Paths ############################################## 156 | 157 | 158 | ### File to write Question-Logical Forms ## 159 | 160 | ofile = open(ql_output, "w") 161 | self.filewriter_forlform = csv.writer(ofile, delimiter="\t") 162 | self.filewriter_forlform.writerow(["Question", "Logical Form"]) 163 | 164 | ### File to read templates ### 165 | 166 | file = open(template_file_path) 167 | filereader = list(csv.reader(file)) 168 | 169 | ## read only templates relevant to medications challenge ## 170 | 171 | med_lines = [] 172 | for line in filereader[1:]: 173 | if line[0] != "medication" and line[0] != "medications": 174 | continue 175 | med_lines.append(line) 176 | 177 | ########################################## Main Function Call ############################################## 178 | 179 | for (dictionary,PatientNote) in self.MedicationData: 180 | for note_id in dictionary: 181 | out_patient = {"note_id": note_id, "context": PatientNote, "qas": []} 182 | 183 | med_list = dictionary[note_id] ## extract all the annotations given per note ## 184 | 185 | ## create one to many mappings, to use them for QA. Coreference not resolved ## 186 | 187 | self.MakeMedicationRelationMappings(med_list) 188 | 189 | flag = 0 190 | self.unique_questions = [] 191 | question_id = 0 192 | for line in med_lines: 193 | ## do +1 for the new format ## 194 | question = line[2].strip() 195 | logical_form = line[3].strip() 196 | answertype = line[4].split(",") 197 | answertype = [type.strip() for type in answertype] 198 | 199 | 200 | #question = question.replace("|problem| or |problem|","|problem|") 201 | question = question.replace("|medication| or |medication|", "|medication|") 202 | question = question.replace("|problem| or |problem|", "|problem|") 203 | question = question.replace("|test| or |test|", "|test|") 204 | question = question.replace("|test| |test| |test|", "|test|") 205 | question = question.replace("\t", "") 206 | logical_form = logical_form.replace("\t", "") 207 | 208 | if question.strip() == "": 209 | continue 210 | 211 | answer_out = self.MakeMedicationQLA(question,logical_form,answertype,med_list,flag,note_id,PatientNote,question_id) 212 | 213 | if len(answer_out) != 0: 214 | #for answer in answer_out: 215 | #print(answer["id"]) 216 | out_patient["qas"].extend(answer_out) 217 | self.medications_out["paragraphs"].append(out_patient) 218 | 219 | ################################################################# Dump JSON ########################################### 220 | 221 | json_out = medications_qa_output_json 222 | with open(json_out, 'w') as outfile: 223 | json.dump(self.medications_out, outfile, ensure_ascii=False) ## storage format same as SQUAD 224 | 225 | #json_out = medications_ql_output_json 226 | #with open(json_out, 'w') as outfile: 227 | # json.dump(self.logical_out, outfile, ensure_ascii=False) ## storage format, question logical_form question_id logicalfrom_id source 228 | 229 | def MakeMedicationQLA(self, question_list, logical_form_template, answertype, med_list, flag, note_id, PatientNote, question_id): 230 | 231 | answer_out = [] 232 | 233 | ## save a copy of the orginals ## 234 | intial_question_list = question_list.split("##") 235 | intial_template = logical_form_template 236 | orginal_logical_form_template = logical_form_template.strip() 237 | 238 | ## check for errors in templates and gather all the placeholders in the templates (placeholders stored in rwords) ## 239 | ## semantic types of placeholders ## 240 | 241 | dup_rwords_list = self.CheckForErrors(intial_question_list, orginal_logical_form_template) 242 | if dup_rwords_list == None: 243 | return answer_out 244 | 245 | for med_annotations in med_list: ## Medlist is a list of dictionaries (each dict is a medication and its attributes) 246 | 247 | flag = 0 248 | logical_form_template = orginal_logical_form_template 249 | if len(dup_rwords_list) != 1: ## sanity check 250 | print("Check Question_Logical Form Mapping") 251 | print(dup_rwords_list, intial_question_list) 252 | print(logical_form_template) 253 | return answer_out 254 | else: 255 | dup_rwords = dup_rwords_list[0] 256 | 257 | rwords = list(dup_rwords) 258 | line_num = [] 259 | line_token = [] 260 | question_line = [] 261 | quest_list_nar = [] 262 | 263 | answer = [] 264 | 265 | ### checking if placeholder values to be used in question is "nm" (not mentioned), if yes set flag to 1 ## 266 | 267 | if rwords != ["time"]: 268 | for idx in range(len(rwords)): 269 | if rwords[idx] == "treatment": 270 | rwords[idx] = "medication" 271 | 272 | if med_annotations[rwords[idx]][0] == "nm": 273 | flag = 1 274 | break 275 | else: 276 | line_num.append(int(med_annotations[rwords[idx]][2])) 277 | line_token.append(int(med_annotations[rwords[idx]][3])) 278 | question_line.append(med_annotations[rwords[idx]][1]) 279 | rwords[idx] = med_annotations[rwords[idx]][0] 280 | quest_list_nar.append(med_annotations["list"][0]) 281 | 282 | ## Generate question, logical form and answer only if flag is 0 ## 283 | 284 | if flag == 0: 285 | [paraphrase_questions, tuple_orginal, logical_form] = self.MakeMedicationQL(rwords, 286 | intial_question_list, 287 | logical_form_template, 288 | dup_rwords) 289 | [answer, answer_line, result_num, result_token, list_nar] = self.MakeAnswer(quest_list_nar, answertype, 290 | med_annotations, 291 | question_line, line_num, 292 | line_token) 293 | else: 294 | continue 295 | # return answer_out #### bug fixed ## 296 | 297 | if len(answer) != 0: 298 | 299 | if answertype == ["medication", 'dosage']: 300 | entity_type = "complex" 301 | elif answertype == ["yes"]: 302 | entity_type = "empty" 303 | else: 304 | entity_type = "single" 305 | 306 | unique_paras = set(paraphrase_questions) 307 | if unique_paras not in self.unique_questions: ## redundancy check: checking if these set of questions are unique for every clinical note ## 308 | 309 | self.unique_questions.append(unique_paras) 310 | question_id += 1 311 | ans_list = [] 312 | for idx in range(len(answer)): 313 | 314 | start_line = result_num[idx] 315 | start_token = result_token[idx] 316 | 317 | val = {"answer_start": [start_line, start_token], "text": answer[idx], 318 | "evidence": answer_line[idx], "evidence_start": result_num[idx], "answer_entity_type": entity_type} 319 | 320 | if val not in ans_list: 321 | ans_list.append(val) 322 | 323 | ## ""evidence"" in the dictionary above is currently just the answer line in the note. You can also consider question line and answer line from note as evidence in that uncomment below code and use it accordingly # 324 | 325 | ''' 326 | 327 | ## maximum distance between the question line and answer line ## 328 | perms = list(itertools.product(result_num+line_num, result_num+line_num)) 329 | diffs = [abs(val1 - val2) for (val1, val2) in perms] 330 | difference = max(diffs) 331 | 332 | Note_val = "#".join(answer_line) 333 | list_nar = ",".join(list_nar) 334 | 335 | ## evidence per answer ## 336 | evidence_answer = [] 337 | evidence_start = [] 338 | evidence_temp_line = answer_line 339 | evidence_temp_start = result_num 340 | for pdx in range(len(evidence_temp_line)): 341 | if evidence_temp_line[pdx] not in evidence_answer: 342 | evidence_answer.append(evidence_temp_line[pdx]) 343 | evidence_start.append(evidence_temp_start[pdx]) 344 | 345 | val = {"answer_start": [start_line, start_token], "text": answer[idx], 346 | "evidence": evidence_answer, 347 | "evidence_start": evidence_start} 348 | 349 | if qa_csv_write: 350 | self.filewriter.writerow( 351 | ["##".join(list(unique_paras))] + [logical_form] + [",".join(set(answer))] + [Note_val] + [note_id + "_MedicationsChallenge"] + [difference] + [list_nar]) 352 | 353 | 354 | ''' 355 | 356 | answer_temp = {"answers": ans_list, "id": [tuple_orginal, intial_template], 357 | "question": list(unique_paras)} 358 | answer_out.append(answer_temp) 359 | 360 | return answer_out 361 | 362 | ######################## Main Utility Functions ###################################### 363 | 364 | def MakeMedicationRelationMappings(self,med_list): 365 | 366 | self.map_meds_to_reasons = {} 367 | self.map_meds_to_dosages = {} 368 | self.map_meds_to_frequency = {} 369 | self.map_reasons_to_meds = {} 370 | self.map_meds_to_durations = {} 371 | self.medications_all = {} 372 | 373 | 374 | for med_annotations in med_list: 375 | 376 | if med_annotations["medication"][0] not in self.medications_all: 377 | self.medications_all[med_annotations["medication"][0]] = [med_annotations["medication"]] 378 | #print(med_annotations["medication"]) 379 | 380 | if med_annotations["medication"][0] not in self.map_meds_to_dosages: 381 | self.map_meds_to_dosages[med_annotations["medication"][0]] = [] 382 | 383 | if med_annotations["medication"][0] not in self.map_meds_to_frequency: 384 | self.map_meds_to_frequency[med_annotations["medication"][0]] = [] 385 | 386 | if med_annotations["medication"][0] not in self.map_meds_to_reasons: 387 | self.map_meds_to_reasons[med_annotations["medication"][0]] = [] 388 | 389 | if med_annotations["problem"][0] != "nm": 390 | if med_annotations["problem"][0] not in self.map_reasons_to_meds: 391 | self.map_reasons_to_meds[med_annotations["problem"][0]] = [] 392 | 393 | if med_annotations["medication"][0] not in self.map_meds_to_durations: 394 | self.map_meds_to_durations[med_annotations["medication"][0]] = [] 395 | 396 | if med_annotations["dosage"][0] != "nm": 397 | #if med_annotations["event"] == "" 398 | if med_annotations["dosage"]+med_annotations["list"] not in self.map_meds_to_dosages[med_annotations["medication"][0]]: 399 | self.map_meds_to_dosages[med_annotations["medication"][0]].append(med_annotations["dosage"]+med_annotations["list"]) 400 | if med_annotations["problem"][0] != "nm": 401 | self.map_meds_to_reasons[med_annotations["medication"][0]].append(med_annotations["problem"]+med_annotations["list"]) 402 | if med_annotations["problem"][0] != "nm": 403 | self.map_reasons_to_meds[med_annotations["problem"][0]].append(med_annotations["medication"]+med_annotations["list"]) 404 | if med_annotations["frequency"][0] != "nm": 405 | self.map_meds_to_frequency[med_annotations["medication"][0]].append(med_annotations["frequency"]+med_annotations["list"]) 406 | if med_annotations["duration"][0] != "nm": 407 | self.map_meds_to_durations[med_annotations["medication"][0]].append(med_annotations["duration"]+med_annotations["list"]) 408 | 409 | def MakeMedicationQL(self, rwords, question_list, logical_form_template, dup_rwords): 410 | 411 | intial_template = logical_form_template 412 | paraphrase_questions = [] 413 | tuple_orginal = [] 414 | 415 | if rwords == ["time"]: 416 | time = str(random.randint(2, 5)) + random.choice([" years", " weeks"]) 417 | for question in question_list: 418 | original = question 419 | question = question.replace("|time|", time) 420 | logical_form_template = logical_form_template.replace("|time|", time) 421 | rwords = [] 422 | dup_rwords = [] 423 | paraphrase_questions.append(question) 424 | tuple_orginal.append((question, original)) 425 | else: 426 | 427 | ############################ make questions ############################################ 428 | 429 | for question in question_list: 430 | orginal = question 431 | idx = 0 432 | done = [] 433 | for types in list(dup_rwords): 434 | # temp = qwords 435 | index = question.find("|" + types + "|") 436 | if index == -1 and types not in done: 437 | print(question, "|" + types + "|", done) 438 | question = question.replace("|" + types + "|", rwords[idx]) 439 | done.append(types) 440 | idx += 1 441 | tuple_orginal.append((question, orginal)) 442 | paraphrase_questions.append(question) 443 | 444 | ###################################### Make Logical Form ################################# 445 | 446 | ## tab ## 447 | idx = 0 448 | done = [] 449 | for types in list(dup_rwords): 450 | logical_form_template.replace("|treatment|", "|medication") 451 | index = logical_form_template.find("|" + types + "|") 452 | if index == -1 and types not in done: 453 | print(logical_form_template, "|" + types + "|", done, types) 454 | done.append(types) 455 | 456 | logical_form_template = logical_form_template.replace("|" + types + "|", rwords[idx]) 457 | idx += 1 458 | 459 | logical_form = logical_form_template 460 | 461 | ### Writing question-logical form ## 462 | 463 | for (question, orginal) in tuple_orginal: 464 | self.filewriter_forlform.writerow([question] + [logical_form.strip()] + [orginal.strip()] + [intial_template]) 465 | 466 | return [paraphrase_questions, tuple_orginal, logical_form] 467 | 468 | def MakeAnswer(self, quest_list_nar, answertype, med_annotations, question_list,line_num,line_token): 469 | 470 | result_num = [] 471 | result_token = [] 472 | answer_line = [] 473 | list_nar = quest_list_nar 474 | answer = [] 475 | 476 | idx = 0 477 | if answertype[idx] == "yes": 478 | 479 | ### the question line is evidence for yes or no questions ## 480 | #answer = ["yes"]*len(question_list) 481 | answer = [""] * len(question_list) 482 | answer_line.extend(question_list) 483 | result_num.extend(line_num) 484 | #result_token.extend(line_token) 485 | result_token = [""] * len(question_list) 486 | list_nar.extend(quest_list_nar) 487 | elif answertype == ["problem"]: 488 | for listr in self.map_meds_to_reasons[med_annotations["medication"][0]]: 489 | answer += [listr[0]] 490 | answer_line.append(listr[1]) 491 | result_num.append(int(listr[2])) 492 | result_token.append(int(listr[3])) 493 | list_nar.append(listr[3]) 494 | elif answertype == ["frequency"]: 495 | # print("frequency") 496 | for listr in self.map_meds_to_frequency[med_annotations["medication"][0]]: 497 | answer += [listr[0]] 498 | answer_line.append(listr[1]) 499 | result_num.append(int(listr[2])) 500 | result_token.append(int(listr[3])) 501 | list_nar.append(listr[3]) 502 | elif answertype == ["dosage"]: 503 | for med in [med_annotations["medication"][0]]: 504 | for listr in self.map_meds_to_dosages[med]: 505 | answer += [listr[0]] 506 | answer_line.append(listr[1]) 507 | result_num.append(int(listr[2])) 508 | result_token.append(int(listr[3])) 509 | list_nar.append(listr[3]) 510 | elif answertype == ["medication"]: 511 | for listr in self.map_reasons_to_meds[med_annotations["problem"][0]]: 512 | answer += [listr[0]] 513 | answer_line.append(listr[1]) 514 | result_num.append(int(listr[2])) 515 | result_token.append(int(listr[3])) 516 | list_nar.append(listr[3]) 517 | elif answertype == ["medication", 'dosage']: 518 | meds = self.map_reasons_to_meds[med_annotations["problem"][0]] 519 | for med in meds: 520 | #dos = ",".join([x[0] for x in self.map_meds_to_dosages[med[0]]]) 521 | #answer += ["( " + med[0] + ", " + dos + ")"] 522 | 523 | answer.append([med[0]]) 524 | answer_line.append([med[1]]) 525 | result_num.append([int(med[2])]) 526 | result_token.append([int(med[3])]) 527 | list_nar.append([med[3]]) 528 | 529 | 530 | for x in self.map_meds_to_dosages[med[0]]: 531 | #if x[1] not in answer_line[-1]: 532 | answer[-1].extend([x[0]]) 533 | answer_line[-1].extend([x[1]]) 534 | result_num[-1].extend([int(x[2])]) 535 | result_token[-1].extend([int(x[3])]) 536 | list_nar[-1].extend([x[4]]) 537 | 538 | #print("new medicine") 539 | #print(answer[-1]) 540 | #print(result_num[-1]) 541 | #print(result_token[-1]) 542 | #print(answer_line[-1]) 543 | #result_num[-1].extend([int(x[2]) for x in self.map_meds_to_dosages[med[0]] if int(x[2]) not in result_num[-1]]) 544 | #result_token[-1].extend([int(x[3]) for x in self.map_meds_to_dosages[med[0]]]) 545 | #list_nar.extend([x[3] for x in self.map_meds_to_dosages[med[0]]]) 546 | 547 | elif answertype == ["duration"]: 548 | for listr in self.map_meds_to_durations[med_annotations["medication"][0]]: 549 | answer += [listr[0]] 550 | answer_line.append(listr[1]) 551 | result_num.append(int(listr[2])) 552 | result_token.append(int(listr[3])) 553 | list_nar.append(listr[3]) 554 | elif answertype == ["medications_all"]: 555 | for medication_name in self.medications_all: 556 | listr = self.medications_all[medication_name][0] 557 | answer += [listr[0]] 558 | answer_line.append(listr[1]) 559 | result_num.append(int(listr[2])) 560 | result_token.append(int(listr[3])) 561 | list_nar.append(listr[3]) 562 | elif answertype == ["none"]: 563 | pass 564 | else: 565 | print(answertype) 566 | answer = [] 567 | 568 | return [answer,answer_line, result_num, result_token, list_nar] 569 | 570 | ######################## Supporting Utility Functions ###################################### 571 | 572 | def CheckForErrors(self, question_list, logical_form_template): 573 | 574 | ## gather all the placeholders in the templates ## 575 | 576 | dup_rwords_list = [] 577 | unique_templates = [] 578 | qwords_list = [] 579 | 580 | ## check if all the questions paraphrases have the same placeholders ## 581 | 582 | for question in question_list: 583 | if question.strip() == "": 584 | continue 585 | question = question.replace("|medication| or |medication|", "|medication|") 586 | question = question.replace("|problem| or |problem|", "|problem|") 587 | question = question.replace("|test| or |test|", "|test|") 588 | question = question.replace("|test| |test| |test|", "|test|") 589 | question = question.strip() 590 | 591 | if question not in unique_templates: 592 | unique_templates.append(question) 593 | else: 594 | continue 595 | 596 | qwords = question.split("|") 597 | dup_rwords = qwords[1:len(qwords):2] 598 | qwords_list.append(qwords) 599 | 600 | if len(dup_rwords_list) == 0: 601 | dup_rwords_list = [set(dup_rwords)] 602 | else: 603 | if set(dup_rwords) not in dup_rwords_list: 604 | print("Error Out Of Context Question:") 605 | print(question, logical_form_template, question_list) 606 | return None 607 | 608 | ## Check if the placeholders in logical forms are same as the placeholders in question ## 609 | 610 | lwords = logical_form_template.split("|") 611 | dup_lrwords = lwords[1:len(lwords):2] 612 | if set(dup_lrwords) not in dup_rwords_list: 613 | print("Error Out Of Context Question-Logical Form Pairs:") 614 | print(question_list, logical_form_template) 615 | return None 616 | 617 | return dup_rwords_list 618 | 619 | if __name__=="__main__": 620 | GenerateQA() -------------------------------------------------------------------------------- /generation/i2b2_relations/matching_notes.csv: -------------------------------------------------------------------------------- 1 | Relations Coreference 2 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/974381789.txt i2b2/coreference/Partners_Train/docs/clinical-473.txt 3 | i2b2/relations/test_data/0373.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-592.txt 4 | i2b2/relations/test_data/0285.txt i2b2/coreference/Beth_Train/docs/clinical-482.txt 5 | i2b2/relations/test_data/0014.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-283.txt 6 | i2b2/relations/test_data/0310.txt i2b2/coreference/Partners_Train/docs/clinical-321.txt 7 | i2b2/relations/test_data/0005.txt i2b2/coreference/Beth_Train/docs/clinical-132.txt 8 | i2b2/relations/test_data/0174.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-166.txt 9 | i2b2/relations/test_data/0474.txt i2b2/coreference/Partners_Train/docs/clinical-516.txt 10 | i2b2/relations/test_data/0178.txt i2b2/coreference/Partners_Train/docs/clinical-171.txt 11 | i2b2/relations/test_data/0385.txt i2b2/coreference/Beth_Train/docs/clinical-607.txt 12 | i2b2/relations/test_data/0461.txt i2b2/coreference/Partners_Train/docs/clinical-491.txt 13 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/614746156.txt i2b2/coreference/Partners_Train/docs/clinical-781.txt 14 | i2b2/relations/test_data/0097.txt i2b2/coreference/Beth_Train/docs/clinical-247.txt 15 | i2b2/relations/test_data/0357.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-572.txt 16 | i2b2/relations/test_data/0464.txt i2b2/coreference/Beth_Train/docs/clinical-732.txt 17 | i2b2/relations/test_data/0455.txt i2b2/coreference/Partners_Train/docs/clinical-481.txt 18 | i2b2/relations/test_data/0025.txt i2b2/coreference/Beth_Train/docs/clinical-157.txt 19 | i2b2/relations/test_data/0472.txt i2b2/coreference/Partners_Train/docs/clinical-408.txt 20 | i2b2/relations/test_data/0069.txt i2b2/coreference/Beth_Train/docs/clinical-212.txt 21 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-74.txt i2b2/coreference/Beth_Train/docs/clinical-248.txt 22 | i2b2/relations/test_data/0277.txt i2b2/coreference/Beth_Train/docs/clinical-472.txt 23 | i2b2/relations/test_data/0093.txt i2b2/coreference/Beth_Train/docs/clinical-242.txt 24 | i2b2/relations/test_data/0010.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-6.txt 25 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-24.txt i2b2/coreference/Beth_Train/docs/clinical-68.txt 26 | i2b2/relations/test_data/0074.txt i2b2/coreference/Partners_Train/docs/clinical-66.txt 27 | i2b2/relations/test_data/0058.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-293.txt 28 | i2b2/relations/test_data/0457.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-717.txt 29 | i2b2/relations/test_data/0428.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-436.txt 30 | i2b2/relations/test_data/0329.txt i2b2/coreference/Beth_Train/docs/clinical-537.txt 31 | i2b2/relations/test_data/0418.txt i2b2/coreference/Beth_Train/docs/clinical-652.txt 32 | i2b2/relations/test_data/0460.txt i2b2/coreference/Beth_Train/docs/clinical-722.txt 33 | i2b2/relations/test_data/0401.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-627.txt 34 | i2b2/relations/test_data/0381.txt i2b2/coreference/Beth_Train/docs/clinical-602.txt 35 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/622086964.txt i2b2/coreference/Partners_Train/docs/clinical-786.txt 36 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/412141256.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-711.txt 37 | i2b2/relations/test_data/0442.txt i2b2/coreference/Beth_Train/docs/clinical-692.txt 38 | i2b2/relations/test_data/0266.txt i2b2/coreference/Partners_Train/docs/clinical-271.txt 39 | i2b2/relations/test_data/0358.txt i2b2/coreference/Partners_Train/docs/clinical-371.txt 40 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-121.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-767.txt 41 | i2b2/relations/test_data/0365.txt i2b2/coreference/Beth_Train/docs/clinical-582.txt 42 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-177.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-23.txt 43 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/627258104.txt i2b2/coreference/Partners_Train/docs/clinical-791.txt 44 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/959086752.txt i2b2/coreference/Partners_Train/docs/clinical-107.txt 45 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/655358166_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-806.txt 46 | i2b2/relations/test_data/0294.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-301.txt 47 | i2b2/relations/test_data/0346.txt i2b2/coreference/Partners_Train/docs/clinical-361.txt 48 | i2b2/relations/test_data/0253.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-442.txt 49 | i2b2/relations/test_data/0436.txt i2b2/coreference/Beth_Train/docs/clinical-682.txt 50 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/188543380.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-591.txt 51 | i2b2/relations/test_data/0057.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-197.txt 52 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-66.txt i2b2/coreference/Beth_Train/docs/clinical-213.txt 53 | i2b2/relations/test_data/0342.txt i2b2/coreference/Partners_Train/docs/clinical-348.txt 54 | i2b2/relations/test_data/0397.txt i2b2/coreference/Beth_Train/docs/clinical-622.txt 55 | i2b2/relations/test_data/0145.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-307.txt 56 | i2b2/relations/test_data/0081.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-227.txt 57 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-32.txt i2b2/coreference/Beth_Train/docs/clinical-108.txt 58 | i2b2/relations/test_data/0210.txt i2b2/coreference/Partners_Train/docs/clinical-206.txt 59 | i2b2/relations/test_data/0289.txt i2b2/coreference/Beth_Train/docs/clinical-487.txt 60 | i2b2/relations/test_data/0026.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-16.txt 61 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/156406283.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-576.txt 62 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-106.txt i2b2/coreference/Beth_Train/docs/clinical-747.txt 63 | i2b2/relations/test_data/0345.txt i2b2/coreference/Beth_Train/docs/clinical-557.txt 64 | i2b2/relations/test_data/0125.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-282.txt 65 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-176.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-18.txt 66 | i2b2/relations/test_data/0050.txt i2b2/coreference/Partners_Train/docs/clinical-46.txt 67 | i2b2/relations/test_data/0218.txt i2b2/coreference/Partners_Train/docs/clinical-216.txt 68 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/723989226.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-2.txt 69 | i2b2/relations/test_data/0314.txt i2b2/coreference/Partners_Train/docs/clinical-326.txt 70 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/814743340_RWH.txt i2b2/coreference/Partners_Train/docs/clinical-32.txt 71 | i2b2/relations/test_data/0098.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-96.txt 72 | i2b2/relations/test_data/0354.txt i2b2/coreference/Partners_Train/docs/clinical-366.txt 73 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/289811204.txt i2b2/coreference/Partners_Train/docs/clinical-641.txt 74 | i2b2/relations/test_data/0333.txt i2b2/coreference/Beth_Train/docs/clinical-542.txt 75 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-80.txt i2b2/coreference/Beth_Train/docs/clinical-253.txt 76 | i2b2/relations/test_data/0406.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-416.txt 77 | i2b2/relations/test_data/0134.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-318.txt 78 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-29.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-93.txt 79 | i2b2/relations/test_data/0325.txt i2b2/coreference/Beth_Train/docs/clinical-532.txt 80 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-21.txt i2b2/coreference/Beth_Train/docs/clinical-53.txt 81 | i2b2/relations/test_data/0070.txt i2b2/coreference/Partners_Train/docs/clinical-61.txt 82 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-109.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-762.txt 83 | i2b2/relations/test_data/0415.txt i2b2/coreference/Beth_Train/docs/clinical-647.txt 84 | i2b2/relations/test_data/0445.txt i2b2/coreference/Beth_Train/docs/clinical-697.txt 85 | i2b2/relations/test_data/0078.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-71.txt 86 | i2b2/relations/test_data/0065.txt i2b2/coreference/Beth_Train/docs/clinical-207.txt 87 | i2b2/relations/test_data/0390.txt i2b2/coreference/Partners_Train/docs/clinical-358.txt 88 | i2b2/relations/test_data/0309.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-512.txt 89 | i2b2/relations/test_data/0338.txt i2b2/coreference/Partners_Train/docs/clinical-356.txt 90 | i2b2/relations/test_data/0198.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-191.txt 91 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-16.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-3.txt 92 | i2b2/relations/test_data/0101.txt i2b2/coreference/Beth_Train/docs/clinical-252.txt 93 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-52.txt i2b2/coreference/Beth_Train/docs/clinical-173.txt 94 | i2b2/relations/test_data/0374.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-391.txt 95 | i2b2/relations/test_data/0162.txt i2b2/coreference/Partners_Train/docs/clinical-156.txt 96 | i2b2/relations/test_data/0113.txt i2b2/coreference/Beth_Train/docs/clinical-267.txt 97 | i2b2/relations/test_data/0122.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-308.txt 98 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/915093496_RWH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-92.txt 99 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-28.txt i2b2/coreference/Beth_Train/docs/clinical-88.txt 100 | i2b2/relations/test_data/0029.txt i2b2/coreference/Beth_Train/docs/clinical-162.txt 101 | i2b2/relations/test_data/0467.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-393.txt 102 | i2b2/relations/test_data/0038.txt i2b2/coreference/Partners_Train/docs/clinical-31.txt 103 | i2b2/relations/test_data/0061.txt i2b2/coreference/Beth_Train/docs/clinical-202.txt 104 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-37.txt i2b2/coreference/Beth_Train/docs/clinical-128.txt 105 | i2b2/relations/test_data/0062.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-298.txt 106 | i2b2/relations/test_data/0213.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-392.txt 107 | i2b2/relations/test_data/0269.txt i2b2/coreference/Beth_Train/docs/clinical-462.txt 108 | i2b2/relations/test_data/0476.txt i2b2/coreference/Partners_Train/docs/clinical-526.txt 109 | i2b2/relations/test_data/0254.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-333.txt 110 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-178.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-28.txt 111 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/320422564.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-656.txt 112 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-18.txt i2b2/coreference/Beth_Train/docs/clinical-38.txt 113 | i2b2/relations/test_data/0173.txt i2b2/coreference/Beth_Train/docs/clinical-342.txt 114 | i2b2/relations/test_data/0106.txt i2b2/coreference/Partners_Train/docs/clinical-101.txt 115 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/332803550.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-438.txt 116 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-25.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-73.txt 117 | i2b2/relations/test_data/0177.txt i2b2/coreference/Beth_Train/docs/clinical-347.txt 118 | i2b2/relations/test_data/0412.txt i2b2/coreference/Beth_Train/docs/clinical-642.txt 119 | i2b2/relations/test_data/0077.txt i2b2/coreference/Beth_Train/docs/clinical-222.txt 120 | i2b2/relations/test_data/0030.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-21.txt 121 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-53.txt i2b2/coreference/Beth_Train/docs/clinical-178.txt 122 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/337702516_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-666.txt 123 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-51.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-168.txt 124 | i2b2/relations/test_data/0370.txt i2b2/coreference/Partners_Train/docs/clinical-386.txt 125 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/950452368.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-468.txt 126 | i2b2/relations/test_data/0282.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-291.txt 127 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/176318078_a.txt i2b2/coreference/Partners_Train/docs/clinical-581.txt 128 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-46.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-143.txt 129 | i2b2/relations/test_data/0138.txt i2b2/coreference/Partners_Train/docs/clinical-126.txt 130 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-19.txt i2b2/coreference/Beth_Train/docs/clinical-43.txt 131 | i2b2/relations/test_data/0434.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-441.txt 132 | i2b2/relations/test_data/0185.txt i2b2/coreference/Beth_Train/docs/clinical-357.txt 133 | i2b2/relations/test_data/0462.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-727.txt 134 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-81.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-258.txt 135 | i2b2/relations/test_data/0245.txt i2b2/coreference/Beth_Train/docs/clinical-432.txt 136 | i2b2/relations/test_data/0322.txt i2b2/coreference/Partners_Train/docs/clinical-336.txt 137 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-48.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-153.txt 138 | i2b2/relations/test_data/0169.txt i2b2/coreference/Beth_Train/docs/clinical-337.txt 139 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-68.txt i2b2/coreference/Beth_Train/docs/clinical-223.txt 140 | i2b2/relations/test_data/0117.txt i2b2/coreference/Beth_Train/docs/clinical-272.txt 141 | i2b2/relations/test_data/0158.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-151.txt 142 | i2b2/relations/test_data/0393.txt i2b2/coreference/Beth_Train/docs/clinical-617.txt 143 | i2b2/relations/test_data/0410.txt i2b2/coreference/Partners_Train/docs/clinical-373.txt 144 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/348301810.txt i2b2/coreference/Partners_Train/docs/clinical-676.txt 145 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/101407944_PUMC.txt i2b2/coreference/Partners_Train/docs/clinical-541.txt 146 | i2b2/relations/test_data/0229.txt i2b2/coreference/Beth_Train/docs/clinical-412.txt 147 | i2b2/relations/test_data/0261.txt i2b2/coreference/Beth_Train/docs/clinical-452.txt 148 | i2b2/relations/test_data/0301.txt i2b2/coreference/Beth_Train/docs/clinical-502.txt 149 | i2b2/relations/test_data/0427.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-667.txt 150 | i2b2/relations/test_data/0241.txt i2b2/coreference/Beth_Train/docs/clinical-427.txt 151 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-179.txt i2b2/coreference/Beth_Train/docs/clinical-33.txt 152 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/245096078.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-611.txt 153 | i2b2/relations/test_data/0473.txt i2b2/coreference/Partners_Train/docs/clinical-413.txt 154 | i2b2/relations/test_data/0205.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-382.txt 155 | i2b2/relations/test_data/0297.txt i2b2/coreference/Beth_Train/docs/clinical-497.txt 156 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-45.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-138.txt 157 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-71.txt i2b2/coreference/Beth_Train/docs/clinical-238.txt 158 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/910458031.txt i2b2/coreference/Partners_Train/docs/clinical-87.txt 159 | i2b2/relations/test_data/0422.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-383.txt 160 | i2b2/relations/test_data/0137.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-297.txt 161 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/270045381.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-631.txt 162 | i2b2/relations/test_data/0433.txt i2b2/coreference/Beth_Train/docs/clinical-677.txt 163 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/523704694.txt i2b2/coreference/Partners_Train/docs/clinical-756.txt 164 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-70.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-233.txt 165 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/493597270.txt i2b2/coreference/Partners_Train/docs/clinical-736.txt 166 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/424729395_DH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-716.txt 167 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-73.txt i2b2/coreference/Beth_Train/docs/clinical-243.txt 168 | i2b2/relations/test_data/0002.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-1.txt 169 | i2b2/relations/test_data/0419.txt i2b2/coreference/Partners_Train/docs/clinical-378.txt 170 | i2b2/relations/test_data/0141.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-302.txt 171 | i2b2/relations/test_data/0321.txt i2b2/coreference/Beth_Train/docs/clinical-527.txt 172 | i2b2/relations/test_data/0463.txt i2b2/coreference/Partners_Train/docs/clinical-496.txt 173 | i2b2/relations/test_data/0013.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-142.txt 174 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-56.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-193.txt 175 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/699905656_SC.txt i2b2/coreference/Partners_Train/docs/clinical-458.txt 176 | i2b2/relations/test_data/0362.txt i2b2/coreference/Partners_Train/docs/clinical-376.txt 177 | i2b2/relations/test_data/0290.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-296.txt 178 | i2b2/relations/test_data/0475.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-521.txt 179 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-123.txt i2b2/coreference/Beth_Train/docs/clinical-777.txt 180 | i2b2/relations/test_data/0129.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-287.txt 181 | i2b2/relations/test_data/0318.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-331.txt 182 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-84.txt i2b2/coreference/Beth_Train/docs/clinical-273.txt 183 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-33.txt i2b2/coreference/Beth_Train/docs/clinical-113.txt 184 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/641557794_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-801.txt 185 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-140.txt i2b2/coreference/Beth_Train/docs/clinical-797.txt 186 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/351853846_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-681.txt 187 | i2b2/relations/test_data/0237.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-422.txt 188 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/425680098_SC.txt i2b2/coreference/Partners_Train/docs/clinical-721.txt 189 | i2b2/relations/test_data/0378.txt i2b2/coreference/Partners_Train/docs/clinical-396.txt 190 | i2b2/relations/test_data/0105.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-257.txt 191 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/105732749.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-546.txt 192 | i2b2/relations/test_data/0089.txt i2b2/coreference/Beth_Train/docs/clinical-237.txt 193 | i2b2/relations/test_data/0306.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-316.txt 194 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/018636330_DH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-423.txt 195 | i2b2/relations/test_data/0425.txt i2b2/coreference/Partners_Train/docs/clinical-431.txt 196 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-143.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-812.txt 197 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-59.txt i2b2/coreference/Beth_Train/docs/clinical-203.txt 198 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-14.txt i2b2/coreference/Beth_Train/docs/clinical-792.txt 199 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-122.txt i2b2/coreference/Beth_Train/docs/clinical-772.txt 200 | i2b2/relations/test_data/0225.txt i2b2/coreference/Beth_Train/docs/clinical-407.txt 201 | i2b2/relations/test_data/0421.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-657.txt 202 | i2b2/relations/test_data/0018.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-288.txt 203 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/274230067_EH.txt i2b2/coreference/Partners_Train/docs/clinical-433.txt 204 | i2b2/relations/test_data/0045.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-182.txt 205 | i2b2/relations/test_data/0165.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-332.txt 206 | i2b2/relations/test_data/0330.txt i2b2/coreference/Partners_Train/docs/clinical-346.txt 207 | i2b2/relations/test_data/0022.txt i2b2/coreference/Partners_Train/docs/clinical-11.txt 208 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-36.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-123.txt 209 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-31.txt i2b2/coreference/Beth_Train/docs/clinical-103.txt 210 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-30.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-98.txt 211 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-108.txt i2b2/coreference/Beth_Train/docs/clinical-757.txt 212 | i2b2/relations/test_data/0302.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-311.txt 213 | i2b2/relations/test_data/0126.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-121.txt 214 | i2b2/relations/test_data/0118.txt i2b2/coreference/Partners_Train/docs/clinical-116.txt 215 | i2b2/relations/test_data/0334.txt i2b2/coreference/Partners_Train/docs/clinical-351.txt 216 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-26.txt i2b2/coreference/Beth_Train/docs/clinical-78.txt 217 | i2b2/relations/test_data/0317.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-522.txt 218 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/145980160.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-571.txt 219 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-144.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-817.txt 220 | i2b2/relations/test_data/0021.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-152.txt 221 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/879492218_YC.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-72.txt 222 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/837898389.txt i2b2/coreference/Partners_Train/docs/clinical-47.txt 223 | i2b2/relations/test_data/0217.txt i2b2/coreference/Beth_Train/docs/clinical-397.txt 224 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/638157550_SC.txt i2b2/coreference/Partners_Train/docs/clinical-796.txt 225 | i2b2/relations/test_data/0349.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-562.txt 226 | i2b2/relations/test_data/0233.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-417.txt 227 | i2b2/relations/test_data/0469.txt i2b2/coreference/Partners_Train/docs/clinical-398.txt 228 | i2b2/relations/test_data/0054.txt i2b2/coreference/Partners_Train/docs/clinical-51.txt 229 | i2b2/relations/test_data/0451.txt i2b2/coreference/Beth_Train/docs/clinical-707.txt 230 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-82.txt i2b2/coreference/Beth_Train/docs/clinical-263.txt 231 | i2b2/relations/test_data/0037.txt i2b2/coreference/Beth_Train/docs/clinical-172.txt 232 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/433651389.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-726.txt 233 | i2b2/relations/test_data/0209.txt i2b2/coreference/Beth_Train/docs/clinical-387.txt 234 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/517414339.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-751.txt 235 | i2b2/relations/test_data/0270.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-276.txt 236 | i2b2/relations/test_data/0090.txt i2b2/coreference/Partners_Train/docs/clinical-86.txt 237 | i2b2/relations/test_data/0142.txt i2b2/coreference/Partners_Train/docs/clinical-131.txt 238 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/130959255.txt i2b2/coreference/Partners_Train/docs/clinical-556.txt 239 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/194442600_RWH.txt i2b2/coreference/Partners_Train/docs/clinical-596.txt 240 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/134300717.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-561.txt 241 | i2b2/relations/test_data/0042.txt i2b2/coreference/Partners_Train/docs/clinical-36.txt 242 | i2b2/relations/test_data/0102.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-303.txt 243 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-34.txt i2b2/coreference/Beth_Train/docs/clinical-118.txt 244 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-105.txt i2b2/coreference/Beth_Train/docs/clinical-742.txt 245 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-55.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-188.txt 246 | i2b2/relations/test_data/0470.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-511.txt 247 | i2b2/relations/test_data/0130.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-313.txt 248 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/284487129.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-636.txt 249 | i2b2/relations/test_data/0181.txt i2b2/coreference/Beth_Train/docs/clinical-352.txt 250 | i2b2/relations/test_data/0041.txt i2b2/coreference/Beth_Train/docs/clinical-177.txt 251 | i2b2/relations/test_data/0353.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-567.txt 252 | i2b2/relations/test_data/0221.txt i2b2/coreference/Beth_Train/docs/clinical-402.txt 253 | i2b2/relations/test_data/0249.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-437.txt 254 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-17.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-8.txt 255 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-107.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-752.txt 256 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-27.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-83.txt 257 | i2b2/relations/test_data/0449.txt i2b2/coreference/Partners_Train/docs/clinical-471.txt 258 | i2b2/relations/test_data/0454.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-712.txt 259 | i2b2/relations/test_data/0189.txt i2b2/coreference/Beth_Train/docs/clinical-362.txt 260 | i2b2/relations/test_data/0153.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-317.txt 261 | i2b2/relations/test_data/0001.txt i2b2/coreference/Beth_Train/docs/clinical-127.txt 262 | i2b2/relations/test_data/0033.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-167.txt 263 | i2b2/relations/test_data/0161.txt i2b2/coreference/Beth_Train/docs/clinical-327.txt 264 | i2b2/relations/test_data/0416.txt i2b2/coreference/Partners_Train/docs/clinical-426.txt 265 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-13.txt i2b2/coreference/Beth_Train/docs/clinical-787.txt 266 | i2b2/relations/test_data/0193.txt i2b2/coreference/Beth_Train/docs/clinical-367.txt 267 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-142.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-807.txt 268 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/825330116.txt i2b2/coreference/Partners_Train/docs/clinical-42.txt 269 | i2b2/relations/test_data/0286.txt i2b2/coreference/Partners_Train/docs/clinical-343.txt 270 | i2b2/relations/test_data/0350.txt i2b2/coreference/Partners_Train/docs/clinical-353.txt 271 | i2b2/relations/test_data/0437.txt i2b2/coreference/Partners_Train/docs/clinical-446.txt 272 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/405507617.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-701.txt 273 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-67.txt i2b2/coreference/Beth_Train/docs/clinical-218.txt 274 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/989519730_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-122.txt 275 | i2b2/relations/test_data/0109.txt i2b2/coreference/Beth_Train/docs/clinical-262.txt 276 | i2b2/relations/test_data/0431.txt i2b2/coreference/Partners_Train/docs/clinical-388.txt 277 | i2b2/relations/test_data/0257.txt i2b2/coreference/Beth_Train/docs/clinical-447.txt 278 | i2b2/relations/test_data/0293.txt i2b2/coreference/Beth_Train/docs/clinical-492.txt 279 | i2b2/relations/test_data/0402.txt i2b2/coreference/Partners_Train/docs/clinical-368.txt 280 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/044687343_ELMVH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-466.txt 281 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-50.txt i2b2/coreference/Beth_Train/docs/clinical-163.txt 282 | i2b2/relations/test_data/0150.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-141.txt 283 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/212512774_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-601.txt 284 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/262912613.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-626.txt 285 | i2b2/relations/test_data/0086.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-81.txt 286 | i2b2/relations/test_data/0149.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-312.txt 287 | i2b2/relations/test_data/0389.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-612.txt 288 | i2b2/relations/test_data/0121.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-277.txt 289 | i2b2/relations/test_data/0409.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-637.txt 290 | i2b2/relations/test_data/0369.txt i2b2/coreference/Beth_Train/docs/clinical-587.txt 291 | i2b2/relations/test_data/0341.txt i2b2/coreference/Beth_Train/docs/clinical-552.txt 292 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/596437842.txt i2b2/coreference/Partners_Train/docs/clinical-776.txt 293 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/920798564.txt i2b2/coreference/Partners_Train/docs/clinical-463.txt 294 | i2b2/relations/test_data/0053.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-192.txt 295 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-65.txt i2b2/coreference/Beth_Train/docs/clinical-208.txt 296 | i2b2/relations/test_data/0394.txt i2b2/coreference/Partners_Train/docs/clinical-411.txt 297 | i2b2/relations/test_data/0361.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-577.txt 298 | -------------------------------------------------------------------------------- /generation/i2b2_relations/relations-answers.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from os import listdir 3 | from os.path import isfile, join 4 | import nltk 5 | from nltk.stem import WordNetLemmatizer 6 | from nltk.corpus import wordnet as wn 7 | from problem_classfiers import concept_is_CommonNoun, concept_is_PastTense 8 | import json 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding("ISO-8859-1") 12 | import random 13 | import argparse 14 | import os 15 | 16 | ## Resolve the use of medications and treatments 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 relations challange files') 20 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format') 21 | parser.add_argument('--output_dir', default='', help='Directory to store the output') 22 | args = parser.parse_args() 23 | 24 | ###################################################### SET FILE PATHS ################################################################## 25 | 26 | ## i2b2 file paths ## 27 | 28 | relations_folder = args.i2b2_dir 29 | 30 | FilePath = [ "concept_assertion_relation_training_data/partners/rel/", "concept_assertion_relation_training_data/beth/rel/", "test_data/rel/"] 31 | 32 | RelationsFilePath = [] 33 | 34 | for file in FilePath: 35 | RelationsFilePath.append(os.path.join(relations_folder,file)) 36 | 37 | FilePath = ["concept_assertion_relation_training_data/partners/txt/", "concept_assertion_relation_training_data/beth/txt/","test_data/txt/"] 38 | 39 | NoteFilePath = [] 40 | 41 | for file in FilePath: 42 | NoteFilePath.append(os.path.join(relations_folder,file)) 43 | 44 | FilePath = [ "concept_assertion_relation_training_data/partners/ast/", "concept_assertion_relation_training_data/beth/ast/", "test_data/ast/"] 45 | 46 | AstFilePath = [] 47 | for file in FilePath: 48 | AstFilePath.append(os.path.join(relations_folder,file)) 49 | 50 | ## template file path ## 51 | 52 | template_file_path = args.templates_dir 53 | 54 | ## matching notes in temporal, coreference and relations dataset ## 55 | 56 | matching_notes = os.path.join("generation/i2b2_relations/", "matching_notes.csv") 57 | 58 | ## output file paths ## 59 | 60 | #qa_output = "/home/anusri/Desktop/emrQA/output/relations-qa.csv" 61 | ql_output = os.path.join(args.output_dir,"relations-ql.csv") 62 | relations_qa_output_json = os.path.join(args.output_dir,"relations-qa.json") 63 | 64 | 65 | ### write to csv file for viz ## 66 | 67 | qa_csv_write = False 68 | ql_csv_write = True 69 | 70 | ######################################################## CODE ######################################################################### 71 | 72 | class GenerateRelationsQuestions(): 73 | 74 | def __init__(self): 75 | 76 | ## synsets to identify common nouns, will be used in preprocessing to remove generic i2b2 concepts ## 77 | 78 | self.similar = [] 79 | val = [wn.synsets('problem'), wn.synsets('test'), wn.synsets('procedure'), wn.synsets('disease'), 80 | wn.synsets('medication'), wn.synsets('treatment'), wn.synsets('surgery')] 81 | 82 | self.count_corefs = 0 83 | self.resolved_corefs = 0 84 | for out in val: 85 | for ss in out: 86 | self.similar.extend(ss.lemma_names()) 87 | 88 | ## set paths ## 89 | self.RelationsFilePath = RelationsFilePath 90 | self.NoteFilePath = NoteFilePath 91 | self.AstFilePath = AstFilePath 92 | 93 | self.ReadRelationsData() 94 | self.ReadAssertionsData() 95 | self.ReadTemplates() 96 | 97 | ######################### Read i2b2 file functions ################################### 98 | 99 | def ReadRelationsData(self): 100 | 101 | self.RelationsPerNote = {} 102 | 103 | self.ClinicalNotes = {} 104 | 105 | ## relations as seen in i2b2 relations challenge ### 106 | 107 | type = {"TeRP": ("test", "problem"), "TeCP": ("test", "problem"), "TrIP": ("treatment", "problem"), 108 | "TrWP": ("treatment", "problem"), 109 | "TrCP": ("treatment", "problem"), "TrAP": ("treatment", "problem"), "TrNAP": ("treatment", "problem"), 110 | "PIP": ("problem1", "problem2")} 111 | 112 | self.tr_status = {"TrIP": "improves", "TrWP": "worsens/not improves", "TrAP": "not known status", 113 | "TrCP": "causes"} 114 | 115 | ## read in all clinical notes ## 116 | for paths in self.NoteFilePath: 117 | files = [f for f in listdir(paths) if isfile(join(paths, f))] 118 | for file in files: 119 | remote_file = open(paths + file) 120 | Noteid = file.split(".")[0] 121 | self.ClinicalNotes[Noteid] = remote_file.readlines() 122 | 123 | ## read the file which shows the common notes in temporal, relations and coreference files from i2b2 challenge ## 124 | ## NOTE: This information is not available as a part of i2b2. This file is generated by using approximate methods (script provided).## 125 | 126 | match_file = open(matching_notes) 127 | csvreader = csv.reader(match_file) 128 | matching_files = list(csvreader) # relation, coreference 129 | 130 | Coreference_Note = {} 131 | self.CoreferenceCluster_to_Entity_map = {} 132 | self.Entity_to_CoreferenceCluster_map = {} 133 | 134 | ### Create coreference clusters for every type in every note and give each cluster an id. ### 135 | 136 | for file in matching_files[1:]: 137 | file = file[0].split("\t") 138 | relation_note_id = file[0].split("/")[-1].split(".")[0] 139 | coreference_path = file[1] 140 | coreferences = self.ReadCoreference(coreference_path, self.ClinicalNotes[relation_note_id]) 141 | Coreference_Note[relation_note_id] = coreferences 142 | 143 | ## Create coreference clusters for every note ## 144 | self.CoreferenceCluster_to_Entity_map[relation_note_id] = {} 145 | self.Entity_to_CoreferenceCluster_map[relation_note_id] = {} 146 | for stype in coreferences: 147 | ## Create coreference clusters for every type (problem, test, treatment)## 148 | if stype not in self.CoreferenceCluster_to_Entity_map[relation_note_id]: 149 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype] = {} 150 | self.Entity_to_CoreferenceCluster_map[relation_note_id][stype] = {} 151 | 152 | cluster_id = 0 153 | for coref_list in coreferences[stype]: 154 | 155 | ## coref_list gets id given by cluster_id 156 | for concept in coref_list: 157 | if cluster_id not in self.CoreferenceCluster_to_Entity_map[relation_note_id][stype]: 158 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype][cluster_id] = [] 159 | 160 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype][cluster_id].append(concept) ## bug fixed ## 161 | self.Entity_to_CoreferenceCluster_map[relation_note_id][stype][concept] = cluster_id 162 | cluster_id += 1 163 | 164 | ############################################################################################################################# 165 | 166 | self.map_problems_to_test_revealed = {} 167 | self.map_tests_to_problem_revealed = {} 168 | self.map_problems_to_test_investigated = {} 169 | self.map_tests_to_problem_investigated = {} 170 | self.map_treatments_to_problem = {} 171 | self.map_problems_to_treatment = {} 172 | self.problems_to_badtreatment = {} 173 | self.allergic_treatments = {} 174 | self.treatments_status_to_problem = {} 175 | self.map_problems_to_treatment = {} 176 | self.badtreatments_to_problem = {} 177 | self.symptoms_to_problem = {} 178 | self.problems_to_symptom = {} 179 | 180 | for paths in self.RelationsFilePath: 181 | files = [f for f in listdir(paths) if isfile(join(paths, f))] 182 | for file in files: 183 | remote_file = open(paths + file) 184 | Noteid = file.split(".")[0] 185 | PatientNote = self.ClinicalNotes[Noteid] 186 | 187 | try: 188 | Coreferences = Coreference_Note[Noteid] 189 | except: 190 | Coreferences = {} 191 | 192 | Relations = {} 193 | 194 | for line in remote_file: 195 | line = line.replace("|||", "||") 196 | words = line.split("||") 197 | 198 | vals = [] 199 | for word in [words[0], words[2]]: 200 | term = word.split("=") 201 | full_annotation = "=".join(term[1:]) 202 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""] 203 | pos1 = int(index[0]) 204 | pos2 = int(index[-1]) 205 | 206 | annotation = full_annotation[pos1 + 1:pos2] 207 | indxs = full_annotation[pos2 + 1:].split(",") 208 | 209 | line_in_note = "" 210 | start_line = None 211 | 212 | for indx in indxs: 213 | indx = indx.strip() 214 | out = indx.split(" ") 215 | start_line = out[0].split(":")[0] 216 | start_token = out[0].split(":")[1] 217 | end_line = out[1].split(":")[0] 218 | end_token = out[1].split(":")[1] 219 | 220 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)]) 221 | 222 | vals.append((annotation, line_in_note, start_line, start_token)) 223 | 224 | relate = words[1].split("=")[1].split("\"")[1] 225 | 226 | val1 = vals[0] 227 | val2 = vals[1] 228 | t1 = val1[0] 229 | t2 = val2[0] 230 | # print(relate) 231 | if relate not in Relations: 232 | Relations[relate] = [] 233 | 234 | ## preprocessing step done when generating question and logical forms, removed from here ## 235 | 236 | ''' 237 | t1 = self.SimplePreProcess(val1[0]) 238 | t2 = self.SimplePreProcess(val2[0]) 239 | 240 | 241 | #print("yes") 242 | if t1 == None: 243 | self.CheckForCoreferences(val1, type[relate][0],Coreferences) 244 | if t2 == None: 245 | self.CheckForCoreferences(val2, type[relate][0], Coreferences) 246 | continue 247 | 248 | if t1 == None or t2 == None: 249 | ## Just use it because we dont want to miss the answers. 250 | continue 251 | 252 | # If atelast one of the concept is a common noun ignore the relation 253 | ### Common Noun Check End### 254 | ''' 255 | val1 = (t1, type[relate][0], val1[1], val1[2], val1[3]) 256 | val2 = (t2, type[relate][1], val2[1], val2[2], val2[3]) 257 | 258 | if (val1, val2) not in Relations[relate]: 259 | Relations[relate].append((val1, val2)) 260 | 261 | self.MakeRelationMappings(val1, val2, relate, Noteid) 262 | 263 | self.RelationsPerNote[Noteid] = [Relations, PatientNote, Coreferences] 264 | 265 | ''' 266 | # for cluster_id in self.map_problems_to_test_investigated: 267 | # try: 268 | # out = self.map_problems_to_test_revealed[cluster_id] 269 | # print(self.map_problems_to_test_investigated[cluster_id]) 270 | # print(out) 271 | # print("\n") 272 | # except: 273 | # continue 274 | 275 | print(Relations.keys()) 276 | try: 277 | relation_investigated = Relations["TeCP"] 278 | relation_revealed = Relations["TeRP"] 279 | except: 280 | 281 | continue 282 | values = zip(*relation_revealed) 283 | for annotations in relation_investigated: 284 | try: 285 | index_val = list(values[0]).index(annotations[0][0]) 286 | except: 287 | continue 288 | 289 | for idx in index_val: 290 | print(annotations) 291 | print(values[2][idx]) 292 | ''' 293 | 294 | def ReadCoreference(self,coref_path,PatientNote): 295 | 296 | remote_file = open(coref_path.replace("docs","chains") + ".chains") 297 | coref_concepts = {} 298 | for line in remote_file: 299 | line = line.replace("|||", "||") 300 | words = line.split("||") 301 | 302 | vals = [] 303 | 304 | 305 | type = words[-1].replace("\"","").split("=")[-1].strip().replace("coref ","") 306 | if type not in coref_concepts and type != "person": 307 | coref_concepts[type] = [] 308 | if type == "person": 309 | continue 310 | for word in words[0:-1]: 311 | term = word.split("=") 312 | full_annotation = "=".join(term[1:]) 313 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""] 314 | pos1 = int(index[0]) 315 | pos2 = int(index[-1]) 316 | 317 | annotation = full_annotation[pos1 + 1:pos2] 318 | indxs = full_annotation[pos2 + 1:].split(",") 319 | 320 | line_in_note = "" 321 | start_line = None 322 | 323 | 324 | for indx in indxs: 325 | indx = indx.strip() 326 | out = indx.split(" ") 327 | start_line = out[0].split(":")[0] 328 | start_token = out[0].split(":")[1] 329 | end_line = out[1].split(":")[0] 330 | end_token = out[1].split(":")[1] 331 | end_token = out[1].split(":")[1] 332 | 333 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)]) 334 | 335 | vals.append((annotation,line_in_note,start_line,start_token)) 336 | 337 | coref_concepts[type].append(vals) 338 | return coref_concepts 339 | 340 | def ReadAssertionsData(self): 341 | 342 | self.problem_status = {} 343 | 344 | for paths in self.AstFilePath: 345 | files = [f for f in listdir(paths) if isfile(join(paths, f))] 346 | for file in files: 347 | remote_file = open(paths + file) 348 | Noteid = file.split(".")[0] 349 | PatientNote = self.ClinicalNotes[Noteid] 350 | 351 | if Noteid not in self.problem_status: 352 | self.problem_status[Noteid] = {} 353 | 354 | for line in remote_file: 355 | line = line.replace("|||", "||") 356 | words = line.split("||") 357 | 358 | vals = [] 359 | type = words[1].split("=")[1].split("\"")[1] 360 | status = words[2].split("=")[1].split("\"")[1] 361 | for word in [words[0]]: 362 | term = word.split("=") 363 | full_annotation = "=".join(term[1:]) 364 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""] 365 | pos1 = int(index[0]) 366 | pos2 = int(index[-1]) 367 | 368 | annotation = full_annotation[pos1 + 1:pos2] 369 | indxs = full_annotation[pos2 + 1:].split(",") 370 | 371 | line_in_note = "" 372 | start_line = None 373 | 374 | annotation = self.SimplePreProcess(annotation) 375 | 376 | for indx in indxs: 377 | indx = indx.strip() 378 | out = indx.split(" ") 379 | start_line = out[0].split(":")[0] 380 | start_token = out[0].split(":")[1] 381 | end_line = out[1].split(":")[0] 382 | end_token = out[1].split(":")[1] 383 | 384 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)]) 385 | 386 | if annotation == None: 387 | continue 388 | if type == "problem": 389 | if annotation not in self.problem_status[Noteid]: 390 | self.problem_status[Noteid][annotation] = [] 391 | self.problem_status[Noteid][annotation].append((status,line_in_note,start_line,start_token)) 392 | 393 | ######################## Main program functions ########################################## 394 | 395 | def ReadTemplates(self): 396 | 397 | self.relations_out = {"paragraphs": [], "title": "relations"} 398 | self.logical_out = [] 399 | 400 | ########################################## Set File Paths ############################################## 401 | 402 | ### File to write Question-Answers ## 403 | 404 | 405 | if qa_csv_write: 406 | ofile = open(qa_output, "w") 407 | self.filewriter = csv.writer(ofile, delimiter="\t") 408 | self.filewriter.writerow( 409 | ["Question", "Logical Form", "Answer", "Answer line in note", "Note ID", "Difference in QA lines"]) 410 | 411 | ### File to write Question-Logical Forms ## 412 | 413 | if ql_csv_write: 414 | ofile = open(ql_output, "w") 415 | self.filewriter_forlform = csv.writer(ofile, delimiter="\t") 416 | self.filewriter_forlform.writerow(["Question", "Logical Form"]) 417 | 418 | ### File to read templates ### 419 | 420 | file = open(template_file_path) 421 | filereader = list(csv.reader(file)) 422 | 423 | ## read only templates relevant to relations challenge ## 424 | 425 | rel_lines = [] 426 | for line in filereader[1:]: 427 | if line[0] != "relations": 428 | continue 429 | rel_lines.append(line) 430 | 431 | ########################################## Main Function Call ############################################## 432 | 433 | total_questions = 0 434 | for Noteid in self.RelationsPerNote: 435 | 436 | [Relations, PatientNote, Coreferences] = self.RelationsPerNote[Noteid] 437 | out_patient = {"note_id": Noteid, "context": PatientNote, "qas": []} 438 | self.unique_questions = [] 439 | 440 | for line in rel_lines: 441 | 442 | question = line[2].strip() 443 | logical_form = line[3].strip() 444 | helper = line[4].split(",") 445 | helper = [type.strip() for type in helper] 446 | answertype = line[5].strip() 447 | 448 | question = question.replace("|medication| or |medication|", "|medication|") ## added ## 449 | question = question.replace("|problem| or |problem|", "|problem|") ## added ## 450 | question = question.replace("|test| or |test|", "|test|") ## added ## 451 | question = question.replace("|test| |test| |test|", "|test|") ## added ## 452 | question = question.replace("\t", "") 453 | logical_form = logical_form.replace("\t", "") 454 | question = question.replace("\t", "") 455 | logical_form = logical_form.replace("\t", "") 456 | if question.strip() == "": 457 | continue 458 | 459 | ## check for errors in templates and gather all the placeholders in the templates (placeholders stored in rwords) ## 460 | ## semantic types of placeholders ## 461 | 462 | types_to_replace = self.checking_for_errors(question, logical_form) 463 | 464 | if len(types_to_replace) != 0: 465 | types_to_replace = list(types_to_replace[0]) 466 | else: 467 | types_to_replace = [] 468 | 469 | answer_out = self.MakeLabTestQA(question, logical_form, types_to_replace, answertype, helper, Relations, Noteid, Coreferences) 470 | 471 | if len(answer_out) != 0: 472 | out_patient["qas"].extend(answer_out) 473 | 474 | total_questions += len(self.unique_questions) 475 | self.relations_out["paragraphs"].append(out_patient) 476 | 477 | print(total_questions) 478 | print(self.count_corefs) 479 | print(self.resolved_corefs) 480 | 481 | with open(relations_qa_output_json, 'w') as outfile: 482 | json.dump(self.relations_out, outfile, ensure_ascii=False) 483 | 484 | def MakeLabTestQA(self, question, logical_form, types_to_replace, answertype, helper, Relations, Noteid, Coreferences): 485 | 486 | orginal_question = question 487 | logical_form_template = logical_form 488 | answer_out = [] 489 | 490 | for relate in helper: 491 | 492 | if relate == "ast": 493 | 494 | questions_list = question.strip().split("##") 495 | 496 | ## fixed a bug, intially not included ## 497 | answer_out = self.HandleAssertionQA(Noteid, types_to_replace, questions_list, logical_form_template, Coreferences, answertype) ## fixed bug, intially was not including assertations data 498 | 499 | else: 500 | 501 | try: 502 | relevant_relations = Relations[relate] ## Get relations which satisy the relate criteria 503 | except: 504 | continue 505 | 506 | for val1, val2 in relevant_relations: 507 | 508 | annotations = {val1[1]: (val1[0], val1[2], val1[3], val1[4]), 509 | val2[1]: (val2[0], val2[2], val2[3], val2[4])} 510 | 511 | ## check if there are placeholders in the question, call function to replace the placeholders ## 512 | 513 | if len(types_to_replace) != 0: 514 | questions_list = question.strip().split("##") 515 | out = self.MakeQuestion_new(types_to_replace, annotations, questions_list, logical_form_template, Coreferences, Noteid) 516 | if out == None: 517 | continue 518 | else: 519 | [question_list, logical_form, question_lines, question_start_line, question_start_token] = out 520 | else: 521 | ## if no placeholders directly use the question ## 522 | [question_list, logical_form, question_lines, question_start_line, question_start_token]= [question.split("##"), logical_form_template, "", "", ""] 523 | 524 | ### Writing question - logical form ## 525 | 526 | paraphrase_questions = set(question_list) 527 | question_templates = orginal_question.split("##") 528 | 529 | if len(question_list) != len(question_templates): 530 | print(question_list) 531 | print(question_templates) 532 | 533 | unique_tup = list(set(zip(question_list, question_templates))) 534 | 535 | if ql_csv_write: 536 | 537 | for qidx in range(len(unique_tup)): 538 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template]) 539 | 540 | ##### Make answers for the succesful questions #### 541 | 542 | [answer, answer_line, answer_start_line, answer_start_token] = self.AnswerSubFunction(answertype, val1, val2, Noteid, relate, question_lines, question_start_line, question_start_token) 543 | 544 | if len(answer) != 0: 545 | 546 | if paraphrase_questions not in self.unique_questions: 547 | 548 | self.unique_questions.append(paraphrase_questions) 549 | 550 | 551 | ans_list = [] 552 | for idx in range(len(answer)): 553 | 554 | start_line = answer_start_line[idx] 555 | start_token = answer_start_token[idx] 556 | 557 | if answertype == ["problems,status"]: 558 | #entity_type = "complex" 559 | entity_type = "empty" 560 | elif answer[idx] == "": 561 | entity_type = "empty" 562 | else: 563 | entity_type = "single" 564 | 565 | 566 | #if answer[idx] == "" and start_token != "": 567 | # print(paraphrase_questions) 568 | val = {"answer_start": [start_line, start_token], "text": answer[idx], "evidence": answer_line[idx], "evidence_start": start_line, "answer_entity_type": entity_type} 569 | if val not in ans_list: 570 | ans_list.append(val) 571 | 572 | 573 | ## ""evidence"" in the dictionary above is currently just the answer line in the note. You can also consider question line and answer line from note as evidence in that uncomment below code and use it accordingly ## 574 | 575 | ''' 576 | ## evidence per answer ## 577 | evidence_answer = [] 578 | evidence_start = [] 579 | evidence_temp_line = question_line + answer_line 580 | evidence_temp_start = question_start_line + answer_start_line 581 | for pdx in range(len(evidence_temp_line)): 582 | if evidence_temp_line[pdx] not in evidence_answer: 583 | evidence_answer.append(evidence_temp_line[pdx]) 584 | evidence_start.append(evidence_temp_start[pdx]) 585 | 586 | if answer[idx] == "yes" or answer[idx] == "no": 587 | start_line = "" 588 | start_token = "" 589 | else: 590 | start_line = answer_start_line[idx] 591 | start_token = answer_start_token[idx] 592 | 593 | val = {"answer_start": [start_line, start_token], "text": answer[idx],"evidence": evidence_answer,"evidence_start": evidence_start} 594 | # evidence will have q_line_answer_line 595 | 596 | if qa_csv_write: 597 | 598 | result_num = answer_start_line + question_start_line 599 | perms = list( 600 | itertools.product(result_num, result_num)) ## find different pairs of numbers ## 601 | diffs = [abs(val1 - val2) for (val1, val2) in perms] 602 | difference = max(diffs) 603 | 604 | Note_val = "#".join(list(set(evidence_temp_line))) 605 | 606 | self.filewriter.writerow( ["##".join(paraphrase_questions)] + [logical_form] + [",".join(answer)] + [Note_val] + [Noteid + "_RelationsChallenge"] + [difference]) 607 | ''' 608 | 609 | answer_temp = {"answers": ans_list,"id": [zip(question_list, question_templates), logical_form_template], "question": list(paraphrase_questions)} 610 | answer_out.append(answer_temp) 611 | 612 | return answer_out 613 | 614 | def HandleAssertionQA(self,Noteid,dup_rwords, question_list_templates, logical_form_template,Coreferences, answertype): 615 | types_to_replace = list(dup_rwords) 616 | answer_out = [] 617 | if len(dup_rwords) != 0: 618 | for problem in self.problem_status[Noteid]: 619 | answer = [] 620 | result_num = [] 621 | answer_line = [] 622 | result_token = [] 623 | 624 | logical_form = logical_form_template 625 | status = self.problem_status[Noteid][problem] 626 | rwords = list(dup_rwords) 627 | flag = 0 628 | for idx in range(len(rwords)): 629 | #print(problem) 630 | (t1,valid_list) = self.CheckIfConceptValid((problem,status[0][1],status[0][2],status[0][3]),rwords[idx], Coreferences ) 631 | if t1 == None: 632 | if valid_list != None: 633 | replace_annoation = random.choice(valid_list) 634 | rwords[idx] = replace_annoation 635 | else: 636 | flag = 1 637 | else: 638 | rwords[idx] = t1 639 | 640 | if flag == 1: 641 | continue 642 | 643 | new_question_list = [] 644 | 645 | ### Make Question ### 646 | for question in question_list_templates: 647 | done = [] 648 | idx = 0 649 | for types in list(types_to_replace): 650 | index = question.find("|" + types + "|") 651 | if index == -1 and types not in done: 652 | print(question, "|" + types + "|", done) 653 | question = question.replace("|" + types + "|", rwords[idx]) 654 | done.append(types) 655 | idx += 1 656 | #if question not in new_question_list: 657 | new_question_list.append(question) 658 | 659 | ## ### Make Logical Form ### 660 | idx = 0 661 | done = [] 662 | for types in list(types_to_replace): 663 | index = logical_form.find("|" + types + "|") 664 | if index == -1 and types not in done: 665 | print(logical_form, "|" + types + "|", done, types) 666 | done.append(types) 667 | logical_form = logical_form.replace("|" + types + "|", rwords[idx]) 668 | idx += 1 669 | 670 | for val in status: 671 | #print(val[0]) 672 | answer.append(val[0]) 673 | answer_line.append(val[1]) 674 | result_num.append(int(val[2])) 675 | result_token.append(int(val[3])) 676 | 677 | if answertype == "none": 678 | question_templates = question_list_templates 679 | unique_tup = list(set(zip(new_question_list, question_templates))) 680 | for qidx in range(len(unique_tup)): 681 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template]) 682 | else: 683 | 684 | question_templates = question_list_templates 685 | if len(new_question_list) != len(question_templates): 686 | print(new_question_list) 687 | print(question_templates) 688 | unique_tup = list(set(zip(new_question_list, question_templates))) 689 | 690 | for qidx in range(len(unique_tup)): 691 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template]) 692 | 693 | 694 | if len(answer) != 0: 695 | 696 | ''' 697 | perms = list(itertools.product(result_num, result_num)) 698 | diffs = [abs(val1 - val2) for (val1, val2) in perms] 699 | difference = max(diffs) 700 | question_templates = question_list_templates 701 | 702 | Note_val = "#".join(answer_line) 703 | ''' 704 | new_question_list = set(new_question_list) 705 | if new_question_list not in self.unique_questions: 706 | ''' 707 | if qa_csv_write: 708 | self.filewriter.writerow(["##".join(new_question_list)] + [logical_form] + [",".join(answer)] + [Note_val] + [Noteid + "_RelationsChallenge"] + [ difference]) 709 | ''' 710 | self.unique_questions.append(set(new_question_list)) 711 | 712 | ans_list = [] 713 | for idx in range(len(answer)): 714 | #print(answer[idx], result_num[idx], result_token[idx]) 715 | #val = {"answer_start": [result_num[idx], result_token[idx]], "text": answer[idx], "evidence": answer_line[idx], "evidence_start": result_num[idx]} 716 | val = {"answer_start": [result_num[idx], ""], "text": "", "evidence": answer_line[idx], "evidence_start": result_num[idx], "answer_entity_type": "empty"} 717 | if val not in ans_list: 718 | ans_list.append(val) 719 | 720 | # evidence will have q_line_answer_line 721 | answer_temp = {"answers": ans_list, "id": [zip(question_templates,new_question_list),logical_form_template], "question": list(set(new_question_list))} 722 | answer_out.append(answer_temp) 723 | 724 | 725 | 726 | return answer_out 727 | 728 | ######################## Main Utility Functions ###################################### 729 | 730 | def MakeRelationMappings(self, val1, val2, relate, Noteid): 731 | 732 | # print(self.Entity_to_CoreferenceCluster_map[Noteid]["problem"]) 733 | # print((val1[0],val1[2],val1[3],val1[4])) 734 | 735 | ## If val1 belongs to some cluster, map to that if not map, to the concept directly ## 736 | ## Not resolving coreference for answers at this point, so some answers maybe redundant ### 737 | 738 | try: 739 | 740 | concept_cluster_1 = self.Entity_to_CoreferenceCluster_map[Noteid][val1[1].replace("1", "")][ 741 | (val1[0], val1[2], val1[3], val1[4])] 742 | # print(concept_cluster_1) 743 | except: 744 | concept_cluster_1 = val1[0] 745 | try: 746 | 747 | concept_cluster_2 = self.Entity_to_CoreferenceCluster_map[Noteid][val2[1].replace("2", "")][ 748 | (val2[0], val2[2], val2[3], val2[4])] 749 | 750 | # print(concept_cluster_2) 751 | except: 752 | concept_cluster_2 = val2[0] 753 | # print(concept_cluster_2) 754 | 755 | if Noteid not in self.map_problems_to_test_revealed: 756 | self.map_problems_to_test_revealed[Noteid] = {} 757 | self.map_tests_to_problem_revealed[Noteid] = {} 758 | self.map_problems_to_test_investigated[Noteid] = {} 759 | self.map_tests_to_problem_investigated[Noteid] = {} 760 | self.allergic_treatments[Noteid] = [] 761 | self.problems_to_badtreatment[Noteid] = {} 762 | self.treatments_status_to_problem[Noteid] = {} 763 | self.map_problems_to_treatment[Noteid] = {} 764 | self.badtreatments_to_problem[Noteid] = {} 765 | self.symptoms_to_problem[Noteid] = {} 766 | self.problems_to_symptom[Noteid] = {} 767 | 768 | if relate == "TeRP": 769 | 770 | ## Coreference Checking is ensuring semantic check ## 771 | 772 | if concept_cluster_1 not in self.map_problems_to_test_revealed[Noteid]: 773 | self.map_problems_to_test_revealed[Noteid][concept_cluster_1] = [] 774 | 775 | if concept_cluster_2 not in self.map_tests_to_problem_revealed: 776 | self.map_tests_to_problem_revealed[Noteid][concept_cluster_2] = [] 777 | 778 | self.map_problems_to_test_revealed[Noteid][concept_cluster_1].append(val2) 779 | self.map_tests_to_problem_revealed[Noteid][concept_cluster_2].append(val1) 780 | 781 | if relate == "TeCP": 782 | 783 | ## Simple checking the name, need to check semantically, or normalize with CUI ## 784 | 785 | if concept_cluster_1 not in self.map_problems_to_test_investigated[Noteid]: 786 | self.map_problems_to_test_investigated[Noteid][concept_cluster_1] = [] 787 | 788 | if concept_cluster_2 not in self.map_tests_to_problem_investigated: 789 | self.map_tests_to_problem_investigated[Noteid][concept_cluster_2] = [] 790 | 791 | self.map_problems_to_test_investigated[Noteid][concept_cluster_1].append(val2) 792 | self.map_tests_to_problem_investigated[Noteid][concept_cluster_2].append(val1) 793 | 794 | if relate == "TrNAP" or relate == "TrCP": 795 | 796 | if val1 not in self.allergic_treatments[Noteid]: 797 | self.allergic_treatments[Noteid].append(val1) 798 | 799 | if relate == "TrCP": 800 | 801 | if concept_cluster_1 not in self.problems_to_badtreatment[Noteid]: 802 | self.problems_to_badtreatment[Noteid][concept_cluster_1] = [] 803 | 804 | if concept_cluster_2 not in self.badtreatments_to_problem[Noteid]: 805 | self.badtreatments_to_problem[Noteid][concept_cluster_2] = [] 806 | 807 | self.problems_to_badtreatment[Noteid][concept_cluster_1].append(val2) 808 | self.badtreatments_to_problem[Noteid][concept_cluster_2].append(val1) 809 | 810 | if concept_cluster_1 not in self.map_problems_to_treatment[Noteid]: 811 | self.map_problems_to_treatment[Noteid][concept_cluster_1] = [] 812 | 813 | status = self.tr_status[relate] 814 | self.map_problems_to_treatment[Noteid][concept_cluster_1].append((val2, status)) 815 | 816 | if relate == "TrIP" or relate == "TrWP" or relate == "TrAP": 817 | 818 | if concept_cluster_2 not in self.treatments_status_to_problem[Noteid]: 819 | self.treatments_status_to_problem[Noteid][concept_cluster_2] = [] 820 | 821 | status = self.tr_status[relate] 822 | self.treatments_status_to_problem[Noteid][concept_cluster_2].append( 823 | (val1, status)) ## val1 is treatment 824 | 825 | if concept_cluster_1 not in self.map_problems_to_treatment[Noteid]: 826 | self.map_problems_to_treatment[Noteid][concept_cluster_1] = [] 827 | 828 | status = self.tr_status[relate] 829 | self.map_problems_to_treatment[Noteid][concept_cluster_1].append((val2, status)) 830 | 831 | if relate == "PIP": 832 | 833 | if concept_cluster_1 not in self.symptoms_to_problem[Noteid]: 834 | self.symptoms_to_problem[Noteid][concept_cluster_1] = [] 835 | 836 | if concept_cluster_2 not in self.problems_to_symptom[Noteid]: 837 | self.problems_to_symptom[Noteid][concept_cluster_2] = [] 838 | 839 | self.symptoms_to_problem[Noteid][concept_cluster_1].append(val2) 840 | self.problems_to_symptom[Noteid][concept_cluster_2].append(val1) 841 | 842 | def AnswerSubFunction(self, answertype, val1, val2, Noteid, relate, question_lines, question_start_line, question_start_token): 843 | 844 | try: 845 | concept_cluster_1 = self.Entity_to_CoreferenceCluster_map[Noteid][val1[1].replace("1", "")][ 846 | (val1[0], val1[2], val1[3], val1[4])] 847 | except: 848 | concept_cluster_1 = val1[0] 849 | try: 850 | concept_cluster_2 = self.Entity_to_CoreferenceCluster_map[Noteid][val2[1].replace("2", "")][ 851 | (val2[0], val2[2], val2[3], val2[4])] 852 | except: 853 | concept_cluster_2 = val2[0] 854 | 855 | answer = [] 856 | result_start_line = [] 857 | result_start_token = [] 858 | answer_line = [] 859 | 860 | ######################## rules for test answers ######################## 861 | if answertype == "yes/no" or answertype == "abnormal" or answertype == "yes": 862 | #answer = ["yes"]* len(question_lines) 863 | answer = [""] * len(question_lines) 864 | answer_line.extend(question_lines) 865 | result_start_line.extend(question_start_line) 866 | #result_start_token.extend(question_start_token) 867 | result_start_token = [""] * len(question_lines) 868 | elif answertype == "tests_investigated": 869 | tests = self.map_tests_to_problem_investigated[Noteid][concept_cluster_2] 870 | for test in tests: 871 | answer += [test[0]] 872 | answer_line.append(test[2]) 873 | result_start_line.append(int(test[3])) 874 | result_start_token.append(int(test[4])) 875 | elif answertype == "tests_revealed": 876 | tests = self.map_tests_to_problem_revealed[Noteid][concept_cluster_2] 877 | for test in tests: 878 | answer += [test[0]] 879 | answer_line.append(test[2]) 880 | result_start_line.append(int(test[3])) 881 | result_start_token.append(int(test[4])) 882 | elif answertype == "conducted_problem_revealed_problem": 883 | try: 884 | investigated_problems = self.map_problems_to_test_investigated[concept_cluster_1] 885 | for problem in investigated_problems: 886 | answer += [problem[0]] 887 | # answer += ["conducted " + problem[0]] 888 | answer_line.append(problem[2]) 889 | result_start_line.append(int(problem[3])) 890 | result_start_token.append(int(problem[4])) 891 | except: 892 | pass 893 | try: 894 | revealed_problems = self.map_problems_to_test_revealed[concept_cluster_1] 895 | for problem in revealed_problems: 896 | # answer += ["revealed " + problem[0]] 897 | answer += [problem[0]] 898 | answer_line.append(problem[2]) 899 | result_start_line.append(int(problem[3])) 900 | result_start_token.append(int(problem[4])) 901 | except: 902 | pass 903 | elif answertype == "revealed_problem": 904 | try: 905 | revealed_problems = self.map_problems_to_test_revealed[concept_cluster_1] 906 | for problem in revealed_problems: 907 | answer += [problem[0]] 908 | answer_line.append(problem[2]) 909 | result_start_line.append(int(problem[3])) 910 | result_start_token.append(int(problem[4])) 911 | except: 912 | #answer = ["no"]*len(question_lines) 913 | answer = [""] * len(question_lines) 914 | answer_line.extend(question_lines) 915 | result_start_line.extend(question_start_line) 916 | #result_start_token.extend(question_start_token) 917 | result_start_token = [""] * len(question_lines) 918 | 919 | elif answertype == "problems_investigated": 920 | problems = self.map_problems_to_test_investigated[Noteid][concept_cluster_1] 921 | # print(problems) 922 | for problem in problems: 923 | answer += [problem[0]] 924 | answer_line.append(problem[2]) 925 | result_start_line.append(int(problem[3])) 926 | result_start_token.append(int(problem[4])) 927 | ########################################################################################################################################## 928 | elif answertype == "allergic_treatments": 929 | events = self.allergic_treatments[Noteid] 930 | 931 | for event in events: 932 | answer += [event[0]] 933 | answer_line.append(event[2]) 934 | result_start_line.append(int(event[3])) 935 | result_start_token.append(int(event[4])) 936 | elif answertype == "treatments, status": 937 | events = self.treatments_status_to_problem[Noteid][concept_cluster_2] 938 | 939 | for temp in events: 940 | (event, status) = temp 941 | ''' 942 | stemp = "" 943 | status = status.strip() 944 | if val2[0] in self.problem_status[Noteid]: 945 | out = self.problem_status[Noteid][val2[0]] 946 | if out[1] == question_line and out[2] == line_num: 947 | stemp = out[0] 948 | status += ", "+stemp 949 | ''' 950 | # answer += [event[0] + " (" + status + ")"] 951 | answer += [event[0]] 952 | answer_line.append(event[2]) 953 | result_start_line.append(int(event[3])) 954 | result_start_token.append(int(event[4])) 955 | elif answertype == "problems,status": 956 | try: 957 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1] 958 | # print(events) 959 | if "causes" in zip(*events)[1] and "improves" in zip(*events)[1]: 960 | print(Noteid) 961 | for temp in events: 962 | (event, status) = temp 963 | #answer += [event[0] + " (" + status + ")"] 964 | #answer.append([event[0], status]) 965 | answer.append("") 966 | # answer += [event[0]] 967 | answer_line.append(event[2]) 968 | result_start_line.append(int(event[3])) 969 | result_start_token.append(int(event[4])) 970 | except: 971 | caused_problems = self.problems_to_badtreatment[Noteid][concept_cluster_1] 972 | 973 | for event in caused_problems: 974 | #answer += [event[0] + " (" + "caused" + ")"] 975 | #answer.append([event[0] , "caused"]) 976 | # answer += [event[0]] 977 | answer.append("") 978 | answer_line.append(event[2]) 979 | result_start_line.append(int(event[3])) 980 | result_start_token.append(int(event[4])) 981 | elif answertype == "no": 982 | #answer = ["no"]*len(question_lines) 983 | answer = [""] * len(question_lines) 984 | answer_line.extend(question_lines) 985 | result_start_line.extend(question_start_line) 986 | #result_start_token.extend(question_start_token) 987 | result_start_token = [""] * len(question_lines) 988 | elif answertype == "problems_check_conducted": 989 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1] 990 | 991 | for temp in events: 992 | (event, status) = temp 993 | # answer += ["treatment:" + event[0]] 994 | answer += [event[0]] 995 | answer_line.append(event[2]) 996 | result_start_line.append(int(event[3])) 997 | result_start_token.append(int(event[4])) 998 | try: 999 | treatment_entities_list = self.CoreferenceCluster_to_Entity_map["treatment"][concept_cluster_1] 1000 | tests = self.map_problems_to_test_investigated[Noteid] 1001 | for test in tests: 1002 | test_entities_list = self.CoreferenceCluster_to_Entity_map["test"][test] 1003 | new_set = set(test_entities_list).intersection(set(treatment_entities_list)) 1004 | if len(new_set) != 0: 1005 | events = self.map_problems_to_test_investigated[Noteid][test] 1006 | for temp in events: 1007 | (event, status) = temp 1008 | # answer += ["tests:" + event[0]] 1009 | answer += [event[0]] 1010 | answer_line.append(event[2]) 1011 | result_start_line.append(int(event[3])) 1012 | result_start_token.append(int(event[4])) 1013 | break 1014 | except: 1015 | pass 1016 | elif answertype == "problems": 1017 | 1018 | if relate == "TrCP": 1019 | pass 1020 | # events = self.problems_to_badtreatment[Noteid][concept_cluster_1] 1021 | 1022 | # for event in events: 1023 | # answer += [event[0]] 1024 | # answer_line.append(event[2]) 1025 | # result_start_line.append(int(event[3])) 1026 | # result_start_token.append(int(event[4])) 1027 | else: 1028 | 1029 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1] 1030 | 1031 | for temp in events: 1032 | (event, status) = temp 1033 | answer += [event[0]] 1034 | answer_line.append(event[2]) 1035 | result_start_line.append(int(event[3])) 1036 | result_start_token.append(int(event[4])) 1037 | 1038 | elif answertype == "treatments": 1039 | events = self.treatments_status_to_problem[Noteid][concept_cluster_2] 1040 | 1041 | for temp in events: 1042 | (event, status) = temp 1043 | answer += [event[0]] 1044 | answer_line.append(event[2]) 1045 | result_start_line.append(int(event[3])) 1046 | result_start_token.append(int(event[4])) 1047 | elif answertype == "problem1, treatment": 1048 | 1049 | try: 1050 | events = self.badtreatments_to_problem[Noteid][concept_cluster_2] 1051 | 1052 | for event in events: 1053 | answer += [event[0]] 1054 | answer_line.append(event[2]) 1055 | result_start_line.append(int(event[3])) 1056 | result_start_token.append(int(event[4])) 1057 | except: 1058 | pass 1059 | ''' 1060 | try: 1061 | events = self.problems_to_symptom[Noteid][concept_cluster_2] 1062 | 1063 | for event in events: 1064 | answer += [event[0]] 1065 | answer_line.append(event[2]) 1066 | result_start_line.append(int(event[3])) 1067 | result_start_token.append(int(event[4])) 1068 | except: 1069 | print(relate,answertype) 1070 | pass 1071 | ''' 1072 | elif answertype == "problem1": 1073 | 1074 | events = self.problems_to_symptom[Noteid][concept_cluster_2] 1075 | 1076 | for event in events: 1077 | answer += [event[0]] 1078 | answer_line.append(event[2]) 1079 | result_start_line.append(int(event[3])) 1080 | result_start_token.append(int(event[4])) 1081 | 1082 | elif answertype == "symptoms": 1083 | 1084 | events = self.symptoms_to_problem[Noteid][concept_cluster_1] 1085 | 1086 | for event in events: 1087 | answer += [event[0]] 1088 | answer_line.append(event[2]) 1089 | result_start_line.append(int(event[3])) 1090 | result_start_token.append(int(event[4])) 1091 | elif answertype == "none": 1092 | answer = [] 1093 | else: 1094 | print(answertype) 1095 | answer = [] 1096 | 1097 | return [answer, answer_line, result_start_line, result_start_token] 1098 | 1099 | def MakeQuestion_new(self, types_to_replace, annotations, question_list, logical_form_template, Coreferences, Noteid): 1100 | 1101 | new_question_list = [] 1102 | question_start_line = [] 1103 | question_start_token = [] 1104 | question_line = [] 1105 | 1106 | rwords = list(types_to_replace) 1107 | for idx in range(len(rwords)): 1108 | question_start_line.append(int(annotations[rwords[idx]][2])) 1109 | question_start_token.append(int(annotations[rwords[idx]][3])) 1110 | question_line.append(annotations[rwords[idx]][1]) 1111 | 1112 | (t1, valid_list) = self.CheckIfConceptValid(annotations[rwords[idx]], rwords[idx], Coreferences) 1113 | if t1 == None: 1114 | if valid_list != None: 1115 | replace_annoation = random.choice(valid_list) ### all of them can be used for QL forms (more training data) 1116 | # print(annotations[rwords[idx]]) 1117 | rwords[idx] = replace_annoation 1118 | else: 1119 | return None 1120 | else: 1121 | rwords[idx] = t1 1122 | 1123 | for question in question_list: 1124 | done = [] 1125 | idx = 0 1126 | for types in list(types_to_replace): 1127 | # temp = qwords 1128 | index = question.find("|" + types + "|") 1129 | if index == -1 and types not in done: 1130 | print(question, "|" + types + "|", done) 1131 | question = question.replace("|" + types + "|", rwords[idx]) 1132 | done.append(types) 1133 | idx += 1 1134 | 1135 | new_question_list.append(question) 1136 | 1137 | idx = 0 1138 | done = [] 1139 | for types in list(types_to_replace): 1140 | index = logical_form_template.find("|" + types + "|") 1141 | if index == -1 and types not in done: 1142 | print(logical_form_template, "|" + types + "|", done, types) 1143 | done.append(types) 1144 | logical_form_template = logical_form_template.replace("|" + types + "|", rwords[idx]) 1145 | idx += 1 1146 | 1147 | return [new_question_list, logical_form_template, question_line, question_start_line, question_start_token] 1148 | 1149 | ######################## Supporting Utility Functions ###################################### 1150 | 1151 | #the tremendous tumor burden,the cord compression,gait weakness , stress incontinence copd flare a wide based gait shuffling short steps head computerized tomography scan 1152 | 1153 | def SimplePreProcess(self, word): 1154 | 1155 | if word == "": 1156 | return None 1157 | 1158 | lemmatizer = WordNetLemmatizer() 1159 | 1160 | if concept_is_CommonNoun(word) == 1 or concept_is_PastTense(word) == 1: 1161 | return None 1162 | 1163 | tag = nltk.pos_tag(nltk.word_tokenize(word)) 1164 | temp = zip(*tag) 1165 | words = list(temp[0]) 1166 | tags = list(temp[1]) 1167 | 1168 | if tags[0] == "DT": 1169 | words[0] = "" 1170 | else: 1171 | pass 1172 | 1173 | for idx in range(len(tags)): 1174 | if lemmatizer.lemmatize(words[idx].lower()) in ["patient"]: 1175 | words[idx] = "" 1176 | if tags[idx] in ["PRP","PRP$"]: 1177 | if idx != 0 or " ".join(words[0:idx]).strip() != "": 1178 | words[idx] = "the" 1179 | if idx == 0: 1180 | words[idx] = "" 1181 | if " ".join(words[0:idx]).strip() != "" and tags[idx] == ["IN", "WDT"]: 1182 | words[idx] = "" 1183 | 1184 | words = [word for word in words if word != "" and lemmatizer.lemmatize(word) not in self.similar] ## check if its okay to start with "further" 1185 | if len(words) == 0: 1186 | return None 1187 | 1188 | 1189 | filter = " ".join(words) ## To make sure it makes sense you can use a parse# 1190 | tag = nltk.pos_tag(nltk.word_tokenize(filter)) 1191 | temp = zip(*tag) 1192 | words = list(temp[0]) 1193 | tags = list(temp[1]) 1194 | 1195 | if len(set(["NN","NNS","jjR","JJS","JJ","NNP","NNPS","VB","VBG","VBP","VBZ"]).intersection(set(tags))) == 0: 1196 | return None 1197 | #events = word 1198 | #fevent = [] 1199 | #out = events.split(" ") 1200 | #for val in out: 1201 | # if (val.lower().find("patient") == -1): 1202 | # fevent.append(val) 1203 | 1204 | #if len(fevent) == 0: 1205 | # return None 1206 | 1207 | #events = " ".join(fevent) # Remove Patient or any other common words 1208 | 1209 | #exclude = set(string.punctuation) 1210 | #s = ''.join(ch for ch in filter if ch not in exclude) 1211 | #print(filter) 1212 | return filter 1213 | 1214 | def CheckForCoreferences(self,concept, type ,Coreferences): 1215 | 1216 | self.count_corefs += 1 1217 | valid_list = [] 1218 | if type == "problem1" or type == "problem2": 1219 | type = "problem" 1220 | try: 1221 | coref_lists = Coreferences[type] 1222 | except: 1223 | #print(type,Coreferences.keys()) 1224 | return None 1225 | 1226 | for coref_list in coref_lists: 1227 | if concept in coref_list: 1228 | 1229 | #print(concept[0],zip(*coref_list)[0]) 1230 | for idx in range(len(zip(*coref_list)[0])): 1231 | coref_concept = zip(*coref_list)[0][idx] 1232 | sout = self.SimplePreProcess(coref_concept) 1233 | #out_list = list(coref_list[idx]) 1234 | #out_list.append(sout) ############################ correct grammar ot not ############# 1235 | if sout != None and sout not in valid_list: 1236 | valid_list.append(sout) 1237 | #print(concept[0],valid_list,set(zip(*coref_list)[0]).symmetric_difference(set(valid_list))) 1238 | 1239 | if len(valid_list) != 0: 1240 | self.resolved_corefs += 1 1241 | return valid_list 1242 | else: 1243 | 1244 | return None 1245 | 1246 | def CheckIfConceptValid(self,val, type, Coreferences): 1247 | 1248 | t1 = self.SimplePreProcess(val[0]) 1249 | valid_list = None 1250 | 1251 | ## currently only looking for coreference if orginal word is not valid, can use it to change orginal concepts as well ### 1252 | 1253 | if t1 == None: 1254 | valid_list = self.CheckForCoreferences(val, type ,Coreferences) 1255 | #print(val[0],valid_list,Coreferences[type]) 1256 | else: 1257 | pass 1258 | 1259 | return (t1,valid_list) 1260 | 1261 | # If atelast one of the concept is a common noun ignore the relation 1262 | ### Common Noun Check End### 1263 | 1264 | def checking_for_errors(self, question_list,logical_form_template): 1265 | 1266 | question_list = question_list.split("##") 1267 | qwords_list = [] 1268 | dup_rwords_list = [] 1269 | unique_templates = [] 1270 | 1271 | #logical_form_template = logical_form_template.replace("|treatment|", "|medication|").strip() 1272 | 1273 | for question in question_list: 1274 | if question.strip() == "": 1275 | continue 1276 | #question = question.replace("|medication| or |medication|", "|medication|") 1277 | #question = question.replace("|treatment|", "|medication|").strip() 1278 | if question not in unique_templates: 1279 | unique_templates.append(question) 1280 | else: 1281 | continue 1282 | 1283 | qtemplate = question 1284 | qwords = question.split("|") 1285 | dup_rwords = qwords[1:len(qwords):2] 1286 | 1287 | qwords_list.append(qwords) 1288 | 1289 | if len(dup_rwords_list) == 0: 1290 | dup_rwords_list = [set(dup_rwords)] 1291 | else: 1292 | if set(dup_rwords) not in dup_rwords_list: 1293 | print("Error Out Of Context Question:") 1294 | print(question, logical_form_template, question_list) 1295 | return [] 1296 | 1297 | lwords = logical_form_template.split("|") 1298 | dup_lrwords = lwords[1:len(lwords):2] 1299 | if set(dup_lrwords) not in dup_rwords_list: 1300 | print("Error Out Of Context Question-Logical Form Pairs:") 1301 | print(question_list, logical_form_template) 1302 | return [] 1303 | 1304 | 1305 | if len(dup_rwords_list) != 1: 1306 | print("Check Question_Logical Form Mapping") 1307 | print(dup_rwords_list, question_list) 1308 | print(logical_form_template) 1309 | return [] 1310 | 1311 | return dup_rwords_list 1312 | 1313 | 1314 | 1315 | if __name__=="__main__": 1316 | GenerateRelationsQuestions() --------------------------------------------------------------------------------