├── requirements.txt
├── emrqa_download_image.jpg
├── generation
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ └── generation.iml
├── i2b2_smoking
│ └── smoking-answers.py
├── i2b2_relations
│ ├── problem_classfiers.py
│ ├── common_names.txt
│ ├── matching_notes.csv
│ └── relations-answers.py
├── combine_data
│ └── combine_answers.py
├── i2b2_obesity
│ └── obesity-answers.py
└── i2b2_medications
│ └── medication-answers.py
├── .gitignore
├── evaluation
├── paraphrase-analysis.py
├── template-analysis.py
└── basic-stats.py
├── main.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | xmltodict
3 |
--------------------------------------------------------------------------------
/emrqa_download_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panushri25/emrQA/HEAD/emrqa_download_image.jpg
--------------------------------------------------------------------------------
/generation/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Compiler outputs #
3 | *.a
4 | *.bat
5 | *.exe
6 | *.json
7 | *.pyc
8 | temp_risk.txt
9 | squad_format.py
10 |
11 | # Directories #
12 | output
13 | baselines
14 | i2b2
15 | .idea
16 |
17 | # Packages #
18 | *.gz
19 | *.iso
20 | *.jar
21 |
--------------------------------------------------------------------------------
/generation/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/generation/.idea/generation.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/evaluation/paraphrase-analysis.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import nltk
4 | from nltk.metrics import *
5 | from nltk.translate.bleu_score import sentence_bleu
6 | import argparse
7 | import itertools
8 | import random
9 | import numpy as np
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--templates_dir', default='/home/anusri/Desktop/emrQA/templates', help='Directory containing template files in the given format')
13 |
14 | args = parser.parse_args()
15 | csv_reader = list(csv.reader(open(os.path.join(args.templates_dir,"templates-all.csv"))))
16 |
17 | def scoring_method(qtuple,method):
18 |
19 | if method == "jaccard_score":
20 | set1 = set(nltk.word_tokenize(qtuple[0]))
21 | set2 = set(nltk.word_tokenize(qtuple[1]))
22 | score = jaccard_distance(set1,set2)
23 |
24 | if method == "blue_score":
25 | (reference, candidate) = qtuple
26 | score = sentence_bleu(reference, candidate)
27 |
28 | return score
29 |
30 | if __name__=="__main__":
31 |
32 | method = "blue_score"
33 | #method = "jaccard_score"
34 | unique_logical_forms = []
35 | total_questions = []
36 | total_scores = []
37 |
38 | for line in csv_reader[1:]:
39 |
40 | question = line[2].strip()
41 | logical_form = line[3].strip()
42 |
43 | question = question.replace("|medication| or |medication|", "|medication|")
44 | question = question.replace("|problem| or |problem|", "|problem|")
45 | question = question.replace("|test| or |test|", "|test|")
46 | question = question.replace("|test| |test| |test|", "|test|")
47 | question = question.replace("\t", "")
48 | logical_form = logical_form.replace("\t", "").replace("|medication|","|treatment|")
49 | if logical_form not in unique_logical_forms:
50 | unique_logical_forms.append(logical_form)
51 |
52 | paraphrase_questions = question.split("##")
53 | random.shuffle(paraphrase_questions)
54 | total_questions.extend(list(set(paraphrase_questions)))
55 |
56 | question_tuples = list(itertools.product([paraphrase_questions[0]], paraphrase_questions[1:]))
57 | scores = []
58 | for qtuple in question_tuples:
59 | if qtuple[0] == qtuple[1]:
60 | continue
61 | scoring_tuple = scoring_method(qtuple, method)
62 | scores.append(scoring_tuple)
63 |
64 | if len(scores) != 0:
65 | min_value = min(scores)
66 | max_value = max(scores)
67 |
68 | total_scores.extend(scores)
69 |
70 | ## total questions by total question types
71 |
72 | print("Average paraphrases per question", len(total_questions)*1.0/len(unique_logical_forms))
73 | print("Average of "+ method+ " of paraphrases", np.mean(np.array(total_scores)))
74 | print("Standard deviation of " + method + " of paraphrases", np.std(np.array(total_scores)))
75 |
76 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from subprocess import check_call
2 | import sys
3 | import os
4 | import csv
5 |
6 | PYTHON = sys.executable
7 |
8 | #################################### set the full file paths ###############################################
9 |
10 | i2b2_relations_challenge_directory = "i2b2/relations/"
11 | i2b2_medications_challenge_directory = "i2b2/medication/"
12 | i2b2_heart_disease_risk_challenge_directory = "i2b2/heart-disease-risk/"
13 | i2b2_obesity_challenge_directory = "i2b2/obesity/"
14 | i2b2_smoking_challenge_directory = "i2b2/smoking/"
15 | i2b2_coreference_challeneg_directory = "i2b2/coreference"
16 |
17 | templates_directory = "templates/templates-all.csv"
18 |
19 | #################################### make output directory if it does not already exist #########################
20 |
21 | cwd = os.getcwd()
22 | model_dir = "output/"
23 | if not os.path.exists(os.path.join(cwd,model_dir)):
24 | os.makedirs(model_dir)
25 |
26 | output_directory = os.path.join(cwd,model_dir) ## you can modify this to change the output directory path ##
27 |
28 | ###########################################################################################################
29 |
30 | matching_notes = os.path.join("generation/i2b2_relations/", "matching_notes.csv")
31 | match_file = open(matching_notes)
32 | csvreader = csv.reader(match_file)
33 | matching_files = list(csvreader) # relation, coreference
34 | new_file = []
35 | new_file.append(matching_files[0])
36 | flag = 0
37 | for file in matching_files[1:]:
38 | if i2b2_relations_challenge_directory in file[0]:
39 | flag = 1
40 | break
41 | new_file.append([os.path.join(i2b2_relations_challenge_directory,file[0]),os.path.join(i2b2_coreference_challeneg_directory,file[1])])
42 |
43 | if flag == 0:
44 | ofile = open(matching_notes, "w")
45 | filewriter = csv.writer(ofile, delimiter="\t")
46 |
47 | for val in new_file:
48 | filewriter.writerow(val)
49 |
50 | ofile.close()
51 |
52 | ################################### run the generation scripts #######################################
53 |
54 |
55 | cmd = "{python} generation/i2b2_medications/medication-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_medications_challenge_directory, templates_dir=templates_directory, output_dir=output_directory)
56 | print(cmd)
57 | check_call(cmd, shell=True)
58 |
59 |
60 | cmd = "{python} generation/i2b2_relations/relations-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_relations_challenge_directory, templates_dir=templates_directory, output_dir=output_directory)
61 | print(cmd)
62 | check_call(cmd, shell=True)
63 |
64 |
65 | cmd = "{python} generation/i2b2_heart_disease_risk/risk-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_heart_disease_risk_challenge_directory, templates_dir=templates_directory, output_dir=output_directory)
66 | print(cmd)
67 | check_call(cmd, shell=True)
68 |
69 |
70 | cmd = "{python} generation/i2b2_smoking/smoking-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_smoking_challenge_directory, templates_dir=templates_directory, output_dir=output_directory)
71 | print(cmd)
72 | check_call(cmd, shell=True)
73 |
74 |
75 | cmd = "{python} generation/i2b2_obesity/obesity-answers.py --i2b2_dir={i2b2_dir} --templates_dir={templates_dir} --output_dir={output_dir}".format(python=PYTHON, i2b2_dir=i2b2_obesity_challenge_directory, templates_dir=templates_directory, output_dir=output_directory)
76 | print(cmd)
77 | check_call(cmd, shell=True)
78 |
79 | ################## combine all the output files and generate the output in normal format ####################
80 |
81 | cmd = "{python} generation/combine_data/combine_answers.py --output_dir={output_dir}".format(python=PYTHON, output_dir=output_directory)
82 | print(cmd)
83 | check_call(cmd, shell=True)
84 |
85 | ##################### convert normal output to squad format ##################################
86 |
87 |
88 | ######################### basic analysis of the dataset #######################################
89 |
90 | '''
91 | cmd = "{python} evaluation/analysis.py".format(python=PYTHON)
92 | print(cmd)
93 | check_call(cmd, shell=True)
94 | '''
--------------------------------------------------------------------------------
/generation/i2b2_smoking/smoking-answers.py:
--------------------------------------------------------------------------------
1 | import xmltodict
2 | import csv
3 | import json
4 | import argparse
5 | import os
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 smoking challange files')
8 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format')
9 | parser.add_argument('--output_dir', default='', help='Directory to store the output')
10 | args = parser.parse_args()
11 |
12 |
13 | ###################################################### SET FILE PATHS ##################################################################
14 |
15 | templates_file = args.templates_dir
16 | i2b2_file_paths = args.i2b2_dir
17 |
18 | ql_output = os.path.join(args.output_dir,"smoking-ql.csv")
19 | qa_output = os.path.join(args.output_dir,"smoking-qa.json")
20 | file_names = ["smokers_surrogate_test_all_groundtruth_version2.xml","smokers_surrogate_train_all_version2.xml"]
21 |
22 | ######################################################## CODE #########################################################################
23 |
24 | def ReadFile():
25 | file_path = i2b2_file_paths
26 |
27 | status = []
28 | for file_name in file_names:
29 | file = file_path + file_name
30 | with open(file) as fd:
31 | XML = xmltodict.parse(fd.read())
32 | idx = 0
33 | for key in XML["ROOT"]["RECORD"]:
34 | idx += 1
35 |
36 | patient_id = key["@ID"]
37 | answer_class = key["SMOKING"]["@STATUS"]
38 | patient_note = key["TEXT"]
39 |
40 | status.append([patient_id,answer_class,patient_note])
41 | return status
42 |
43 |
44 | def MakeJSONOutput(smoking_data, json_out, status, filewriter_forlform):
45 |
46 | smoking_out = {"paragraphs": [], "title": "smoking"}
47 |
48 | for state in status:
49 | patient_id = state[0]
50 | patient_note = state[2]
51 |
52 | out = {"note_id": patient_id, "context": patient_note, "qas": []}
53 |
54 | for row in smoking_data:
55 | question = row[2].strip()
56 | form = row[3].strip()
57 | answer_type = row[4]
58 |
59 | if question == "":
60 | continue
61 |
62 | question_list = question.split("##")
63 | for q in question_list:
64 | filewriter_forlform.writerow([q, form, q, form])
65 |
66 | if answer_type == "smoke_class":
67 |
68 | out["qas"].append({"answers": [{"answer_start": "", "text": state[1], "evidence": "", "evidence_start": ""}],
69 | "id": [zip(question_list, question_list), form], "question": question_list})
70 |
71 |
72 | smoking_out["paragraphs"].append(out)
73 |
74 |
75 | with open(json_out, 'w') as outfile:
76 | json.dump(smoking_out, outfile)
77 |
78 | if __name__=="__main__":
79 |
80 | ### Read i2b2 files, one status per clinical note ###
81 |
82 | status = ReadFile()
83 |
84 | ### File to read templates ###
85 |
86 | filereader = list(csv.reader(open(templates_file)))
87 |
88 | ## read only templates relevant to smoking challenge ##
89 |
90 | smoking_lines = []
91 | for line in filereader[1:]:
92 | if line[0] != "smoking" and line[0] != "smoking":
93 | continue
94 | smoking_lines.append(line)
95 |
96 | ofile = open(ql_output, "w")
97 | filewriter_forlform = csv.writer(ofile, delimiter="\t")
98 | filewriter_forlform.writerow(["Question", "Logical Form"])
99 |
100 | MakeJSONOutput(smoking_lines, qa_output, status, filewriter_forlform)
101 | #MakeQuestion(smoking_lines,out_file,status)
102 |
103 |
104 |
105 | '''
106 | def MakeQuestion(smoking_data,out_file,status):
107 |
108 | ofile = open(out_file,"w")
109 | ofilewriter = csv.writer(ofile)
110 |
111 | values = ["Question", "Answer" , "Answer line in note", "Note ID", "Difference in QA lines"]
112 | ofilewriter.writerow(values)
113 |
114 | for row in smoking_data:
115 | #print(row)
116 | question = row[1].strip()
117 | #print(row)
118 | answer_type = row[3]
119 |
120 | if answer_type == "smoke_class":
121 | for state in status:
122 | values = [question, state[1],"",state[0],""]
123 | patient_id = status[0]
124 | patient_note = status[2]
125 |
126 | ofilewriter.writerow(values)
127 | elif answer_type == "None":
128 | #return []
129 | pass
130 | else:
131 | print(answer_type)
132 |
133 | '''
--------------------------------------------------------------------------------
/evaluation/template-analysis.py:
--------------------------------------------------------------------------------
1 | import json
2 | import csv
3 | import os
4 | import numpy as np
5 | import collections
6 | import argparse
7 |
8 | parser = argparse.ArgumentParser()
9 | parser.add_argument('--templates_dir', default='/home/anusri/Desktop/emrQA/templates', help='Directory containing template files in the given format')
10 | args = parser.parse_args()
11 |
12 | relations = ["reveals", "relates","causes","given","conducted","improves","worsens"]
13 | Functions = ["CheckRange","CheckIfNull","sortBy"]
14 | attributes = ["date","result","onsetdate","startdate","QuitDate","PackPerDay","status","abnormalResultFlag","adherence","enddate","IsTobaccoUser","sig",
15 | "YearsOfUse","diagnosisdate","dosage"]
16 | attribute_values_defined = ["pending","currentDate"]
17 |
18 | csv_reader = list(csv.reader(open(os.path.join(args.templates_dir,"templates-all.csv"))))
19 | answer = "no"
20 |
21 | question_lforms = []
22 | for line in csv_reader[1:]:
23 |
24 | dataset = line[0]
25 | if dataset == "relations":
26 | check = line[5]
27 | else:
28 | check = line[4]
29 |
30 | ## analyze all logical forms or only the ones used with answers,
31 |
32 | if answer == "yes":
33 | if check != "none":
34 | if (line[2],line[3]) not in question_lforms:
35 | question_lforms.append((line[2],line[3]))
36 | else:
37 | if (line[2],line[3]) not in question_lforms:
38 | question_lforms.append((line[2],line[3]))
39 |
40 |
41 | ########################################################################################################
42 | lforms = []
43 | for (question_list,lform) in question_lforms:
44 | #print(lform)
45 | if lform not in lforms:
46 | lforms.append(lform.replace("\t", "").replace("|medication|","|treatment|"))
47 |
48 | ##########################################################################################################
49 | #print(len(lforms))
50 |
51 |
52 | lform_vocab = []
53 | for lform in lforms:
54 | lform = lform.replace("-"," - ").replace("1","").replace("2","").replace("/"," / ").replace("<"," < ").replace(">"," > ").replace("("," ( ").replace(")"," ) ").replace("["," [ ").replace("]"," ] ").replace("{"," { ").replace("}"," } ").replace("="," = ").replace(",", " , ")
55 | if lform.count("(") != lform.count(")"):
56 | print("(")
57 | print(lform)
58 | if lform.count("{") != lform.count("}"):
59 | print("{")
60 | print(lform)
61 | if lform.count("[") != lform.count("]"):
62 | print('[')
63 | print(lform)
64 |
65 |
66 | tokens = [tok for tok in lform.split(" ") if tok != ""]
67 | lform_vocab += tokens
68 |
69 | vocab_counter = collections.Counter(lform_vocab)
70 | Events = []
71 | arguments = []
72 | arthemetic = []
73 | brackets = []
74 | Events = []
75 | arthemetic = []
76 | punctuations = []
77 | attribute_values = []
78 | Functions = []
79 | Event_Combination = []
80 | Relations_COmbination = []
81 | brackets = []
82 | arguments = []
83 |
84 | for vocab in vocab_counter:
85 | if "Event" in vocab:
86 | Events.append(vocab)
87 | elif vocab in relations + Functions + attributes + attribute_values_defined:
88 | pass
89 | elif "." in vocab:
90 | attribute_values.append(vocab)
91 | elif vocab in [">","<","=","Y","N","x","-"]:
92 | arthemetic.append(vocab)
93 | elif vocab in ["OR", "AND"]:
94 | Event_Combination.append(vocab)
95 | elif vocab in ["/"]:
96 | Relations_COmbination.append(vocab)
97 | elif vocab in ["(",")","[","]","{","}"]:
98 | brackets.append(vocab)
99 | elif "|" in vocab:
100 | arguments.append(vocab)
101 | elif "," in vocab:
102 | punctuations.append(vocab)
103 | else:
104 | pass
105 |
106 |
107 |
108 | arthemetic_questions = []
109 | question_with_relation = []
110 | medical_domain_qs = []
111 | date_questions = []
112 | time_questions = []
113 | trend_question = []
114 | events_used = {}
115 | multiple_events = []
116 | Lab_Questions = []
117 | arthmetic_questions = []
118 | indefinite_evidence = []
119 | event_confirmation = []
120 | current = []
121 | property = 0.0
122 | past = []
123 | more_than_one = 0.0
124 | attribute_questions = 0.0
125 | event_questions = 0.0
126 | medical_queston = 0.0
127 |
128 | for event in Events:
129 | events_used[event] = 0
130 |
131 | for lform in lforms:
132 | #print(lform)
133 | lform = lform.replace("-", " - ").replace("1", "").replace("2", "").replace("/", " / ").replace("<", " < ").replace(
134 | ">",
135 | " > ").replace(
136 | "(", " ( ").replace(")", " ) ").replace("[", " [ ").replace("]", " ] ").replace("{", " { ").replace("}",
137 | " } ").replace(
138 | "=", " = ").replace(",", " , ")
139 |
140 | if "( x )" in lform:
141 | #print(lform)
142 | event_questions += 1
143 |
144 | if "= "in lform:
145 | #print(lform)
146 | attribute_questions += 1
147 |
148 | if "." in lform:
149 | #print(lform)
150 | medical_queston += 1
151 |
152 | tokens = [tok for tok in lform.split(" ") if tok != ""]
153 |
154 | rel = set(tokens).intersection(set(relations))
155 |
156 | if len(set(["CheckRange", ">", "<", ]).intersection(tokens)) != 0:
157 | #print(lform)
158 | arthemetic_questions.append(lform)
159 |
160 | if len(rel) == 0:
161 | if "[" not in tokens:
162 | indefinite_evidence.append(lform)
163 | else:
164 | out = list((set(Events)).intersection(set(tokens))) ## Event Property Questions
165 | for e in out:
166 | events_used[e] += 1
167 | property += 1
168 | else:
169 | question_with_relation.append(lform)
170 |
171 | if len(rel) > 0:
172 | more_than_one += 1
173 |
174 | print("Arthemetic questions",len(arthemetic_questions)*100.0/len(lforms))
175 | print("One or more than one relations", 100.0 * more_than_one/len(lforms))
176 | print("Course Questions",100.0*event_questions/len(lforms))
177 | print("Fine Questions",100.0*attribute_questions/len(lforms))
178 | print("Medical Questions",100.0*medical_queston/len(lforms))
179 |
180 | ## medical
181 |
182 | ## corse
183 |
184 | ## fine
185 |
--------------------------------------------------------------------------------
/generation/i2b2_relations/problem_classfiers.py:
--------------------------------------------------------------------------------
1 | from nltk.stem import WordNetLemmatizer
2 | import nltk
3 | from nltk.corpus import stopwords
4 |
5 | ## Open common names to use in is_common_noun function ##
6 | file = open("generation/i2b2_relations/common_names.txt") ## you can use any set of common nouns to filter, here we call the top 500 high frequency words occuring in our templates as commoun nouns ##
7 | data = file.readlines()
8 | file.close()
9 | common_nouns = [line.strip() for line in data]
10 |
11 | ## Get Stop words ##
12 |
13 | stopWords = set(stopwords.words('english'))
14 | lemmatizer = WordNetLemmatizer()
15 |
16 | ## Functions For Use ##
17 |
18 | def concept_is_CommonNoun(concept):
19 | '''
20 | Return 1 if the concept is a common noun
21 | :param concept:
22 | :return:
23 | '''
24 |
25 | tags = nltk.pos_tag(nltk.word_tokenize(concept))
26 | [words, tag] = zip(*tags)
27 |
28 | words = list(words)
29 |
30 | nouns = []
31 | if tag[0] in ["DT", "PRP", "PRP$"]:
32 | words[0] = ""
33 | for idx in range(1, len(tag)):
34 | if words[idx] in stopWords:
35 | continue
36 | nouns.append(words[idx])
37 | else:
38 | for idx in range(len(tag)):
39 | if words[idx] in stopWords:
40 | continue
41 | nouns.append(words[idx])
42 |
43 | flag = 0
44 | for noun in nouns:
45 | if (lemmatizer.lemmatize(noun) in common_nouns) or (noun in common_nouns):
46 | flag = 1
47 | else:
48 | flag = 0
49 | break
50 |
51 | '''
52 | if flag == 1:
53 | print(" ".join(words).strip(), tags)
54 | '''
55 | return flag
56 |
57 | def concept_is_PastTense(concept):
58 | '''
59 | Return 1 if the concept ends in past tense
60 | :param concept:
61 | :return:
62 | '''
63 | text = nltk.word_tokenize(concept)
64 | tagged = nltk.pos_tag(text)
65 |
66 | tense = {}
67 | tense["future"] = len([word for word in tagged[-1:] if word[1] == "MD"])
68 | tense["present"] = len([word for word in tagged[-1:] if word[1] in ["VBP", "VBZ", "VBG"]])
69 | tense["past"] = len([word for word in tagged[-1:] if word[1] in ["VBD", "VBN"]])
70 |
71 | if tense["past"] > 0:
72 | flag = 1
73 | else:
74 | flag = 0
75 |
76 | return flag
77 |
78 | '''
79 | import sys
80 | sys.path.insert(0, '/home/anusri/Desktop/IBM/GetUMLS/QuickUMLS')
81 | import quickumls
82 | matcher = quickumls.QuickUMLS("/home/anusri/Desktop/IBM/GetUMLS/installation")
83 |
84 | ## Get UMLS semantic mapping ##
85 | sfile = open("/home/anusri/Desktop/IBM/GetUMLS/QuickUMLS/SemanticTypes_2013AA.txt")
86 | data = sfile.readlines()
87 | sfile.close()
88 | mapping = {}
89 | for line in data:
90 | words = line.split("|")
91 | short_type = words[1]
92 | full_type = words[0]
93 | mapping[short_type] = full_type
94 |
95 | def concept_is_Disease(concept):
96 | #if concept_is_CommonNoun(concept) == 1:
97 | # return 0
98 |
99 | SemanticTypes = CheckSemanticType(concept)
100 |
101 | otype = disease
102 | for (word,wtype) in SemanticTypes:
103 | for type in wtype:
104 | if (type in otype):
105 | return 1
106 |
107 |
108 | return 0
109 | def concept_is_Symptom(concept):
110 | # if concept_is_CommonNoun(concept) == 1:
111 | # return 0
112 |
113 | SemanticTypes = CheckSemanticType(concept)
114 | for (word, wtype) in SemanticTypes:
115 | for type in wtype:
116 | if (type in symptoms):
117 | return 1
118 |
119 | return 0
120 | def concept_is_MentalDisease(concept):
121 | # if concept_is_CommonNoun(concept) == 1:
122 | # return 0
123 |
124 | SemanticTypes = CheckSemanticType(concept)
125 |
126 |
127 | for (word, wtype) in SemanticTypes:
128 | for type in wtype:
129 | if (type in mental_disease):
130 | return 1
131 |
132 | return 0
133 | def concept_is_VirusBacterium(concept):
134 | # if concept_is_CommonNoun(concept) == 1:
135 | # return 0
136 |
137 | SemanticTypes = CheckSemanticType(concept)
138 |
139 | for (word, wtype) in SemanticTypes:
140 | for type in wtype:
141 | if type in bacteria:
142 | return 1
143 |
144 | return 0
145 | def concept_is_Injury(concept):
146 | # if concept_is_CommonNoun(concept) == 1:
147 | # return 0
148 |
149 | SemanticTypes = CheckSemanticType(concept)
150 |
151 |
152 | for (word, wtype) in SemanticTypes:
153 | for type in wtype:
154 | if (type in injury):
155 | return 1
156 |
157 | return 0
158 | def concept_is_Abnormality(concept):
159 | # if concept_is_CommonNoun(concept) == 1:
160 | # return 0
161 |
162 | SemanticTypes = CheckSemanticType(concept)
163 |
164 |
165 | for (word, wtype) in SemanticTypes:
166 | for type in wtype:
167 | if (type in abnormality):
168 | return 1
169 |
170 | return 0
171 | def concept_is_AbnormalTestResult(concept):
172 | # if concept_is_CommonNoun(concept) == 1:
173 | # return 0
174 |
175 | SemanticTypes = CheckSemanticType(concept)
176 |
177 |
178 | for (word, wtype) in SemanticTypes:
179 | for type in wtype:
180 | if (type in lab_result):
181 | return 1
182 |
183 | return 0
184 | def CheckSemanticType(text):
185 | types = []
186 | out = matcher.match(text, best_match=True, ignore_syntax=False)
187 | for words in out:
188 | word = words[0]["ngram"]
189 | temp = []
190 | for type in list(words[0]["semtypes"]):
191 | temp.append(mapping[type])
192 | types.append((word,temp))
193 | return types
194 |
195 | ## Functions for script check ##
196 |
197 | #TenseFilter()
198 |
199 |
200 | def determine_tense_input(sentance):
201 | text = nltk.word_tokenize(sentance)
202 | tagged = nltk.pos_tag(text)
203 |
204 | tense = {}
205 | tense["future"] = len([word for word in tagged[-1:] if word[1] == "MD"])
206 | tense["present"] = len([word for word in tagged[-1:] if word[1] in ["VBP", "VBZ", "VBG"]])
207 | tense["past"] = len([word for word in tagged[-1:] if word[1] in ["VBD", "VBN"]])
208 | return tense
209 |
210 | def TenseFilter():
211 |
212 | file = open("problem-concept.txt")
213 | data = file.readlines()
214 | file.close()
215 |
216 | concepts = [line.strip() for line in data]
217 |
218 | past = []
219 | future = []
220 |
221 | for concept in concepts:
222 | tense = determine_tense_input(concept)
223 | if tense["past"] > 0:
224 | past.append(concept)
225 | if tense["future"] > 0:
226 | future.append(concept)
227 |
228 | #for word in past:
229 | # term = word.strip().split(" ")
230 | # if len(term) > 1:
231 | # term = term[-1]
232 | # else:
233 | # term = term[0]
234 | # print(term)
235 | # print(word,en.verb.present(term))
236 |
237 | print(past)
238 | print(future)
239 |
240 | #FilterCommonNouns()
241 |
242 | '''
--------------------------------------------------------------------------------
/evaluation/basic-stats.py:
--------------------------------------------------------------------------------
1 | import json
2 | from nltk.tokenize.stanford import StanfordTokenizer
3 | import os
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import nltk
7 | from random import *
8 | from nltk import sent_tokenize
9 | from nltk import word_tokenize
10 | import random
11 | import argparse
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--output_dir', default='/home/anusri/Desktop/emrQA/output/', help='Directory to store the output')
15 |
16 | args = parser.parse_args()
17 |
18 | #os.environ['STANFORD_PARSER'] = '/home/anusri/Desktop/codes_submission/packages/stanford-jars/'
19 | #os.environ['STANFORD_MODELS'] = '/home/anusri/Desktop/codes_submission/packages/stanford-jars'
20 | #tokenizer = StanfordTokenizer("/home/anusri/Desktop/codes_submission/packages/stanford-jars/stanford-postagger.jar")
21 | #from matplotlib2tikz import save as tikz_save
22 |
23 | def LengthStatistics(list_values):
24 |
25 | metrics = {}
26 | Total_values= len(list_values)
27 | Total_Tokens = 0.0
28 | #print(Total_values)
29 | for question in list_values:
30 | words = word_tokenize(question.strip())
31 | words = [word for word in words if word != ""]
32 | Total_Tokens += len(words)
33 |
34 | Avg_token_length = Total_Tokens / Total_values
35 | metrics["question_length"] = Total_values
36 | metrics["avg_question_length"] = Avg_token_length
37 |
38 | return (Total_values, Avg_token_length)
39 |
40 |
41 | problem = []
42 | treatments = []
43 | tests = []
44 |
45 | if __name__ == '__main__':
46 |
47 | data_file = os.path.join(args.output_dir,"data.json")
48 | datasets = json.load(open(data_file), encoding="latin-1")
49 |
50 | all_questions = []
51 | all_clinical_notes = []
52 |
53 | total_clinical_notes = 0
54 | number_of_answers_per_question = {}
55 | num_classes = 0.0
56 | classes = []
57 | total_evidences = []
58 |
59 |
60 | for dataset in datasets["data"]:
61 |
62 |
63 |
64 | print("Processing dataset",dataset["title"])
65 |
66 | for note in dataset["paragraphs"]:
67 | total_clinical_notes += 1
68 |
69 | if " ".join(note["context"]) not in all_clinical_notes:
70 | all_clinical_notes.extend([" ".join(note["context"])])
71 | else:
72 | continue
73 |
74 | for questions in note["qas"]:
75 |
76 | all_answers = []
77 | evidences = []
78 |
79 | all_questions.append(list(set(questions["question"]))) # all questions
80 |
81 | for answer in questions["answers"]:
82 |
83 | if dataset["title"] in ["obesity", "smoking"] :
84 | #print(answer["text"])
85 | classes.append(answer["text"])
86 | continue
87 | #for txt in answer["text"]:
88 | # if txt not in all_answers:
89 | # all_answers.append(txt)
90 | else:
91 | if answer["answer_start"][0] != "":
92 | if answer["answer_start"] not in all_answers:
93 | all_answers.append(answer["answer_start"]) ## all answers
94 | #print(questions["question"][0], answer["answer_start"],answer["evidence"])
95 | evidences.append(answer["evidence"])
96 |
97 | total_evidences.extend(evidences)
98 |
99 | ## distribution of evidences per question type
100 |
101 | ground_truth = all_answers
102 | total_answers = len(ground_truth)
103 | if total_answers not in number_of_answers_per_question:
104 | number_of_answers_per_question[total_answers] = 0
105 | number_of_answers_per_question[total_answers] += 1
106 |
107 |
108 | print("Total Clinical Notes", total_clinical_notes, len(all_clinical_notes))
109 | total_question = len(all_questions)
110 | totals = 0
111 | questions_list = []
112 | for value in all_questions:
113 | totals += len(value)
114 | questions_list.extend(value)
115 |
116 | ## Average Question Length ##
117 |
118 | print("Total Number Of Questions",totals)
119 | print("Total number of question types", total_question)
120 | stats_questions = LengthStatistics(questions_list)
121 | print("Average question length",stats_questions[1])
122 |
123 | ## Average Evidence Length ##
124 |
125 | stats_evidences = LengthStatistics(total_evidences)
126 | print("Average evidence length",stats_evidences[1])
127 |
128 | ## Average Note Length ##
129 |
130 | stats_evidences = LengthStatistics(all_clinical_notes)
131 | print("Average clinical note length", stats_evidences[1])
132 |
133 | ## Average number of questions per note ##
134 |
135 | print("Average Number of questions per note", totals/total_clinical_notes)
136 | print("Average number of question types per note", total_question/total_clinical_notes)
137 |
138 | ## Average number of evidences per question ##
139 |
140 | total__num_answers = 0
141 | for value in number_of_answers_per_question:
142 | if value == 0:
143 | print(number_of_answers_per_question[value])
144 | else:
145 | total__num_answers += value*number_of_answers_per_question[value]
146 |
147 | num_classes = len(set(classes))
148 | print("Average number of evidences", float(total__num_answers) / total_question)
149 | print("Percentage with one evidences",number_of_answers_per_question[1]*100.0/total_question)
150 | print("range in number of evidences",min(number_of_answers_per_question.keys()),max(number_of_answers_per_question.keys()))
151 | print("total number of classes in obesity and smoking datasets", num_classes)
152 |
153 | ################# more stats ignore for now ######################
154 |
155 | # indefinite_evidence_type = []
156 | # forms_in_data = []
157 |
158 | #print(indefinite_evidence_type)
159 | #print("indefinite",len(num_answers)*100.0/total_question)
160 | #print(min(num_answers),max(num_answers))
161 | #plt.figure(2)
162 | #plt.xlabel("Number of evidences greater than 1")
163 | #plt.ylabel("Frequency")
164 | #plt.title("Formula Size Bins")
165 | #plt.hist(num_answers, bins=3)
166 | #plt.show()
167 | #tikz_save('evidences-hist.tex')
168 |
169 | #print(number_of_answers_per_question)
170 | #stats_clinincal_notes = LengthStatistics(all_clinical_notes)
171 | #print("Total Clinincal Notes",stats_clinincal_notes[0])
172 | #print("Average Clinincal Note length", stats_clinincal_notes[1])
173 |
174 | #print(number_of_answers_per_question[0])
175 | #print(number_of_answers_per_question[1])
176 | #print(number_of_answers_per_question)
177 | ## Plot the distribution of number of answer
178 | #print(number_of_answers_per_question)
179 |
180 | #x = np.arange(len(number_of_answers_per_question)-1)
181 | #plt.bar(x,list(np.array(number_of_answers_per_question.values().remove(number_of_answers_per_question[1]))))
182 | #plt.xticks(x, number_of_answers_per_question.keys().remove(1))
183 | #plt.show()
184 |
185 |
--------------------------------------------------------------------------------
/generation/combine_data/combine_answers.py:
--------------------------------------------------------------------------------
1 | import json
2 | import csv
3 | import random
4 | import argparse
5 | import os
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--output_dir', default='/home/anusri/Desktop/emrQA/output/', help='Directory of output files')
9 |
10 | args = parser.parse_args()
11 |
12 | ###################################################### SET FILE PATHS ##################################################################
13 |
14 | medications = json.load(open(os.path.join(args.output_dir,"medication-qa.json")))
15 | relations = json.load(open(os.path.join(args.output_dir,"relations-qa.json")), encoding="latin-1")
16 | risk = json.load(open(os.path.join(args.output_dir,"risk-qa.json")))
17 | smoking = json.load(open(os.path.join(args.output_dir,"smoking-qa.json")))
18 | obesity = json.load(open(os.path.join(args.output_dir,"obesity-qa.json")))
19 |
20 |
21 | ######################################################## CODE #########################################################################
22 |
23 | data = [medications, relations, risk, smoking, obesity]
24 | #data = [relations]
25 | data_out = {"data": data}
26 | json_out = os.path.join(args.output_dir,"data.json")
27 | with open(json_out, 'w') as outfile:
28 | json.dump(data_out, outfile, encoding="latin-1")
29 |
30 | total_clinical_notes = 0
31 | all_questions = []
32 | all_clinical_notes = []
33 | for dataset in data:
34 |
35 | for note in dataset["paragraphs"]:
36 | total_clinical_notes += 1
37 | if " ".join(note["context"]) not in all_clinical_notes:
38 | all_clinical_notes.extend([" ".join(note["context"])])
39 | else:
40 | #print("repeat")
41 | continue
42 |
43 | for questions in note["qas"]:
44 | #print(questions["question"])
45 | all_questions.append(list(set(questions["question"]))) # all questions
46 |
47 | out = []
48 | count = {}
49 | print("Total Clinical Notes", len(all_clinical_notes))
50 | total_question = len(all_questions)
51 | totals = 0
52 | questions_list = []
53 | for value in all_questions:
54 | #print(value)
55 | if type(value) != list:
56 | print("error")
57 | if len(value[0]) == 1:
58 | print(value)
59 | #out.append([len(value[0]),len(value),"\t".join(value)])
60 | #if len(value) not in count:
61 | # count[len(value)] = []
62 | totals += len(value)
63 | questions_list.extend(value)
64 |
65 | '''
66 | print(len(count))
67 | new_list = sorted(out, key=lambda x: x[1], reverse=True)
68 |
69 | ofile = open("testing","w")
70 | for val in new_list:
71 | ofile.write("\t".join(map(str,val)))
72 | ofile.write("\n")
73 |
74 | ofile.close()
75 | '''
76 | ## Average Question Length ##
77 |
78 | print("Total Number Of Questions", totals)
79 | print("Total number of question types", total_question)
80 |
81 | ##################################################################################################################################
82 |
83 | medications = os.path.join(args.output_dir,"medication-ql.csv")
84 | relations = os.path.join(args.output_dir,"relations-ql.csv")
85 | risk = os.path.join(args.output_dir,"risk-ql.csv")
86 | smoking = os.path.join(args.output_dir,"smoking-ql.csv")
87 | obesity = os.path.join(args.output_dir,"obesity-ql.csv")
88 |
89 | data = [medications, relations, risk, smoking, obesity]
90 |
91 | unique = set()
92 |
93 |
94 | for file_path in data:
95 | file = open(file_path)
96 | filereader = list(csv.reader(file))
97 |
98 | for line in filereader[1:]:
99 | unique.add(tuple(line))
100 | #if random.randint(1,100) < 10:
101 | #print(line)
102 |
103 | values = list(unique)
104 |
105 | print("Total number of QL forms", len(values))
106 |
107 | final_out = os.path.join(args.output_dir,"data-ql.csv")
108 | ofile = open(final_out, "w")
109 | writer = csv.writer(ofile, delimiter="\t")
110 | writer.writerow(["Question", "Logical Form", "QTemplate", "LTemplate"])
111 |
112 | for val in values:
113 | writer.writerow(val)
114 |
115 | ofile.close()
116 |
117 |
118 | '''
119 |
120 | datasets = json.load(open("data.json"))
121 | for dataset in datasets:
122 | print(dataset["title"])
123 |
124 | for ClinicalNote in dataset["paragraphs"]:
125 |
126 | NoteText = "\n".join(ClinicalNote["context"])
127 |
128 | for questions in ClinicalNote["qas"]:
129 |
130 | paraphrase_questions = questions["question"]
131 | print(paraphrase_questions)
132 | for answer in questions["answers"]:
133 |
134 | answer_text = answer["text"]
135 | answer_start = answer["answer_start"] ## [start_line,start_token] from NoteText
136 | evidence = answer["evidence"] ## The evidence here is question line + answer line (the evidence we use as ground truth is start_line from answer_start)
137 |
138 | print(answer_text,answer_start,evidence)
139 |
140 | '''
141 | '''
142 | use_evidence_model = "True"
143 |
144 | paras = []
145 | idx = 0
146 | for note in medications["paragraphs"]:
147 |
148 | if medications["title"] == "risk-dataset":
149 |
150 | text = "\n".join(note["context"])
151 | para = {"context": text, "qas": []}
152 |
153 | for questions in note["qas"]:
154 | idx += 1 ## Take care of this
155 | question = {"question": questions["question"], "answers": [], "id": idx}
156 |
157 | if use_evidence_model == "True":
158 | for answer in questions["answers"]:
159 | question["answers"].append({"text": answer["evidence"], "answer_start": answer["answer_start"][0]}) ## the answer line
160 | else:
161 | for answer in questions["answers"]:
162 | question["answers"].append({"text": answer["text"], "answer_start": answer["answer_start"][1]}) ## the answer text
163 | else:
164 |
165 | text = "".join(note["context"])
166 | line_lenth = [len(line) for line in note["context"]]
167 | para = {"context": text, "qas": []}
168 |
169 | for questions in note["qas"]:
170 | idx += 1
171 | print(questions["id"])
172 | question = {"question": questions["question"], "answers": [], "id": idx}
173 | for answer in questions["answers"]:
174 |
175 | if use_evidence_model == "True":
176 | try: ## evidence and evidence start token
177 | question["answers"].append({"text":note["context"][answer["answer_start"][0]-1],"answer_start":sum(line_lenth[answer[:answer["answer_start"][0]-1]])})
178 | except:
179 | unique = []
180 | for num in list(map(lambda x: x - 1, answer["evidence_start"])):
181 | if num not in unique:
182 | unique.append(num)
183 | question["answers"].append({"text":note["context"][num],"answer_start":sum(line_lenth[:num])})
184 | else:
185 | try: ## answer and answer start token
186 | question["answers"].append({"text": answer["text"],
187 | "answer_start": sum(
188 | line_lenth[answer[:answer["answer_start"][0] - 1]])+answer["answer_start"][1]})
189 | except:
190 | unique = []
191 | for num in list(map(lambda x: x - 1, answer["evidence_start"])):
192 | if num not in unique:
193 | unique.append(num)
194 | question["answers"].append(
195 | {"text": note["context"][num], "answer_start": sum(line_lenth[:num])})
196 |
197 |
198 | para["qas"].append(question)
199 |
200 | paras.append(para)
201 |
202 | medications_new = {"paragraphs": paras, "title": "medications"}
203 |
204 | #file = open("file.json", "w")
205 | data = {}
206 | data["data"] = [medications_new]
207 | output = {'qids': [], 'questions': [], 'answers': [],
208 | 'contexts': [], 'qid2cid': []}
209 | for article in data["data"]:
210 | for paragraph in article['paragraphs']:
211 | output['contexts'].append(paragraph['context'])
212 | for qa in paragraph['qas']:
213 | output['qids'].append(qa['id'])
214 | #print(qa["question"])
215 | output['questions'].append(qa['question'])
216 | output['qid2cid'].append(len(output['contexts']) - 1)
217 | if 'answers' in qa:
218 | output['answers'].append(qa['answers'])
219 | #print(qa['answers'])
220 |
221 | json_out = "data_squad_format.json"
222 | with open(json_out, 'w') as outfile:
223 | json.dump(data, outfile, encoding="utf-8")
224 |
225 | '''
--------------------------------------------------------------------------------
/generation/i2b2_relations/common_names.txt:
--------------------------------------------------------------------------------
1 | patient
2 | affected_role
3 | patient_role
4 | trial
5 | trial_run
6 | test
7 | tryout
8 | psychometric_test
9 | examination
10 | exam
11 | hour_angle
12 | ha
13 | problem
14 | job
15 | trouble
16 | stopping_point
17 | finale
18 | finis
19 | finish
20 | conclusion
21 | death
22 | end
23 | final_stage
24 | shoemaker's_last
25 | cobbler's_last
26 | department_of_energy
27 | energy_department
28 | energy
29 | doe
30 | medicine
31 | medication
32 | medicament
33 | medicinal_drug
34 | pickings
35 | history
36 | account
37 | chronicle
38 | story
39 | return
40 | issue
41 | takings
42 | proceeds
43 | yield
44 | payoff
45 | consequence
46 | effect
47 | outcome
48 | result
49 | event
50 | solution
51 | answer
52 | resolution
53 | solvent
54 | resultant
55 | final_result
56 | termination
57 | resultant_role
58 | treatment
59 | intervention
60 | discussion
61 | discourse
62 | wherefore
63 | past
64 | past_times
65 | yesteryear
66 | past_tense
67 | washington
68 | evergreen_state
69 | wa
70 | holocene
71 | holocene_epoch
72 | recent_epoch
73 | electric_current
74 | stream
75 | flow
76 | platinum
77 | pt
78 | atomic_number_78
79 | mention
80 | reference
81 | citation
82 | cite
83 | acknowledgment
84 | credit
85 | quotation
86 | honorable_mention
87 | family
88 | household
89 | house
90 | home
91 | menage
92 | family_unit
93 | class
94 | category
95 | family_line
96 | folk
97 | kinfolk
98 | kinsfolk
99 | sept
100 | phratry
101 | kin
102 | kinsperson
103 | syndicate
104 | crime_syndicate
105 | mob
106 | fellowship
107 | dose
108 | dosage
109 | venereal_disease
110 | vd
111 | venereal_infection
112 | social_disease
113 | cupid's_itch
114 | cupid's_disease
115 | venus's_curse
116 | sexually_transmitted_disease
117 | std
118 | acid
119 | back_breaker
120 | battery-acid
121 | dot
122 | elvis
123 | loony_toons
124 | lucy_in_the_sky_with_diamonds
125 | pane
126 | superman
127 | window_pane
128 | zen
129 | diagnosis
130 | time
131 | clip
132 | clock_time
133 | fourth_dimension
134 | meter
135 | metre
136 | prison_term
137 | sentence
138 | number_one
139 | number_1
140 | commencement
141 | outset
142 | get-go
143 | start
144 | kickoff
145 | starting_time
146 | showtime
147 | offset
148 | first_base
149 | first_gear
150 | low_gear
151 | presumption
152 | precondition
153 | scope
154 | range
155 | reach
156 | orbit
157 | compass
158 | ambit
159 | mountain_range
160 | range_of_mountains
161 | chain
162 | mountain_chain
163 | chain_of_mountains
164 | image
165 | range_of_a_function
166 | grasp
167 | stove
168 | kitchen_range
169 | cooking_stove
170 | value
171 | economic_value
172 | time_value
173 | note_value
174 | chemical_reaction
175 | reaction
176 | response
177 | fountainhead
178 | type
179 | character
180 | case
181 | cause
182 | reason
183 | grounds
184 | campaign
185 | crusade
186 | drive
187 | movement
188 | effort
189 | causal_agent
190 | causal_agency
191 | lawsuit
192 | suit
193 | causa
194 | iodine
195 | iodin
196 | i
197 | atomic_number_53
198 | ace
199 | unity
200 | tendency
201 | trend
202 | course
203 | drift
204 | vogue
205 | style
206 | specialist
207 | specializer
208 | specialiser
209 | medical_specialist
210 | positive_degree
211 | date
212 | day_of_the_month
213 | escort
214 | appointment
215 | engagement
216 | particular_date
217 | veteran
218 | veteran_soldier
219 | vet
220 | ex-serviceman
221 | old-timer
222 | oldtimer
223 | old_hand
224 | warhorse
225 | old_stager
226 | stager
227 | use
228 | usage
229 | utilization
230 | utilisation
231 | employment
232 | exercise
233 | function
234 | purpose
235 | role
236 | consumption
237 | economic_consumption
238 | usance
239 | use_of_goods_and_services
240 | habit
241 | manipulation
242 | enjoyment
243 | checkup
244 | medical_checkup
245 | medical_examination
246 | medical_exam
247 | health_check
248 | show
249 | display
250 | appearance
251 | smoke
252 | smoking
253 | woman
254 | adult_female
255 | charwoman
256 | char
257 | cleaning_woman
258 | cleaning_lady
259 | womanhood
260 | fair_sex
261 | list
262 | listing
263 | tilt
264 | inclination
265 | lean
266 | spring
267 | springiness
268 | lab
269 | laboratory
270 | research_lab
271 | research_laboratory
272 | science_lab
273 | science_laboratory
274 | symptom
275 | side
276 | face
277 | side_of_meat
278 | position
279 | slope
280 | incline
281 | ground
282 | intellect
283 | rationality
284 | reasonableness
285 | impact
286 | wallop
287 | impingement
288 | encroachment
289 | evaluation
290 | rating
291 | valuation
292 | department
293 | section
294 | indication
295 | indicant
296 | denotation
297 | reading
298 | record
299 | phonograph_record
300 | disk
301 | disc
302 | platter
303 | track_record
304 | record_book
305 | book
306 | criminal_record
307 | measurement
308 | measure
309 | mensuration
310 | evidence
311 | graph
312 | graphical_record
313 | light-emitting_diode
314 | report
315 | study
316 | written_report
317 | news_report
318 | write_up
319 | report_card
320 | composition
321 | paper
322 | theme
323 | reputation
324 | etiology
325 | aetiology
326 | unit_of_measurement
327 | unit
328 | social_unit
329 | building_block
330 | work
331 | piece_of_work
332 | workplace
333 | oeuvre
334 | body_of_work
335 | heights
336 | senior_high_school
337 | senior_high
338 | highschool
339 | high_school
340 | high_gear
341 | hush
342 | stillness
343 | distillery
344 | manner
345 | mode
346 | way
347 | fashion
348 | modality
349 | mood
350 | musical_mode
351 | modal_value
352 | year
353 | twelvemonth
354 | yr
355 | times
356 | multiplication
357 | cover
358 | set
359 | exercise_set
360 | stage_set
361 | circle
362 | band
363 | lot
364 | bent
365 | solidification
366 | seth
367 | readiness
368 | exterior
369 | drug
370 | prescription
371 | prescription_drug
372 | prescription_medicine
373 | ethical_drug
374 | baseline
375 | service_line
376 | status
377 | condition
378 | tin
379 | canful
380 | can_buoy
381 | buttocks
382 | nates
383 | arse
384 | butt
385 | backside
386 | bum
387 | buns
388 | fundament
389 | hindquarters
390 | hind_end
391 | keister
392 | posterior
393 | prat
394 | rear
395 | rear_end
396 | rump
397 | stern
398 | seat
399 | tail
400 | tail_end
401 | tooshie
402 | tush
403 | bottom
404 | fanny
405 | ass
406 | toilet
407 | commode
408 | crapper
409 | pot
410 | potty
411 | stool
412 | throne
413 | lavatory
414 | lav
415 | john
416 | privy
417 | bathroom
418 | exploitation
419 | victimization
420 | victimisation
421 | the_like
422 | the_likes_of
423 | ilk
424 | ar
425 | master_of_education
426 | startle
427 | jump
428 | starting_line
429 | scratch
430 | scratch_line
431 | head_start
432 | david_low
433 | sir_david_low
434 | sir_david_alexander_cecil_low
435 | complication
436 | ramification
437 | complicatedness
438 | knottiness
439 | tortuousness
440 | visit
441 | sojourn
442 | remote_control
443 | remote
444 | child
445 | kid
446 | youngster
447 | minor
448 | shaver
449 | nipper
450 | small_fry
451 | tiddler
452 | tike
453 | tyke
454 | fry
455 | baby
456 | veterinary_surgeon
457 | hazard
458 | jeopardy
459 | peril
460 | risk
461 | endangerment
462 | danger
463 | risk_of_infection
464 | risk_of_exposure
465 | stallion
466 | fleshiness
467 | obesity
468 | corpulency
469 | findings
470 | determination
471 | change
472 | alteration
473 | modification
474 | variety
475 | search
476 | hunt
477 | lookup
478 | mixer
479 | hospital
480 | standard
481 | criterion
482 | touchstone
483 | summary
484 | sum-up
485 | detail
486 | item
487 | particular_proposition
488 | room
489 | elbow_room
490 | values
491 | meet
492 | control
493 | control_condition
494 | dominance
495 | ascendance
496 | ascendence
497 | ascendancy
498 | ascendency
499 | command
500 | mastery
501 | controller
502 | need
503 | demand
504 | want
505 | motivation
506 | motive
507 | indigence
508 | penury
509 | pauperism
510 | pauperization
511 | world_health_organization
512 | travel
513 | change_of_location
514 | locomotion
515 | numbers
516 | book_of_numbers
517 | numbers_pool
518 | numbers_game
519 | numbers_racket
520 | number
521 | figure
522 | act
523 | routine
524 | turn
525 | bit
526 | phone_number
527 | telephone_number
528 | identification_number
529 | practice
530 | pattern
531 | drill
532 | practice_session
533 | recitation
534 | praxis
535 | effects
536 | personal_effects
537 | impression
538 | essence
539 | burden
540 | core
541 | gist
542 | force
543 | fill
544 | woof
545 | weft
546 | pick
547 | infusion
548 | extract
549 | excerpt
550 | excerption
551 | selection
552 | management
553 | direction
554 | ii
555 | deuce
556 | transcript
557 | copy
558 | written_matter
559 | kind
560 | sort
561 | form
562 | multiple
563 | convention
564 | rule
565 | formula
566 | incidence
567 | relative_incidence
568 | plan
569 | program
570 | programme
571 | design
572 | architectural_plan
573 | nosology
574 | diagnostics
575 | discovery
576 | breakthrough
577 | iii
578 | trio
579 | threesome
580 | tierce
581 | leash
582 | troika
583 | triad
584 | trine
585 | trinity
586 | ternion
587 | triplet
588 | tercet
589 | terzetto
590 | trey
591 | deuce-ace
592 | stop
593 | halt
594 | stoppage
595 | stopover
596 | layover
597 | arrest
598 | check
599 | hitch
600 | stay
601 | stop_consonant
602 | plosive_consonant
603 | plosive_speech_sound
604 | plosive
605 | period
606 | point
607 | full_stop
608 | full_point
609 | diaphragm
610 | catch
611 | blockage
612 | block
613 | closure
614 | occlusion
615 | procedure
616 | process
617 | operation
618 | subroutine
619 | subprogram
620 | exhibit
621 | presentation
622 | video_display
623 | conditions
624 | weather
625 | weather_condition
626 | atmospheric_condition
627 | stipulation
628 | circumstance
629 | consideration
630 | shape
631 | term
632 | experimental_condition
633 | medical_specialty
634 | practice_of_medicine
635 | music
636 | recommendation
637 | testimonial
638 | good_word
639 | passport
640 | semen
641 | seed
642 | seminal_fluid
643 | ejaculate
644 | cum
645 | footing
646 | basis
647 | base
648 | foundation
649 | groundwork
650 | cornerstone
651 | name
652 | gens
653 | public_figure
654 | epithet
655 | fillet
656 | chart
657 | elevated_railroad
658 | el
659 | contraindication
660 | virginia
661 | old_dominion
662 | old_dominion_state
663 | va
664 | department_of_veterans_affairs
665 | degree
666 | grade
667 | level
668 | tier
669 | stage
670 | spirit_level
671 | horizontal_surface
672 | layer
673 | stratum
674 | floor
675 | storey
676 | person
677 | someone
678 | somebody
679 | mortal
680 | soul
681 | rich_person
682 | wealthy_person
683 | sum
684 | totality
685 | aggregate
686 | amount
687 | prevention
688 | bar
689 | campaigner
690 | candidate
691 | nominee
692 | prospect
693 | admission
694 | admittance
695 | entrance_fee
696 | admission_charge
697 | admission_fee
698 | admission_price
699 | price_of_admission
700 | entrance_money
701 | entree
702 | access
703 | accession
704 | night
705 | nighttime
706 | dark
707 | nox
708 | duration
709 | continuance
710 | length
711 | diet
712 | vital_organ
713 | vitals
714 | facility
715 | installation
716 | adeptness
717 | adroitness
718 | deftness
719 | quickness
720 | method
721 | appraisal
722 | assessment
723 | judgment
724 | judgement
725 | maximum
726 | upper_limit
727 | utmost
728 | uttermost
729 | intercession
730 | interference
731 | interposition
732 | follow-up
733 | followup
734 | reexamination
735 | review
736 | whitethorn
737 | english_hawthorn
738 | crataegus_laevigata
739 | crataegus_oxycantha
740 | details
741 | inside_information
742 | contingent
743 |
--------------------------------------------------------------------------------
/generation/i2b2_obesity/obesity-answers.py:
--------------------------------------------------------------------------------
1 | import xmltodict
2 | import csv
3 | import json
4 | import argparse
5 | import os
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 obesity challange files')
9 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format')
10 | parser.add_argument('--output_dir', default='', help='Directory to store the output')
11 | args = parser.parse_args()
12 |
13 | ###################################################### SET FILE PATHS ##################################################################
14 |
15 | templates_file = args.templates_dir
16 | obesity_file_path = i2b2_file_paths = args.i2b2_dir
17 |
18 | file_names = ["obesity_standoff_annotations_test.xml","obesity_standoff_annotations_training.xml"]
19 | note_names = ["obesity_patient_records_test.xml", "obesity_patient_records_training.xml"]
20 |
21 | ql_output = os.path.join(args.output_dir,"obesity-ql.csv")
22 | #print(ql_output)
23 | qa_json_out = os.path.join(args.output_dir,"obesity-qa.json")
24 |
25 | ######################################################## CODE #########################################################################
26 |
27 | def ReadFile():
28 |
29 | file_path = obesity_file_path
30 |
31 | Patient = {} #note_id is the key with a dictionary as value
32 |
33 | for note_name in note_names:
34 | file = file_path + note_name
35 | with open(file) as fd:
36 | XML = xmltodict.parse(fd.read())
37 |
38 | for doc in XML["root"]["docs"]["doc"]:
39 | doc_id = doc["@id"]
40 | note_text = doc["text"]
41 |
42 |
43 | if doc_id not in Patient:
44 | Patient[doc_id] = {}
45 | Patient[doc_id]["text"] = note_text
46 |
47 | for file_name in file_names:
48 | file = file_path + file_name
49 | with open(file) as fd:
50 | XML = xmltodict.parse(fd.read())
51 |
52 | intuitive = XML["diseaseset"]["diseases"][0]["disease"]
53 | textual = XML["diseaseset"]["diseases"][1]["disease"]
54 |
55 | #print(intuitive)
56 | for idx in range(len(intuitive)):
57 |
58 | disease_name = intuitive[idx]["@name"]
59 | intuitive_docs_list = intuitive[idx]["doc"]
60 |
61 | for pidx in range(len(intuitive_docs_list)):
62 |
63 | idoc_id = intuitive_docs_list[pidx]["@id"]
64 | ijudgment = intuitive_docs_list[pidx]["@judgment"]
65 |
66 | if idoc_id not in Patient:
67 | Patient[idoc_id] = {}
68 | if disease_name not in Patient[idoc_id]:
69 | Patient[idoc_id][disease_name] = ijudgment
70 |
71 | for idx in range(len(textual)):
72 |
73 | disease_name = textual[idx]["@name"]
74 | textual_docs_list = textual[idx]["doc"]
75 |
76 | for pidx in range(len(textual_docs_list)):
77 |
78 | tdoc_id = textual_docs_list[pidx]["@id"]
79 | tjudgment = textual_docs_list[pidx]["@judgment"]
80 |
81 | try:
82 | ijudgment = Patient[tdoc_id][disease_name]
83 | if ijudgment != tjudgment and tjudgment != "U" and tjudgment != "Q":
84 | print(ijudgment, tjudgment, disease_name, tdoc_id)
85 | except:
86 | try:
87 | Patient[tdoc_id][disease_name] = tjudgment
88 | except:
89 | Patient[tdoc_id] = {disease_name:tjudgment}
90 | continue
91 |
92 |
93 | return Patient
94 |
95 | def MakeJSONOut(obesity_data,json_out,Patient):
96 |
97 |
98 | obesity_out = {"paragraphs": [], "title": "obesity"}
99 |
100 | for note_id in Patient:
101 | Y_class = []
102 | U_class = []
103 | Q_class = []
104 | N_class = []
105 | patient_note = Patient[note_id]["text"]
106 | out = {"note_id": note_id, "context": patient_note, "qas": []}
107 | unique_questions = []
108 |
109 | for problem in Patient[note_id]:
110 | if problem == "text":
111 | continue
112 | if Patient[note_id][problem] == "Y":
113 | Y_class.append(problem)
114 | elif Patient[note_id][problem] == "N":
115 | N_class.append(problem)
116 | elif Patient[note_id][problem] == "U":
117 | U_class.append(problem)
118 | elif Patient[note_id][problem] == "Q":
119 | Q_class.append(problem)
120 | else:
121 | print(Patient[note_id][problem])
122 |
123 | ###### not doing on all questions #####
124 |
125 | for row in obesity_data:
126 | question = row[2].strip()
127 |
128 | if question == "":
129 | continue
130 | lform = row[3]
131 | answer_type = row[4]
132 | question = question.replace("\t", "")
133 | lform = lform.replace("\t", "")
134 | orginal = question
135 |
136 | if answer_type == "problems":
137 | for idx in range(len(Y_class)):
138 | problem = Y_class[idx]
139 | question = orginal
140 |
141 | if problem == "Obesity":
142 | qwords = question.split("|")
143 | qwords[1] = problem
144 | lform_new = lform.replace("|problem|",problem)
145 | qwords = [word.strip() for word in qwords]
146 | final_question = " ".join(qwords)
147 | Answer = Y_class[0:idx] + Y_class[idx + 1:]
148 | else:
149 | question = orginal.replace("|problem|", problem)
150 | lform_new = lform.replace("|problem|", problem)
151 | filewriter_forlform.writerow([question] + [lform_new] + [question] + [lform])
152 | continue
153 |
154 | ans_list = []
155 | for ans in Answer:
156 | ans_list.append({"answer_start": "", "text": ans, "evidence": "", "evidence_start": ""})
157 | #print(final_question)
158 | answer = {"answers": ans_list, "id": [[final_question,final_question],lform], "question": [final_question]}
159 | out["qas"].append(answer)
160 |
161 | filewriter_forlform.writerow([question] + [lform_new] + [question] + [lform])
162 |
163 | elif answer_type == "yes/no" and "|problem|" in question:
164 | answers = ["yes", "no", "UNK"]
165 | jdx = -1
166 | question_template = question.split("##")
167 | #print(question)
168 | for temp in [Y_class, N_class, U_class]:
169 | jdx += 1
170 | for problem in temp:
171 |
172 | #if problem.lower() != "obesity":
173 | # continue
174 |
175 | orginal_lform = lform
176 | question_lits = question.replace("|problem|",problem).split("##")
177 | lform_new = lform.replace("|problem|", problem)
178 | #print(question_lits)
179 | idx = 0
180 | if question_lits not in unique_questions:
181 | unique_questions.append(question_lits)
182 |
183 | for q in question_lits:
184 | filewriter_forlform.writerow([q] + [lform_new] + [question_template[idx]] + [orginal_lform])
185 | idx += 1
186 |
187 | Answer = [answers[jdx]]
188 | ans_list = []
189 | for ans in Answer:
190 | ans_list.append({"answer_start": "", "text": ans, "evidence": "", "evidence_start": ""})
191 |
192 | answer = {"answers": ans_list, "id": [zip(question_lits,question_template),orginal_lform], "question": question_lits}
193 |
194 | out["qas"].append(answer)
195 | else:
196 | print(answer_type)
197 |
198 | obesity_out["paragraphs"].append(out)
199 |
200 | with open(json_out, 'w') as outfile:
201 | json.dump(obesity_out, outfile)
202 |
203 |
204 | if __name__=="__main__":
205 |
206 | ofile = open(ql_output, "w")
207 | filewriter_forlform = csv.writer(ofile, delimiter="\t")
208 | filewriter_forlform.writerow(["Question", "Logical Form"])
209 |
210 | ### Read i2b2 files ###
211 |
212 | Patient = ReadFile()
213 |
214 | ### File to read templates ###
215 |
216 | qfile = open(templates_file)
217 | read_data = list(csv.reader(qfile))
218 |
219 | ## read only templates relevant to obesity challenge ##
220 |
221 | obesity_data = []
222 | for line in read_data[1:]:
223 | if line[0] != "obesity":
224 | continue
225 | obesity_data.append(line)
226 |
227 |
228 | MakeJSONOut(obesity_data,qa_json_out,Patient)
229 | #MakeQuestion(questions_file,out_file,Patient)
230 |
231 |
232 | '''
233 | def MakeQuestion(questions_file,out_file,Patient):
234 |
235 | qfile = open(questions_file)
236 | read_data = list(csv.reader(qfile, delimiter="\t"))
237 |
238 | ofile = open(out_file, "w")
239 | ofilewriter = csv.writer(ofile)
240 |
241 | values = ["Question", "Answer", "Answer line in note", "Note ID", "Difference in QA lines"]
242 | ofilewriter.writerow(values)
243 |
244 |
245 | for note_id in Patient:
246 | Y_class = []
247 | U_class = []
248 | Q_class = []
249 | N_class = []
250 | for problem in Patient[note_id]:
251 | if Patient[note_id][problem] == "Y":
252 | Y_class.append(problem)
253 | elif Patient[note_id][problem] == "N":
254 | N_class.append(problem)
255 | elif Patient[note_id][problem] == "U":
256 | U_class.append(problem)
257 | elif Patient[note_id][problem] == "Q":
258 | Q_class.append(problem)
259 | else:
260 | print(Patient[note_id][problem])
261 |
262 |
263 | for row in read_data[1:4]:
264 | question = row[1].strip()
265 | if question == "":
266 | continue
267 | #print(row)
268 | answer_type = row[3]
269 | question_in = row[0] #question_concept_type
270 |
271 | if answer_type == "problems":
272 | for idx in range(len(Y_class)):
273 | problem = Y_class[idx]
274 | qwords = question.split("|")
275 | qwords[1] = problem
276 | qwords = [word.strip() for word in qwords]
277 | final_question = " ".join(qwords)
278 | Answer = Y_class[0:idx]+Y_class[idx+1:]
279 | ofilewriter.writerow([final_question," ".join(Answer), "", note_id, ""])
280 | elif answer_type == "yes/no" and question_in == "problem":
281 | answers = ["yes","no",""]
282 | jdx = -1
283 | for temp in [Y_class,N_class,U_class]:
284 | jdx += 1
285 | for idx in range(len(temp)):
286 | problem = temp[idx]
287 | qwords = question.split("|")
288 | qwords[1] = problem
289 | qwords = [word.strip() for word in qwords]
290 | final_question = " ".join(qwords)
291 | Answer = answers[jdx]
292 | ofilewriter.writerow([final_question,Answer, "", note_id, ""])
293 | elif answer_type == "yes/no" and question_in == "None":
294 | try:
295 | if Patient[note_id]["Obesity"] == "Y":
296 | ofilewriter.writerow([question, "yes", "", note_id, ""])
297 | if Patient[note_id]["Obesity"] == "N":
298 | ofilewriter.writerow([question, "no", "", note_id, ""])
299 | if Patient[note_id]["Obesity"] == "U":
300 | ofilewriter.writerow([question, "", "", note_id, ""])
301 | except:
302 | print(Patient[note_id].keys())
303 | else:
304 | print(answer_type,question_in)
305 | '''
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # emrQA: A Large Corpus for Question Answering on Electronic Medical Records
2 | The page and codes are ready for use. We are excited to announce that this data
3 | will now be hosted directly under the i2b2 license !! So you can directly
4 | download the dataset from the i2b2 website instead of generating it from the scripts!
5 | For downloading please refer to the instructions under the download tab below.
6 | For later versions of emrqa and recent updates contact Preethi Raghavan (praghav@us.ibm.com).
7 |
8 | - This repo contains code for the paper
9 | Anusri Pampari, Preethi Raghavan, Jennifer Liang and Jian Peng,
10 | [emrQA: A Large Corpus for Question Answering on Electronic Medical Records][paper-link],
11 | In Conference on Empirical Methods in Natural Language Processing (EMNLP) 2018, Brussels, Belgium.
12 | - General queries/thoughts have been addressed in the discussion section below.
13 | - To automatically download emrQA from i2b2 instead of using the scripts to generate them follow instructions listed below.
14 | - For later versions of emrQA contact Preethi Raghavan (praghav@us.ibm.com).
15 | - Please contact [Anusri Pampari][anusri-home] (\@stanford.edu) for suggestions and comments. More instructions about reporting bugs detailed below.
16 |
17 | ## Quick Links
18 |
19 | - [About](#question-answering-on-electronic-medical-records)
20 | - [Download Dataset](#download-dataset)
21 | - [Requirements](#requirements)
22 | - [Data Generation](#emrqa-generation)
23 | - [Data Analysis](#emrqa-analysis)
24 | - [Baselines](#baselines)
25 | - [Discussion](#discussion)
26 | - [Report a bug](#dataset-bugs)
27 | - [Current works using emrQA](#current-works-using-emrqa)
28 |
29 | ## Question Answering on Electronic Medical Records (EMR)
30 |
31 | In this work, we address the lack of any publicly available EMR Question Answering (QA) corpus by creating a large-scale dataset, emrQA, using a novel semi-automated generation framework that allows for minimal expert involvement and re-purposes existing annotations available for other clinical NLP tasks. To briefly summarize the generation process: (1) we collect questions from experts (2) convert them to templates by replacing entities with placeholders (3) expert annotate the templates with logical form templates and then (4) use annotations from existing NLP tasks (based on information in logical forms) to populate placeholders in templates and generate answers. For our purpose, we use existing NLP task annotations from the [i2b2 Challenge datasets][i2b2-datasets]. We refer the reader to the paper to get a more detailed overview of the generation framework.
32 |
33 | This repository includes the question and logical form templates provided by our experts and the code for generating the emrQA dataset from these templates and the i2b2 challenge datasets. Note that this work is a refactored and extended version of the orginal dataset described in the paper.
34 |
35 | Some statistics of the current version of the generated data:
36 |
37 | | Datasets | QA pairs | QL pairs | #Clinical Notes |
38 | | :------: | :------: | :------: | :----: |
39 | | i2b2 relations (concepts, relations, assertions)| 1,322,789 | 1,008,205 | 425 |
40 | | i2b2 medications | 226,128 | 190,169 | 261 |
41 | | i2b2 heart disease risk | 49,897 | 35,777 | 119 |
42 | | i2b2 smoking | 4,518 | 14 | 502 |
43 | | i2b2 obesity | 354,503 | 336 | 1,118 |
44 | | **emrQA (total)** | **1,957,835** | **1,225,369** | **2,425** |
45 |
46 | **UPDATES:**
47 | ```
48 | 29th Novemebr 2018: We are excited to announce that this data will now be hosted directly under the i2b2 license !! So you can directly download the dataset from the i2b2 website instead of generating it from the scripts.
49 | 27th August 2018: Extended the i2b2 obesity question-answer pairs to obesity comorbidities.
50 | 20th August 2018: Added QA pairs generated from i2b2 relations (assertions).
51 | 27th Jun 2018: Dataset as decribed in the paper.
52 | ```
53 |
54 | ## Download Dataset
55 |
56 | emrQA is available for download here: https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/ (you'll need to sign the agreement and request for the data, it comes through the same day). You'll find it listed under Community Annotations Downloads as follows -
57 |
58 | 
59 |
60 | Please note that the download link title in the image has a typo. The link says that question answers were generated from 2014 Heart Disease risk factors data but it is generated from all the datasets listed in the table above (medications, smoking, obesity, heart disease and relations). So ignore the title and go ahead and downlaod the entire dataset from the link. i2b2/n2c2 will soon fix this typo.
61 |
62 | ## Requirements
63 |
64 | To generate emrQA, first download the NLP Datasets from the [i2b2 Challenges][i2b2-datasets] accessible by everyone subject to a license agreement. You will need to download and extract all the datasets corresponding to given a challenge (e.g 2009 Medications Challenge) to a directory named `i2b2` in the main folder (the contains of the folder location are eloborated below in the discussion section for your reference). Once completed, check the path location in `main.py`. In our work, we have currently made use of all the challenge datasets except the 2012 Temporal Relations Challenge. Our future extensions of the dataset to include this challenge dataset will soon be available.
65 |
66 | The generation scrpits in the repo require Python 2.7. Run the following commands to clone the repository and install the requirements for emrQA:
67 |
68 | ```bash
69 | git clone https://github.com/emrqa/emrQA.git
70 | cd emrQA; pip install -r requirements.txt
71 | ```
72 |
73 |
74 | ## emrQA Generation
75 |
76 | Run `python main.py` to generate the question-answers pairs in a json format and the question-logical form pairs in a csv format. The input to these scripts is a csv file (`templates-all.csv`) located in `templates\` directory. By default the script creates an `output\` directory to store all the generated files. You can access the combined question-answer dataset as `data.json` and question-logical form data as `data-ql.csv`. You can also access the intermediate datasets generated per every i2b2 challenge (e.g. `medications-qa.json` and `medication-ql.csv` generated from the 2009 medications challenge annotations).
77 |
78 |
79 | A thorough discussion of the output format of these files is presented below.
80 |
81 | #### Input: Templates (CSV) Format
82 |
83 | Each row in the csv file has the following format:
84 |
85 | ```
86 | "dataset" \t "question templates" \t "logical form templates" \t "answer type" \t "sub-answer-type"
87 | ```
88 |
89 | A brief explantion how following fields are used in `main.py`,
90 |
91 | ```
92 | dataset: The i2b2 challenge dataset annotations to be used for the templates in that row. This field should be one of the following values, medications, relations, risk, smoking or obesity.
93 |
94 | question templates: All the question paraphrase templates are provided as a string seperated by ##.
95 |
96 | logical form templates: The logical form template expert annotated for the question templates.
97 |
98 | answer type: The output type
99 |
100 | sub-answer-type:
101 | ```
102 | #### Output: Question-Answer (JSON) Format
103 |
104 | The json files in `output\` directory have the following format:
105 |
106 | ```
107 | data.json
108 | ├── "data"
109 | └── [i]
110 | ├── "paragraphs"
111 | │ └── [j]
112 | │ ├── "note_id": "clinical note id"
113 | │ ├── "context": "clinical note text"
114 | │ └── "qas"
115 | │ └── [k]
116 | │ ├── "answers"
117 | │ │ └── [l]
118 | │ │ ├── "answer_start"
119 | │ │ │ └── [m]
120 | │ │ │ ├── integer (line number in clinical note to find the answer entity)
121 | │ │ │ └── integer (token position in line to find the answer entity)
122 | │ │ │
123 | │ │ ├──"text": "answer entity"
124 | │ │ │
125 | │ │ ├──"evidence": "evidence line to support the answer entity "
126 | │ │ │
127 | │ │ ├──"answer_entity_type": takes the value "single" or "empty" or "complex" (refer to discussion for more details)
128 | │ │ │
129 | │ │ └── "evidence_start": integer (line number in clinical note to find the evidence line)
130 | │ │
131 | │ ├── "id"
132 | │ │ └─ [n]
133 | │ │ ├──[o]
134 | │ │ │ ├── "paraphrase question"
135 | │ │ │ └── "paraphrase question-template"
136 | │ │ │
137 | │ │ └── "logical-form-template"
138 | │ │
139 | │ └── "question"
140 | │ └──[p]
141 | │ └──"paraphrase question"
142 | │
143 | └── "title": "i2b2 challenge name"
144 |
145 | ```
146 |
147 |
155 |
156 | #### Output: Question-Logical Form (CSV) Format
157 |
158 | Each row in the csv file has the following format,
159 |
160 | ```
161 | "question" \t "logical-form" \t "question-template" \t "logical-form-template"
162 | ```
163 |
164 | ## emrQA Analysis
165 |
166 | #### Basic statistics
167 |
168 | To run the scripts that finds the basic statistics of the dataset, such as average question length etc, do.
169 |
170 | ```bash
171 | python evaluation/basic-stats.py --output_dir output/
172 | ```
173 |
174 | #### Paraphrase analysis
175 |
176 | To run the scripts that finds (1) the average number of paraphrase templates (2) Jaccard and BLEU Score of parapharase templates
177 |
178 | ```bash
179 | python evaluation/paraphrase-analysis --templates_dir templates/
180 | ```
181 |
182 | #### Logical form template analysis
183 |
184 | To run the scripts that filter logical form templates with specific properties,
185 |
186 | ```bash
187 | python evaluation/template-analysis.py --templates_dir templates/
188 | ```
189 | ## Discussion
190 |
191 | ##### What is the "answer_entity_type" field for ?
192 |
193 | The "answer_entity_type" field in `data.json` takes the following values,
194 |
195 | 1) "empty": This indicates that the "text" field is an empty string, which means that there is no specific entity to look for in the evidence line.
196 |
197 | 2) "single": This indicates that the "text" field contains a single entity that can be found in the evidence line and can answer the question byitself.
198 |
199 | 3) "complex": This indicates that each "text" field is a list of entities. This means that each answer needs all the entities in this list to give a single answer. Here the evidence lines and answer_start (line start and token start) are all lists corresponding to the entity.
200 |
201 | ##### Why do I see “coronary artery” instead of “coronary artery disease” in the question? Why is the entity used in question not complete ?
202 |
203 | We have a preprocessing step, before using the i2b2 annotation in the question. This is because the annotation itself are noisy and can include generic concepts within the annotations.
204 |
205 | For example,
206 |
207 | Minor disease, her disease, her dominant CAD - these are all annotated as problems. So we remove/clean them using a pre-processing step using some rules which checks for generic words in the annotation. As a result of this we are getting "coronary artery" instead of "coronary artery disease".
208 |
209 | ##### How is the "context" field related to the clinical notes text ?
210 |
211 | In i2b2 medications, i2b2 relations, i2b2 smoking and i2b2 obesity challenge every patient has a single clinical note which is directly used in the "context" field.
212 |
213 | For i2b2 heart disease risk dataset we have 4/5 longitudnal clinical notes per patient named as follows, "note_id-01.txt", "note_id-02.txt"..."note_id-05.txt". Each of these files correspond to notes on a particular day and are already in timeline order.
214 | We combine all these ".txt" files (in the order given) seperated by "\n" and use them in the "context" field. The note_id part of the file name is used in "note_id" field. If you wish to break them down into individual notes, you can refer to the "note_id" field and in reverse find the note_id-01.txt, note_id-02.txt contents in the "context" field.
215 |
216 | ##### i2b2 smoking and i2b2 obesity challenge generted QA are different. How ?
217 |
218 | For the QA pairs generated from these datasets we do not have an evidence, neither do have a specific entity to look for. Instead the "text" field here is the class information provided in these two challenges and the entire "context" field can be seen as evidence. Please refer to the corresponding challenges for more information about the classes.
219 |
220 | ##### The answer evidence is not a complete sentence. Why ?
221 |
222 | The annotations used from the i2b2 datasets (except heart disease risk) have both token span and line number annotations. Clinical notes in these datasets are split at the newline character and assigned a line number. Our evidence line is simply the line in the clinical note corresponsing to a particular i2b2 annotation's line number. Since i2b2 heart disease risk annotations has only token span annotations without any line number annotations, we break the clinical notes at newline character and the line containing the token span is considered as our evidence line.
223 |
224 | - When clinical notes are split at newline character, start/stop of the evidence line may not overlap with a complete sentence in a clinical note. To avoid this we tried to use a sentence splitter instead of newline character to determine our evidence lines. But existing sentence splitter's such as NLTK sentence splitter do even worse in breaking a clinical notes sentence because of its noisy, ungrammatical structure.
225 | - Clinical notes are noisy, so some of the evidence lines may not have complete context or may not be grammatically correct.
226 |
227 | ##### i2b2 datasets directory structure
228 |
229 | The i2b2 challenge datasets used to generate the current emrQA version was downloaded in August, 2017. Since the structure of these i2b2 datasets itself could change, we thought it might be useful to discuss our i2b2 repository structure.
230 |
231 | The scipts in this repository are used to parse the following i2b2 directory structure,
232 |
233 | ```
234 |
235 | ├── "i2b2 (download the datsets in single folder)"
236 | ├── "smoking" (download 2006 smoking challenge datasets here)
237 | │ │
238 | │ ├── "smokers_surrogate_test_all_groundtruth_version2.xml"
239 | │ └── "smokers_surrogate_train_all_version2.xml"
240 | │
241 | ├── "obesity" (download 2008 obesity challenge datasets here)
242 | │ │
243 | │ ├── "obesity_standoff_annotations_test.xml"
244 | │ ├── "obesity_standoff_annotations_training.xml"
245 | │ ├── "obesity_patient_records_test.xml"
246 | │ └── "obesity_patient_records_training.xml"
247 | │
248 | ├── "medication" (download 2009 medication challenge datasets here)
249 | │ │
250 | │ ├── "train.test.released.8.17.09/" (folder containing all clinical notes)
251 | │ ├── "annotations_ground_truth/converted.noduplicates.sorted/" (folder path with medication annotations
252 | │ └── "training.ground.truth/" (folder path with medication annotations)
253 | │
254 | ├── "relations" (download 2010 relation challenge datasets here)
255 | │ │
256 | │ ├── "concept_assertion_relation_training_data/partners/txt/" (folder path containing clinical notes)
257 | │ ├── "concept_assertion_relation_training_data/beth/txt/" (folder path containing clinical notes)
258 | │ ├── "test_data/txt/" (folder path containing clinical notes)
259 | │ ├── "concept_assertion_relation_training_data/partners/rel/" (folder path with relation annotations)
260 | │ ├── "concept_assertion_relation_training_data/beth/rel/" (folder path with relation annotations)
261 | │ ├── "test_data/rel/" (folder path with relation annotations)
262 | │ ├── "concept_assertion_relation_training_data/partners/ast/" (folder path with assertion annotations)
263 | │ ├── "concept_assertion_relation_training_data/beth/ast/" (folder path with assertion annotations)
264 | │ └── "test_data/ast/" (folder path with assertion annotations)
265 | │
266 | ├── "coreference" (download 2011 coreference challenge datasets here)
267 | │ │
268 | │ ├── "Beth_Train" (folder with the following subfolders "chains", "concepts", "docs", "pairs")
269 | │ ├── "Partners_Train" (folder with the following subfolders "chains", "concepts", "docs", "pairs")
270 | │ └── "i2b2_Test" (folder with "i2b2_Beth_Test" and "i2b2_Partners_Test" containing "chains" and "concepts" subfolders)
271 | │
272 | └── "heart-disease-risk" (download 2014 heart disease risk factprs challenge datasets here)
273 | │
274 | └── "training-RiskFactors-Complete-Set1/" (folder path with files containing annotations and clinical notes)
275 |
276 |
277 | ```
278 |
279 |
280 |
281 | ## Dataset Bugs
282 |
283 | ##### I see a bug in the dataset, What should I do ?
284 |
285 | For later versions of emrQA and recent updates contact Preethi Raghavan (praghav@us.ibm.com).
286 |
287 | Please contact [Anusri Pampari][anusri-home] (\@stanford.edu) for any bugs. The more details you provide me about the bug, the easier and hence quicker you will make it for me to debug it. You can help me with the following information:
288 |
289 | ```
290 | i2b2 dataset name
291 | example note_id, how many notes are affected by this bug if possible
292 | is there a trend in the type of questions (particular question template) where this bug occurs
293 | An example instance of the bug indetail.
294 | ```
295 |
296 | Opening a public issue, might go against the i2b2 license agreement. So it is important you mail me the bug. Thank you for understanding. I will try my best to reply at the earliest.
297 |
298 | ## Current works using emrQA
299 |
300 | For a full and updated list please refer to the entire list published [here][citation-list].
301 |
302 | - [Neural Mask Generator: Learning to Generate Adaptive Word Maskings for Language Model Adaptation][NMG]
303 | - [Improved Pretraining for Domain-specific Contextual Embedding Models][pretraining]
304 | - [Calibrating Structured Output Predictors for Natural Language Processing][calibration]
305 | - [Annotating and Characterizing Clinical Sentences with Explicit Why-QA Cues][why-qa]
306 | - [Entity-Enriched Neural Models for Clinical Question Answering][entity]
307 | - [Evaluation of Dataset Selection for Pre-Training and Fine-Tuning Transformer Language Models for Clinical Question Answering][eval]
308 | - [CliniQG4QA: Generating Diverse Questions for Domain Adaptation of Clinical Question Answering][da]
309 | - [How You Ask Matters: The Effect of Paraphrastic Questions to BERT Performance on a Clinical SQuAD Dataset][para1]
310 | - [Advancing Seq2seq with Joint Paraphrase Learning][para2]
311 | - [Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset][crc] - Though this work provides interesting analysis on some sub-parts of emrQA, we have concerns regarding the dataset bias considered in their analysis resulting in the said conclusions. We think that the emrQA readers should be aware of this bias and hence we try to convey this through a letter posted [here][letter] for the readers.
312 |
313 | [NMG]:https://arxiv.org/abs/2010.02705
314 | [para1]:https://www.aclweb.org/anthology/2020.clinicalnlp-1.13.pdf
315 | [para2]:https://www.aclweb.org/anthology/2020.clinicalnlp-1.30.pdf
316 | [da]: https://arxiv.org/pdf/2010.16021.pdf
317 | [eval]: https://www.aclweb.org/anthology/2020.lrec-1.679.pdf
318 | [entity]: https://arxiv.org/abs/2005.06587
319 | [why-qa]: https://www.aclweb.org/anthology/W19-1913.pdf
320 | [letter]: https://docs.google.com/document/d/1IeOqKPy3qzUEvpuSMy0Tvg7rjfYoAkn1ueplC5RXjpA/edit?usp=sharing
321 | [crc]: https://arxiv.org/abs/2005.00574
322 | [pretraining]: https://arxiv.org/pdf/2004.02288.pdf
323 | [calibration]: https://arxiv.org/pdf/2004.04361.pdf
324 | [citation-list]: https://scholar.google.com/scholar?cites=14819103415098730167&as_sdt=2005&sciodt=0,5&hl=en
325 | [i2b2-datasets]: https://www.i2b2.org/NLP/DataSets/
326 | [anusri-home]: https://www.linkedin.com/in/anusri-pampari-594bb5126/
327 | [drqa]: https://github.com/facebookresearch/DrQA
328 | [paper-link]: http://aclweb.org/anthology/D18-1258
329 |
330 |
331 |
332 |
--------------------------------------------------------------------------------
/generation/i2b2_medications/medication-answers.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | from os import listdir
4 | from os.path import isfile, join
5 | import json
6 | import random
7 | import argparse
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 medications challange files')
11 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format')
12 | parser.add_argument('--output_dir', default='', help='Directory to store the output')
13 |
14 | args = parser.parse_args()
15 |
16 |
17 | ###################################################### SET FILE PATHS ##################################################################
18 |
19 | ## i2b2 file paths ##
20 |
21 | DosageFilePath = [ os.path.join(args.i2b2_dir,"annotations_ground_truth/converted.noduplicates.sorted/"), os.path.join(args.i2b2_dir,"training.ground.truth/")]
22 |
23 | MedicationClinicalNotes = [os.path.join(args.i2b2_dir,"train.test.released.8.17.09/")]
24 |
25 | ## template file path ##
26 |
27 | template_file_path = args.templates_dir
28 |
29 | ## output file paths ##
30 |
31 | ql_output = os.path.join(args.output_dir,"medication-ql.csv")
32 | medications_qa_output_json = os.path.join(args.output_dir,"medication-qa.json")
33 |
34 |
35 | ######################################################## CODE #########################################################################
36 |
37 | class GenerateQA():
38 |
39 | DosageFilePath = DosageFilePath
40 | MedicationClinicalNotes = MedicationClinicalNotes
41 |
42 | def __init__(self):
43 |
44 | self.ReadMedicationData()
45 | self.ReadTemplates()
46 |
47 | ######################### Read i2b2 file functions ###################################
48 |
49 | def ReadMedicationData(self):
50 |
51 | ## based on format of the i2b2 files. please refer to the i2b2 medications challenge documentation for details ###
52 |
53 | abbs = {"m": "medication", "do": "dosage", "mo": "mode", "f": "frequency", "du": "duration", "r": "problem",
54 | "e": "event", "t": "temporal", "c": "certainty", "ln": "list"}
55 | exception = ["list", "event", "temporal",
56 | "certainty"] ## very few annotations are tagged with these, hence we willl ignore them.
57 |
58 | self.MedicationData = []
59 | ClinicalNotes = {}
60 |
61 | ## read the clinical notes ##
62 | for paths in self.MedicationClinicalNotes:
63 | files = [f for f in listdir(paths) if isfile(join(paths, f))]
64 | for file in files:
65 | remote_file = open(paths + file)
66 | ClinicalNotes[file.strip()] = remote_file.readlines()
67 |
68 | ## read the annotations per clinical note (parse the files) ##
69 |
70 | annotations_span = []
71 | for paths in self.DosageFilePath:
72 | files = [f for f in listdir(paths) if isfile(join(paths, f))]
73 | for file in files:
74 | remote_file = open(paths + file)
75 |
76 | note_id = file.split(".")[0]
77 | note_id = note_id.split("_")[0]
78 | # print(file)
79 | dictionary = {note_id: []}
80 | PatientNote = ClinicalNotes[note_id] ## access the corresponding clinical note.
81 | flag = 0
82 | for line in remote_file:
83 | med_list = {}
84 | line = line.replace("|||", "||")
85 | words = line.split("||")
86 |
87 | for word in words:
88 | term = word.split("=")
89 | try:
90 | type = abbs[term[0].strip()] ## check if all of them lie within the given annotation list
91 | except:
92 | print(paths + file)
93 | flag = 1
94 | break
95 |
96 | full_annotation = "=".join(term[1:])
97 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""]
98 | pos1 = int(index[0])
99 | pos2 = int(index[-1])
100 |
101 | annotation = full_annotation[pos1 + 1:pos2]
102 | indxs = full_annotation[pos2 + 1:].split(",")
103 |
104 | line_in_note = ""
105 | start_line = None
106 | if annotation == "nm" or type in exception:
107 | med_list[type] = [annotation, line_in_note, start_line]
108 | continue
109 |
110 | # print(word,annotation,indxs)
111 | # print(indxs)
112 | for indx in indxs:
113 | indx = indx.strip()
114 | out = indx.split(" ")
115 |
116 | start_line = out[0].split(":")[0]
117 | start_token = out[0].split(":")[1]
118 | end_line = out[1].split(":")[0]
119 | end_token = out[1].split(":")[1]
120 |
121 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)])
122 |
123 | # if int(end_line) > int(start_line):
124 | # print(type)
125 | # print(line)
126 | # print(end_line,start_line)
127 |
128 | ## some end line number are greater than start line numbers. annotation line_in_note can span upto 3 lines
129 | ## annotation can be discontinous set of tokens
130 |
131 | med_list[type] = [annotation, line_in_note, start_line, start_token]
132 |
133 | # if start_line != end_line:
134 | # print(int(end_line)-int(start_line))
135 | # print(line_in_note)
136 |
137 | dictionary[note_id].append(med_list)
138 |
139 | remote_file.close()
140 |
141 | if flag == 0:
142 | if (dictionary, PatientNote) not in self.MedicationData:
143 | self.MedicationData.append((dictionary, PatientNote))
144 |
145 |
146 | # print(annotations_span)
147 |
148 | ######################## Main program functions ##########################################
149 |
150 | def ReadTemplates(self):
151 |
152 | self.medications_out = {"paragraphs": [], "title": "medication"}
153 | self.logical_out = []
154 |
155 | ########################################## Set File Paths ##############################################
156 |
157 |
158 | ### File to write Question-Logical Forms ##
159 |
160 | ofile = open(ql_output, "w")
161 | self.filewriter_forlform = csv.writer(ofile, delimiter="\t")
162 | self.filewriter_forlform.writerow(["Question", "Logical Form"])
163 |
164 | ### File to read templates ###
165 |
166 | file = open(template_file_path)
167 | filereader = list(csv.reader(file))
168 |
169 | ## read only templates relevant to medications challenge ##
170 |
171 | med_lines = []
172 | for line in filereader[1:]:
173 | if line[0] != "medication" and line[0] != "medications":
174 | continue
175 | med_lines.append(line)
176 |
177 | ########################################## Main Function Call ##############################################
178 |
179 | for (dictionary,PatientNote) in self.MedicationData:
180 | for note_id in dictionary:
181 | out_patient = {"note_id": note_id, "context": PatientNote, "qas": []}
182 |
183 | med_list = dictionary[note_id] ## extract all the annotations given per note ##
184 |
185 | ## create one to many mappings, to use them for QA. Coreference not resolved ##
186 |
187 | self.MakeMedicationRelationMappings(med_list)
188 |
189 | flag = 0
190 | self.unique_questions = []
191 | question_id = 0
192 | for line in med_lines:
193 | ## do +1 for the new format ##
194 | question = line[2].strip()
195 | logical_form = line[3].strip()
196 | answertype = line[4].split(",")
197 | answertype = [type.strip() for type in answertype]
198 |
199 |
200 | #question = question.replace("|problem| or |problem|","|problem|")
201 | question = question.replace("|medication| or |medication|", "|medication|")
202 | question = question.replace("|problem| or |problem|", "|problem|")
203 | question = question.replace("|test| or |test|", "|test|")
204 | question = question.replace("|test| |test| |test|", "|test|")
205 | question = question.replace("\t", "")
206 | logical_form = logical_form.replace("\t", "")
207 |
208 | if question.strip() == "":
209 | continue
210 |
211 | answer_out = self.MakeMedicationQLA(question,logical_form,answertype,med_list,flag,note_id,PatientNote,question_id)
212 |
213 | if len(answer_out) != 0:
214 | #for answer in answer_out:
215 | #print(answer["id"])
216 | out_patient["qas"].extend(answer_out)
217 | self.medications_out["paragraphs"].append(out_patient)
218 |
219 | ################################################################# Dump JSON ###########################################
220 |
221 | json_out = medications_qa_output_json
222 | with open(json_out, 'w') as outfile:
223 | json.dump(self.medications_out, outfile, ensure_ascii=False) ## storage format same as SQUAD
224 |
225 | #json_out = medications_ql_output_json
226 | #with open(json_out, 'w') as outfile:
227 | # json.dump(self.logical_out, outfile, ensure_ascii=False) ## storage format, question logical_form question_id logicalfrom_id source
228 |
229 | def MakeMedicationQLA(self, question_list, logical_form_template, answertype, med_list, flag, note_id, PatientNote, question_id):
230 |
231 | answer_out = []
232 |
233 | ## save a copy of the orginals ##
234 | intial_question_list = question_list.split("##")
235 | intial_template = logical_form_template
236 | orginal_logical_form_template = logical_form_template.strip()
237 |
238 | ## check for errors in templates and gather all the placeholders in the templates (placeholders stored in rwords) ##
239 | ## semantic types of placeholders ##
240 |
241 | dup_rwords_list = self.CheckForErrors(intial_question_list, orginal_logical_form_template)
242 | if dup_rwords_list == None:
243 | return answer_out
244 |
245 | for med_annotations in med_list: ## Medlist is a list of dictionaries (each dict is a medication and its attributes)
246 |
247 | flag = 0
248 | logical_form_template = orginal_logical_form_template
249 | if len(dup_rwords_list) != 1: ## sanity check
250 | print("Check Question_Logical Form Mapping")
251 | print(dup_rwords_list, intial_question_list)
252 | print(logical_form_template)
253 | return answer_out
254 | else:
255 | dup_rwords = dup_rwords_list[0]
256 |
257 | rwords = list(dup_rwords)
258 | line_num = []
259 | line_token = []
260 | question_line = []
261 | quest_list_nar = []
262 |
263 | answer = []
264 |
265 | ### checking if placeholder values to be used in question is "nm" (not mentioned), if yes set flag to 1 ##
266 |
267 | if rwords != ["time"]:
268 | for idx in range(len(rwords)):
269 | if rwords[idx] == "treatment":
270 | rwords[idx] = "medication"
271 |
272 | if med_annotations[rwords[idx]][0] == "nm":
273 | flag = 1
274 | break
275 | else:
276 | line_num.append(int(med_annotations[rwords[idx]][2]))
277 | line_token.append(int(med_annotations[rwords[idx]][3]))
278 | question_line.append(med_annotations[rwords[idx]][1])
279 | rwords[idx] = med_annotations[rwords[idx]][0]
280 | quest_list_nar.append(med_annotations["list"][0])
281 |
282 | ## Generate question, logical form and answer only if flag is 0 ##
283 |
284 | if flag == 0:
285 | [paraphrase_questions, tuple_orginal, logical_form] = self.MakeMedicationQL(rwords,
286 | intial_question_list,
287 | logical_form_template,
288 | dup_rwords)
289 | [answer, answer_line, result_num, result_token, list_nar] = self.MakeAnswer(quest_list_nar, answertype,
290 | med_annotations,
291 | question_line, line_num,
292 | line_token)
293 | else:
294 | continue
295 | # return answer_out #### bug fixed ##
296 |
297 | if len(answer) != 0:
298 |
299 | if answertype == ["medication", 'dosage']:
300 | entity_type = "complex"
301 | elif answertype == ["yes"]:
302 | entity_type = "empty"
303 | else:
304 | entity_type = "single"
305 |
306 | unique_paras = set(paraphrase_questions)
307 | if unique_paras not in self.unique_questions: ## redundancy check: checking if these set of questions are unique for every clinical note ##
308 |
309 | self.unique_questions.append(unique_paras)
310 | question_id += 1
311 | ans_list = []
312 | for idx in range(len(answer)):
313 |
314 | start_line = result_num[idx]
315 | start_token = result_token[idx]
316 |
317 | val = {"answer_start": [start_line, start_token], "text": answer[idx],
318 | "evidence": answer_line[idx], "evidence_start": result_num[idx], "answer_entity_type": entity_type}
319 |
320 | if val not in ans_list:
321 | ans_list.append(val)
322 |
323 | ## ""evidence"" in the dictionary above is currently just the answer line in the note. You can also consider question line and answer line from note as evidence in that uncomment below code and use it accordingly #
324 |
325 | '''
326 |
327 | ## maximum distance between the question line and answer line ##
328 | perms = list(itertools.product(result_num+line_num, result_num+line_num))
329 | diffs = [abs(val1 - val2) for (val1, val2) in perms]
330 | difference = max(diffs)
331 |
332 | Note_val = "#".join(answer_line)
333 | list_nar = ",".join(list_nar)
334 |
335 | ## evidence per answer ##
336 | evidence_answer = []
337 | evidence_start = []
338 | evidence_temp_line = answer_line
339 | evidence_temp_start = result_num
340 | for pdx in range(len(evidence_temp_line)):
341 | if evidence_temp_line[pdx] not in evidence_answer:
342 | evidence_answer.append(evidence_temp_line[pdx])
343 | evidence_start.append(evidence_temp_start[pdx])
344 |
345 | val = {"answer_start": [start_line, start_token], "text": answer[idx],
346 | "evidence": evidence_answer,
347 | "evidence_start": evidence_start}
348 |
349 | if qa_csv_write:
350 | self.filewriter.writerow(
351 | ["##".join(list(unique_paras))] + [logical_form] + [",".join(set(answer))] + [Note_val] + [note_id + "_MedicationsChallenge"] + [difference] + [list_nar])
352 |
353 |
354 | '''
355 |
356 | answer_temp = {"answers": ans_list, "id": [tuple_orginal, intial_template],
357 | "question": list(unique_paras)}
358 | answer_out.append(answer_temp)
359 |
360 | return answer_out
361 |
362 | ######################## Main Utility Functions ######################################
363 |
364 | def MakeMedicationRelationMappings(self,med_list):
365 |
366 | self.map_meds_to_reasons = {}
367 | self.map_meds_to_dosages = {}
368 | self.map_meds_to_frequency = {}
369 | self.map_reasons_to_meds = {}
370 | self.map_meds_to_durations = {}
371 | self.medications_all = {}
372 |
373 |
374 | for med_annotations in med_list:
375 |
376 | if med_annotations["medication"][0] not in self.medications_all:
377 | self.medications_all[med_annotations["medication"][0]] = [med_annotations["medication"]]
378 | #print(med_annotations["medication"])
379 |
380 | if med_annotations["medication"][0] not in self.map_meds_to_dosages:
381 | self.map_meds_to_dosages[med_annotations["medication"][0]] = []
382 |
383 | if med_annotations["medication"][0] not in self.map_meds_to_frequency:
384 | self.map_meds_to_frequency[med_annotations["medication"][0]] = []
385 |
386 | if med_annotations["medication"][0] not in self.map_meds_to_reasons:
387 | self.map_meds_to_reasons[med_annotations["medication"][0]] = []
388 |
389 | if med_annotations["problem"][0] != "nm":
390 | if med_annotations["problem"][0] not in self.map_reasons_to_meds:
391 | self.map_reasons_to_meds[med_annotations["problem"][0]] = []
392 |
393 | if med_annotations["medication"][0] not in self.map_meds_to_durations:
394 | self.map_meds_to_durations[med_annotations["medication"][0]] = []
395 |
396 | if med_annotations["dosage"][0] != "nm":
397 | #if med_annotations["event"] == ""
398 | if med_annotations["dosage"]+med_annotations["list"] not in self.map_meds_to_dosages[med_annotations["medication"][0]]:
399 | self.map_meds_to_dosages[med_annotations["medication"][0]].append(med_annotations["dosage"]+med_annotations["list"])
400 | if med_annotations["problem"][0] != "nm":
401 | self.map_meds_to_reasons[med_annotations["medication"][0]].append(med_annotations["problem"]+med_annotations["list"])
402 | if med_annotations["problem"][0] != "nm":
403 | self.map_reasons_to_meds[med_annotations["problem"][0]].append(med_annotations["medication"]+med_annotations["list"])
404 | if med_annotations["frequency"][0] != "nm":
405 | self.map_meds_to_frequency[med_annotations["medication"][0]].append(med_annotations["frequency"]+med_annotations["list"])
406 | if med_annotations["duration"][0] != "nm":
407 | self.map_meds_to_durations[med_annotations["medication"][0]].append(med_annotations["duration"]+med_annotations["list"])
408 |
409 | def MakeMedicationQL(self, rwords, question_list, logical_form_template, dup_rwords):
410 |
411 | intial_template = logical_form_template
412 | paraphrase_questions = []
413 | tuple_orginal = []
414 |
415 | if rwords == ["time"]:
416 | time = str(random.randint(2, 5)) + random.choice([" years", " weeks"])
417 | for question in question_list:
418 | original = question
419 | question = question.replace("|time|", time)
420 | logical_form_template = logical_form_template.replace("|time|", time)
421 | rwords = []
422 | dup_rwords = []
423 | paraphrase_questions.append(question)
424 | tuple_orginal.append((question, original))
425 | else:
426 |
427 | ############################ make questions ############################################
428 |
429 | for question in question_list:
430 | orginal = question
431 | idx = 0
432 | done = []
433 | for types in list(dup_rwords):
434 | # temp = qwords
435 | index = question.find("|" + types + "|")
436 | if index == -1 and types not in done:
437 | print(question, "|" + types + "|", done)
438 | question = question.replace("|" + types + "|", rwords[idx])
439 | done.append(types)
440 | idx += 1
441 | tuple_orginal.append((question, orginal))
442 | paraphrase_questions.append(question)
443 |
444 | ###################################### Make Logical Form #################################
445 |
446 | ## tab ##
447 | idx = 0
448 | done = []
449 | for types in list(dup_rwords):
450 | logical_form_template.replace("|treatment|", "|medication")
451 | index = logical_form_template.find("|" + types + "|")
452 | if index == -1 and types not in done:
453 | print(logical_form_template, "|" + types + "|", done, types)
454 | done.append(types)
455 |
456 | logical_form_template = logical_form_template.replace("|" + types + "|", rwords[idx])
457 | idx += 1
458 |
459 | logical_form = logical_form_template
460 |
461 | ### Writing question-logical form ##
462 |
463 | for (question, orginal) in tuple_orginal:
464 | self.filewriter_forlform.writerow([question] + [logical_form.strip()] + [orginal.strip()] + [intial_template])
465 |
466 | return [paraphrase_questions, tuple_orginal, logical_form]
467 |
468 | def MakeAnswer(self, quest_list_nar, answertype, med_annotations, question_list,line_num,line_token):
469 |
470 | result_num = []
471 | result_token = []
472 | answer_line = []
473 | list_nar = quest_list_nar
474 | answer = []
475 |
476 | idx = 0
477 | if answertype[idx] == "yes":
478 |
479 | ### the question line is evidence for yes or no questions ##
480 | #answer = ["yes"]*len(question_list)
481 | answer = [""] * len(question_list)
482 | answer_line.extend(question_list)
483 | result_num.extend(line_num)
484 | #result_token.extend(line_token)
485 | result_token = [""] * len(question_list)
486 | list_nar.extend(quest_list_nar)
487 | elif answertype == ["problem"]:
488 | for listr in self.map_meds_to_reasons[med_annotations["medication"][0]]:
489 | answer += [listr[0]]
490 | answer_line.append(listr[1])
491 | result_num.append(int(listr[2]))
492 | result_token.append(int(listr[3]))
493 | list_nar.append(listr[3])
494 | elif answertype == ["frequency"]:
495 | # print("frequency")
496 | for listr in self.map_meds_to_frequency[med_annotations["medication"][0]]:
497 | answer += [listr[0]]
498 | answer_line.append(listr[1])
499 | result_num.append(int(listr[2]))
500 | result_token.append(int(listr[3]))
501 | list_nar.append(listr[3])
502 | elif answertype == ["dosage"]:
503 | for med in [med_annotations["medication"][0]]:
504 | for listr in self.map_meds_to_dosages[med]:
505 | answer += [listr[0]]
506 | answer_line.append(listr[1])
507 | result_num.append(int(listr[2]))
508 | result_token.append(int(listr[3]))
509 | list_nar.append(listr[3])
510 | elif answertype == ["medication"]:
511 | for listr in self.map_reasons_to_meds[med_annotations["problem"][0]]:
512 | answer += [listr[0]]
513 | answer_line.append(listr[1])
514 | result_num.append(int(listr[2]))
515 | result_token.append(int(listr[3]))
516 | list_nar.append(listr[3])
517 | elif answertype == ["medication", 'dosage']:
518 | meds = self.map_reasons_to_meds[med_annotations["problem"][0]]
519 | for med in meds:
520 | #dos = ",".join([x[0] for x in self.map_meds_to_dosages[med[0]]])
521 | #answer += ["( " + med[0] + ", " + dos + ")"]
522 |
523 | answer.append([med[0]])
524 | answer_line.append([med[1]])
525 | result_num.append([int(med[2])])
526 | result_token.append([int(med[3])])
527 | list_nar.append([med[3]])
528 |
529 |
530 | for x in self.map_meds_to_dosages[med[0]]:
531 | #if x[1] not in answer_line[-1]:
532 | answer[-1].extend([x[0]])
533 | answer_line[-1].extend([x[1]])
534 | result_num[-1].extend([int(x[2])])
535 | result_token[-1].extend([int(x[3])])
536 | list_nar[-1].extend([x[4]])
537 |
538 | #print("new medicine")
539 | #print(answer[-1])
540 | #print(result_num[-1])
541 | #print(result_token[-1])
542 | #print(answer_line[-1])
543 | #result_num[-1].extend([int(x[2]) for x in self.map_meds_to_dosages[med[0]] if int(x[2]) not in result_num[-1]])
544 | #result_token[-1].extend([int(x[3]) for x in self.map_meds_to_dosages[med[0]]])
545 | #list_nar.extend([x[3] for x in self.map_meds_to_dosages[med[0]]])
546 |
547 | elif answertype == ["duration"]:
548 | for listr in self.map_meds_to_durations[med_annotations["medication"][0]]:
549 | answer += [listr[0]]
550 | answer_line.append(listr[1])
551 | result_num.append(int(listr[2]))
552 | result_token.append(int(listr[3]))
553 | list_nar.append(listr[3])
554 | elif answertype == ["medications_all"]:
555 | for medication_name in self.medications_all:
556 | listr = self.medications_all[medication_name][0]
557 | answer += [listr[0]]
558 | answer_line.append(listr[1])
559 | result_num.append(int(listr[2]))
560 | result_token.append(int(listr[3]))
561 | list_nar.append(listr[3])
562 | elif answertype == ["none"]:
563 | pass
564 | else:
565 | print(answertype)
566 | answer = []
567 |
568 | return [answer,answer_line, result_num, result_token, list_nar]
569 |
570 | ######################## Supporting Utility Functions ######################################
571 |
572 | def CheckForErrors(self, question_list, logical_form_template):
573 |
574 | ## gather all the placeholders in the templates ##
575 |
576 | dup_rwords_list = []
577 | unique_templates = []
578 | qwords_list = []
579 |
580 | ## check if all the questions paraphrases have the same placeholders ##
581 |
582 | for question in question_list:
583 | if question.strip() == "":
584 | continue
585 | question = question.replace("|medication| or |medication|", "|medication|")
586 | question = question.replace("|problem| or |problem|", "|problem|")
587 | question = question.replace("|test| or |test|", "|test|")
588 | question = question.replace("|test| |test| |test|", "|test|")
589 | question = question.strip()
590 |
591 | if question not in unique_templates:
592 | unique_templates.append(question)
593 | else:
594 | continue
595 |
596 | qwords = question.split("|")
597 | dup_rwords = qwords[1:len(qwords):2]
598 | qwords_list.append(qwords)
599 |
600 | if len(dup_rwords_list) == 0:
601 | dup_rwords_list = [set(dup_rwords)]
602 | else:
603 | if set(dup_rwords) not in dup_rwords_list:
604 | print("Error Out Of Context Question:")
605 | print(question, logical_form_template, question_list)
606 | return None
607 |
608 | ## Check if the placeholders in logical forms are same as the placeholders in question ##
609 |
610 | lwords = logical_form_template.split("|")
611 | dup_lrwords = lwords[1:len(lwords):2]
612 | if set(dup_lrwords) not in dup_rwords_list:
613 | print("Error Out Of Context Question-Logical Form Pairs:")
614 | print(question_list, logical_form_template)
615 | return None
616 |
617 | return dup_rwords_list
618 |
619 | if __name__=="__main__":
620 | GenerateQA()
--------------------------------------------------------------------------------
/generation/i2b2_relations/matching_notes.csv:
--------------------------------------------------------------------------------
1 | Relations Coreference
2 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/974381789.txt i2b2/coreference/Partners_Train/docs/clinical-473.txt
3 | i2b2/relations/test_data/0373.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-592.txt
4 | i2b2/relations/test_data/0285.txt i2b2/coreference/Beth_Train/docs/clinical-482.txt
5 | i2b2/relations/test_data/0014.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-283.txt
6 | i2b2/relations/test_data/0310.txt i2b2/coreference/Partners_Train/docs/clinical-321.txt
7 | i2b2/relations/test_data/0005.txt i2b2/coreference/Beth_Train/docs/clinical-132.txt
8 | i2b2/relations/test_data/0174.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-166.txt
9 | i2b2/relations/test_data/0474.txt i2b2/coreference/Partners_Train/docs/clinical-516.txt
10 | i2b2/relations/test_data/0178.txt i2b2/coreference/Partners_Train/docs/clinical-171.txt
11 | i2b2/relations/test_data/0385.txt i2b2/coreference/Beth_Train/docs/clinical-607.txt
12 | i2b2/relations/test_data/0461.txt i2b2/coreference/Partners_Train/docs/clinical-491.txt
13 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/614746156.txt i2b2/coreference/Partners_Train/docs/clinical-781.txt
14 | i2b2/relations/test_data/0097.txt i2b2/coreference/Beth_Train/docs/clinical-247.txt
15 | i2b2/relations/test_data/0357.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-572.txt
16 | i2b2/relations/test_data/0464.txt i2b2/coreference/Beth_Train/docs/clinical-732.txt
17 | i2b2/relations/test_data/0455.txt i2b2/coreference/Partners_Train/docs/clinical-481.txt
18 | i2b2/relations/test_data/0025.txt i2b2/coreference/Beth_Train/docs/clinical-157.txt
19 | i2b2/relations/test_data/0472.txt i2b2/coreference/Partners_Train/docs/clinical-408.txt
20 | i2b2/relations/test_data/0069.txt i2b2/coreference/Beth_Train/docs/clinical-212.txt
21 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-74.txt i2b2/coreference/Beth_Train/docs/clinical-248.txt
22 | i2b2/relations/test_data/0277.txt i2b2/coreference/Beth_Train/docs/clinical-472.txt
23 | i2b2/relations/test_data/0093.txt i2b2/coreference/Beth_Train/docs/clinical-242.txt
24 | i2b2/relations/test_data/0010.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-6.txt
25 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-24.txt i2b2/coreference/Beth_Train/docs/clinical-68.txt
26 | i2b2/relations/test_data/0074.txt i2b2/coreference/Partners_Train/docs/clinical-66.txt
27 | i2b2/relations/test_data/0058.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-293.txt
28 | i2b2/relations/test_data/0457.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-717.txt
29 | i2b2/relations/test_data/0428.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-436.txt
30 | i2b2/relations/test_data/0329.txt i2b2/coreference/Beth_Train/docs/clinical-537.txt
31 | i2b2/relations/test_data/0418.txt i2b2/coreference/Beth_Train/docs/clinical-652.txt
32 | i2b2/relations/test_data/0460.txt i2b2/coreference/Beth_Train/docs/clinical-722.txt
33 | i2b2/relations/test_data/0401.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-627.txt
34 | i2b2/relations/test_data/0381.txt i2b2/coreference/Beth_Train/docs/clinical-602.txt
35 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/622086964.txt i2b2/coreference/Partners_Train/docs/clinical-786.txt
36 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/412141256.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-711.txt
37 | i2b2/relations/test_data/0442.txt i2b2/coreference/Beth_Train/docs/clinical-692.txt
38 | i2b2/relations/test_data/0266.txt i2b2/coreference/Partners_Train/docs/clinical-271.txt
39 | i2b2/relations/test_data/0358.txt i2b2/coreference/Partners_Train/docs/clinical-371.txt
40 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-121.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-767.txt
41 | i2b2/relations/test_data/0365.txt i2b2/coreference/Beth_Train/docs/clinical-582.txt
42 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-177.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-23.txt
43 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/627258104.txt i2b2/coreference/Partners_Train/docs/clinical-791.txt
44 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/959086752.txt i2b2/coreference/Partners_Train/docs/clinical-107.txt
45 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/655358166_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-806.txt
46 | i2b2/relations/test_data/0294.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-301.txt
47 | i2b2/relations/test_data/0346.txt i2b2/coreference/Partners_Train/docs/clinical-361.txt
48 | i2b2/relations/test_data/0253.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-442.txt
49 | i2b2/relations/test_data/0436.txt i2b2/coreference/Beth_Train/docs/clinical-682.txt
50 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/188543380.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-591.txt
51 | i2b2/relations/test_data/0057.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-197.txt
52 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-66.txt i2b2/coreference/Beth_Train/docs/clinical-213.txt
53 | i2b2/relations/test_data/0342.txt i2b2/coreference/Partners_Train/docs/clinical-348.txt
54 | i2b2/relations/test_data/0397.txt i2b2/coreference/Beth_Train/docs/clinical-622.txt
55 | i2b2/relations/test_data/0145.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-307.txt
56 | i2b2/relations/test_data/0081.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-227.txt
57 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-32.txt i2b2/coreference/Beth_Train/docs/clinical-108.txt
58 | i2b2/relations/test_data/0210.txt i2b2/coreference/Partners_Train/docs/clinical-206.txt
59 | i2b2/relations/test_data/0289.txt i2b2/coreference/Beth_Train/docs/clinical-487.txt
60 | i2b2/relations/test_data/0026.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-16.txt
61 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/156406283.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-576.txt
62 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-106.txt i2b2/coreference/Beth_Train/docs/clinical-747.txt
63 | i2b2/relations/test_data/0345.txt i2b2/coreference/Beth_Train/docs/clinical-557.txt
64 | i2b2/relations/test_data/0125.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-282.txt
65 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-176.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-18.txt
66 | i2b2/relations/test_data/0050.txt i2b2/coreference/Partners_Train/docs/clinical-46.txt
67 | i2b2/relations/test_data/0218.txt i2b2/coreference/Partners_Train/docs/clinical-216.txt
68 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/723989226.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-2.txt
69 | i2b2/relations/test_data/0314.txt i2b2/coreference/Partners_Train/docs/clinical-326.txt
70 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/814743340_RWH.txt i2b2/coreference/Partners_Train/docs/clinical-32.txt
71 | i2b2/relations/test_data/0098.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-96.txt
72 | i2b2/relations/test_data/0354.txt i2b2/coreference/Partners_Train/docs/clinical-366.txt
73 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/289811204.txt i2b2/coreference/Partners_Train/docs/clinical-641.txt
74 | i2b2/relations/test_data/0333.txt i2b2/coreference/Beth_Train/docs/clinical-542.txt
75 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-80.txt i2b2/coreference/Beth_Train/docs/clinical-253.txt
76 | i2b2/relations/test_data/0406.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-416.txt
77 | i2b2/relations/test_data/0134.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-318.txt
78 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-29.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-93.txt
79 | i2b2/relations/test_data/0325.txt i2b2/coreference/Beth_Train/docs/clinical-532.txt
80 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-21.txt i2b2/coreference/Beth_Train/docs/clinical-53.txt
81 | i2b2/relations/test_data/0070.txt i2b2/coreference/Partners_Train/docs/clinical-61.txt
82 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-109.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-762.txt
83 | i2b2/relations/test_data/0415.txt i2b2/coreference/Beth_Train/docs/clinical-647.txt
84 | i2b2/relations/test_data/0445.txt i2b2/coreference/Beth_Train/docs/clinical-697.txt
85 | i2b2/relations/test_data/0078.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-71.txt
86 | i2b2/relations/test_data/0065.txt i2b2/coreference/Beth_Train/docs/clinical-207.txt
87 | i2b2/relations/test_data/0390.txt i2b2/coreference/Partners_Train/docs/clinical-358.txt
88 | i2b2/relations/test_data/0309.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-512.txt
89 | i2b2/relations/test_data/0338.txt i2b2/coreference/Partners_Train/docs/clinical-356.txt
90 | i2b2/relations/test_data/0198.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-191.txt
91 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-16.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-3.txt
92 | i2b2/relations/test_data/0101.txt i2b2/coreference/Beth_Train/docs/clinical-252.txt
93 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-52.txt i2b2/coreference/Beth_Train/docs/clinical-173.txt
94 | i2b2/relations/test_data/0374.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-391.txt
95 | i2b2/relations/test_data/0162.txt i2b2/coreference/Partners_Train/docs/clinical-156.txt
96 | i2b2/relations/test_data/0113.txt i2b2/coreference/Beth_Train/docs/clinical-267.txt
97 | i2b2/relations/test_data/0122.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-308.txt
98 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/915093496_RWH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-92.txt
99 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-28.txt i2b2/coreference/Beth_Train/docs/clinical-88.txt
100 | i2b2/relations/test_data/0029.txt i2b2/coreference/Beth_Train/docs/clinical-162.txt
101 | i2b2/relations/test_data/0467.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-393.txt
102 | i2b2/relations/test_data/0038.txt i2b2/coreference/Partners_Train/docs/clinical-31.txt
103 | i2b2/relations/test_data/0061.txt i2b2/coreference/Beth_Train/docs/clinical-202.txt
104 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-37.txt i2b2/coreference/Beth_Train/docs/clinical-128.txt
105 | i2b2/relations/test_data/0062.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-298.txt
106 | i2b2/relations/test_data/0213.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-392.txt
107 | i2b2/relations/test_data/0269.txt i2b2/coreference/Beth_Train/docs/clinical-462.txt
108 | i2b2/relations/test_data/0476.txt i2b2/coreference/Partners_Train/docs/clinical-526.txt
109 | i2b2/relations/test_data/0254.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-333.txt
110 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-178.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-28.txt
111 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/320422564.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-656.txt
112 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-18.txt i2b2/coreference/Beth_Train/docs/clinical-38.txt
113 | i2b2/relations/test_data/0173.txt i2b2/coreference/Beth_Train/docs/clinical-342.txt
114 | i2b2/relations/test_data/0106.txt i2b2/coreference/Partners_Train/docs/clinical-101.txt
115 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/332803550.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-438.txt
116 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-25.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-73.txt
117 | i2b2/relations/test_data/0177.txt i2b2/coreference/Beth_Train/docs/clinical-347.txt
118 | i2b2/relations/test_data/0412.txt i2b2/coreference/Beth_Train/docs/clinical-642.txt
119 | i2b2/relations/test_data/0077.txt i2b2/coreference/Beth_Train/docs/clinical-222.txt
120 | i2b2/relations/test_data/0030.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-21.txt
121 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-53.txt i2b2/coreference/Beth_Train/docs/clinical-178.txt
122 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/337702516_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-666.txt
123 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-51.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-168.txt
124 | i2b2/relations/test_data/0370.txt i2b2/coreference/Partners_Train/docs/clinical-386.txt
125 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/950452368.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-468.txt
126 | i2b2/relations/test_data/0282.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-291.txt
127 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/176318078_a.txt i2b2/coreference/Partners_Train/docs/clinical-581.txt
128 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-46.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-143.txt
129 | i2b2/relations/test_data/0138.txt i2b2/coreference/Partners_Train/docs/clinical-126.txt
130 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-19.txt i2b2/coreference/Beth_Train/docs/clinical-43.txt
131 | i2b2/relations/test_data/0434.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-441.txt
132 | i2b2/relations/test_data/0185.txt i2b2/coreference/Beth_Train/docs/clinical-357.txt
133 | i2b2/relations/test_data/0462.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-727.txt
134 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-81.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-258.txt
135 | i2b2/relations/test_data/0245.txt i2b2/coreference/Beth_Train/docs/clinical-432.txt
136 | i2b2/relations/test_data/0322.txt i2b2/coreference/Partners_Train/docs/clinical-336.txt
137 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-48.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-153.txt
138 | i2b2/relations/test_data/0169.txt i2b2/coreference/Beth_Train/docs/clinical-337.txt
139 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-68.txt i2b2/coreference/Beth_Train/docs/clinical-223.txt
140 | i2b2/relations/test_data/0117.txt i2b2/coreference/Beth_Train/docs/clinical-272.txt
141 | i2b2/relations/test_data/0158.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-151.txt
142 | i2b2/relations/test_data/0393.txt i2b2/coreference/Beth_Train/docs/clinical-617.txt
143 | i2b2/relations/test_data/0410.txt i2b2/coreference/Partners_Train/docs/clinical-373.txt
144 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/348301810.txt i2b2/coreference/Partners_Train/docs/clinical-676.txt
145 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/101407944_PUMC.txt i2b2/coreference/Partners_Train/docs/clinical-541.txt
146 | i2b2/relations/test_data/0229.txt i2b2/coreference/Beth_Train/docs/clinical-412.txt
147 | i2b2/relations/test_data/0261.txt i2b2/coreference/Beth_Train/docs/clinical-452.txt
148 | i2b2/relations/test_data/0301.txt i2b2/coreference/Beth_Train/docs/clinical-502.txt
149 | i2b2/relations/test_data/0427.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-667.txt
150 | i2b2/relations/test_data/0241.txt i2b2/coreference/Beth_Train/docs/clinical-427.txt
151 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-179.txt i2b2/coreference/Beth_Train/docs/clinical-33.txt
152 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/245096078.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-611.txt
153 | i2b2/relations/test_data/0473.txt i2b2/coreference/Partners_Train/docs/clinical-413.txt
154 | i2b2/relations/test_data/0205.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-382.txt
155 | i2b2/relations/test_data/0297.txt i2b2/coreference/Beth_Train/docs/clinical-497.txt
156 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-45.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-138.txt
157 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-71.txt i2b2/coreference/Beth_Train/docs/clinical-238.txt
158 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/910458031.txt i2b2/coreference/Partners_Train/docs/clinical-87.txt
159 | i2b2/relations/test_data/0422.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-383.txt
160 | i2b2/relations/test_data/0137.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-297.txt
161 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/270045381.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-631.txt
162 | i2b2/relations/test_data/0433.txt i2b2/coreference/Beth_Train/docs/clinical-677.txt
163 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/523704694.txt i2b2/coreference/Partners_Train/docs/clinical-756.txt
164 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-70.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-233.txt
165 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/493597270.txt i2b2/coreference/Partners_Train/docs/clinical-736.txt
166 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/424729395_DH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-716.txt
167 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-73.txt i2b2/coreference/Beth_Train/docs/clinical-243.txt
168 | i2b2/relations/test_data/0002.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-1.txt
169 | i2b2/relations/test_data/0419.txt i2b2/coreference/Partners_Train/docs/clinical-378.txt
170 | i2b2/relations/test_data/0141.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-302.txt
171 | i2b2/relations/test_data/0321.txt i2b2/coreference/Beth_Train/docs/clinical-527.txt
172 | i2b2/relations/test_data/0463.txt i2b2/coreference/Partners_Train/docs/clinical-496.txt
173 | i2b2/relations/test_data/0013.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-142.txt
174 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-56.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-193.txt
175 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/699905656_SC.txt i2b2/coreference/Partners_Train/docs/clinical-458.txt
176 | i2b2/relations/test_data/0362.txt i2b2/coreference/Partners_Train/docs/clinical-376.txt
177 | i2b2/relations/test_data/0290.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-296.txt
178 | i2b2/relations/test_data/0475.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-521.txt
179 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-123.txt i2b2/coreference/Beth_Train/docs/clinical-777.txt
180 | i2b2/relations/test_data/0129.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-287.txt
181 | i2b2/relations/test_data/0318.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-331.txt
182 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-84.txt i2b2/coreference/Beth_Train/docs/clinical-273.txt
183 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-33.txt i2b2/coreference/Beth_Train/docs/clinical-113.txt
184 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/641557794_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-801.txt
185 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-140.txt i2b2/coreference/Beth_Train/docs/clinical-797.txt
186 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/351853846_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-681.txt
187 | i2b2/relations/test_data/0237.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-422.txt
188 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/425680098_SC.txt i2b2/coreference/Partners_Train/docs/clinical-721.txt
189 | i2b2/relations/test_data/0378.txt i2b2/coreference/Partners_Train/docs/clinical-396.txt
190 | i2b2/relations/test_data/0105.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-257.txt
191 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/105732749.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-546.txt
192 | i2b2/relations/test_data/0089.txt i2b2/coreference/Beth_Train/docs/clinical-237.txt
193 | i2b2/relations/test_data/0306.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-316.txt
194 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/018636330_DH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-423.txt
195 | i2b2/relations/test_data/0425.txt i2b2/coreference/Partners_Train/docs/clinical-431.txt
196 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-143.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-812.txt
197 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-59.txt i2b2/coreference/Beth_Train/docs/clinical-203.txt
198 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-14.txt i2b2/coreference/Beth_Train/docs/clinical-792.txt
199 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-122.txt i2b2/coreference/Beth_Train/docs/clinical-772.txt
200 | i2b2/relations/test_data/0225.txt i2b2/coreference/Beth_Train/docs/clinical-407.txt
201 | i2b2/relations/test_data/0421.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-657.txt
202 | i2b2/relations/test_data/0018.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-288.txt
203 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/274230067_EH.txt i2b2/coreference/Partners_Train/docs/clinical-433.txt
204 | i2b2/relations/test_data/0045.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-182.txt
205 | i2b2/relations/test_data/0165.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-332.txt
206 | i2b2/relations/test_data/0330.txt i2b2/coreference/Partners_Train/docs/clinical-346.txt
207 | i2b2/relations/test_data/0022.txt i2b2/coreference/Partners_Train/docs/clinical-11.txt
208 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-36.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-123.txt
209 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-31.txt i2b2/coreference/Beth_Train/docs/clinical-103.txt
210 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-30.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-98.txt
211 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-108.txt i2b2/coreference/Beth_Train/docs/clinical-757.txt
212 | i2b2/relations/test_data/0302.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-311.txt
213 | i2b2/relations/test_data/0126.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-121.txt
214 | i2b2/relations/test_data/0118.txt i2b2/coreference/Partners_Train/docs/clinical-116.txt
215 | i2b2/relations/test_data/0334.txt i2b2/coreference/Partners_Train/docs/clinical-351.txt
216 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-26.txt i2b2/coreference/Beth_Train/docs/clinical-78.txt
217 | i2b2/relations/test_data/0317.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-522.txt
218 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/145980160.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-571.txt
219 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-144.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-817.txt
220 | i2b2/relations/test_data/0021.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-152.txt
221 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/879492218_YC.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-72.txt
222 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/837898389.txt i2b2/coreference/Partners_Train/docs/clinical-47.txt
223 | i2b2/relations/test_data/0217.txt i2b2/coreference/Beth_Train/docs/clinical-397.txt
224 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/638157550_SC.txt i2b2/coreference/Partners_Train/docs/clinical-796.txt
225 | i2b2/relations/test_data/0349.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-562.txt
226 | i2b2/relations/test_data/0233.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-417.txt
227 | i2b2/relations/test_data/0469.txt i2b2/coreference/Partners_Train/docs/clinical-398.txt
228 | i2b2/relations/test_data/0054.txt i2b2/coreference/Partners_Train/docs/clinical-51.txt
229 | i2b2/relations/test_data/0451.txt i2b2/coreference/Beth_Train/docs/clinical-707.txt
230 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-82.txt i2b2/coreference/Beth_Train/docs/clinical-263.txt
231 | i2b2/relations/test_data/0037.txt i2b2/coreference/Beth_Train/docs/clinical-172.txt
232 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/433651389.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-726.txt
233 | i2b2/relations/test_data/0209.txt i2b2/coreference/Beth_Train/docs/clinical-387.txt
234 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/517414339.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-751.txt
235 | i2b2/relations/test_data/0270.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-276.txt
236 | i2b2/relations/test_data/0090.txt i2b2/coreference/Partners_Train/docs/clinical-86.txt
237 | i2b2/relations/test_data/0142.txt i2b2/coreference/Partners_Train/docs/clinical-131.txt
238 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/130959255.txt i2b2/coreference/Partners_Train/docs/clinical-556.txt
239 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/194442600_RWH.txt i2b2/coreference/Partners_Train/docs/clinical-596.txt
240 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/134300717.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-561.txt
241 | i2b2/relations/test_data/0042.txt i2b2/coreference/Partners_Train/docs/clinical-36.txt
242 | i2b2/relations/test_data/0102.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-303.txt
243 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-34.txt i2b2/coreference/Beth_Train/docs/clinical-118.txt
244 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-105.txt i2b2/coreference/Beth_Train/docs/clinical-742.txt
245 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-55.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-188.txt
246 | i2b2/relations/test_data/0470.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-511.txt
247 | i2b2/relations/test_data/0130.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-313.txt
248 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/284487129.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-636.txt
249 | i2b2/relations/test_data/0181.txt i2b2/coreference/Beth_Train/docs/clinical-352.txt
250 | i2b2/relations/test_data/0041.txt i2b2/coreference/Beth_Train/docs/clinical-177.txt
251 | i2b2/relations/test_data/0353.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-567.txt
252 | i2b2/relations/test_data/0221.txt i2b2/coreference/Beth_Train/docs/clinical-402.txt
253 | i2b2/relations/test_data/0249.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-437.txt
254 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-17.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-8.txt
255 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-107.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-752.txt
256 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-27.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-83.txt
257 | i2b2/relations/test_data/0449.txt i2b2/coreference/Partners_Train/docs/clinical-471.txt
258 | i2b2/relations/test_data/0454.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-712.txt
259 | i2b2/relations/test_data/0189.txt i2b2/coreference/Beth_Train/docs/clinical-362.txt
260 | i2b2/relations/test_data/0153.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-317.txt
261 | i2b2/relations/test_data/0001.txt i2b2/coreference/Beth_Train/docs/clinical-127.txt
262 | i2b2/relations/test_data/0033.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-167.txt
263 | i2b2/relations/test_data/0161.txt i2b2/coreference/Beth_Train/docs/clinical-327.txt
264 | i2b2/relations/test_data/0416.txt i2b2/coreference/Partners_Train/docs/clinical-426.txt
265 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-13.txt i2b2/coreference/Beth_Train/docs/clinical-787.txt
266 | i2b2/relations/test_data/0193.txt i2b2/coreference/Beth_Train/docs/clinical-367.txt
267 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-142.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-807.txt
268 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/825330116.txt i2b2/coreference/Partners_Train/docs/clinical-42.txt
269 | i2b2/relations/test_data/0286.txt i2b2/coreference/Partners_Train/docs/clinical-343.txt
270 | i2b2/relations/test_data/0350.txt i2b2/coreference/Partners_Train/docs/clinical-353.txt
271 | i2b2/relations/test_data/0437.txt i2b2/coreference/Partners_Train/docs/clinical-446.txt
272 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/405507617.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-701.txt
273 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-67.txt i2b2/coreference/Beth_Train/docs/clinical-218.txt
274 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/989519730_WGH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-122.txt
275 | i2b2/relations/test_data/0109.txt i2b2/coreference/Beth_Train/docs/clinical-262.txt
276 | i2b2/relations/test_data/0431.txt i2b2/coreference/Partners_Train/docs/clinical-388.txt
277 | i2b2/relations/test_data/0257.txt i2b2/coreference/Beth_Train/docs/clinical-447.txt
278 | i2b2/relations/test_data/0293.txt i2b2/coreference/Beth_Train/docs/clinical-492.txt
279 | i2b2/relations/test_data/0402.txt i2b2/coreference/Partners_Train/docs/clinical-368.txt
280 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/044687343_ELMVH.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-466.txt
281 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-50.txt i2b2/coreference/Beth_Train/docs/clinical-163.txt
282 | i2b2/relations/test_data/0150.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-141.txt
283 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/212512774_WGH.txt i2b2/coreference/Partners_Train/docs/clinical-601.txt
284 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/262912613.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-626.txt
285 | i2b2/relations/test_data/0086.txt i2b2/coreference/i2b2_Test/i2b2_Partners_Test/docs/clinical-81.txt
286 | i2b2/relations/test_data/0149.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-312.txt
287 | i2b2/relations/test_data/0389.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-612.txt
288 | i2b2/relations/test_data/0121.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-277.txt
289 | i2b2/relations/test_data/0409.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-637.txt
290 | i2b2/relations/test_data/0369.txt i2b2/coreference/Beth_Train/docs/clinical-587.txt
291 | i2b2/relations/test_data/0341.txt i2b2/coreference/Beth_Train/docs/clinical-552.txt
292 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/596437842.txt i2b2/coreference/Partners_Train/docs/clinical-776.txt
293 | i2b2/relations/concept_assertion_relation_training_data/partners/txt/920798564.txt i2b2/coreference/Partners_Train/docs/clinical-463.txt
294 | i2b2/relations/test_data/0053.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-192.txt
295 | i2b2/relations/concept_assertion_relation_training_data/beth/txt/record-65.txt i2b2/coreference/Beth_Train/docs/clinical-208.txt
296 | i2b2/relations/test_data/0394.txt i2b2/coreference/Partners_Train/docs/clinical-411.txt
297 | i2b2/relations/test_data/0361.txt i2b2/coreference/i2b2_Test/i2b2_Beth_Test/docs/clinical-577.txt
298 |
--------------------------------------------------------------------------------
/generation/i2b2_relations/relations-answers.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from os import listdir
3 | from os.path import isfile, join
4 | import nltk
5 | from nltk.stem import WordNetLemmatizer
6 | from nltk.corpus import wordnet as wn
7 | from problem_classfiers import concept_is_CommonNoun, concept_is_PastTense
8 | import json
9 | import sys
10 | reload(sys)
11 | sys.setdefaultencoding("ISO-8859-1")
12 | import random
13 | import argparse
14 | import os
15 |
16 | ## Resolve the use of medications and treatments
17 |
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--i2b2_dir', default='', help='Directory containing i2b2 relations challange files')
20 | parser.add_argument('--templates_dir', default='', help='Directory containing template files in the given format')
21 | parser.add_argument('--output_dir', default='', help='Directory to store the output')
22 | args = parser.parse_args()
23 |
24 | ###################################################### SET FILE PATHS ##################################################################
25 |
26 | ## i2b2 file paths ##
27 |
28 | relations_folder = args.i2b2_dir
29 |
30 | FilePath = [ "concept_assertion_relation_training_data/partners/rel/", "concept_assertion_relation_training_data/beth/rel/", "test_data/rel/"]
31 |
32 | RelationsFilePath = []
33 |
34 | for file in FilePath:
35 | RelationsFilePath.append(os.path.join(relations_folder,file))
36 |
37 | FilePath = ["concept_assertion_relation_training_data/partners/txt/", "concept_assertion_relation_training_data/beth/txt/","test_data/txt/"]
38 |
39 | NoteFilePath = []
40 |
41 | for file in FilePath:
42 | NoteFilePath.append(os.path.join(relations_folder,file))
43 |
44 | FilePath = [ "concept_assertion_relation_training_data/partners/ast/", "concept_assertion_relation_training_data/beth/ast/", "test_data/ast/"]
45 |
46 | AstFilePath = []
47 | for file in FilePath:
48 | AstFilePath.append(os.path.join(relations_folder,file))
49 |
50 | ## template file path ##
51 |
52 | template_file_path = args.templates_dir
53 |
54 | ## matching notes in temporal, coreference and relations dataset ##
55 |
56 | matching_notes = os.path.join("generation/i2b2_relations/", "matching_notes.csv")
57 |
58 | ## output file paths ##
59 |
60 | #qa_output = "/home/anusri/Desktop/emrQA/output/relations-qa.csv"
61 | ql_output = os.path.join(args.output_dir,"relations-ql.csv")
62 | relations_qa_output_json = os.path.join(args.output_dir,"relations-qa.json")
63 |
64 |
65 | ### write to csv file for viz ##
66 |
67 | qa_csv_write = False
68 | ql_csv_write = True
69 |
70 | ######################################################## CODE #########################################################################
71 |
72 | class GenerateRelationsQuestions():
73 |
74 | def __init__(self):
75 |
76 | ## synsets to identify common nouns, will be used in preprocessing to remove generic i2b2 concepts ##
77 |
78 | self.similar = []
79 | val = [wn.synsets('problem'), wn.synsets('test'), wn.synsets('procedure'), wn.synsets('disease'),
80 | wn.synsets('medication'), wn.synsets('treatment'), wn.synsets('surgery')]
81 |
82 | self.count_corefs = 0
83 | self.resolved_corefs = 0
84 | for out in val:
85 | for ss in out:
86 | self.similar.extend(ss.lemma_names())
87 |
88 | ## set paths ##
89 | self.RelationsFilePath = RelationsFilePath
90 | self.NoteFilePath = NoteFilePath
91 | self.AstFilePath = AstFilePath
92 |
93 | self.ReadRelationsData()
94 | self.ReadAssertionsData()
95 | self.ReadTemplates()
96 |
97 | ######################### Read i2b2 file functions ###################################
98 |
99 | def ReadRelationsData(self):
100 |
101 | self.RelationsPerNote = {}
102 |
103 | self.ClinicalNotes = {}
104 |
105 | ## relations as seen in i2b2 relations challenge ###
106 |
107 | type = {"TeRP": ("test", "problem"), "TeCP": ("test", "problem"), "TrIP": ("treatment", "problem"),
108 | "TrWP": ("treatment", "problem"),
109 | "TrCP": ("treatment", "problem"), "TrAP": ("treatment", "problem"), "TrNAP": ("treatment", "problem"),
110 | "PIP": ("problem1", "problem2")}
111 |
112 | self.tr_status = {"TrIP": "improves", "TrWP": "worsens/not improves", "TrAP": "not known status",
113 | "TrCP": "causes"}
114 |
115 | ## read in all clinical notes ##
116 | for paths in self.NoteFilePath:
117 | files = [f for f in listdir(paths) if isfile(join(paths, f))]
118 | for file in files:
119 | remote_file = open(paths + file)
120 | Noteid = file.split(".")[0]
121 | self.ClinicalNotes[Noteid] = remote_file.readlines()
122 |
123 | ## read the file which shows the common notes in temporal, relations and coreference files from i2b2 challenge ##
124 | ## NOTE: This information is not available as a part of i2b2. This file is generated by using approximate methods (script provided).##
125 |
126 | match_file = open(matching_notes)
127 | csvreader = csv.reader(match_file)
128 | matching_files = list(csvreader) # relation, coreference
129 |
130 | Coreference_Note = {}
131 | self.CoreferenceCluster_to_Entity_map = {}
132 | self.Entity_to_CoreferenceCluster_map = {}
133 |
134 | ### Create coreference clusters for every type in every note and give each cluster an id. ###
135 |
136 | for file in matching_files[1:]:
137 | file = file[0].split("\t")
138 | relation_note_id = file[0].split("/")[-1].split(".")[0]
139 | coreference_path = file[1]
140 | coreferences = self.ReadCoreference(coreference_path, self.ClinicalNotes[relation_note_id])
141 | Coreference_Note[relation_note_id] = coreferences
142 |
143 | ## Create coreference clusters for every note ##
144 | self.CoreferenceCluster_to_Entity_map[relation_note_id] = {}
145 | self.Entity_to_CoreferenceCluster_map[relation_note_id] = {}
146 | for stype in coreferences:
147 | ## Create coreference clusters for every type (problem, test, treatment)##
148 | if stype not in self.CoreferenceCluster_to_Entity_map[relation_note_id]:
149 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype] = {}
150 | self.Entity_to_CoreferenceCluster_map[relation_note_id][stype] = {}
151 |
152 | cluster_id = 0
153 | for coref_list in coreferences[stype]:
154 |
155 | ## coref_list gets id given by cluster_id
156 | for concept in coref_list:
157 | if cluster_id not in self.CoreferenceCluster_to_Entity_map[relation_note_id][stype]:
158 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype][cluster_id] = []
159 |
160 | self.CoreferenceCluster_to_Entity_map[relation_note_id][stype][cluster_id].append(concept) ## bug fixed ##
161 | self.Entity_to_CoreferenceCluster_map[relation_note_id][stype][concept] = cluster_id
162 | cluster_id += 1
163 |
164 | #############################################################################################################################
165 |
166 | self.map_problems_to_test_revealed = {}
167 | self.map_tests_to_problem_revealed = {}
168 | self.map_problems_to_test_investigated = {}
169 | self.map_tests_to_problem_investigated = {}
170 | self.map_treatments_to_problem = {}
171 | self.map_problems_to_treatment = {}
172 | self.problems_to_badtreatment = {}
173 | self.allergic_treatments = {}
174 | self.treatments_status_to_problem = {}
175 | self.map_problems_to_treatment = {}
176 | self.badtreatments_to_problem = {}
177 | self.symptoms_to_problem = {}
178 | self.problems_to_symptom = {}
179 |
180 | for paths in self.RelationsFilePath:
181 | files = [f for f in listdir(paths) if isfile(join(paths, f))]
182 | for file in files:
183 | remote_file = open(paths + file)
184 | Noteid = file.split(".")[0]
185 | PatientNote = self.ClinicalNotes[Noteid]
186 |
187 | try:
188 | Coreferences = Coreference_Note[Noteid]
189 | except:
190 | Coreferences = {}
191 |
192 | Relations = {}
193 |
194 | for line in remote_file:
195 | line = line.replace("|||", "||")
196 | words = line.split("||")
197 |
198 | vals = []
199 | for word in [words[0], words[2]]:
200 | term = word.split("=")
201 | full_annotation = "=".join(term[1:])
202 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""]
203 | pos1 = int(index[0])
204 | pos2 = int(index[-1])
205 |
206 | annotation = full_annotation[pos1 + 1:pos2]
207 | indxs = full_annotation[pos2 + 1:].split(",")
208 |
209 | line_in_note = ""
210 | start_line = None
211 |
212 | for indx in indxs:
213 | indx = indx.strip()
214 | out = indx.split(" ")
215 | start_line = out[0].split(":")[0]
216 | start_token = out[0].split(":")[1]
217 | end_line = out[1].split(":")[0]
218 | end_token = out[1].split(":")[1]
219 |
220 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)])
221 |
222 | vals.append((annotation, line_in_note, start_line, start_token))
223 |
224 | relate = words[1].split("=")[1].split("\"")[1]
225 |
226 | val1 = vals[0]
227 | val2 = vals[1]
228 | t1 = val1[0]
229 | t2 = val2[0]
230 | # print(relate)
231 | if relate not in Relations:
232 | Relations[relate] = []
233 |
234 | ## preprocessing step done when generating question and logical forms, removed from here ##
235 |
236 | '''
237 | t1 = self.SimplePreProcess(val1[0])
238 | t2 = self.SimplePreProcess(val2[0])
239 |
240 |
241 | #print("yes")
242 | if t1 == None:
243 | self.CheckForCoreferences(val1, type[relate][0],Coreferences)
244 | if t2 == None:
245 | self.CheckForCoreferences(val2, type[relate][0], Coreferences)
246 | continue
247 |
248 | if t1 == None or t2 == None:
249 | ## Just use it because we dont want to miss the answers.
250 | continue
251 |
252 | # If atelast one of the concept is a common noun ignore the relation
253 | ### Common Noun Check End###
254 | '''
255 | val1 = (t1, type[relate][0], val1[1], val1[2], val1[3])
256 | val2 = (t2, type[relate][1], val2[1], val2[2], val2[3])
257 |
258 | if (val1, val2) not in Relations[relate]:
259 | Relations[relate].append((val1, val2))
260 |
261 | self.MakeRelationMappings(val1, val2, relate, Noteid)
262 |
263 | self.RelationsPerNote[Noteid] = [Relations, PatientNote, Coreferences]
264 |
265 | '''
266 | # for cluster_id in self.map_problems_to_test_investigated:
267 | # try:
268 | # out = self.map_problems_to_test_revealed[cluster_id]
269 | # print(self.map_problems_to_test_investigated[cluster_id])
270 | # print(out)
271 | # print("\n")
272 | # except:
273 | # continue
274 |
275 | print(Relations.keys())
276 | try:
277 | relation_investigated = Relations["TeCP"]
278 | relation_revealed = Relations["TeRP"]
279 | except:
280 |
281 | continue
282 | values = zip(*relation_revealed)
283 | for annotations in relation_investigated:
284 | try:
285 | index_val = list(values[0]).index(annotations[0][0])
286 | except:
287 | continue
288 |
289 | for idx in index_val:
290 | print(annotations)
291 | print(values[2][idx])
292 | '''
293 |
294 | def ReadCoreference(self,coref_path,PatientNote):
295 |
296 | remote_file = open(coref_path.replace("docs","chains") + ".chains")
297 | coref_concepts = {}
298 | for line in remote_file:
299 | line = line.replace("|||", "||")
300 | words = line.split("||")
301 |
302 | vals = []
303 |
304 |
305 | type = words[-1].replace("\"","").split("=")[-1].strip().replace("coref ","")
306 | if type not in coref_concepts and type != "person":
307 | coref_concepts[type] = []
308 | if type == "person":
309 | continue
310 | for word in words[0:-1]:
311 | term = word.split("=")
312 | full_annotation = "=".join(term[1:])
313 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""]
314 | pos1 = int(index[0])
315 | pos2 = int(index[-1])
316 |
317 | annotation = full_annotation[pos1 + 1:pos2]
318 | indxs = full_annotation[pos2 + 1:].split(",")
319 |
320 | line_in_note = ""
321 | start_line = None
322 |
323 |
324 | for indx in indxs:
325 | indx = indx.strip()
326 | out = indx.split(" ")
327 | start_line = out[0].split(":")[0]
328 | start_token = out[0].split(":")[1]
329 | end_line = out[1].split(":")[0]
330 | end_token = out[1].split(":")[1]
331 | end_token = out[1].split(":")[1]
332 |
333 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)])
334 |
335 | vals.append((annotation,line_in_note,start_line,start_token))
336 |
337 | coref_concepts[type].append(vals)
338 | return coref_concepts
339 |
340 | def ReadAssertionsData(self):
341 |
342 | self.problem_status = {}
343 |
344 | for paths in self.AstFilePath:
345 | files = [f for f in listdir(paths) if isfile(join(paths, f))]
346 | for file in files:
347 | remote_file = open(paths + file)
348 | Noteid = file.split(".")[0]
349 | PatientNote = self.ClinicalNotes[Noteid]
350 |
351 | if Noteid not in self.problem_status:
352 | self.problem_status[Noteid] = {}
353 |
354 | for line in remote_file:
355 | line = line.replace("|||", "||")
356 | words = line.split("||")
357 |
358 | vals = []
359 | type = words[1].split("=")[1].split("\"")[1]
360 | status = words[2].split("=")[1].split("\"")[1]
361 | for word in [words[0]]:
362 | term = word.split("=")
363 | full_annotation = "=".join(term[1:])
364 | index = [pos for pos, char in enumerate(full_annotation) if char == "\""]
365 | pos1 = int(index[0])
366 | pos2 = int(index[-1])
367 |
368 | annotation = full_annotation[pos1 + 1:pos2]
369 | indxs = full_annotation[pos2 + 1:].split(",")
370 |
371 | line_in_note = ""
372 | start_line = None
373 |
374 | annotation = self.SimplePreProcess(annotation)
375 |
376 | for indx in indxs:
377 | indx = indx.strip()
378 | out = indx.split(" ")
379 | start_line = out[0].split(":")[0]
380 | start_token = out[0].split(":")[1]
381 | end_line = out[1].split(":")[0]
382 | end_token = out[1].split(":")[1]
383 |
384 | line_in_note += "".join(PatientNote[int(start_line) - 1:int(end_line)])
385 |
386 | if annotation == None:
387 | continue
388 | if type == "problem":
389 | if annotation not in self.problem_status[Noteid]:
390 | self.problem_status[Noteid][annotation] = []
391 | self.problem_status[Noteid][annotation].append((status,line_in_note,start_line,start_token))
392 |
393 | ######################## Main program functions ##########################################
394 |
395 | def ReadTemplates(self):
396 |
397 | self.relations_out = {"paragraphs": [], "title": "relations"}
398 | self.logical_out = []
399 |
400 | ########################################## Set File Paths ##############################################
401 |
402 | ### File to write Question-Answers ##
403 |
404 |
405 | if qa_csv_write:
406 | ofile = open(qa_output, "w")
407 | self.filewriter = csv.writer(ofile, delimiter="\t")
408 | self.filewriter.writerow(
409 | ["Question", "Logical Form", "Answer", "Answer line in note", "Note ID", "Difference in QA lines"])
410 |
411 | ### File to write Question-Logical Forms ##
412 |
413 | if ql_csv_write:
414 | ofile = open(ql_output, "w")
415 | self.filewriter_forlform = csv.writer(ofile, delimiter="\t")
416 | self.filewriter_forlform.writerow(["Question", "Logical Form"])
417 |
418 | ### File to read templates ###
419 |
420 | file = open(template_file_path)
421 | filereader = list(csv.reader(file))
422 |
423 | ## read only templates relevant to relations challenge ##
424 |
425 | rel_lines = []
426 | for line in filereader[1:]:
427 | if line[0] != "relations":
428 | continue
429 | rel_lines.append(line)
430 |
431 | ########################################## Main Function Call ##############################################
432 |
433 | total_questions = 0
434 | for Noteid in self.RelationsPerNote:
435 |
436 | [Relations, PatientNote, Coreferences] = self.RelationsPerNote[Noteid]
437 | out_patient = {"note_id": Noteid, "context": PatientNote, "qas": []}
438 | self.unique_questions = []
439 |
440 | for line in rel_lines:
441 |
442 | question = line[2].strip()
443 | logical_form = line[3].strip()
444 | helper = line[4].split(",")
445 | helper = [type.strip() for type in helper]
446 | answertype = line[5].strip()
447 |
448 | question = question.replace("|medication| or |medication|", "|medication|") ## added ##
449 | question = question.replace("|problem| or |problem|", "|problem|") ## added ##
450 | question = question.replace("|test| or |test|", "|test|") ## added ##
451 | question = question.replace("|test| |test| |test|", "|test|") ## added ##
452 | question = question.replace("\t", "")
453 | logical_form = logical_form.replace("\t", "")
454 | question = question.replace("\t", "")
455 | logical_form = logical_form.replace("\t", "")
456 | if question.strip() == "":
457 | continue
458 |
459 | ## check for errors in templates and gather all the placeholders in the templates (placeholders stored in rwords) ##
460 | ## semantic types of placeholders ##
461 |
462 | types_to_replace = self.checking_for_errors(question, logical_form)
463 |
464 | if len(types_to_replace) != 0:
465 | types_to_replace = list(types_to_replace[0])
466 | else:
467 | types_to_replace = []
468 |
469 | answer_out = self.MakeLabTestQA(question, logical_form, types_to_replace, answertype, helper, Relations, Noteid, Coreferences)
470 |
471 | if len(answer_out) != 0:
472 | out_patient["qas"].extend(answer_out)
473 |
474 | total_questions += len(self.unique_questions)
475 | self.relations_out["paragraphs"].append(out_patient)
476 |
477 | print(total_questions)
478 | print(self.count_corefs)
479 | print(self.resolved_corefs)
480 |
481 | with open(relations_qa_output_json, 'w') as outfile:
482 | json.dump(self.relations_out, outfile, ensure_ascii=False)
483 |
484 | def MakeLabTestQA(self, question, logical_form, types_to_replace, answertype, helper, Relations, Noteid, Coreferences):
485 |
486 | orginal_question = question
487 | logical_form_template = logical_form
488 | answer_out = []
489 |
490 | for relate in helper:
491 |
492 | if relate == "ast":
493 |
494 | questions_list = question.strip().split("##")
495 |
496 | ## fixed a bug, intially not included ##
497 | answer_out = self.HandleAssertionQA(Noteid, types_to_replace, questions_list, logical_form_template, Coreferences, answertype) ## fixed bug, intially was not including assertations data
498 |
499 | else:
500 |
501 | try:
502 | relevant_relations = Relations[relate] ## Get relations which satisy the relate criteria
503 | except:
504 | continue
505 |
506 | for val1, val2 in relevant_relations:
507 |
508 | annotations = {val1[1]: (val1[0], val1[2], val1[3], val1[4]),
509 | val2[1]: (val2[0], val2[2], val2[3], val2[4])}
510 |
511 | ## check if there are placeholders in the question, call function to replace the placeholders ##
512 |
513 | if len(types_to_replace) != 0:
514 | questions_list = question.strip().split("##")
515 | out = self.MakeQuestion_new(types_to_replace, annotations, questions_list, logical_form_template, Coreferences, Noteid)
516 | if out == None:
517 | continue
518 | else:
519 | [question_list, logical_form, question_lines, question_start_line, question_start_token] = out
520 | else:
521 | ## if no placeholders directly use the question ##
522 | [question_list, logical_form, question_lines, question_start_line, question_start_token]= [question.split("##"), logical_form_template, "", "", ""]
523 |
524 | ### Writing question - logical form ##
525 |
526 | paraphrase_questions = set(question_list)
527 | question_templates = orginal_question.split("##")
528 |
529 | if len(question_list) != len(question_templates):
530 | print(question_list)
531 | print(question_templates)
532 |
533 | unique_tup = list(set(zip(question_list, question_templates)))
534 |
535 | if ql_csv_write:
536 |
537 | for qidx in range(len(unique_tup)):
538 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template])
539 |
540 | ##### Make answers for the succesful questions ####
541 |
542 | [answer, answer_line, answer_start_line, answer_start_token] = self.AnswerSubFunction(answertype, val1, val2, Noteid, relate, question_lines, question_start_line, question_start_token)
543 |
544 | if len(answer) != 0:
545 |
546 | if paraphrase_questions not in self.unique_questions:
547 |
548 | self.unique_questions.append(paraphrase_questions)
549 |
550 |
551 | ans_list = []
552 | for idx in range(len(answer)):
553 |
554 | start_line = answer_start_line[idx]
555 | start_token = answer_start_token[idx]
556 |
557 | if answertype == ["problems,status"]:
558 | #entity_type = "complex"
559 | entity_type = "empty"
560 | elif answer[idx] == "":
561 | entity_type = "empty"
562 | else:
563 | entity_type = "single"
564 |
565 |
566 | #if answer[idx] == "" and start_token != "":
567 | # print(paraphrase_questions)
568 | val = {"answer_start": [start_line, start_token], "text": answer[idx], "evidence": answer_line[idx], "evidence_start": start_line, "answer_entity_type": entity_type}
569 | if val not in ans_list:
570 | ans_list.append(val)
571 |
572 |
573 | ## ""evidence"" in the dictionary above is currently just the answer line in the note. You can also consider question line and answer line from note as evidence in that uncomment below code and use it accordingly ##
574 |
575 | '''
576 | ## evidence per answer ##
577 | evidence_answer = []
578 | evidence_start = []
579 | evidence_temp_line = question_line + answer_line
580 | evidence_temp_start = question_start_line + answer_start_line
581 | for pdx in range(len(evidence_temp_line)):
582 | if evidence_temp_line[pdx] not in evidence_answer:
583 | evidence_answer.append(evidence_temp_line[pdx])
584 | evidence_start.append(evidence_temp_start[pdx])
585 |
586 | if answer[idx] == "yes" or answer[idx] == "no":
587 | start_line = ""
588 | start_token = ""
589 | else:
590 | start_line = answer_start_line[idx]
591 | start_token = answer_start_token[idx]
592 |
593 | val = {"answer_start": [start_line, start_token], "text": answer[idx],"evidence": evidence_answer,"evidence_start": evidence_start}
594 | # evidence will have q_line_answer_line
595 |
596 | if qa_csv_write:
597 |
598 | result_num = answer_start_line + question_start_line
599 | perms = list(
600 | itertools.product(result_num, result_num)) ## find different pairs of numbers ##
601 | diffs = [abs(val1 - val2) for (val1, val2) in perms]
602 | difference = max(diffs)
603 |
604 | Note_val = "#".join(list(set(evidence_temp_line)))
605 |
606 | self.filewriter.writerow( ["##".join(paraphrase_questions)] + [logical_form] + [",".join(answer)] + [Note_val] + [Noteid + "_RelationsChallenge"] + [difference])
607 | '''
608 |
609 | answer_temp = {"answers": ans_list,"id": [zip(question_list, question_templates), logical_form_template], "question": list(paraphrase_questions)}
610 | answer_out.append(answer_temp)
611 |
612 | return answer_out
613 |
614 | def HandleAssertionQA(self,Noteid,dup_rwords, question_list_templates, logical_form_template,Coreferences, answertype):
615 | types_to_replace = list(dup_rwords)
616 | answer_out = []
617 | if len(dup_rwords) != 0:
618 | for problem in self.problem_status[Noteid]:
619 | answer = []
620 | result_num = []
621 | answer_line = []
622 | result_token = []
623 |
624 | logical_form = logical_form_template
625 | status = self.problem_status[Noteid][problem]
626 | rwords = list(dup_rwords)
627 | flag = 0
628 | for idx in range(len(rwords)):
629 | #print(problem)
630 | (t1,valid_list) = self.CheckIfConceptValid((problem,status[0][1],status[0][2],status[0][3]),rwords[idx], Coreferences )
631 | if t1 == None:
632 | if valid_list != None:
633 | replace_annoation = random.choice(valid_list)
634 | rwords[idx] = replace_annoation
635 | else:
636 | flag = 1
637 | else:
638 | rwords[idx] = t1
639 |
640 | if flag == 1:
641 | continue
642 |
643 | new_question_list = []
644 |
645 | ### Make Question ###
646 | for question in question_list_templates:
647 | done = []
648 | idx = 0
649 | for types in list(types_to_replace):
650 | index = question.find("|" + types + "|")
651 | if index == -1 and types not in done:
652 | print(question, "|" + types + "|", done)
653 | question = question.replace("|" + types + "|", rwords[idx])
654 | done.append(types)
655 | idx += 1
656 | #if question not in new_question_list:
657 | new_question_list.append(question)
658 |
659 | ## ### Make Logical Form ###
660 | idx = 0
661 | done = []
662 | for types in list(types_to_replace):
663 | index = logical_form.find("|" + types + "|")
664 | if index == -1 and types not in done:
665 | print(logical_form, "|" + types + "|", done, types)
666 | done.append(types)
667 | logical_form = logical_form.replace("|" + types + "|", rwords[idx])
668 | idx += 1
669 |
670 | for val in status:
671 | #print(val[0])
672 | answer.append(val[0])
673 | answer_line.append(val[1])
674 | result_num.append(int(val[2]))
675 | result_token.append(int(val[3]))
676 |
677 | if answertype == "none":
678 | question_templates = question_list_templates
679 | unique_tup = list(set(zip(new_question_list, question_templates)))
680 | for qidx in range(len(unique_tup)):
681 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template])
682 | else:
683 |
684 | question_templates = question_list_templates
685 | if len(new_question_list) != len(question_templates):
686 | print(new_question_list)
687 | print(question_templates)
688 | unique_tup = list(set(zip(new_question_list, question_templates)))
689 |
690 | for qidx in range(len(unique_tup)):
691 | self.filewriter_forlform.writerow([unique_tup[qidx][0]] + [logical_form] + [unique_tup[qidx][1]] + [logical_form_template])
692 |
693 |
694 | if len(answer) != 0:
695 |
696 | '''
697 | perms = list(itertools.product(result_num, result_num))
698 | diffs = [abs(val1 - val2) for (val1, val2) in perms]
699 | difference = max(diffs)
700 | question_templates = question_list_templates
701 |
702 | Note_val = "#".join(answer_line)
703 | '''
704 | new_question_list = set(new_question_list)
705 | if new_question_list not in self.unique_questions:
706 | '''
707 | if qa_csv_write:
708 | self.filewriter.writerow(["##".join(new_question_list)] + [logical_form] + [",".join(answer)] + [Note_val] + [Noteid + "_RelationsChallenge"] + [ difference])
709 | '''
710 | self.unique_questions.append(set(new_question_list))
711 |
712 | ans_list = []
713 | for idx in range(len(answer)):
714 | #print(answer[idx], result_num[idx], result_token[idx])
715 | #val = {"answer_start": [result_num[idx], result_token[idx]], "text": answer[idx], "evidence": answer_line[idx], "evidence_start": result_num[idx]}
716 | val = {"answer_start": [result_num[idx], ""], "text": "", "evidence": answer_line[idx], "evidence_start": result_num[idx], "answer_entity_type": "empty"}
717 | if val not in ans_list:
718 | ans_list.append(val)
719 |
720 | # evidence will have q_line_answer_line
721 | answer_temp = {"answers": ans_list, "id": [zip(question_templates,new_question_list),logical_form_template], "question": list(set(new_question_list))}
722 | answer_out.append(answer_temp)
723 |
724 |
725 |
726 | return answer_out
727 |
728 | ######################## Main Utility Functions ######################################
729 |
730 | def MakeRelationMappings(self, val1, val2, relate, Noteid):
731 |
732 | # print(self.Entity_to_CoreferenceCluster_map[Noteid]["problem"])
733 | # print((val1[0],val1[2],val1[3],val1[4]))
734 |
735 | ## If val1 belongs to some cluster, map to that if not map, to the concept directly ##
736 | ## Not resolving coreference for answers at this point, so some answers maybe redundant ###
737 |
738 | try:
739 |
740 | concept_cluster_1 = self.Entity_to_CoreferenceCluster_map[Noteid][val1[1].replace("1", "")][
741 | (val1[0], val1[2], val1[3], val1[4])]
742 | # print(concept_cluster_1)
743 | except:
744 | concept_cluster_1 = val1[0]
745 | try:
746 |
747 | concept_cluster_2 = self.Entity_to_CoreferenceCluster_map[Noteid][val2[1].replace("2", "")][
748 | (val2[0], val2[2], val2[3], val2[4])]
749 |
750 | # print(concept_cluster_2)
751 | except:
752 | concept_cluster_2 = val2[0]
753 | # print(concept_cluster_2)
754 |
755 | if Noteid not in self.map_problems_to_test_revealed:
756 | self.map_problems_to_test_revealed[Noteid] = {}
757 | self.map_tests_to_problem_revealed[Noteid] = {}
758 | self.map_problems_to_test_investigated[Noteid] = {}
759 | self.map_tests_to_problem_investigated[Noteid] = {}
760 | self.allergic_treatments[Noteid] = []
761 | self.problems_to_badtreatment[Noteid] = {}
762 | self.treatments_status_to_problem[Noteid] = {}
763 | self.map_problems_to_treatment[Noteid] = {}
764 | self.badtreatments_to_problem[Noteid] = {}
765 | self.symptoms_to_problem[Noteid] = {}
766 | self.problems_to_symptom[Noteid] = {}
767 |
768 | if relate == "TeRP":
769 |
770 | ## Coreference Checking is ensuring semantic check ##
771 |
772 | if concept_cluster_1 not in self.map_problems_to_test_revealed[Noteid]:
773 | self.map_problems_to_test_revealed[Noteid][concept_cluster_1] = []
774 |
775 | if concept_cluster_2 not in self.map_tests_to_problem_revealed:
776 | self.map_tests_to_problem_revealed[Noteid][concept_cluster_2] = []
777 |
778 | self.map_problems_to_test_revealed[Noteid][concept_cluster_1].append(val2)
779 | self.map_tests_to_problem_revealed[Noteid][concept_cluster_2].append(val1)
780 |
781 | if relate == "TeCP":
782 |
783 | ## Simple checking the name, need to check semantically, or normalize with CUI ##
784 |
785 | if concept_cluster_1 not in self.map_problems_to_test_investigated[Noteid]:
786 | self.map_problems_to_test_investigated[Noteid][concept_cluster_1] = []
787 |
788 | if concept_cluster_2 not in self.map_tests_to_problem_investigated:
789 | self.map_tests_to_problem_investigated[Noteid][concept_cluster_2] = []
790 |
791 | self.map_problems_to_test_investigated[Noteid][concept_cluster_1].append(val2)
792 | self.map_tests_to_problem_investigated[Noteid][concept_cluster_2].append(val1)
793 |
794 | if relate == "TrNAP" or relate == "TrCP":
795 |
796 | if val1 not in self.allergic_treatments[Noteid]:
797 | self.allergic_treatments[Noteid].append(val1)
798 |
799 | if relate == "TrCP":
800 |
801 | if concept_cluster_1 not in self.problems_to_badtreatment[Noteid]:
802 | self.problems_to_badtreatment[Noteid][concept_cluster_1] = []
803 |
804 | if concept_cluster_2 not in self.badtreatments_to_problem[Noteid]:
805 | self.badtreatments_to_problem[Noteid][concept_cluster_2] = []
806 |
807 | self.problems_to_badtreatment[Noteid][concept_cluster_1].append(val2)
808 | self.badtreatments_to_problem[Noteid][concept_cluster_2].append(val1)
809 |
810 | if concept_cluster_1 not in self.map_problems_to_treatment[Noteid]:
811 | self.map_problems_to_treatment[Noteid][concept_cluster_1] = []
812 |
813 | status = self.tr_status[relate]
814 | self.map_problems_to_treatment[Noteid][concept_cluster_1].append((val2, status))
815 |
816 | if relate == "TrIP" or relate == "TrWP" or relate == "TrAP":
817 |
818 | if concept_cluster_2 not in self.treatments_status_to_problem[Noteid]:
819 | self.treatments_status_to_problem[Noteid][concept_cluster_2] = []
820 |
821 | status = self.tr_status[relate]
822 | self.treatments_status_to_problem[Noteid][concept_cluster_2].append(
823 | (val1, status)) ## val1 is treatment
824 |
825 | if concept_cluster_1 not in self.map_problems_to_treatment[Noteid]:
826 | self.map_problems_to_treatment[Noteid][concept_cluster_1] = []
827 |
828 | status = self.tr_status[relate]
829 | self.map_problems_to_treatment[Noteid][concept_cluster_1].append((val2, status))
830 |
831 | if relate == "PIP":
832 |
833 | if concept_cluster_1 not in self.symptoms_to_problem[Noteid]:
834 | self.symptoms_to_problem[Noteid][concept_cluster_1] = []
835 |
836 | if concept_cluster_2 not in self.problems_to_symptom[Noteid]:
837 | self.problems_to_symptom[Noteid][concept_cluster_2] = []
838 |
839 | self.symptoms_to_problem[Noteid][concept_cluster_1].append(val2)
840 | self.problems_to_symptom[Noteid][concept_cluster_2].append(val1)
841 |
842 | def AnswerSubFunction(self, answertype, val1, val2, Noteid, relate, question_lines, question_start_line, question_start_token):
843 |
844 | try:
845 | concept_cluster_1 = self.Entity_to_CoreferenceCluster_map[Noteid][val1[1].replace("1", "")][
846 | (val1[0], val1[2], val1[3], val1[4])]
847 | except:
848 | concept_cluster_1 = val1[0]
849 | try:
850 | concept_cluster_2 = self.Entity_to_CoreferenceCluster_map[Noteid][val2[1].replace("2", "")][
851 | (val2[0], val2[2], val2[3], val2[4])]
852 | except:
853 | concept_cluster_2 = val2[0]
854 |
855 | answer = []
856 | result_start_line = []
857 | result_start_token = []
858 | answer_line = []
859 |
860 | ######################## rules for test answers ########################
861 | if answertype == "yes/no" or answertype == "abnormal" or answertype == "yes":
862 | #answer = ["yes"]* len(question_lines)
863 | answer = [""] * len(question_lines)
864 | answer_line.extend(question_lines)
865 | result_start_line.extend(question_start_line)
866 | #result_start_token.extend(question_start_token)
867 | result_start_token = [""] * len(question_lines)
868 | elif answertype == "tests_investigated":
869 | tests = self.map_tests_to_problem_investigated[Noteid][concept_cluster_2]
870 | for test in tests:
871 | answer += [test[0]]
872 | answer_line.append(test[2])
873 | result_start_line.append(int(test[3]))
874 | result_start_token.append(int(test[4]))
875 | elif answertype == "tests_revealed":
876 | tests = self.map_tests_to_problem_revealed[Noteid][concept_cluster_2]
877 | for test in tests:
878 | answer += [test[0]]
879 | answer_line.append(test[2])
880 | result_start_line.append(int(test[3]))
881 | result_start_token.append(int(test[4]))
882 | elif answertype == "conducted_problem_revealed_problem":
883 | try:
884 | investigated_problems = self.map_problems_to_test_investigated[concept_cluster_1]
885 | for problem in investigated_problems:
886 | answer += [problem[0]]
887 | # answer += ["conducted " + problem[0]]
888 | answer_line.append(problem[2])
889 | result_start_line.append(int(problem[3]))
890 | result_start_token.append(int(problem[4]))
891 | except:
892 | pass
893 | try:
894 | revealed_problems = self.map_problems_to_test_revealed[concept_cluster_1]
895 | for problem in revealed_problems:
896 | # answer += ["revealed " + problem[0]]
897 | answer += [problem[0]]
898 | answer_line.append(problem[2])
899 | result_start_line.append(int(problem[3]))
900 | result_start_token.append(int(problem[4]))
901 | except:
902 | pass
903 | elif answertype == "revealed_problem":
904 | try:
905 | revealed_problems = self.map_problems_to_test_revealed[concept_cluster_1]
906 | for problem in revealed_problems:
907 | answer += [problem[0]]
908 | answer_line.append(problem[2])
909 | result_start_line.append(int(problem[3]))
910 | result_start_token.append(int(problem[4]))
911 | except:
912 | #answer = ["no"]*len(question_lines)
913 | answer = [""] * len(question_lines)
914 | answer_line.extend(question_lines)
915 | result_start_line.extend(question_start_line)
916 | #result_start_token.extend(question_start_token)
917 | result_start_token = [""] * len(question_lines)
918 |
919 | elif answertype == "problems_investigated":
920 | problems = self.map_problems_to_test_investigated[Noteid][concept_cluster_1]
921 | # print(problems)
922 | for problem in problems:
923 | answer += [problem[0]]
924 | answer_line.append(problem[2])
925 | result_start_line.append(int(problem[3]))
926 | result_start_token.append(int(problem[4]))
927 | ##########################################################################################################################################
928 | elif answertype == "allergic_treatments":
929 | events = self.allergic_treatments[Noteid]
930 |
931 | for event in events:
932 | answer += [event[0]]
933 | answer_line.append(event[2])
934 | result_start_line.append(int(event[3]))
935 | result_start_token.append(int(event[4]))
936 | elif answertype == "treatments, status":
937 | events = self.treatments_status_to_problem[Noteid][concept_cluster_2]
938 |
939 | for temp in events:
940 | (event, status) = temp
941 | '''
942 | stemp = ""
943 | status = status.strip()
944 | if val2[0] in self.problem_status[Noteid]:
945 | out = self.problem_status[Noteid][val2[0]]
946 | if out[1] == question_line and out[2] == line_num:
947 | stemp = out[0]
948 | status += ", "+stemp
949 | '''
950 | # answer += [event[0] + " (" + status + ")"]
951 | answer += [event[0]]
952 | answer_line.append(event[2])
953 | result_start_line.append(int(event[3]))
954 | result_start_token.append(int(event[4]))
955 | elif answertype == "problems,status":
956 | try:
957 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1]
958 | # print(events)
959 | if "causes" in zip(*events)[1] and "improves" in zip(*events)[1]:
960 | print(Noteid)
961 | for temp in events:
962 | (event, status) = temp
963 | #answer += [event[0] + " (" + status + ")"]
964 | #answer.append([event[0], status])
965 | answer.append("")
966 | # answer += [event[0]]
967 | answer_line.append(event[2])
968 | result_start_line.append(int(event[3]))
969 | result_start_token.append(int(event[4]))
970 | except:
971 | caused_problems = self.problems_to_badtreatment[Noteid][concept_cluster_1]
972 |
973 | for event in caused_problems:
974 | #answer += [event[0] + " (" + "caused" + ")"]
975 | #answer.append([event[0] , "caused"])
976 | # answer += [event[0]]
977 | answer.append("")
978 | answer_line.append(event[2])
979 | result_start_line.append(int(event[3]))
980 | result_start_token.append(int(event[4]))
981 | elif answertype == "no":
982 | #answer = ["no"]*len(question_lines)
983 | answer = [""] * len(question_lines)
984 | answer_line.extend(question_lines)
985 | result_start_line.extend(question_start_line)
986 | #result_start_token.extend(question_start_token)
987 | result_start_token = [""] * len(question_lines)
988 | elif answertype == "problems_check_conducted":
989 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1]
990 |
991 | for temp in events:
992 | (event, status) = temp
993 | # answer += ["treatment:" + event[0]]
994 | answer += [event[0]]
995 | answer_line.append(event[2])
996 | result_start_line.append(int(event[3]))
997 | result_start_token.append(int(event[4]))
998 | try:
999 | treatment_entities_list = self.CoreferenceCluster_to_Entity_map["treatment"][concept_cluster_1]
1000 | tests = self.map_problems_to_test_investigated[Noteid]
1001 | for test in tests:
1002 | test_entities_list = self.CoreferenceCluster_to_Entity_map["test"][test]
1003 | new_set = set(test_entities_list).intersection(set(treatment_entities_list))
1004 | if len(new_set) != 0:
1005 | events = self.map_problems_to_test_investigated[Noteid][test]
1006 | for temp in events:
1007 | (event, status) = temp
1008 | # answer += ["tests:" + event[0]]
1009 | answer += [event[0]]
1010 | answer_line.append(event[2])
1011 | result_start_line.append(int(event[3]))
1012 | result_start_token.append(int(event[4]))
1013 | break
1014 | except:
1015 | pass
1016 | elif answertype == "problems":
1017 |
1018 | if relate == "TrCP":
1019 | pass
1020 | # events = self.problems_to_badtreatment[Noteid][concept_cluster_1]
1021 |
1022 | # for event in events:
1023 | # answer += [event[0]]
1024 | # answer_line.append(event[2])
1025 | # result_start_line.append(int(event[3]))
1026 | # result_start_token.append(int(event[4]))
1027 | else:
1028 |
1029 | events = self.map_problems_to_treatment[Noteid][concept_cluster_1]
1030 |
1031 | for temp in events:
1032 | (event, status) = temp
1033 | answer += [event[0]]
1034 | answer_line.append(event[2])
1035 | result_start_line.append(int(event[3]))
1036 | result_start_token.append(int(event[4]))
1037 |
1038 | elif answertype == "treatments":
1039 | events = self.treatments_status_to_problem[Noteid][concept_cluster_2]
1040 |
1041 | for temp in events:
1042 | (event, status) = temp
1043 | answer += [event[0]]
1044 | answer_line.append(event[2])
1045 | result_start_line.append(int(event[3]))
1046 | result_start_token.append(int(event[4]))
1047 | elif answertype == "problem1, treatment":
1048 |
1049 | try:
1050 | events = self.badtreatments_to_problem[Noteid][concept_cluster_2]
1051 |
1052 | for event in events:
1053 | answer += [event[0]]
1054 | answer_line.append(event[2])
1055 | result_start_line.append(int(event[3]))
1056 | result_start_token.append(int(event[4]))
1057 | except:
1058 | pass
1059 | '''
1060 | try:
1061 | events = self.problems_to_symptom[Noteid][concept_cluster_2]
1062 |
1063 | for event in events:
1064 | answer += [event[0]]
1065 | answer_line.append(event[2])
1066 | result_start_line.append(int(event[3]))
1067 | result_start_token.append(int(event[4]))
1068 | except:
1069 | print(relate,answertype)
1070 | pass
1071 | '''
1072 | elif answertype == "problem1":
1073 |
1074 | events = self.problems_to_symptom[Noteid][concept_cluster_2]
1075 |
1076 | for event in events:
1077 | answer += [event[0]]
1078 | answer_line.append(event[2])
1079 | result_start_line.append(int(event[3]))
1080 | result_start_token.append(int(event[4]))
1081 |
1082 | elif answertype == "symptoms":
1083 |
1084 | events = self.symptoms_to_problem[Noteid][concept_cluster_1]
1085 |
1086 | for event in events:
1087 | answer += [event[0]]
1088 | answer_line.append(event[2])
1089 | result_start_line.append(int(event[3]))
1090 | result_start_token.append(int(event[4]))
1091 | elif answertype == "none":
1092 | answer = []
1093 | else:
1094 | print(answertype)
1095 | answer = []
1096 |
1097 | return [answer, answer_line, result_start_line, result_start_token]
1098 |
1099 | def MakeQuestion_new(self, types_to_replace, annotations, question_list, logical_form_template, Coreferences, Noteid):
1100 |
1101 | new_question_list = []
1102 | question_start_line = []
1103 | question_start_token = []
1104 | question_line = []
1105 |
1106 | rwords = list(types_to_replace)
1107 | for idx in range(len(rwords)):
1108 | question_start_line.append(int(annotations[rwords[idx]][2]))
1109 | question_start_token.append(int(annotations[rwords[idx]][3]))
1110 | question_line.append(annotations[rwords[idx]][1])
1111 |
1112 | (t1, valid_list) = self.CheckIfConceptValid(annotations[rwords[idx]], rwords[idx], Coreferences)
1113 | if t1 == None:
1114 | if valid_list != None:
1115 | replace_annoation = random.choice(valid_list) ### all of them can be used for QL forms (more training data)
1116 | # print(annotations[rwords[idx]])
1117 | rwords[idx] = replace_annoation
1118 | else:
1119 | return None
1120 | else:
1121 | rwords[idx] = t1
1122 |
1123 | for question in question_list:
1124 | done = []
1125 | idx = 0
1126 | for types in list(types_to_replace):
1127 | # temp = qwords
1128 | index = question.find("|" + types + "|")
1129 | if index == -1 and types not in done:
1130 | print(question, "|" + types + "|", done)
1131 | question = question.replace("|" + types + "|", rwords[idx])
1132 | done.append(types)
1133 | idx += 1
1134 |
1135 | new_question_list.append(question)
1136 |
1137 | idx = 0
1138 | done = []
1139 | for types in list(types_to_replace):
1140 | index = logical_form_template.find("|" + types + "|")
1141 | if index == -1 and types not in done:
1142 | print(logical_form_template, "|" + types + "|", done, types)
1143 | done.append(types)
1144 | logical_form_template = logical_form_template.replace("|" + types + "|", rwords[idx])
1145 | idx += 1
1146 |
1147 | return [new_question_list, logical_form_template, question_line, question_start_line, question_start_token]
1148 |
1149 | ######################## Supporting Utility Functions ######################################
1150 |
1151 | #the tremendous tumor burden,the cord compression,gait weakness , stress incontinence copd flare a wide based gait shuffling short steps head computerized tomography scan
1152 |
1153 | def SimplePreProcess(self, word):
1154 |
1155 | if word == "":
1156 | return None
1157 |
1158 | lemmatizer = WordNetLemmatizer()
1159 |
1160 | if concept_is_CommonNoun(word) == 1 or concept_is_PastTense(word) == 1:
1161 | return None
1162 |
1163 | tag = nltk.pos_tag(nltk.word_tokenize(word))
1164 | temp = zip(*tag)
1165 | words = list(temp[0])
1166 | tags = list(temp[1])
1167 |
1168 | if tags[0] == "DT":
1169 | words[0] = ""
1170 | else:
1171 | pass
1172 |
1173 | for idx in range(len(tags)):
1174 | if lemmatizer.lemmatize(words[idx].lower()) in ["patient"]:
1175 | words[idx] = ""
1176 | if tags[idx] in ["PRP","PRP$"]:
1177 | if idx != 0 or " ".join(words[0:idx]).strip() != "":
1178 | words[idx] = "the"
1179 | if idx == 0:
1180 | words[idx] = ""
1181 | if " ".join(words[0:idx]).strip() != "" and tags[idx] == ["IN", "WDT"]:
1182 | words[idx] = ""
1183 |
1184 | words = [word for word in words if word != "" and lemmatizer.lemmatize(word) not in self.similar] ## check if its okay to start with "further"
1185 | if len(words) == 0:
1186 | return None
1187 |
1188 |
1189 | filter = " ".join(words) ## To make sure it makes sense you can use a parse#
1190 | tag = nltk.pos_tag(nltk.word_tokenize(filter))
1191 | temp = zip(*tag)
1192 | words = list(temp[0])
1193 | tags = list(temp[1])
1194 |
1195 | if len(set(["NN","NNS","jjR","JJS","JJ","NNP","NNPS","VB","VBG","VBP","VBZ"]).intersection(set(tags))) == 0:
1196 | return None
1197 | #events = word
1198 | #fevent = []
1199 | #out = events.split(" ")
1200 | #for val in out:
1201 | # if (val.lower().find("patient") == -1):
1202 | # fevent.append(val)
1203 |
1204 | #if len(fevent) == 0:
1205 | # return None
1206 |
1207 | #events = " ".join(fevent) # Remove Patient or any other common words
1208 |
1209 | #exclude = set(string.punctuation)
1210 | #s = ''.join(ch for ch in filter if ch not in exclude)
1211 | #print(filter)
1212 | return filter
1213 |
1214 | def CheckForCoreferences(self,concept, type ,Coreferences):
1215 |
1216 | self.count_corefs += 1
1217 | valid_list = []
1218 | if type == "problem1" or type == "problem2":
1219 | type = "problem"
1220 | try:
1221 | coref_lists = Coreferences[type]
1222 | except:
1223 | #print(type,Coreferences.keys())
1224 | return None
1225 |
1226 | for coref_list in coref_lists:
1227 | if concept in coref_list:
1228 |
1229 | #print(concept[0],zip(*coref_list)[0])
1230 | for idx in range(len(zip(*coref_list)[0])):
1231 | coref_concept = zip(*coref_list)[0][idx]
1232 | sout = self.SimplePreProcess(coref_concept)
1233 | #out_list = list(coref_list[idx])
1234 | #out_list.append(sout) ############################ correct grammar ot not #############
1235 | if sout != None and sout not in valid_list:
1236 | valid_list.append(sout)
1237 | #print(concept[0],valid_list,set(zip(*coref_list)[0]).symmetric_difference(set(valid_list)))
1238 |
1239 | if len(valid_list) != 0:
1240 | self.resolved_corefs += 1
1241 | return valid_list
1242 | else:
1243 |
1244 | return None
1245 |
1246 | def CheckIfConceptValid(self,val, type, Coreferences):
1247 |
1248 | t1 = self.SimplePreProcess(val[0])
1249 | valid_list = None
1250 |
1251 | ## currently only looking for coreference if orginal word is not valid, can use it to change orginal concepts as well ###
1252 |
1253 | if t1 == None:
1254 | valid_list = self.CheckForCoreferences(val, type ,Coreferences)
1255 | #print(val[0],valid_list,Coreferences[type])
1256 | else:
1257 | pass
1258 |
1259 | return (t1,valid_list)
1260 |
1261 | # If atelast one of the concept is a common noun ignore the relation
1262 | ### Common Noun Check End###
1263 |
1264 | def checking_for_errors(self, question_list,logical_form_template):
1265 |
1266 | question_list = question_list.split("##")
1267 | qwords_list = []
1268 | dup_rwords_list = []
1269 | unique_templates = []
1270 |
1271 | #logical_form_template = logical_form_template.replace("|treatment|", "|medication|").strip()
1272 |
1273 | for question in question_list:
1274 | if question.strip() == "":
1275 | continue
1276 | #question = question.replace("|medication| or |medication|", "|medication|")
1277 | #question = question.replace("|treatment|", "|medication|").strip()
1278 | if question not in unique_templates:
1279 | unique_templates.append(question)
1280 | else:
1281 | continue
1282 |
1283 | qtemplate = question
1284 | qwords = question.split("|")
1285 | dup_rwords = qwords[1:len(qwords):2]
1286 |
1287 | qwords_list.append(qwords)
1288 |
1289 | if len(dup_rwords_list) == 0:
1290 | dup_rwords_list = [set(dup_rwords)]
1291 | else:
1292 | if set(dup_rwords) not in dup_rwords_list:
1293 | print("Error Out Of Context Question:")
1294 | print(question, logical_form_template, question_list)
1295 | return []
1296 |
1297 | lwords = logical_form_template.split("|")
1298 | dup_lrwords = lwords[1:len(lwords):2]
1299 | if set(dup_lrwords) not in dup_rwords_list:
1300 | print("Error Out Of Context Question-Logical Form Pairs:")
1301 | print(question_list, logical_form_template)
1302 | return []
1303 |
1304 |
1305 | if len(dup_rwords_list) != 1:
1306 | print("Check Question_Logical Form Mapping")
1307 | print(dup_rwords_list, question_list)
1308 | print(logical_form_template)
1309 | return []
1310 |
1311 | return dup_rwords_list
1312 |
1313 |
1314 |
1315 | if __name__=="__main__":
1316 | GenerateRelationsQuestions()
--------------------------------------------------------------------------------