├── .idea ├── .gitignore ├── misc.xml ├── modules.xml ├── nlp2phenome.iml ├── other.xml └── vcs.xml ├── EDI_ann_doc.py ├── LabelModel.py ├── README.md ├── ann_converter.py ├── ann_utils.py ├── annotation_docs.py ├── data ├── entity_types.txt ├── entity_types_modifiers.txt ├── entity_types_no_context.txt ├── entity_types_phenotypes.txt └── entity_types_times.txt ├── doc_inference.py ├── learners.py ├── mention_pattern.py ├── neg-tumour-dt-learnt.png ├── nlp_to_phenome.py ├── predict_helper.py ├── pretrained_models ├── stroke_settings.zip ├── stroke_subtype_models.zip └── stroke_supplemental-gazetteer.zip ├── reportreader.py ├── requirements.txt ├── run_learning.py ├── settings ├── concept_mapping_stroke_sample.json ├── entity_types_phenotypes_stroke_sample.txt ├── ignore_mappings_stroke_sample.json ├── sample_setting.json ├── sample_setting_kfold_learning.json ├── stroke-subtype-rules-full.json └── stroke-subtype-rules.json └── utils.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/nlp2phenome.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 19 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /EDI_ann_doc.py: -------------------------------------------------------------------------------- 1 | from annotation_docs import EDIRAnn, relocate_annotation_pos 2 | import logging 3 | from os.path import basename, isfile, join, split 4 | import xml.etree.ElementTree as ET 5 | import re 6 | 7 | 8 | class EDIRDoc(object): 9 | """ 10 | a class for reading EDIR annotation doc (XML) 11 | """ 12 | 13 | def __init__(self, file_path): 14 | self._path = file_path 15 | self._root = None 16 | self._full_text = None 17 | self._word_offset_start = -1 18 | self._entities = None 19 | self.load() 20 | 21 | @property 22 | def file_path(self): 23 | return self._path 24 | 25 | def load(self): 26 | if not isfile(self.file_path): 27 | logging.debug('%s is NOT a file' % self.file_path) 28 | return 29 | tree = ET.parse(self.file_path) 30 | self._root = tree.getroot() 31 | self.get_word_offset_start() 32 | 33 | @property 34 | def get_full_text(self): 35 | if self._full_text is not None: 36 | return self._full_text 37 | if self._root is None: 38 | self.load() 39 | root = self._root 40 | d = '' 41 | start_offset = -1 42 | for p in root.findall('.//p'): 43 | for s in p: 44 | if 'proc' in s.attrib: # and s.attrib['proc'] == 'yes': 45 | for w in s: 46 | id_val = int(w.attrib['id'][1:]) 47 | if start_offset == -1: 48 | start_offset = id_val 49 | offset = id_val - start_offset 50 | d += ' ' * (offset - len(d)) + w.text 51 | self._full_text = d 52 | return d 53 | 54 | def get_word_offset_start(self): 55 | if self._word_offset_start >= 0: 56 | return self._word_offset_start 57 | root = self._root 58 | offset_start = -1 59 | for e in root.findall('.//p/s[@proc]/w'): 60 | if 'id' not in e.attrib: 61 | continue 62 | else: 63 | offset_start = int(e.attrib['id'][1:]) 64 | break 65 | # if offset_start == -1: 66 | # logging.debug('%s offset start could not be found' % self.file_path) 67 | self._word_offset_start = offset_start 68 | 69 | def get_ess_entities(self): 70 | if self._entities is not None: 71 | return self._entities 72 | root = self._root 73 | offset_start = self.get_word_offset_start() 74 | entities = [] 75 | for e in root.findall('.//standoff/ents/ent'): 76 | if 'type' not in e.attrib: 77 | continue 78 | ent_type = e.attrib['type'] 79 | if ent_type.startswith('label:'): 80 | continue 81 | negated = False 82 | if 'neg_' in ent_type: 83 | negated = True 84 | ent_type = ent_type.replace(r'neg_', '') 85 | str = ' '.join([part.text for part in e.findall('./parts/part')]) 86 | ent_start = -1 87 | ent_end = -1 88 | for part in e.findall('./parts/part'): 89 | ent_start = int(part.attrib['sw'][1:]) - offset_start 90 | ent_end = ent_start + len(part.text) 91 | ann = EDIRAnn(str=str, start=ent_start, end=ent_end, type=ent_type) 92 | ann.negated = negated 93 | ann.id = len(entities) 94 | entities.append(ann) 95 | self._entities = entities 96 | return self._entities 97 | 98 | def relocate_anns(self, t): 99 | if self._entities is None: 100 | return 101 | for a in self._entities: 102 | s, e = relocate_annotation_pos(t, a.start, a.end, a.str) 103 | a.start = s 104 | a.end = e 105 | 106 | 107 | class eHostGenedDoc(EDIRDoc): 108 | def __init__(self, file_path): 109 | super(eHostGenedDoc, self).__init__(file_path) 110 | 111 | def get_ess_entities(self): 112 | if self._entities is not None: 113 | return self._entities 114 | root = self._root 115 | entities = [] 116 | s_e_ids = [] 117 | for e in root.findall('.//classMention'): 118 | mcs = e.findall('./mentionClass') 119 | mention_id = e.attrib['id'] 120 | if len(mcs) > 0: 121 | mc = mcs[0] 122 | cls = mc.attrib['id'] 123 | cls = cls.replace('Negated_', '').replace('hypothetical_', '').replace('Other_', '').replace( 124 | 'historical_', '') 125 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..') 126 | if len(mentions) > 0: 127 | span = mentions[0].findall('./span') 128 | ent_start = span[0].attrib['start'] 129 | ent_end = span[0].attrib['end'] 130 | 131 | s_e_id = '%s-%s' % (ent_start, ent_end) 132 | if s_e_id in s_e_ids: 133 | continue 134 | s_e_ids.append(s_e_id) 135 | 136 | spannedText = mentions[0].findall('./spannedText') 137 | str = spannedText[0].text 138 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls) 139 | ann.id = len(entities) 140 | entities.append(ann) 141 | self._entities = entities 142 | return self._entities 143 | 144 | 145 | class eHostDoc(EDIRDoc): 146 | def __init__(self, file_path): 147 | super(eHostDoc, self).__init__(file_path) 148 | 149 | def get_ess_entities(self): 150 | if self._entities is not None: 151 | return self._entities 152 | root = self._root 153 | entities = [] 154 | for e in root.findall('.//classMention'): 155 | mcs = e.findall('./mentionClass') 156 | mention_id = e.attrib['id'] 157 | if len(mcs) > 0: 158 | mc = mcs[0] 159 | m = re.match(r'Verified\_([^\(]+)(\(.*\)){0,1}', mc.attrib['id']) 160 | if m is not None: 161 | cls = m.group(1) 162 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..') 163 | if len(mentions) > 0: 164 | span = mentions[0].findall('./span') 165 | ent_start = span[0].attrib['start'] 166 | ent_end = span[0].attrib['end'] 167 | spannedText = mentions[0].findall('./spannedText') 168 | str = spannedText[0].text 169 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls) 170 | ann.id = len(entities) 171 | entities.append(ann) 172 | self._entities = entities 173 | return self._entities 174 | 175 | 176 | class ConllDoc(EDIRDoc): 177 | """ 178 | for Conll output from classification results 179 | """ 180 | 181 | def __init__(self, file_path): 182 | super(ConllDoc, self).__init__(file_path) 183 | self._tokens = None 184 | self._label_white_list = None 185 | 186 | def set_label_white_list(self, labels): 187 | self._label_white_list = labels 188 | 189 | @property 190 | def conll_output(self): 191 | try: 192 | return '\n'.join([' '.join([t['t'], str(len(t['predicted_label'])), t['gold_label'], 193 | (('B-' if t['predicted_label'][-1]['ann'].start == t['offset'] else 'I-') + 194 | t['predicted_label'][-1]['label']) 195 | if len(t['predicted_label']) > 0 else 'O']) 196 | for t in self.get_token_list()]) 197 | except: 198 | logging.error('processing [%s] failed' % self.file_path) 199 | return '' 200 | 201 | def get_token_list(self): 202 | if self._tokens is not None: 203 | return self._tokens 204 | self._tokens = [] 205 | start_offset = -1 206 | root = self._root 207 | work_ess = list(self.get_ess_entities()) 208 | matched_ess = set() 209 | for p in root.findall('.//p'): 210 | for s in p: 211 | if 'proc' in s.attrib: # and s.attrib['proc'] == 'yes': 212 | for w in s: 213 | id_val = int(w.attrib['id'][1:]) 214 | if start_offset == -1: 215 | start_offset = id_val 216 | offset = id_val - start_offset 217 | token = {'t': w.text, 'id': w.attrib['id'], 'offset': offset, 218 | 'gold_label': 'O', 'predicted_label': []} 219 | for e in work_ess: 220 | label = e.type.replace('neg_', '').lower().strip() 221 | if self._label_white_list is not None and label not in self._label_white_list: 222 | continue 223 | if token['offset'] == e.start: 224 | token['gold_label'] = 'B-' + label 225 | matched_ess.add(e) 226 | elif e.start < token['offset'] < e.end: 227 | token['gold_label'] = 'I-' + label 228 | matched_ess.add(e) 229 | self._tokens.append(token) 230 | left_ess = [e for e in work_ess if e not in matched_ess 231 | and e.type.replace('neg_', '') in self._label_white_list] 232 | if len(left_ess) > 0: 233 | logging.error('leftovers: [%s] at %s' % ( 234 | '\n'.join(['%s (%s,%s)' % (a.type, a.start, a.end) for a in left_ess]), self.file_path)) 235 | return self._tokens 236 | 237 | def add_predicted_labels(self, predicted_label): 238 | """ 239 | append prediction result to the doc, one annotation a time 240 | :param predicted_label: labelled ann {'label': ..., 'ann': ann object} 241 | :return: 242 | """ 243 | if self._label_white_list is not None and predicted_label['label'] not in self._label_white_list: 244 | return 245 | for token in self.get_token_list(): 246 | if predicted_label['ann'].start <= token['offset'] < predicted_label['ann'].end: 247 | token['predicted_label'].append(predicted_label) 248 | -------------------------------------------------------------------------------- /LabelModel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import isfile, join 3 | from os import listdir 4 | from annotation_docs import Concept2Mapping, CustomisedRecoginiser 5 | from EDI_ann_doc import EDIRDoc, eHostGenedDoc 6 | import joblib as jl 7 | 8 | 9 | class LabelModel(object): 10 | """ 11 | a machine learning based class for inferring phenotypes from NLP results 12 | features: 13 | - feature weighing 14 | - transparent models 15 | """ 16 | def __init__(self, label, concept_mapping, max_dimensions=None): 17 | self._label = label 18 | self._concept_mapping = concept_mapping 19 | self._lbl_data = {} 20 | self._cui2label = {} 21 | self._selected_dims = None 22 | self._max_dimensions = 2000 if max_dimensions is None else max_dimensions 23 | self._tps = 0 24 | self._fps = 0 25 | self._lbl_one_dimension = True 26 | self._lbl2tfidf_dims = {} 27 | self._label_dimensions = [] 28 | self._rare_labels = {} 29 | self._lbl2classifiers = {} 30 | 31 | @property 32 | def concept_mapping(self): 33 | return self._concept_mapping 34 | 35 | @concept_mapping.setter 36 | def concept_mapping(self, concept_mapping): 37 | self._concept_mapping = concept_mapping 38 | 39 | def get_binary_cluster_classifier(self, label): 40 | if label in self._lbl2classifiers: 41 | return self._lbl2classifiers[label] 42 | else: 43 | return None 44 | 45 | @property 46 | def cluster_classifier_dict(self): 47 | return self._lbl2classifiers 48 | 49 | def put_binary_cluster_classifier(self, label, classifier): 50 | self._lbl2classifiers[label] = classifier 51 | 52 | @property 53 | def rare_labels(self): 54 | return self._rare_labels 55 | 56 | def add_rare_label(self, label, tp_ratio): 57 | self._rare_labels[label] = tp_ratio 58 | 59 | @property 60 | def use_one_dimension_for_label(self): 61 | return self._lbl_one_dimension 62 | 63 | @use_one_dimension_for_label.setter 64 | def use_one_dimension_for_label(self, value): 65 | self._lbl_one_dimension = value 66 | 67 | @property 68 | def cui2label(self): 69 | return self._cui2label 70 | 71 | @property 72 | def label(self): 73 | return self._label 74 | 75 | def add_label_dimension(self, value): 76 | if value.lower() not in self._label_dimensions: 77 | self._label_dimensions.append(value.lower()) 78 | # if tp is not None: 79 | # self._tp_labels.add(value.lower()) 80 | # if fp is not None: 81 | # self._fp_labels.add(value.lower()) 82 | 83 | def add_label_dimension_by_annotation(self, ann): 84 | self.add_label_dimension(LabelModel.get_ann_dim_label(ann, no_negation=True)) 85 | 86 | def add_context_dimension(self, value, tp=None, fp=None, lbl='united'): 87 | if lbl not in self._lbl_data: 88 | self._lbl_data[lbl] = {'dims': [], 't2f': {}, 'tps': set(), 'fps': set()} 89 | d = self._lbl_data[lbl] 90 | if value.lower() not in d['dims']: 91 | d['dims'].append(value.lower()) 92 | if value.lower() not in d['t2f']: 93 | d['t2f'][value.lower()] = 1 94 | else: 95 | d['t2f'][value.lower()] = d['t2f'][value.lower()] + 1 96 | tps = d['tps'] 97 | fps = d['fps'] 98 | if tp is not None: 99 | tps.add(value.lower()) 100 | if fp is not None: 101 | fps.add(value.lower()) 102 | 103 | def add_context_dimension_by_annotation(self, ann, tp=None, fp=None, lbl=None): 104 | self.add_context_dimension(LabelModel.get_ann_dim_label(ann, generalise=True, no_negation=True), tp=tp, fp=fp, 105 | lbl=lbl) 106 | 107 | def get_top_freq_dimensions(self, k, lbl='united'): 108 | if self._selected_dims is not None: 109 | return self._selected_dims 110 | if lbl not in self._lbl_data: 111 | return [] 112 | l2f = self._lbl_data[lbl]['t2f'] 113 | df = [(l, l2f[l]) for l in l2f] 114 | df = sorted(df, key=lambda x: -x[1]) 115 | self._selected_dims = [d[0] for d in df[:k]] 116 | return self._selected_dims 117 | 118 | def get_top_tfidf_dimensions(self, k, lbl='united'): 119 | if lbl in self._lbl2tfidf_dims: 120 | return self._lbl2tfidf_dims[lbl] 121 | self._lbl2tfidf_dims[lbl] = {} 122 | if lbl not in self._lbl_data: 123 | logging.info('label [%s] has no contextual info' % lbl) 124 | return [] 125 | d = self._lbl_data[lbl] 126 | tps = d['tps'] 127 | fps = d['fps'] 128 | idf_weight = 1.0 129 | if len(tps) > 0 and len(fps) > 0: 130 | idf_weight = 1.0 * len(tps) / len(fps) 131 | df = [] 132 | max_score = 0 133 | for l in d['t2f']: 134 | idf = 1.0 / ((1 if l in d['tps'] else 0) + (1 if l in d['fps'] else 0)) 135 | score = 1.0 * d['t2f'][l] / (len(tps) + len(fps)) 136 | if idf_weight == 1 or (l in d['tps'] and l in d['fps']): 137 | score = score * idf 138 | # if l in d['tps'] and l in d['fps']: 139 | # score *= 0.5 140 | elif l in d['fps']: 141 | score *= idf_weight * idf 142 | max_score = max(score, max_score) 143 | df.append((l, score)) 144 | df = sorted(df, key=lambda x: -x[1]) 145 | # logging.debug(df) 146 | self._lbl2tfidf_dims[lbl] = [(t[0], t[1] * 1.0 / max_score) for t in df[:k]] 147 | logging.debug('%s ==> [%s]' % (lbl, self._lbl2tfidf_dims[lbl])) 148 | return self._lbl2tfidf_dims[lbl] 149 | 150 | @property 151 | def max_dimensions(self): 152 | return self._max_dimensions 153 | 154 | @max_dimensions.setter 155 | def max_dimensions(self, value): 156 | if value is None: 157 | self._max_dimensions = 2000 158 | self._max_dimensions = value 159 | 160 | @property 161 | def label_dimensions(self): 162 | return self._label_dimensions 163 | 164 | def context_dimensions(self, lbl): 165 | if lbl not in self._lbl_data: 166 | return [] 167 | # logging.info('%s`s dims: %s' % (lbl, self._lbl_data[lbl]['dims'])) 168 | return self._lbl_data[lbl]['dims'] 169 | 170 | def encode_ann(self, ann, context_anns, lbl='united', extra_dims=None): 171 | ann_label = LabelModel.get_ann_dim_label(ann) 172 | encoded = [] 173 | # if self.use_one_dimension_for_label: 174 | # if ann_label in self.label_dimensions: 175 | # encoded.append(self.label_dimensions.index(ann_label)) 176 | # else: 177 | # encoded.append(-1) 178 | # else: 179 | # for l in self.label_dimensions: 180 | # if l == ann_label: 181 | # encoded.append(1) 182 | # else: 183 | # encoded.append(0) 184 | context_labels = [LabelModel.get_ann_dim_label(ann, generalise=True, no_negation=True) for ann in context_anns] 185 | for l, score in self.get_top_tfidf_dimensions(self.max_dimensions, lbl=lbl): # self.context_dimensions: 186 | # freq = 0 187 | # for cl in context_labels: 188 | # if cl.lower() == l.lower(): 189 | # freq += 1 190 | if l in context_labels: 191 | encoded.append(1) 192 | else: 193 | encoded.append(0) 194 | # encoded.append(freq * score) 195 | return encoded + ([] if extra_dims is None else extra_dims) 196 | 197 | def collect_dimensions(self, ann_dir): 198 | cm = self.concept_mapping 199 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 200 | # collect dimension labels 201 | for fk in file_keys: 202 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) 203 | t = self.label.replace('neg_', '') 204 | anns = cr.get_anns_by_label(t) 205 | neg_anns = cr.get_anns_by_label('neg_' + t) 206 | for a in anns + neg_anns: 207 | self.add_label_dimension_by_annotation(a) 208 | # self.add_context_dimension_by_annotation(a) 209 | if (a.negation != 'Negated' and self.label.startswith('neg_')) or \ 210 | (a.negation == 'Negated' and not self.label.startswith('neg_')): 211 | continue 212 | sanns = cr.get_same_sentence_anns(a) 213 | context_anns = [] + sanns['umls'] + sanns['phenotype'] 214 | # collect cui labels 215 | for u in sanns['umls']: 216 | self._cui2label[u.cui] = u.pref 217 | for c in context_anns: 218 | self.add_context_dimension_by_annotation(c) 219 | 220 | def collect_tfidf_dimensions(self, ann_dir, gold_dir, ignore_context=False, separate_by_label=False, 221 | full_text_dir=None, eHostGD=False): 222 | cm = self.concept_mapping 223 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 224 | # collect dimension labels 225 | tp_freq = 0 226 | fp_freq = 0 227 | label_type = self.label.replace('neg_', '') 228 | fn_freq = 0 229 | for fk in file_keys: 230 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) 231 | fk = fk.replace('se_ann_', '') 232 | if full_text_dir is not None: 233 | cr.full_text_folder = full_text_dir 234 | if eHostGD: 235 | if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)): 236 | continue 237 | gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk)) 238 | else: 239 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)): 240 | continue 241 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) 242 | t = self.label.replace('neg_', '') 243 | anns = cr.get_anns_by_label(t) 244 | neg_anns = cr.get_anns_by_label('neg_' + t) 245 | 246 | # re-segement sentences 247 | # cr.re_segment_sentences(fk) 248 | # cr.relocate_all_anns(fk) 249 | # gd.relocate_anns(cr.get_full_text(fk)) 250 | 251 | not_matched_gds = [] 252 | for e in gd.get_ess_entities(): 253 | if (ignore_context and e.label.replace('neg_', '') == label_type) \ 254 | or (not ignore_context and e.label == self.label): 255 | not_matched_gds.append(e.id) 256 | for a in anns + neg_anns: 257 | # self.add_context_dimension_by_annotation(a) 258 | self.add_label_dimension_by_annotation(a) 259 | # if (not ignore_context) and ((a.negation != 'Negated' and self.label.startswith('neg_')) or \ 260 | # (a.negation == 'Negated' and not self.label.startswith('neg_'))): 261 | # logging.info('skipped because context') 262 | # continue 263 | 264 | matched = False 265 | for g in gd.get_ess_entities(): 266 | if g.id in not_matched_gds: 267 | gt = g.label.replace('neg_', '') 268 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or 269 | (ignore_context and gt == label_type)): 270 | matched = True 271 | tp_freq += 1 272 | not_matched_gds.remove(g.id) 273 | if not matched: 274 | fp_freq += 1 275 | 276 | sanns = cr.get_prior_anns(a, contenxt_depth=-1) 277 | context_anns = [] + sanns['umls'] + sanns['phenotype'] + cr.get_context_words(a, fk) 278 | # context_anns = cr.get_context_words(a, fk) 279 | # collect cui labels 280 | for u in sanns['umls']: 281 | self._cui2label[u.cui] = u.pref 282 | for c in context_anns: 283 | self.add_context_dimension_by_annotation(c, tp=True if matched else None, 284 | fp=True if not matched else None, 285 | lbl='united' if not separate_by_label else 286 | LabelModel.get_ann_query_label(a)) 287 | fn_freq += len(not_matched_gds) 288 | self._tps = tp_freq 289 | self._fps = fp_freq 290 | logging.debug('tp: %s, fp: %s, fn: %s' % (tp_freq, fp_freq, fn_freq)) 291 | 292 | def get_low_quality_labels(self, ann_dir, gold_dir, accurate_threshold=0.05, min_sample_size=20): 293 | return [t[0] for t in self.assess_label_quality(ann_dir, gold_dir) 294 | if t[1] <= accurate_threshold and t[2] + t[3] >= min_sample_size] 295 | 296 | def assess_label_quality(self, ann_dir, gold_dir, separate_by_label=True, ignore_context=True): 297 | if ignore_context: 298 | logging.info('doing learning without considering contextual info') 299 | # print self.get_top_tfidf_dimensions(self.max_dimensions) 300 | cm = self.concept_mapping 301 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 302 | label_type = self.label.replace('neg_', '') 303 | query_label_perform = {} 304 | for fk in file_keys: 305 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) 306 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)): 307 | continue 308 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) 309 | 310 | not_matched_gds = [] 311 | for e in gd.get_ess_entities(): 312 | if (ignore_context and e.label.replace('neg_', '') == label_type) \ 313 | or (not ignore_context and e.label == self.label): 314 | not_matched_gds.append(e.id) 315 | anns = cr.get_anns_by_label(self.label, no_context=ignore_context) 316 | for a in anns: 317 | multiple_true_positives = 0 318 | matched = False 319 | for g in gd.get_ess_entities(): 320 | if g.id in not_matched_gds: 321 | gt = g.label.replace('neg_', '') 322 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or 323 | (ignore_context and gt == label_type)): 324 | if matched: 325 | multiple_true_positives += 1 326 | matched = True 327 | not_matched_gds.remove(g.id) 328 | 329 | if separate_by_label: 330 | lbl = LabelModel.get_ann_query_label(a) 331 | else: 332 | lbl = 'united' 333 | ql = lbl 334 | if ql not in query_label_perform: 335 | query_label_perform[ql] = {'c': 0, 'w': 0} 336 | if matched: 337 | query_label_perform[ql]['c'] += 1 338 | else: 339 | query_label_perform[ql]['w'] += 1 340 | lbls = [(l, 341 | 1.0 * query_label_perform[l]['c'] / (query_label_perform[l]['c'] + query_label_perform[l]['w']), 342 | query_label_perform[l]['c'], 343 | query_label_perform[l]['w']) for l in query_label_perform] 344 | return sorted(lbls, key=lambda x: x[1]) 345 | 346 | def load_data(self, ann_dir, gold_dir, verbose=True, ignore_mappings=[], ignore_context=False, 347 | separate_by_label=False, ful_text_dir=None, eHostGD=False, annotated_anns={}): 348 | """ 349 | 350 | :param ann_dir: 351 | :param gold_dir: 352 | :param verbose: 353 | :param ignore_mappings: 354 | :param ignore_context: 355 | :param separate_by_label: 356 | :param ful_text_dir: 357 | :param eHostGD: 358 | :param annotated_anns: NB: this is for labelling settings where only partial data is annotated on 359 | the documents. Therefore, we need to filter out those not assessed by the annotators to avoid kill some 360 | true positives (those are correct but not assessed by annotators) 361 | :return: 362 | """ 363 | if ignore_context: 364 | logging.info('doing learning without considering contextual info') 365 | # print self.get_top_tfidf_dimensions(self.max_dimensions) 366 | cm = self.concept_mapping 367 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 368 | lbl2data = {} 369 | false_negatives = 0 370 | lbl2tps = {} 371 | label_type = self.label.replace('neg_', '') 372 | query_label_perform = {} 373 | for fk in file_keys: 374 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) 375 | fk = fk.replace('se_ann_', '') 376 | if ful_text_dir is not None: 377 | cr.full_text_folder = ful_text_dir 378 | if eHostGD: 379 | if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)): 380 | continue 381 | # logging.debug('using GD file %s' % join(gold_dir, '%s.txt.knowtator.xml' % fk)) 382 | gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk)) 383 | else: 384 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)): 385 | continue 386 | logging.debug('using GD file %s' % join(gold_dir, '%s-ann.xml' % fk)) 387 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk)) 388 | 389 | # re-segement sentences 390 | # cr.re_segment_sentences(fk) 391 | # cr.relocate_all_anns(fk) 392 | # gd.relocate_anns(cr.get_full_text(fk)) 393 | 394 | not_matched_gds = [] 395 | for e in gd.get_ess_entities(): 396 | if (ignore_context and e.label.replace('neg_', '') == label_type) \ 397 | or (not ignore_context and e.label == self.label): 398 | not_matched_gds.append(e.id) 399 | 400 | anns = cr.get_anns_by_label(self.label, ignore_mappings=ignore_mappings, no_context=ignore_context) 401 | if len(annotated_anns) > 0: 402 | if '%s.txt' % fk not in annotated_anns: 403 | continue 404 | kept_anns = [] 405 | for a in anns: 406 | for aa in annotated_anns['%s.txt' % fk]: 407 | if int(aa['s']) == a.start and int(aa['e']) == a.end: 408 | kept_anns.append(a) 409 | anns = kept_anns 410 | for a in anns: 411 | logging.debug('%s, %s, %s' % (a.str, a.start, a.end)) 412 | multiple_true_positives = 0 413 | t2anns = cr.get_prior_anns(a) 414 | # if len(t2anns['umls']) + len(t2anns['phenotype']) == 0: 415 | # t2anns = cr.get_prior_anns(a, contenxt_depth=-2) 416 | context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + \ 417 | cr.get_context_words(a, fk) 418 | # context_anns = cr.get_context_words(a, fk) 419 | matched = False 420 | for g in gd.get_ess_entities(): 421 | if g.id in not_matched_gds: 422 | gt = g.label.replace('neg_', '') 423 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or 424 | (ignore_context and gt == label_type)): 425 | if matched: 426 | multiple_true_positives += 1 427 | matched = True 428 | not_matched_gds.remove(g.id) 429 | if verbose: 430 | if not matched: 431 | logging.debug('%s %s %s' % ('!', 432 | self.get_ann_dim_label(a) + 433 | ' // ' + ' | '.join(self.get_ann_dim_label(a, generalise=True) 434 | for a in context_anns), fk)) 435 | else: 436 | logging.debug('%s %s %s' % ('R', 437 | self.get_ann_dim_label(a) + ' // ' + ' | '.join( 438 | self.get_ann_dim_label(a, generalise=True) 439 | for a in context_anns), fk)) 440 | 441 | lbl = LabelModel.get_label_specific_data(self, lbl2data, a, context_anns, fk, cr, 442 | separate_by_label=separate_by_label) 443 | 444 | lbl2data[lbl]['multiple_tps'] += multiple_true_positives 445 | Y = lbl2data[lbl]['Y'] 446 | Y.append([1 if matched else 0]) 447 | ql = lbl 448 | if ql not in query_label_perform: 449 | query_label_perform[ql] = {'c': 0, 'w': 0} 450 | if matched: 451 | query_label_perform[ql]['c'] += 1 452 | else: 453 | query_label_perform[ql]['w'] += 1 454 | false_negatives += len(not_matched_gds) 455 | 456 | missed = None 457 | for g in gd.get_ess_entities(): 458 | if g.id in not_matched_gds: 459 | missed = g 460 | logging.debug('\t'.join( 461 | ['M', g.str, str(g.negated), str(g.start), str(g.end), join(gold_dir, '%s-ann.xml' % fk)])) 462 | # if len(not_matched_gds) > 0: 463 | # print not_matched_gds 464 | # for a in anns: 465 | # logging.debug(a.str, a.start, a.end, missed.overlap(a)) 466 | bad_labels = [] 467 | for ql in query_label_perform: 468 | p = query_label_perform[ql] 469 | if p['c'] == 0 or (1.0 * p['w'] / p['c'] < 0.05): 470 | bad_labels.append(ql) 471 | return {'lbl2data': lbl2data, 472 | 'fns': false_negatives, 'bad_labels': bad_labels, 'files': file_keys} 473 | 474 | @staticmethod 475 | def get_label_specific_data(label_model, lbl2data, annotation, context_anns, fk, cr, 476 | separate_by_label=False): 477 | a = annotation 478 | extra_dims = [1] if len(cr.get_containing_anns(a)) > 0 else [0] 479 | if separate_by_label: 480 | lbl = LabelModel.get_ann_query_label(a) 481 | else: 482 | lbl = 'united' 483 | if lbl not in lbl2data: 484 | lbl2data[lbl] = {'X': [], 'Y': [], 'multiple_tps': 0, 'doc_anns': []} 485 | X = lbl2data[lbl]['X'] 486 | lbl2data[lbl]['doc_anns'].append({'d': fk, 'ann': a, 'label': label_model.label}) 487 | X.append(label_model.encode_ann(a, context_anns, lbl=lbl, extra_dims=extra_dims)) 488 | return lbl 489 | 490 | @staticmethod 491 | def read_one_ann_doc(label_model, cr, fk, lbl2data=None, 492 | ignore_mappings=[], ignore_context=False, separate_by_label=False): 493 | if lbl2data is None: 494 | lbl2data = {} 495 | anns = cr.get_anns_by_label(label_model.label, ignore_mappings=ignore_mappings, no_context=ignore_context) 496 | for a in anns: 497 | t2anns = cr.get_prior_anns(a) 498 | context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + cr.get_context_words(a, fk) 499 | # context_anns = cr.get_context_words(a, fk) 500 | LabelModel.get_label_specific_data(label_model, lbl2data, a, context_anns, fk, cr, 501 | separate_by_label=separate_by_label) 502 | return lbl2data 503 | 504 | def load_data_for_predict(self, ann_dir, ignore_mappings=[], ignore_context=False, 505 | separate_by_label=False, full_text_dir=None): 506 | """ 507 | load data for prediction - no ground truth exists 508 | :param ann_dir: 509 | :param ignore_mappings: 510 | :param ignore_context: 511 | :param separate_by_label: 512 | :param full_text_dir: 513 | :return: 514 | """ 515 | if ignore_context: 516 | logging.info('doing learning without considering contextual info') 517 | 518 | cm = self.concept_mapping 519 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 520 | lbl2data = {} 521 | for fk in file_keys: 522 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm) 523 | fk = fk.replace('se_ann_', '') 524 | if full_text_dir is not None: 525 | cr.full_text_folder = full_text_dir 526 | LabelModel.read_one_ann_doc(self, cr, fk, lbl2data=lbl2data, 527 | ignore_mappings=ignore_mappings, ignore_context=ignore_context, 528 | separate_by_label=separate_by_label) 529 | return {'lbl2data': lbl2data, 'files': file_keys} 530 | 531 | def serialise(self, output_file): 532 | jl.dump(self, output_file) 533 | 534 | @staticmethod 535 | def type_related_ann_filter(ann, cm_obj): 536 | if hasattr(ann, 'cui'): 537 | return not ann.cui.lower() in cm_obj.all_entities 538 | # return not ann.cui in _cm_obj.type2cocnepts(type) 539 | else: 540 | return not ann.str.lower() in cm_obj.all_entities 541 | # return not ann.str in _cm_obj.type2gaz[type] 542 | 543 | @staticmethod 544 | def get_ann_query_label(ann): 545 | # return ann.str.lower() 546 | neg = '' 547 | # if hasattr(ann, 'negation'): 548 | # neg = 'neg_' if ann.negation == 'Negated' else '' 549 | # else: 550 | # neg = 'neg_' if ann.negated else '' 551 | # if hasattr(ann, 'cui'): 552 | # return neg + ann.cui + ' ' + str(ann.pref) 553 | # else: 554 | # return neg + ann.str.lower() 555 | return neg + ann.str.lower() 556 | 557 | @staticmethod 558 | def deserialise(serialised_file): 559 | return jl.load(serialised_file) 560 | 561 | @staticmethod 562 | def get_ann_dim_label(ann, generalise=False, no_negation=False): 563 | if isinstance(ann, str): 564 | return 'WORD_%s' % ann 565 | negated = '' 566 | label = ann.str 567 | if (hasattr(ann, 'negation') and ann.negation == 'Negated') or (hasattr(ann, 'negated') and ann.negated): 568 | negated = 'neg_' 569 | if no_negation: 570 | negated = '' 571 | # if hasattr(ann, 'cui'): 572 | # label = ann.cui + ' ' + str(ann.pref) 573 | # ann.str 574 | if hasattr(ann, 'minor_type'): 575 | label = ann.str 576 | # if generalise and hasattr(ann, 'sty'): 577 | # label = ann.sty 578 | # if ann.sty.lower() == 'body part, organ, or organ component': 579 | negated = '' 580 | return negated + label.lower() 581 | # return ann.str.lower() if not isinstance(ann, SemEHRAnn) else ann.cui.lower() 582 | 583 | 584 | 585 | 586 | 587 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archived 2 | 3 | This repo is no longer maintained. For questions on this reop please email Honghan directly (honghan.wu@gmail.com), or for broader CogStack enquiries please reach out to contact@cogstack.org 4 | 5 | # nlp2phenome 6 | using AI models to infer patient phenotypes from identified named entities (instances of biomedical concepts) 7 | 8 | ## why 9 | Using natural language processing(NLP) to identify mentions of biomedical concepts from free text medical records is just the *first* step. There is often a gap between NLP results and what the clinical study is after. For example, a radiology report does not contain the term - `ischemic stroke`. Instead, it reports the patient had `blocked arteries` and `stroke`. To infer the "unspoken" `ischemic stroke`, a mechanism is needed to do such inferences from NLP identifiable mentions of `blocked arteries` and `stroke`. nlp2phenome is designed for doing this extra step from NLP to patient phenome. 10 | 11 | ## what 12 | nlp2phenome was developed for a stroke subtyping study using NLP on radiology reports in Edinburgh University led by [Dr Will Whitely](https://www.ed.ac.uk/profile/dr-william-whiteley). It is based on top of [SemEHR](https://github.com/CogStack/CogStack-SemEHR) results. It identified 2,922 mentions of 32 types of phenotypes from 266 radiology reports and achieved an average F1: 0.929; Precision: 0.925; Recall: 0.939. 13 | 14 | It uses various transparant machine learning models (e.g. decision tree, random forest) to learn the inference from NLP results to more insightful clinical phenotypes (such as subtypes of stroke). The image below is a decision tree learnt for negated tumour from radiology reports. Surprisingly, with specific feature selection methods, decision tree models outperforms the popular neural network based method. The other advantage is that the visualised decision trees can be verified or matched with clinical experts, or even compared to clinical guidelines. A working paper in progress, will update with a link soon. 15 | 16 | ![alt text](https://raw.githubusercontent.com/CogStack/nlp2phenome/master/neg-tumour-dt-learnt.png "decision tree learnt for negated tumour") 17 | 18 | ## data 19 | two datasets (radiology reports) collected in Scotland 20 | - [Edinburgh Stroke Study](http://www.dcn.ed.ac.uk/ess/) training data (364 reports), testing data (266 reports) 21 | - Tayside radiology reports (300 reports) 22 | 23 | ## run learning 24 | 1. config your configuration file. Please see `./settings/sample_setting_kfold_learning.json` for reference. 25 | ```javascript 26 | { 27 | "kfold": 10, // the fold for learning 28 | "corpus_folder": "/data/annotated_data/corpus", // the folder containing full text documents 29 | "gold_folder": "/data/annotated_data/gold", // the folder containing the labelled/annotated data 30 | "semehr_folder": "/data/semehr_results", // the folder containing baseline SemEHR results 31 | "working_folder": "/data/learning", // the working folder to store intermidieate data files 32 | "concept_mapping_file": "./settings/empty_concept_mapping.json", // the mapping file to map UMLS CUI to phenotypes 33 | "learning_model_dir": "./models", // where the machine learning models are stored 34 | "entity_types_file": "./settings/better_worse_entity_types.txt", // the list of phenotypes to be worked on 35 | "ignore_mapping_file": "./settings/ignore_mapping.json", // a json based mapping file to ignore certain CUI mappings 36 | "min_sample_size": 25, // minimal number of samples to train a model, if the sample size is less than this number, a counting based stats will be used to assess the correctness of baseline results rather than a machine learning model 37 | "gold_file_pattern": "%s.txt.knowtator.xml", // the annotation file pattern, %s identifies the unique id that will be used to find SemEHR result file and full text file in respective folders 38 | "eHostGD": true // whether use eHOST annotation, only other format supported is EDiR from Edinburgh Informatics 39 | } 40 | ``` 41 | - `entity_types_file` - each study is to identify a set of phenotypes (e.g., diseases, symptoms or other biomedical mentions). This file is a plain text file to list all the names of `phenotypes` in a format of one phenotype per line. Check [entity_types_phenotypes_stroke_sample.txt](./settings/entity_types_phenotypes_stroke_sample.txt) as an example. 42 | - `concept_mapping_file` - for each phenotype defined above, it needs to be mapped to one or several ontology concepts (e.g., UMLS CUI). This is a json file. It is a json dictionary, where the key is the `phenotype` name and the value is an array. Each element in the array takes the form of `CONPCET_ID\tLabel\tSemantic Type` - `tab` key separated tuple. The first component is most important and the last two are for display purpose only. Check [concept_mapping_stroke_sample.json](./settings/concept_mapping_stroke_sample.json) as an example. 43 | - `ignore_mapping_file` - this is a json dictionary for removing particular concepts (and customised dictionary terms) from the mappings of phenotypes as defined in `concept_mapping_file`. The key is `phenotype` name and the value is an array containing either concept IDs from the ontology used (e.g., UMLS) or the customised dictionary term. This file is only needed when the `concept_mapping_file` is automatically generated from some learning data and it requires some fine-tuning. 44 | 2. run it by 45 | ```bash 46 | python run_learning.py YOUR_LEARNING_CONFIG_FILE 47 | ``` 48 | 49 | ## contact 50 | Dr Honghan Wu (honghan.wu@gmail.com) 51 | -------------------------------------------------------------------------------- /ann_converter.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import datetime 3 | from os import listdir 4 | from os.path import isfile, join 5 | import utils 6 | import csv 7 | 8 | 9 | class AnnConverter(object): 10 | 11 | @staticmethod 12 | def get_semehr_ann_label(ann): 13 | str_context = '' 14 | if ann.negation != 'Affirmed': 15 | str_context += ann.negation + '_' 16 | if ann.temporality != 'Recent': 17 | str_context += ann.temporality + '_' 18 | if ann.experiencer != 'Patient': 19 | str_context += ann.experiencer + '_' 20 | return '%s%s' % (str_context, ann.minor_type) 21 | 22 | @staticmethod 23 | def to_eHOST(doc_key, anns, file_pattern='%s.txt', id_pattern='smehr-%s-%s'): 24 | elem_annotations = ET.Element("annotations") 25 | elem_annotations.set('textSource', file_pattern % doc_key) 26 | idx = 0 27 | for d in anns: 28 | ann = d['ann'] 29 | idx += 1 30 | mention_id = id_pattern % (doc_key, idx) 31 | AnnConverter.create_elem_ann(elem_annotations, mention_id, ann.start, ann.end, ann.str, 32 | AnnConverter.get_semehr_ann_label(ann)) 33 | tree = ET.ElementTree(elem_annotations) 34 | return ET.tostring(elem_annotations, encoding='utf8', method='xml') 35 | 36 | @staticmethod 37 | def create_elem_ann(elem_annotations, mention_id, start, end, str, class_label): 38 | elem_ann = ET.SubElement(elem_annotations, "annotation") 39 | elem_mention = ET.SubElement(elem_ann, "mention") 40 | elem_mention.set('id', mention_id) 41 | elem_annotator = ET.SubElement(elem_ann, "annotator") 42 | elem_annotator.set('id', 'semehr') 43 | elem_annotator.text = 'semehr' 44 | elem_span = ET.SubElement(elem_ann, "span") 45 | elem_span.set('start', '%s' % start) 46 | elem_span.set('end', '%s' % end) 47 | elem_spanText = ET.SubElement(elem_ann, "spannedText") 48 | elem_spanText.text = str 49 | elem_date = ET.SubElement(elem_ann, "creationDate") 50 | elem_date.text = datetime.datetime.now().strftime("%a %B %d %X %Z %Y") 51 | # 52 | elem_class = ET.SubElement(elem_annotations, "classMention") 53 | elem_class.set('id', mention_id) 54 | elem_mention_class = ET.SubElement(elem_class, "mentionClass") 55 | elem_mention_class.set('id', class_label) 56 | elem_mention_class.text = str 57 | return elem_ann 58 | 59 | @staticmethod 60 | def load_ann_file(f, do_multi=True): 61 | tree = ET.parse(f) 62 | doc = tree.getroot() 63 | ann2label = {} 64 | ann2freq = {} 65 | for ann in doc.findall("annotation"): 66 | m_id = ann.find("mention").attrib["id"] 67 | cm = doc.find('.//classMention[@id="' + m_id + '"]') 68 | cls =cm.find('mentionClass').attrib["id"] 69 | m_span = ann.find("span").attrib 70 | annid = 'm-%s-%s' % (m_span['start'], m_span['end']) 71 | m_text = ann.find("spannedText").text 72 | freq = 0 73 | if annid not in ann2freq: 74 | ann2freq[annid] = 1 75 | else: 76 | if do_multi: 77 | ann2freq[annid] += 1 78 | annid_freq = '%s:%s' % (annid, ann2freq[annid]) 79 | ann2label[annid_freq] = {"text": m_text, "class": cls} 80 | return ann2label 81 | 82 | @staticmethod 83 | def convert_csv_annotations(csv_file, text_folder, ann_folder, mapping_file, annotated_anns_file, 84 | id_pattern='%s-%s', ann_file_pattern='%s.txt.knowtator.xml'): 85 | with open(csv_file, newline='') as cf: 86 | reader = csv.DictReader(cf) 87 | label2concepts = {} 88 | d2annotated_anns = {} 89 | for r in reader: 90 | d2annotated_anns[r['doc_id'] + ".txt"] = [{'s': r['start'], 'e': r['end']}] 91 | if r['Skip Document'] != 'Yes': 92 | utils.save_string(r['text'], join(text_folder, r['doc_id'] + ".txt")) 93 | elem_annotations = ET.Element("annotations") 94 | elem_annotations.set('textSource', r['doc_id']) 95 | mention_id = id_pattern % (r['doc_id'], 0) 96 | if r['Correct'] == 'Yes' and r['Negation'] == 'NOT Negated': 97 | AnnConverter.create_elem_ann(elem_annotations, mention_id, 98 | r['start'], r['end'], r['string_orig'], r['icd10-ch']) 99 | xml = ET.tostring(elem_annotations, encoding='unicode', method='xml') 100 | utils.save_string(xml, join(ann_folder, ann_file_pattern % r['doc_id'])) 101 | if r['icd10-ch'] not in label2concepts: 102 | label2concepts[r['icd10-ch']] = [] 103 | if r['cui'] not in label2concepts[r['icd10-ch']]: 104 | label2concepts[r['icd10-ch']].append(r['cui']) 105 | utils.save_json_array(label2concepts, mapping_file) 106 | utils.save_json_array(d2annotated_anns, annotated_anns_file) 107 | 108 | @staticmethod 109 | def populate_inter_annotator_results(ann_folder_1, ann_folder_2, output_file, missing_file, 110 | correct_labels = ["VERIFIED_CORRECT"]): 111 | ann_files = [f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f))] 112 | all_mentions = 0 113 | missed = [] 114 | mismatched = [] 115 | for f in ann_files: 116 | ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f)) 117 | ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f)) 118 | all_mentions += len(ann1) 119 | for ann in ann1: 120 | if ann not in ann2: 121 | missed.append('%s\t%s\t%s' % (ann, ann1[ann]['text'], ann1[ann]['class'])) 122 | elif ann2[ann]['class'] != ann1[ann]['class'] and ann1[ann]['class'] not in correct_labels: 123 | mismatched.append('%s\t%s\t%s\t%s\t%s' % (f, ann, ann1[ann]['text'], ann1[ann]['class'], ann2[ann]['class'])) 124 | print('\n'.join(mismatched)) 125 | print(len(missed), all_mentions) 126 | utils.save_string('\n'.join(mismatched), output_file) 127 | utils.save_string('\n'.join(missed), missing_file) 128 | 129 | @staticmethod 130 | def calculate_IAA(ann_folder_1, ann_folder_2, output_file): 131 | from sklearn.metrics import cohen_kappa_score 132 | ann_files = [f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f))] 133 | ann1_annotations = {} 134 | ann2_annotations = {} 135 | for f in ann_files: 136 | ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f), do_multi=False) 137 | ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f), do_multi=False) 138 | for ann in ann1: 139 | ann1_annotations['%s_%s' % (f, ann)] = ann1[ann]['class'] 140 | for ann in ann2: 141 | ann2_annotations['%s_%s' % (f, ann)] = ann2[ann]['class'] 142 | merged_anns = list(set(list(ann1_annotations.keys()) + list(ann2_annotations.keys()))) 143 | ann1_merged = [] 144 | ann2_merged = [] 145 | label_missed = 'missed' 146 | cat2pares = {'subject': {'ann1': [], 'ann2': []}, 147 | 'irrelevant': {'ann1': [], 'ann2': []}, 148 | 'trajectory': {'ann1': [], 'ann2': []}, 149 | } 150 | output = [] 151 | for ann in merged_anns: 152 | ann1_label = label_missed if ann not in ann1_annotations else ann1_annotations[ann] 153 | ann2_label = label_missed if ann not in ann2_annotations else ann2_annotations[ann] 154 | ann1_merged.append(ann1_label) 155 | ann2_merged.append(ann2_label) 156 | if ann1_label == 'Irrelevant_label' or ann2_label == 'Irrelevant_label': 157 | cat2pares['irrelevant']['ann1'].append(ann1_label) 158 | cat2pares['irrelevant']['ann2'].append(ann2_label) 159 | elif ann1_label in ['Trajectory_Subject', 'General_Trajectory'] or ann2_label in ['Trajectory_Subject', 'General_Trajectory']: 160 | cat2pares['subject']['ann1'].append(ann1_label) 161 | cat2pares['subject']['ann2'].append(ann2_label) 162 | elif ann1_label in ['better(Trajetory)', 'worse(Trajectory)'] or ann2_label in ['better(Trajetory)', 'worse(Trajectory)']: 163 | cat2pares['trajectory']['ann1'].append(ann1_label) 164 | cat2pares['trajectory']['ann2'].append(ann2_label) 165 | output.append('%s\t%s\t%s' % (ann, ann1_label, ann2_label)) 166 | 167 | print('kappa score: [%s]', cohen_kappa_score(ann1_merged, ann2_merged)) 168 | for cat in cat2pares: 169 | print('%s kappa score: [%s]' % (cat, cohen_kappa_score(cat2pares[cat]['ann1'], cat2pares[cat]['ann2']))) 170 | utils.save_string('\n'.join(output), output_file) 171 | 172 | if __name__ == "__main__": 173 | # AnnConverter.load_ann_file('S:/NLP/annotation_Steven/stroke_nlp/saved/Stroke_id_105.txt.knowtator.xml') 174 | # AnnConverter.populate_inter_annotator_results('S:/NLP/annotation_Kristiina/stroke_nlp/saved', 175 | # 'S:/NLP/annotation_Steven/stroke_nlp/saved', 'mismatched.tsv') 176 | # AnnConverter.populate_inter_annotator_results('S:/NLP/annotation_Steven/stroke_nlp/saved', 177 | # 'P:/wuh/SemEHR-working/outputs/nlp2phenome', 178 | # 'kristiina_corrections.tsv', 'steven_added.tsv') 179 | ann_folder = '/data/annotated_data/' 180 | ann_files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))] 181 | for f in ann_files: 182 | print('processing %s...' % f) 183 | AnnConverter.convert_csv_annotations(join(ann_folder, f), join(ann_folder, 'corpus'), join(ann_folder, 'gold'), join(ann_folder, 'concept_mapping.json'), join(ann_folder, 'annotated_anns.json')) 184 | -------------------------------------------------------------------------------- /ann_utils.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | import datetime 3 | from os import listdir 4 | from os.path import isfile, join 5 | from nlp_to_phenome import EDIRDoc 6 | from annotation_docs import EDIRAnn 7 | import reportreader as rr 8 | import re 9 | import utils 10 | import logging 11 | from operator import itemgetter 12 | import xml.etree.ElementTree as ET 13 | 14 | 15 | class eHostGenedDoc(EDIRDoc): 16 | def __init__(self, file_path): 17 | super(eHostGenedDoc, self).__init__(file_path) 18 | 19 | def get_ess_entities(self): 20 | if self._entities is not None: 21 | return self._entities 22 | root = self._root 23 | entities = [] 24 | for e in root.findall('.//classMention'): 25 | mcs = e.findall('./mentionClass') 26 | mention_id = e.attrib['id'] 27 | if len(mcs) > 0: 28 | mc = mcs[0] 29 | cls = mc.attrib['id'] 30 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..') 31 | if len(mentions) > 0: 32 | span = mentions[0].findall('./span') 33 | ent_start = span[0].attrib['start'] 34 | ent_end = span[0].attrib['end'] 35 | spannedText = mentions[0].findall('./spannedText') 36 | str = spannedText[0].text 37 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls) 38 | ann.id = len(entities) 39 | entities.append(ann) 40 | self._entities = entities 41 | return self._entities 42 | 43 | 44 | class eHostAnnDoc(EDIRDoc): 45 | """ 46 | a document class for ehost annotation file 47 | """ 48 | def __init__(self, file_path): 49 | super(eHostAnnDoc, self).__init__(file_path) 50 | 51 | def get_ess_entities(self, no_context=False): 52 | if self._entities is not None: 53 | return self._entities 54 | root = self._root 55 | entities = [] 56 | for e in root.findall('.//classMention'): 57 | mcs = e.findall('./mentionClass') 58 | mention_id = e.attrib['id'] 59 | if len(mcs) > 0: 60 | mc = mcs[0] 61 | m = re.match(r'VERIFIED\_([^\(]+)', mc.attrib['id']) 62 | if m is None: 63 | m = re.match(r'(IRRELEVANT_LABELS)', mc.attrib['id']) 64 | if m is None: 65 | m = re.match(r'(ADDED)\_([^\(]+)', mc.attrib['id']) 66 | if m is not None: 67 | cls = m.group(1) 68 | if no_context and cls != 'IRRELEVANT_LABELS': 69 | if cls.find('_') >= 0: 70 | cls = cls[cls.find('_')+1:] 71 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..') 72 | if len(mentions) > 0: 73 | span = mentions[0].findall('./span') 74 | ent_start = span[0].attrib['start'] 75 | ent_end = span[0].attrib['end'] 76 | spannedText = mentions[0].findall('./spannedText') 77 | str = spannedText[0].text 78 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls) 79 | ann.id = len(entities) 80 | entities.append(ann) 81 | self._entities = entities 82 | return self._entities 83 | 84 | 85 | def ehost_iaa_compute(folder1, folder2, no_context=False): 86 | """ 87 | compute inter annotator agreement 88 | :param folder1: 89 | :param folder2: 90 | :param no_context: 91 | :return: 92 | """ 93 | annotator1 = read_ehost_annotated_result(folder1, no_context=no_context) 94 | annotator2 = read_ehost_annotated_result(folder2, no_context=no_context) 95 | merged_keys = list(set(annotator1.keys()) | set(annotator2.keys())) 96 | y1 = [] 97 | y2 = [] 98 | for key in merged_keys: 99 | if key in annotator1 and key in annotator2: 100 | y1.append(annotator1[key]) 101 | y2.append(annotator2[key]) 102 | else: 103 | print('%s not matched in all' % key) 104 | iaa = sklearn.metrics.cohen_kappa_score(y1, y2) 105 | print('IAA is %s on %s' % (iaa, len(annotator1))) 106 | return iaa 107 | 108 | 109 | def read_ehost_annotated_result(folder, no_context=False): 110 | """ 111 | read ehost annotated documents as a dictionary object: id -> entity label 112 | :param folder: 113 | :param no_context: 114 | :return: 115 | """ 116 | id2label = {} 117 | files = [f for f in listdir(folder) if isfile(join(folder, f))] 118 | for f in files: 119 | d = eHostAnnDoc(join(folder, f)) 120 | for e in d.get_ess_entities(no_context=no_context): 121 | id = '%s-%s-%s' % (f, e.start, e.end) 122 | id2label[id] = e.label 123 | print(id2label) 124 | return id2label 125 | 126 | 127 | def get_what_is_changing(ann_folder, text_folder, output_file, eHostAnnFile=True): 128 | """ 129 | get what is getting better/worse 130 | :param ann_folder: 131 | :param text_folder: 132 | :param output_file: 133 | :return: 134 | """ 135 | nlp = rr.get_nlp_instance() 136 | files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))] 137 | type2abstractions = {} 138 | for f in files: 139 | anns = [] 140 | text_file = join(text_folder, f[0:-14]) 141 | if eHostAnnFile: 142 | d = eHostAnnDoc(join(ann_folder, f)) 143 | anns = d.get_ess_entities(no_context=True) 144 | else: 145 | d = eHostGenedDoc(join(ann_folder, f)) 146 | anns = d.get_ess_entities() 147 | if len(anns) == 0: 148 | logging.info('anns is empty for [{:s}]'.format(f)) 149 | text = utils.read_text_file_as_string(join(text_folder, f[0:-14]), encoding='cp1252') 150 | sents = rr.get_sentences_as_anns(nlp, text) 151 | for ann in anns: 152 | for s in sents: 153 | if ann.overlap(s): 154 | abss = rr.AbstractedSentence(1) 155 | abss.text = s.str 156 | result = abss.get_abstaction_by_pos(abss.locate_pos(ann.str), nlp) 157 | if result is None: 158 | logging.info('%s not found in %s' % (ann.str, f)) 159 | continue 160 | type = ann.label 161 | if type not in type2abstractions: 162 | type2abstractions[type] = [] 163 | type2abstractions[type].append(result.to_dict()) 164 | logging.debug(type2abstractions) 165 | utils.save_json_array(type2abstractions, output_file) 166 | 167 | 168 | def compute_iaa(): 169 | folder_lia = "S:/NLP/annotation_it02/overlaps/k" 170 | folder_rob = "S:/NLP/annotation_it02/overlaps/s" 171 | folder_nadia = "nadia" 172 | ehost_iaa_compute(folder_lia, folder_rob, no_context=True) 173 | 174 | 175 | def analysing_label_performance(folder, output_file): 176 | s2t = {} 177 | files = [f for f in listdir(folder) if isfile(join(folder, f))] 178 | for f in files: 179 | d = eHostAnnDoc(join(folder, f)) 180 | for ann in d.get_ess_entities(): 181 | s = ann.str 182 | if not (s in s2t): 183 | s2t[s] = {} 184 | if ann.type in s2t[s]: 185 | s2t[s][ann.type] = s2t[s][ann.type] + 1 186 | else: 187 | s2t[s][ann.type] = 1 188 | sts = sorted([(s, s2t[s]['CORRECT'] if 'CORRECT' in s2t[s] else 0, s2t[s]['IRRELEVANT_LABELS'] if 'IRRELEVANT_LABELS' in s2t[s] else 0, s2t[s]['ADDED'] if 'ADDED' in s2t[s] else 0) for s in s2t], key=itemgetter(2), reverse=True) 189 | s = ('\n'.join(['%s\t%s\t%s\t%s' % (t[0], t[1], t[2], t[3]) for t in sts])) 190 | utils.save_string(s, output_file) 191 | 192 | 193 | def generate_gold_stand_from_validation(generated_ann_folder, validated_ann_folder, gold_standard_folder): 194 | 195 | files = [f for f in listdir(generated_ann_folder) if isfile(join(generated_ann_folder, f))] 196 | for f in files: 197 | logging.debug('processing: %s / %s' % (generated_ann_folder, f)) 198 | # ignore added annotations for now 199 | gd_anns = [] 200 | gen_doc = eHostGenedDoc(join(generated_ann_folder, f)) 201 | logging.debug('ann number: %s' % len(gen_doc.get_ess_entities())) 202 | val_doc = eHostAnnDoc(join(validated_ann_folder, f)) 203 | for g in gen_doc.get_ess_entities(): 204 | logging.debug('validation label: %s' % g.type) 205 | for v in val_doc.get_ess_entities(): 206 | if g.start == v.start and g.end == v.end: 207 | logging.debug('validation label: %s' % v.type) 208 | if v.type == 'CORRECT': 209 | gd_anns.append(g) 210 | 211 | elem_annotations = ET.Element("annotations") 212 | elem_annotations.set('textSource', f) 213 | idx = 0 214 | for ann in gd_anns: 215 | if ann.str.lower() == 'haematoma': 216 | continue 217 | idx += 1 218 | mention_id = '%s-%s' % (f, idx) 219 | elem_ann = ET.SubElement(elem_annotations, "annotation") 220 | elem_mention = ET.SubElement(elem_ann, "mention") 221 | elem_mention.set('id', mention_id) 222 | elem_annotator = ET.SubElement(elem_ann, "annotator") 223 | elem_annotator.set('id', 'semehr') 224 | elem_annotator.text = 'semehr' 225 | elem_span = ET.SubElement(elem_ann, "span") 226 | elem_span.set('start', '%s' % ann.start) 227 | elem_span.set('end', '%s' % ann.end) 228 | elem_spanText = ET.SubElement(elem_ann, "spannedText") 229 | elem_spanText.text = ann.str 230 | elem_date = ET.SubElement(elem_ann, "creationDate") 231 | elem_date.text = datetime.datetime.now().strftime("%a %B %d %X %Z %Y") 232 | # 233 | elem_class = ET.SubElement(elem_annotations, "classMention") 234 | elem_class.set('id', mention_id) 235 | elem_mention_class = ET.SubElement(elem_class, "mentionClass") 236 | if ann.str.lower() == 'haemorrhage' or ann.str.lower() == 'blood' or ann.str.lower() == 'bleed' or ann.str.lower().startswith('collection'): 237 | ann.type = 'bleeding' 238 | elem_mention_class.set('id', ann.type) 239 | elem_mention_class.text = ann.str 240 | tree = ET.ElementTree(elem_annotations) 241 | logging.info('gd file saved to %s - %s' % (gold_standard_folder, f)) 242 | utils.save_string(ET.tostring(elem_annotations, encoding='utf8', method='xml'), join(gold_standard_folder, f)) 243 | 244 | 245 | def analyse_trajectory_subjects(file, output_file): 246 | t2subs = utils.load_json_data(file) 247 | t2freq = {} 248 | for t in t2subs: 249 | if t not in t2freq: 250 | t2freq[t] = {'subject': {}, 'root': {}} 251 | for sub in t2subs[t]: 252 | add_key_freq(t2freq[t]['subject'], ','.join(sub['subject'])) 253 | add_key_freq(t2freq[t]['root'], sub['root']) 254 | 255 | s = '' 256 | for t in t2freq: 257 | freqs = t2freq[t] 258 | subs = sorted([(k, freqs['subject'][k]) for k in freqs['subject']], key=itemgetter(1), reverse=True) 259 | s += '***%s [subjects]***\n%s\n\n' % (t, freq_to_str(subs)) 260 | roots = sorted([(k, freqs['root'][k]) for k in freqs['root']], key=itemgetter(1), reverse=True) 261 | s += '***%s [roots]***\n%s\n\n' % (t, freq_to_str(roots)) 262 | logging.info(s) 263 | utils.save_string(s, output_file) 264 | 265 | 266 | def freq_to_str(freq): 267 | return '\n'.join(['%s\t%s' % (t[0], t[1]) for t in freq]) 268 | 269 | 270 | def add_key_freq(d, key): 271 | if key in d: 272 | d[key] += 1 273 | else: 274 | d[key] = 1 275 | 276 | 277 | def summarise_validation_results(folder): 278 | files = [f for f in listdir(folder) if isfile(join(folder, f))] 279 | t2freq = {} 280 | for f in files: 281 | gen_doc = eHostGenedDoc(join(folder, f)) 282 | logging.debug('processing: %s / %s' % (folder, f)) 283 | for g in gen_doc.get_ess_entities(): 284 | logging.debug('validation label: %s' % g.type) 285 | if g.type not in t2freq: 286 | t2freq[g.type] = 0 287 | t2freq[g.type] += 1 288 | s = '\n'.join(['%s\t%s' % (t, t2freq[t]) for t in t2freq]) 289 | logging.info(s) 290 | return s 291 | 292 | 293 | 294 | if __name__ == "__main__": 295 | log_level = 'DEBUG' 296 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' 297 | logging.basicConfig(level='DEBUG', format=log_format) 298 | # compute_iaa() 299 | # analysing_label_performance('S:/NLP/annotation_it02/annotation_Steven/iteration_02/saved', 300 | # 'P:/wuh/label2performce_steve.tsv') 301 | # generate_gold_stand_from_validation('P:/wuh/SemEHR-working/outputs_it2/nlp2phenome', 302 | # 'S:/NLP/annotation_it02/annotation_Steven/iteration_02/saved', 303 | # 'P:/wuh/SemEHR-working/outputs_it2/gold_stand_results') 304 | sub_json_file = './diabetes_subs.json' 305 | analyse_trajectory_subjects(sub_json_file, './traject_sub_analysis_result.txt') 306 | # if len(sys.argv) != 4: 307 | # print('the syntax is [python ann_utils.py ann_folder, text_folder, result_file]') 308 | # else: 309 | # logging.info('working...') 310 | # get_what_is_changing(sys.argv[1], sys.argv[2], sys.argv[3], eHostAnnFile=False) 311 | # summarise_validation_results('/data/val/it2') -------------------------------------------------------------------------------- /annotation_docs.py: -------------------------------------------------------------------------------- 1 | import utils 2 | from os import listdir 3 | from os.path import basename, isfile, join 4 | import logging 5 | import re 6 | from learners import LabelPerformance 7 | # import reportreader as rr 8 | 9 | 10 | class BasicAnn(object): 11 | """ 12 | a simple NLP (Named Entity) annotation class 13 | """ 14 | 15 | def __init__(self, str, start, end): 16 | self._str = str 17 | self._start = start 18 | self._end = end 19 | self._id = -1 20 | 21 | @property 22 | def id(self): 23 | return self._id 24 | 25 | @id.setter 26 | def id(self, value): 27 | self._id = value 28 | 29 | @property 30 | def str(self): 31 | return self._str 32 | 33 | @str.setter 34 | def str(self, value): 35 | self._str = value 36 | 37 | @property 38 | def start(self): 39 | return self._start 40 | 41 | @start.setter 42 | def start(self, value): 43 | self._start = value 44 | 45 | @property 46 | def end(self): 47 | return self._end 48 | 49 | @end.setter 50 | def end(self, value): 51 | self._end = value 52 | 53 | def overlap(self, other_ann): 54 | if (other_ann.start <= self.start <= other_ann.end or other_ann.start <= self.end <= other_ann.end) or \ 55 | (self.start <= other_ann.start <= self.end or self.start <= other_ann.end <= self.end): 56 | return True 57 | else: 58 | return False 59 | 60 | def is_larger(self, other_ann): 61 | return self.start <= other_ann.start and self.end >= other_ann.end \ 62 | and not (self.start == other_ann.start and self.end == other_ann.end) 63 | 64 | def serialise_json(self): 65 | return {'start': self.start, 'end': self.end, 'str': self.str, 'id': self.id} 66 | 67 | @staticmethod 68 | def deserialise(jo): 69 | ann = BasicAnn(jo['start'], jo['start'], jo['end']) 70 | ann.id = jo['id'] 71 | return ann 72 | 73 | 74 | class EDIRAnn(BasicAnn): 75 | """ 76 | EDIR annotation class 77 | """ 78 | 79 | def __init__(self, str, start, end, type): 80 | self._type = type 81 | super(EDIRAnn, self).__init__(str, start, end) 82 | self._negated = False 83 | 84 | @property 85 | def type(self): 86 | return self._type 87 | 88 | @type.setter 89 | def type(self, value): 90 | self._type = value 91 | 92 | @property 93 | def negated(self): 94 | return self._negated 95 | 96 | @negated.setter 97 | def negated(self, value): 98 | self._negated = value 99 | 100 | @property 101 | def label(self): 102 | t = self.type 103 | if self.negated: 104 | t = 'neg_' + t 105 | return t 106 | 107 | 108 | class ContextedAnn(BasicAnn): 109 | """ 110 | a contextulised annotation class (negation/tempolarity/experiencer) 111 | """ 112 | 113 | def __init__(self, str, start, end, negation, temporality, experiencer): 114 | self._neg = negation 115 | self._temp = temporality 116 | self._exp = experiencer 117 | super(ContextedAnn, self).__init__(str, start, end) 118 | 119 | @property 120 | def negation(self): 121 | return self._neg 122 | 123 | @negation.setter 124 | def negation(self, value): 125 | self._neg = value 126 | 127 | @property 128 | def temporality(self): 129 | return self._temp 130 | 131 | @temporality.setter 132 | def temporality(self, value): 133 | self._temp = value 134 | 135 | @property 136 | def experiencer(self): 137 | return self._exp 138 | 139 | @experiencer.setter 140 | def experiencer(self, value): 141 | self._exp = value 142 | 143 | 144 | class PhenotypeAnn(ContextedAnn): 145 | """ 146 | a simple customisable phenotype annotation (two attributes for customised attributes) 147 | """ 148 | 149 | def __init__(self, str, start, end, 150 | negation, temporality, experiencer, 151 | major_type, minor_type): 152 | super(PhenotypeAnn, self).__init__(str, start, end, negation, temporality, experiencer) 153 | self._major_type = major_type 154 | self._minor_type = minor_type 155 | 156 | @property 157 | def major_type(self): 158 | return self._major_type 159 | 160 | @major_type.setter 161 | def major_type(self, value): 162 | self._major_type = value 163 | 164 | @property 165 | def minor_type(self): 166 | return self._minor_type 167 | 168 | @minor_type.setter 169 | def minor_type(self, value): 170 | self._minor_type = value 171 | 172 | def to_dict(self): 173 | return { 174 | 'str': self.str, 175 | 'start': self.start, 176 | 'end': self.end, 177 | 'negation': self.negation, 178 | 'temporality': self.temporality, 179 | 'experiencer': self.experiencer, 180 | 'majorType': self.major_type, 181 | 'minorType': self.minor_type 182 | } 183 | 184 | def serialise_json(self): 185 | dict = super(PhenotypeAnn, self).serialise_json() 186 | dict['major_type'] = self.major_type 187 | dict['minor_type'] = self.minor_type 188 | return dict 189 | 190 | @staticmethod 191 | def deserialise(jo): 192 | ann = PhenotypeAnn(jo['str'], jo['start'], jo['end'], jo['negation'], jo['temporality'], 193 | jo['experiencer'], jo['major_type'], jo['minor_type']) 194 | ann.id = jo['id'] 195 | return ann 196 | 197 | 198 | class SemEHRAnn(ContextedAnn): 199 | """ 200 | SemEHR Annotation Class 201 | """ 202 | 203 | def __init__(self, str, start, end, 204 | negation, temporality, experiencer, 205 | cui, sty, pref, ann_type): 206 | super(SemEHRAnn, self).__init__(str, start, end, negation, temporality, experiencer) 207 | self._cui = cui 208 | self._sty = sty 209 | self._pref = pref 210 | self._ann_type = ann_type 211 | self._ruled_by = [] 212 | 213 | @property 214 | def ruled_by(self): 215 | return self._ruled_by 216 | 217 | @property 218 | def cui(self): 219 | return self._cui 220 | 221 | @cui.setter 222 | def cui(self, value): 223 | self._cui = value 224 | 225 | @property 226 | def sty(self): 227 | return self._sty 228 | 229 | @sty.setter 230 | def sty(self, value): 231 | self._sty = value 232 | 233 | @property 234 | def ann_type(self): 235 | return self._ann_type 236 | 237 | @ann_type.setter 238 | def ann_type(self, value): 239 | self._ann_type = value 240 | 241 | @property 242 | def pref(self): 243 | return self._pref 244 | 245 | @pref.setter 246 | def pref(self, value): 247 | self._pref = value 248 | 249 | @staticmethod 250 | def deserialise(jo): 251 | ann = SemEHRAnn(jo['str'], jo['start'], jo['end'], jo['negation'], jo['temporality'], 252 | jo['experiencer'], jo['cui'], jo['sty'], jo['pref'], 'mention') 253 | ann.id = jo['id'] 254 | if 'ruled_by' in jo: 255 | ann._ruled_by = jo['ruled_by'] 256 | if 'study_concepts' in jo: 257 | ann._study_concepts = jo['study_concepts'] 258 | return ann 259 | 260 | 261 | class SemEHRAnnDoc(object): 262 | """ 263 | SemEHR annotation Doc 264 | """ 265 | 266 | def __init__(self, file_path, ann_doc=None): 267 | if ann_doc is not None: 268 | self._doc = ann_doc 269 | else: 270 | self._doc = utils.load_json_data(file_path) 271 | self._anns = [] 272 | self._phenotype_anns = [] 273 | self._sentences = [] 274 | self._others = [] 275 | self.load_anns() 276 | 277 | def load_anns(self): 278 | all_anns = self._anns 279 | panns = self._phenotype_anns 280 | if 'sentences' in self._doc: 281 | # is a SemEHRAnnDoc serialisation 282 | self._anns = [SemEHRAnn.deserialise(a) for a in self._doc['annotations']] 283 | if 'phenotypes' in self._doc: 284 | self._phenotype_anns = [PhenotypeAnn.deserialise(a) for a in self._doc['phenotypes']] 285 | self._sentences = [BasicAnn.deserialise(a) for a in self._doc['sentences']] 286 | else: 287 | for anns in self._doc['annotations']: 288 | for ann in anns: 289 | t = ann['type'] 290 | if t == 'Mention': 291 | a = SemEHRAnn(ann['features']['string_orig'], 292 | int(ann['startNode']['offset']), 293 | int(ann['endNode']['offset']), 294 | 295 | ann['features']['Negation'], 296 | ann['features']['Temporality'], 297 | ann['features']['Experiencer'], 298 | 299 | ann['features']['inst'], 300 | ann['features']['STY'], 301 | ann['features']['PREF'], 302 | t) 303 | all_anns.append(a) 304 | a.id = 'cui-%s' % len(all_anns) 305 | elif t == 'Phenotype': 306 | a = PhenotypeAnn(ann['features']['string_orig'], 307 | int(ann['startNode']['offset']), 308 | int(ann['endNode']['offset']), 309 | 310 | ann['features']['Negation'], 311 | ann['features']['Temporality'], 312 | ann['features']['Experiencer'], 313 | 314 | ann['features']['majorType'], 315 | ann['features']['minorType']) 316 | panns.append(a) 317 | a.id = 'phe-%s' % len(panns) 318 | elif t == 'Sentence': 319 | a = BasicAnn('Sentence', 320 | int(ann['startNode']['offset']), 321 | int(ann['endNode']['offset'])) 322 | self._sentences.append(a) 323 | self._sentences = sorted(self._sentences, key=lambda x: x.start) 324 | a.id = 'sent-%s' % len(self._sentences) 325 | else: 326 | self._others.append(ann) 327 | 328 | sorted(all_anns, key=lambda x: x.start) 329 | 330 | @property 331 | def annotations(self): 332 | return self._anns 333 | 334 | @property 335 | def sentences(self): 336 | return self._sentences 337 | 338 | @sentences.setter 339 | def sentences(self, value): 340 | self._sentences = value 341 | 342 | @property 343 | def phenotypes(self): 344 | return self._phenotype_anns 345 | 346 | def learn_mappings_from_labelled(self, labelled_doc, lbl2insts, lbl2missed): 347 | ed = labelled_doc 348 | sd = self 349 | for e in ed.get_ess_entities(): 350 | matched = False 351 | for a in sd.annotations: 352 | if a.overlap(e) and not e.is_larger(a): 353 | matched = True 354 | if e.type not in lbl2insts: 355 | lbl2insts[e.type] = set() 356 | lbl2insts[e.type].add('\t'.join([a.cui, a.pref, a.sty])) 357 | continue 358 | # if not matched: 359 | if True: 360 | if e.type not in lbl2missed: 361 | lbl2missed[e.type] = [] 362 | lbl2missed[e.type].append(e.str.lower()) 363 | 364 | @staticmethod 365 | def keep_max_len_anns(anns): 366 | ann2remove = set() 367 | for idx in range(len(anns)): 368 | a = anns[idx] 369 | for ni in range(idx + 1, len(anns)): 370 | b = anns[ni] 371 | if a.overlap(b): 372 | if a.is_larger(b): 373 | ann2remove.add(b) 374 | elif b.is_larger(a): 375 | ann2remove.add(a) 376 | for a in ann2remove: 377 | anns.remove(a) 378 | 379 | 380 | class Concept2Mapping(object): 381 | """ 382 | a mapping from annotations to phenotypes 383 | """ 384 | 385 | def __init__(self, concept_map_file): 386 | self._concept_map_file = concept_map_file 387 | self._cui2label = {} 388 | self._concept2label = None 389 | self._type2concept = {} 390 | self._type2gaz = {} 391 | self._all_entities = [] 392 | self.load_concept_mappings() 393 | 394 | def load_concept_mappings(self): 395 | concept_mapping = utils.load_json_data(self._concept_map_file) 396 | concept2types = {} 397 | for t in concept_mapping: 398 | self._type2concept[t] = [] 399 | for text in concept_mapping[t]: 400 | c = text[:8] # only to get the CUI 401 | self._type2concept[t].append(c) 402 | arr = text.split('\t') 403 | self._cui2label[c] = arr[1] 404 | if c not in concept2types: 405 | concept2types[c] = [] 406 | concept2types[c].append(t) 407 | self._all_entities.append(c.lower()) 408 | self._concept2label = concept2types 409 | 410 | def load_gaz_dir(self, gaz_dir): 411 | files = [f for f in listdir(gaz_dir) if isfile(join(gaz_dir, f))] 412 | for f in files: 413 | if f.endswith('.lst'): 414 | t = f.split('.')[0] 415 | self._type2gaz[t] = utils.read_text_file(join(gaz_dir, f)) 416 | self._all_entities += [t.lower() for t in self._type2gaz[t]] 417 | 418 | @property 419 | def cui2label(self): 420 | return self._cui2label 421 | 422 | @property 423 | def concept2label(self): 424 | return self._concept2label 425 | 426 | @concept2label.setter 427 | def concept2label(self, value): 428 | self._concept2label = value 429 | 430 | def type2cocnepts(self, type): 431 | return self._type2concept[type] 432 | 433 | @property 434 | def type2gaz(self): 435 | return self._type2gaz 436 | 437 | @property 438 | def all_entities(self): 439 | return self._all_entities 440 | 441 | 442 | class CustomisedRecoginiser(SemEHRAnnDoc): 443 | """ 444 | recognise target labels based on identified UMLS entities and 445 | customised labels 446 | """ 447 | 448 | def __init__(self, file_path, concept_mapping, ann_doc=None): 449 | super(CustomisedRecoginiser, self).__init__(file_path=file_path, ann_doc=ann_doc) 450 | self._concept_mapping = concept_mapping 451 | self._mapped = None 452 | self._phenotypes = None 453 | self._combined = None 454 | self._full_text_folder = None 455 | self._full_text_file_pattern = '%s.txt' 456 | self._full_text = None 457 | 458 | @property 459 | def full_text_folder(self): 460 | return self._full_text_folder 461 | 462 | @full_text_folder.setter 463 | def full_text_folder(self, value): 464 | self._full_text_folder = value 465 | 466 | @property 467 | def full_text_file_pattern(self): 468 | return self._full_text_file_pattern 469 | 470 | @full_text_file_pattern.setter 471 | def full_text_file_pattern(self, value): 472 | self._full_text_file_pattern = value 473 | 474 | @property 475 | def concept2label(self): 476 | return self._concept_mapping.concept2label 477 | 478 | def get_mapped_labels(self): 479 | if self._mapped is not None: 480 | return self._mapped 481 | mapped = [] 482 | for ann in self.annotations: 483 | if ann.cui in self.concept2label: 484 | for t in self.concept2label[ann.cui]: 485 | ea = EDIRAnn(ann.str, ann.start, ann.end, t) 486 | ea.negated = ann.negation == 'Negated' 487 | ea.id = ann.id 488 | mapped.append(ea) 489 | self._mapped = mapped 490 | return mapped 491 | 492 | def get_customised_phenotypes(self): 493 | if self._phenotypes is not None: 494 | return self._phenotypes 495 | self._phenotypes = [] 496 | for ann in self.phenotypes: 497 | ea = EDIRAnn(ann.str, ann.start, ann.end, ann.minor_type) 498 | ea.negated = ann.negation == 'Negated' 499 | ea.id = ann.id 500 | self._phenotypes.append(ea) 501 | return self._phenotypes 502 | 503 | def get_ann_sentence(self, ann): 504 | sent = None 505 | for s in self.sentences: 506 | if ann.overlap(s): 507 | sent = s 508 | break 509 | if sent is None: 510 | print('sentence not found for %s' % ann.__dict__) 511 | return None 512 | return sent 513 | 514 | def get_previous_sentences(self, ann, include_self=True): 515 | sent = self.get_ann_sentence(ann) 516 | if sent is None: 517 | return None 518 | sents = [] 519 | for s in self.sentences: 520 | if s.start < sent.start: 521 | sents.append(s) 522 | return sorted(sents + ([] if not include_self else [sent]), key=lambda s: s.start) 523 | 524 | def get_sent_anns(self, sent, ann_ignore=None, filter_fun=None, filter_param=None): 525 | ret = {'umls': [], 'phenotype': []} 526 | for a in self.annotations: 527 | if a.overlap(sent): 528 | if ann_ignore is not None and ann_ignore.overlap(a): 529 | continue 530 | if filter_fun is not None and filter_fun(a, filter_param): 531 | continue 532 | ret['umls'].append(a) 533 | for a in self.phenotypes: 534 | if a.overlap(sent): 535 | if ann_ignore is not None and ann_ignore.overlap(a): 536 | continue 537 | if filter_fun is not None and filter_fun(a, filter_param): 538 | continue 539 | ret['phenotype'].append(a) 540 | return ret 541 | 542 | def get_same_sentence_anns(self, ann): 543 | sent = self.get_ann_sentence(ann) 544 | if sent is None: 545 | return None 546 | return self.get_sent_anns(sent, ann) 547 | 548 | def get_prior_anns(self, ann, filter_fun=None, filter_param=None, contenxt_depth=-1): 549 | sents = self.get_previous_sentences(ann) 550 | ret = {'umls': [], 'phenotype': []} 551 | for s in sents[contenxt_depth:]: 552 | r = self.get_sent_anns(s, ann_ignore=ann, filter_fun=filter_fun, filter_param=filter_param) 553 | ret['umls'] += r['umls'] 554 | ret['phenotype'] += r['phenotype'] 555 | return ret 556 | 557 | def get_containing_anns(self, ann): 558 | c_anns = [] 559 | for a in self.phenotypes: 560 | if ann != a and ann.str.lower() in a.str.lower() and len(a.str) > len(ann.str): 561 | c_anns.append(a) 562 | return c_anns 563 | 564 | @property 565 | def full_text(self): 566 | return self._full_text 567 | 568 | @full_text.setter 569 | def full_text(self, value): 570 | self._full_text = value 571 | 572 | def get_full_text(self, fk): 573 | if self._full_text is None and self._full_text_folder is not None and self._full_text_file_pattern is not None: 574 | self._full_text = utils.read_text_file_as_string( 575 | join(self._full_text_folder, 576 | self._full_text_file_pattern % fk), encoding='utf-8') 577 | return self._full_text 578 | 579 | def relocate_all_anns(self, fk): 580 | t = self.get_full_text(fk) 581 | for a in self.phenotypes + self.annotations: 582 | s, e = relocate_annotation_pos(t, a.start, a.end, a.str) 583 | a.start = s 584 | a.end = e 585 | 586 | def re_segment_sentences(self, fk): 587 | text = self.get_full_text(fk) 588 | if text is not None: 589 | self.sentences = rr.get_sentences_as_anns(rr.get_nlp_instance(), text) 590 | 591 | def get_context_words(self, ann, file_key, n_words=2): 592 | sent = self.get_ann_sentence(ann) 593 | t = self.get_full_text(file_key) 594 | words = [] 595 | if t is not None: 596 | s = t[sent.start:sent.end] 597 | context_start = ann.start - sent.start + len(ann.str) 598 | str = s[context_start:] 599 | p = re.compile(r'\[A-Za-z]{0,2}\b(\w+)\b') 600 | idx = 0 601 | for m in p.finditer(str): 602 | if idx <= n_words - 1: 603 | words.append(str[m.span(1)[0]:m.span(1)[1]]) 604 | else: 605 | break 606 | idx += 1 607 | 608 | # use dependency tree to get context words 609 | # abss = rr.AbstractedSentence(1) 610 | # abss.text = s 611 | # result = abss.get_abstaction_by_pos(abss.locate_pos(ann.str), rr.get_nlp_instance()) 612 | # dep_words = [] 613 | # if result is not None: 614 | # # subject 615 | # dep_words.append(result.subject[0].text if len(result.subject) > 0 else 'empty') 616 | 617 | # # first verb other than root verb 618 | # dep_words.append(result.verbs[0].text if len(result.verbs) > 0 else 'empty') 619 | 620 | # # root verb 621 | # dep_words.append(result.root.text if result.root is not None else 'empty') 622 | 623 | # # first child 624 | # dep_words.append(result.children[0].text if len(result.children) > 0 else 'empty') 625 | # else: 626 | # dep_words += ['empty'] *4 627 | # logging.debug('not found [%s]' % s) 628 | # words += dep_words 629 | if len(words) == 0: 630 | words = ['empty'] 631 | return words 632 | 633 | def get_anns_by_label(self, label, ignore_mappings=[], no_context=False): 634 | anns = [] 635 | t = label.replace('neg_', '') 636 | for a in self.annotations: 637 | if a.cui not in self.concept2label: 638 | continue 639 | if a.cui in ignore_mappings: 640 | continue 641 | if len(a.ruled_by) > 0: 642 | continue 643 | if t in self.concept2label[a.cui]: 644 | if no_context: 645 | anns.append(a) 646 | elif label.startswith('neg_') and a.negation == 'Negated': 647 | anns.append(a) 648 | elif not label.startswith('neg_') and a.negation != 'Negated': 649 | anns.append(a) 650 | # anns = [] 651 | phenotypes = [] 652 | smaller_to_remove = [] 653 | for a in self.phenotypes: 654 | if a.minor_type == t: 655 | if a.str.lower() in [s.lower() for s in ignore_mappings]: 656 | continue 657 | if no_context or (label.startswith('neg_') and a.negation == 'Negated') or \ 658 | (not label.startswith('neg_') and a.negation != 'Negated'): 659 | overlaped = False 660 | for ann in anns + phenotypes: 661 | if ann.overlap(a): 662 | if a.is_larger(ann): 663 | smaller_to_remove.append(ann) 664 | else: 665 | overlaped = True 666 | break 667 | if not overlaped: 668 | phenotypes.append(a) 669 | for o in smaller_to_remove: 670 | if o in anns: 671 | anns.remove(o) 672 | if o in phenotypes: 673 | phenotypes.remove(o) 674 | return anns + phenotypes 675 | 676 | def get_combined_anns(self): 677 | if self._combined is not None: 678 | return self._combined 679 | anns = [] + self.get_mapped_labels() 680 | for ann in self.get_customised_phenotypes(): 681 | overlaped = False 682 | for m in self.get_mapped_labels(): 683 | if ann.overlap(m): 684 | overlaped = True 685 | break 686 | if not overlaped: 687 | anns.append(ann) 688 | self._combined = anns 689 | return anns 690 | 691 | def validate_mapped_performance(self, gold_anns, label2performance): 692 | CustomisedRecoginiser.validate(gold_anns, self.get_mapped_labels(), label2performance) 693 | 694 | def validate_combined_performance(self, gold_anns, label2performance): 695 | CustomisedRecoginiser.validate(gold_anns, 696 | self.get_combined_anns(), 697 | label2performance) 698 | 699 | @staticmethod 700 | def validate(gold_anns, learnt_anns, label2performance): 701 | matched_ann_ids = [] 702 | for ga in gold_anns: 703 | l = ga.label 704 | if l not in label2performance: 705 | label2performance[l] = LabelPerformance(l) 706 | performance = label2performance[l] 707 | matched = False 708 | for la in learnt_anns: 709 | if la.label == l and la.overlap(ga): 710 | matched = True 711 | performance.increase_true_positive() 712 | matched_ann_ids.append(la.id) 713 | break 714 | if not matched: 715 | performance.increase_false_negative() 716 | for la in learnt_anns: 717 | if la.id not in matched_ann_ids: 718 | l = la.label 719 | if l not in label2performance: 720 | label2performance[l] = LabelPerformance(l) 721 | performance = label2performance[l] 722 | performance.increase_false_positive() 723 | 724 | @staticmethod 725 | def print_performances(label2performances): 726 | s = ''.join(['*' * 10, 'performance', '*' * 10]) 727 | s += '\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('label', 'precision', 'recall', 'f1', '#insts', 'false positive', 728 | 'false negative', 'true positive') 729 | for t in label2performances: 730 | p = label2performances[t] 731 | s += '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (t, p.precision, p.recall, p.f1, 732 | p.true_positive + p.false_negative, 733 | p.false_positive, p.false_negative, p.true_positive) 734 | logging.getLogger('performance').info(s) 735 | return s 736 | 737 | 738 | def relocate_annotation_pos(t, s, e, string_orig): 739 | if t[s:e] == string_orig: 740 | return [s, e] 741 | candidates = [] 742 | ito = re.finditer(r'[\s\.;\,\?\!\:\/$^](' + string_orig + r')[\s\.;\,\?\!\:\/$^]', 743 | t, re.IGNORECASE) 744 | for mo in ito: 745 | # print mo.start(1), mo.end(1), mo.group(1) 746 | candidates.append({'dis': abs(s - mo.start(1)), 's': mo.start(1), 'e': mo.end(1), 'matched': mo.group(1)}) 747 | if len(candidates) == 0: 748 | return [s, e] 749 | candidates.sort(cmp=lambda x1, x2: x1['dis'] - x2['dis']) 750 | # print candidates[0] 751 | return [candidates[0]['s'], candidates[0]['e']] -------------------------------------------------------------------------------- /data/entity_types.txt: -------------------------------------------------------------------------------- 1 | haemorrhagic_stroke 2 | haemorrhagic_transformation 3 | ischaemic_stroke 4 | loc_cortical 5 | loc_deep 6 | mening_tumour 7 | metast_tumour 8 | microhaemorrhage 9 | neg_atrophy 10 | neg_haemorrhagic_stroke 11 | neg_haemorrhagic_transformation 12 | neg_ischaemic_stroke 13 | neg_loc_cortical 14 | neg_loc_deep 15 | neg_mening_tumour 16 | neg_metast_tumour 17 | neg_microhaemorrhage 18 | neg_small_vessel_disease 19 | neg_stroke 20 | neg_subarachnoid_haemorrhage 21 | neg_subdural_haematoma 22 | neg_time_old 23 | neg_time_recent 24 | neg_tumour 25 | small_vessel_disease 26 | stroke 27 | subarachnoid_haemorrhage 28 | subdural_haematoma 29 | time_old 30 | time_recent 31 | tumour 32 | atrophy -------------------------------------------------------------------------------- /data/entity_types_modifiers.txt: -------------------------------------------------------------------------------- 1 | loc_cortical 2 | loc_deep 3 | time_old 4 | time_recent -------------------------------------------------------------------------------- /data/entity_types_no_context.txt: -------------------------------------------------------------------------------- 1 | atrophy 2 | glioma_tumour 3 | haemorrhagic_stroke 4 | haemorrhagic_transformation 5 | ischaemic_stroke 6 | loc_cortical 7 | loc_deep 8 | mening_tumour 9 | metast_tumour 10 | microhaemorrhage 11 | small_vessel_disease 12 | stroke 13 | subarachnoid_haemorrhage 14 | subdural_haematoma 15 | time_old 16 | time_recent 17 | tumour -------------------------------------------------------------------------------- /data/entity_types_phenotypes.txt: -------------------------------------------------------------------------------- 1 | atrophy 2 | glioma_tumour 3 | haemorrhagic_stroke 4 | haemorrhagic_transformation 5 | ischaemic_stroke 6 | mening_tumour 7 | metast_tumour 8 | microhaemorrhage 9 | small_vessel_disease 10 | stroke 11 | subarachnoid_haemorrhage 12 | subdural_haematoma 13 | tumour -------------------------------------------------------------------------------- /data/entity_types_times.txt: -------------------------------------------------------------------------------- 1 | time_old 2 | time_recent -------------------------------------------------------------------------------- /doc_inference.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import re 3 | import json 4 | import sys 5 | 6 | 7 | class RuleConstruct(object): 8 | def __init__(self, phenotype): 9 | self._phenotype = phenotype 10 | self._negation = 'Affirmed' 11 | self._temporality = 'Recent' 12 | self._experiencer = 'Patient' 13 | 14 | @property 15 | def phenotype(self): 16 | return self._phenotype 17 | 18 | @phenotype.setter 19 | def phenotype(self, value): 20 | self._phenotype = value 21 | 22 | @property 23 | def negation(self): 24 | return self._negation 25 | 26 | @negation.setter 27 | def negation(self, value): 28 | self._negation = value 29 | 30 | @property 31 | def temporality(self): 32 | return self._temporality 33 | 34 | @temporality.setter 35 | def temporality(self, value): 36 | self._temporality = value 37 | 38 | @property 39 | def experiencer(self): 40 | return self._experiencer 41 | 42 | @experiencer.setter 43 | def experiencer(self, value): 44 | self._experiencer = value 45 | 46 | 47 | class PhenotypeRule(object): 48 | def __init__(self): 49 | self._inclusion = [] 50 | self._exclusion = [] 51 | self._rule_label = None 52 | 53 | def inclusion_constructs(self): 54 | return self._inclusion 55 | 56 | def exclusion_units(self): 57 | return self._exclusion 58 | 59 | @property 60 | def rule_label(self): 61 | return self._rule_label 62 | 63 | @rule_label.setter 64 | def rule_label(self, value): 65 | self._rule_label = value 66 | 67 | @staticmethod 68 | def load_rules(rule_file): 69 | rules = utils.load_json_data(rule_file) 70 | prs = [] 71 | for r in rules: 72 | pr = PhenotypeRule() 73 | pr.rule_label = r['label'] 74 | prs.append(pr) 75 | pr.inclusion_constructs = [PhenotypeRule.get_rule_construct(c) for c in r['inclusions']] 76 | pr.exclusion_units = [] 77 | for u in r['exclusion_units']: 78 | pr.exclusion_units.append([PhenotypeRule.get_rule_construct(c) for c in u]) 79 | return prs 80 | 81 | @staticmethod 82 | def get_rule_construct(c): 83 | rc = RuleConstruct(c['phenotype']) 84 | if 'negation' in c: 85 | rc.negation = c['negation'] 86 | if 'temporality' in c: 87 | rc.temporality = c['temporality'] 88 | if 'experiencer' in c: 89 | rc.experiencer = c['experiencer'] 90 | return rc 91 | 92 | 93 | class PhenotypeRuleExecutor(object): 94 | def __init__(self): 95 | pass 96 | 97 | @staticmethod 98 | def apply_rules(doc_anns, rules): 99 | label_prov = [] 100 | anns = [t['ann'] for t in doc_anns] 101 | for r in rules: 102 | prov = {"exclusion": [], "inclusion": None} 103 | label = '' 104 | inclusion_matched = PhenotypeRuleExecutor.match_rule_construct(r.inclusion_constructs, anns) 105 | if len(inclusion_matched) > 0: 106 | prov['inclusion'] = inclusion_matched 107 | for ec in r.exclusion_units: 108 | exclusion_matched = PhenotypeRuleExecutor.match_rule_construct(ec, anns) 109 | if len(exclusion_matched) > 0: 110 | prov['exclusion'].append({'ec': ec, 'matched': exclusion_matched}) 111 | if len(prov['exclusion']) == 0: 112 | label = r.rule_label 113 | if label != '': # or len(prov['exclusion']) > 0: 114 | label_prov.append({'label': label, 'prov': prov}) 115 | return label_prov 116 | 117 | @staticmethod 118 | def match_ann_rule(rc, ann): 119 | return ann['minorType'] == rc.phenotype and ann['negation'] == rc.negation and ann[ 120 | 'temporality'] == rc.temporality and ann['experiencer'] == rc.experiencer 121 | 122 | @staticmethod 123 | def match_rule_construct(rc_list, anns): 124 | matched = [] 125 | for ann in anns: 126 | m = True 127 | for rc in rc_list: 128 | if not PhenotypeRuleExecutor.match_ann_rule(rc, ann): 129 | m = False 130 | break 131 | if m: 132 | matched.append(ann) 133 | return matched 134 | 135 | 136 | def load_patient_truth(truth_file): 137 | all_pids = [] 138 | lines = utils.read_text_file(truth_file) 139 | type2ids = {} 140 | for l in lines: 141 | arr = l.split('\t') 142 | if arr[2] not in type2ids: 143 | type2ids[arr[2]] = [] 144 | type2ids[arr[2]].append(arr[0]) 145 | all_pids.append(arr[0]) 146 | return type2ids, all_pids 147 | 148 | 149 | def cal_performance(no_reports_pids, type2ids, doc_type2id, gd_labels, pred_label): 150 | gt_list = [] 151 | for lbl in gd_labels: 152 | gt_list += type2ids[lbl] 153 | gt_ids = set(gt_list) 154 | pr_ids = set(doc_type2id[pred_label]) 155 | print('\n*****%s******' % pred_label) 156 | 157 | false_negative = gt_ids - no_reports_pids - pr_ids 158 | false_positive = pr_ids - gt_ids 159 | print('total reported patients: %s, total truth: %s, predicted: %s, false negative:%s, false positive:%s' 160 | % (len(pids), len(gt_ids - no_reports_pids), len(pr_ids), len(false_negative), len(false_positive))) 161 | print('false negative: %s' % (false_negative)) 162 | print('false positive: %s' % false_positive) 163 | 164 | 165 | def doc_infer_with_ground_truth(patient_level_tsv, pids, doc_type2id): 166 | type2ids, all_pids = load_patient_truth(patient_level_tsv) 167 | no_reports_pids = set(all_pids) - set(pids) 168 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['SAH', 'ICH'], 'primary haemorrhagic stroke') 169 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['SAH'], 'subarachnoid haemorrhage') 170 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['ICH'], 'intracerebra haemorrhage') 171 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['Ischaemic'], 'ischaemic stroke') 172 | 173 | 174 | def doc_infer(settings): 175 | rules = PhenotypeRule.load_rules(settings['rule_file']) 176 | d2predicted = utils.load_json_data(settings['doc_nlp_results']) 177 | doc_labels_output = settings['doc_inference_output'] 178 | s = '' 179 | doc_type2id = {} 180 | pids = [] 181 | for d in d2predicted: 182 | m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d) 183 | pid = d 184 | if m is not None: 185 | pid = m.group(1) 186 | pids.append(pid) 187 | label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules) 188 | print(pid, d, label_provs) 189 | for lp in label_provs: 190 | if lp['label'] != '': 191 | s += '%s\t%s\n' % (pid, lp['label']) 192 | if lp['label'] not in doc_type2id: 193 | doc_type2id[lp['label']] = [] 194 | doc_type2id[lp['label']].append(pid) 195 | 196 | pids = list(set(pids)) 197 | print(json.dumps(pids)) 198 | utils.save_string(s, doc_labels_output) 199 | if 'patient_level_truth_tsv' in settings: 200 | doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id) 201 | 202 | 203 | if __name__ == "__main__": 204 | if len(sys.argv) != 2: 205 | print('the syntax is [python doc_inference.py PROCESS_SETTINGS_FILE_PATH]') 206 | else: 207 | infer_settings = utils.load_json_data(sys.argv[1]) 208 | doc_infer(infer_settings) 209 | -------------------------------------------------------------------------------- /learners.py: -------------------------------------------------------------------------------- 1 | import joblib as jl 2 | from sklearn import tree 3 | from sklearn.ensemble import RandomForestClassifier 4 | from sklearn.gaussian_process import GaussianProcessClassifier 5 | from sklearn.naive_bayes import GaussianNB 6 | from sklearn import svm 7 | from sklearn.decomposition import PCA 8 | from sklearn.cluster import DBSCAN 9 | from sklearn.neighbors import KNeighborsClassifier, KDTree 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | import logging 12 | from os.path import basename, isfile, join, split 13 | from os import listdir, remove 14 | import graphviz 15 | import numpy 16 | 17 | 18 | class PhenomeLearners(object): 19 | def __init__(self, setting): 20 | self._setting = setting 21 | 22 | @property 23 | def min_sample_size(self): 24 | return self._setting['min_sample_size'] 25 | 26 | @staticmethod 27 | def decision_tree_learning(self, X, Y, lm, output_file=None, pca_dim=None, pca_file=None, tree_viz_file=None, 28 | lbl='united', min_sample_size=25): 29 | if len(X) <= min_sample_size: 30 | logging.warning('not enough data found for prediction: %s' % lm.label) 31 | if isfile(output_file): 32 | remove(output_file) 33 | return 34 | pca = None 35 | if pca_dim is not None: 36 | pca = PCA(n_components=pca_dim) 37 | X_new = pca.fit_transform(X) 38 | else: 39 | X_new = X 40 | clf = tree.DecisionTreeClassifier() 41 | clf = clf.fit(X_new, Y) 42 | if output_file is not None: 43 | jl.dump(clf, output_file) 44 | logging.info('model file saved to %s' % output_file) 45 | if pca is not None and pca_file is not None: 46 | jl.dump(pca, pca_file) 47 | if tree_viz_file is not None: 48 | label_feature_names = [] 49 | if lm.use_one_dimension_for_label: 50 | label_feature_names.append('label') 51 | else: 52 | for l in lm.label_dimensions: 53 | if l.upper() in lm.cui2label: 54 | label_feature_names.append('lbl: ' + lm.cui2label[l.upper()]) 55 | else: 56 | label_feature_names.append('lbl: ' + l.upper()) 57 | dot_data = tree.export_graphviz(clf, out_file=None, 58 | filled=True, rounded=True, 59 | feature_names=label_feature_names + 60 | [(str(lm.cui2label[ 61 | l.upper()]) + '(' + l.upper() + ')') if l.upper() in lm.cui2label else l 62 | for l in lm.context_dimensions(lbl)], 63 | class_names=['Yes', 'No'], 64 | special_characters=True) 65 | graph = graphviz.Source(dot_data) 66 | graph.render(tree_viz_file) 67 | 68 | @staticmethod 69 | def random_forest_learning(X, Y, output_file=None): 70 | if len(X) == 0: 71 | logging.warning('no data found for prediction') 72 | return 73 | clf = RandomForestClassifier() 74 | clf = clf.fit(X, Y) 75 | if output_file is not None: 76 | jl.dump(clf, output_file) 77 | logging.info('model file saved to %s' % output_file) 78 | 79 | @staticmethod 80 | def svm_learning(X, Y, output_file=None): 81 | if len(X) == 0: 82 | logging.info('no data found for prediction') 83 | return 84 | v = -1 85 | all_same = True 86 | for y in Y: 87 | if v == -1: 88 | v = y[0] 89 | if v != y[0]: 90 | all_same = False 91 | break 92 | if all_same: 93 | logging.warning('all same labels %s' % Y) 94 | return 95 | clf = svm.SVC(kernel='sigmoid') 96 | clf = clf.fit(X, Y) 97 | if output_file is not None: 98 | jl.dump(clf, output_file) 99 | logging.info('model file saved to %s' % output_file) 100 | 101 | @staticmethod 102 | def gpc_learning(X, Y, output_file=None): 103 | gpc = GaussianProcessClassifier().fit(X, Y) 104 | if output_file is not None: 105 | jl.dump(gpc, output_file) 106 | logging.info('model file saved to %s' % output_file) 107 | 108 | @staticmethod 109 | def gaussian_nb(X, Y, output_file=None): 110 | gnb = GaussianNB().fit(X, Y) 111 | if output_file is not None: 112 | jl.dump(gnb, output_file) 113 | logging.info('model file saved to %s' % output_file) 114 | 115 | @staticmethod 116 | def cluster(X, Y, output_file=None): 117 | dbm = DBSCAN(eps=.50).fit(X) 118 | cls2label = {} 119 | for idx in range(len(dbm.labels_)): 120 | c = dbm.labels_[idx] 121 | cls = 'cls%s' % c 122 | if cls not in cls2label: 123 | cls2label[cls] = {'t': 0, 'f': 0} 124 | if Y[idx] == [0]: 125 | cls2label[cls]['f'] += 1 126 | else: 127 | cls2label[cls]['t'] += 1 128 | logging.info(cls2label) 129 | kdt = KDTree(X) 130 | if output_file is not None: 131 | jl.dump({'dbm': dbm, 'X': X, 'Y': Y, 'kdt': kdt, 'cls2label': cls2label}, output_file) 132 | logging.info('complex model file saved to %s' % output_file) 133 | 134 | @staticmethod 135 | def cluster_predict(X, Y, fns, multiple_tps, model_file, performance, 136 | separate_performance=None, min_sample_size=25): 137 | all_true = False 138 | if not isfile(model_file): 139 | logging.info('model file NOT FOUND: %s' % model_file) 140 | all_true = True 141 | else: 142 | m = jl.load(model_file) 143 | dbm = m['dbm'] 144 | kdt = m['kdt'] 145 | P = m.predict(X) 146 | if fns > 0: 147 | logging.debug('missed instances: %s' % fns) 148 | performance.increase_false_negative(fns) 149 | if multiple_tps > 0: 150 | performance.increase_true_positive(multiple_tps) 151 | if all_true or len(X) <= min_sample_size: 152 | logging.warn('using querying instead of predicting') 153 | P = numpy.ones(len(X)) 154 | else: 155 | logging.info('instance size %s' % len(P)) 156 | for idx in range(len(P)): 157 | LabelPerformance.evaluate_to_performance(P[idx], Y[idx], [performance, separate_performance]) 158 | 159 | @staticmethod 160 | def knn_classify(X, Y, output_file=None): 161 | knn = KNeighborsClassifier(n_neighbors=2).fit(X, Y) 162 | if output_file is not None: 163 | jl.dump(knn, output_file) 164 | logging.info('model file saved to %s' % output_file) 165 | 166 | @staticmethod 167 | def predict_use_simple_stats(tp_ratio, Y, multiple_tps, performance, ratio_cut_off=0.15, separate_performance=None, 168 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None, 169 | label_whitelist=None, mp_predicted=None): 170 | P = numpy.ones(len(Y)) if tp_ratio >= ratio_cut_off else numpy.zeros(len(Y)) 171 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted) 172 | if multiple_tps > 0: 173 | performance.increase_true_positive(multiple_tps) 174 | if separate_performance is not None: 175 | separate_performance.increase_true_positive(multiple_tps) 176 | PhenomeLearners.cal_performance(P, Y, performance, separate_performance, 177 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, 178 | doc_folder=doc_folder, 179 | label_whitelist=label_whitelist) 180 | 181 | @staticmethod 182 | def merge_with_pattern_prediction(y_pred, mp_predict): 183 | if mp_predict is None: 184 | return y_pred 185 | y_merged = [] 186 | print('>>>', y_pred, mp_predict) 187 | for idx in range(len(y_pred)): 188 | y_merged.append(y_pred[idx]) 189 | if y_pred[idx] == 1 and mp_predict[idx] == 0: 190 | y_merged[idx] = 0 191 | return y_merged 192 | 193 | @staticmethod 194 | def predict_use_simple_stats_in_action(tp_ratio, item_size, ratio_cut_off=0.15, 195 | doc2predicted=None, doc_anns=None, mp_predicted=None): 196 | P = numpy.ones(item_size) if tp_ratio >= ratio_cut_off else numpy.zeros(item_size) 197 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted) 198 | PhenomeLearners.collect_prediction(P, doc2predicted=doc2predicted, doc_anns=doc_anns) 199 | 200 | @staticmethod 201 | def cal_performance(P, Y, performance, separate_performance=None, 202 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None, label_whitelist=None): 203 | 204 | P = numpy.asarray(P).flatten().tolist() 205 | Y = numpy.asarray(Y).flatten().tolist() 206 | doc2predicted = {} 207 | for idx in range(len(P)): 208 | LabelPerformance.evaluate_to_performance(P[idx], Y[idx], [performance, separate_performance]) 209 | if P[idx] == 1.0 and id2conll is not None and doc_anns is not None and doc_folder is not None: 210 | PhenomeLearners.collect_prediction(P, doc_anns, doc2predicted) 211 | # comment the following out to skip conll outputs 212 | # for d in doc2predicted: 213 | # if d not in id2conll: 214 | # id2conll[d] = ConllDoc(join(doc_folder, file_pattern % d)) 215 | # if label_whitelist is not None: 216 | # id2conll[d].set_label_white_list(label_whitelist) 217 | # cnll = id2conll[d] 218 | # for anns in doc2predicted[d]: 219 | # cnll.add_predicted_labels(anns) 220 | 221 | @staticmethod 222 | def predict_use_model(X, Y, fns, multiple_tps, model_file, performance, 223 | pca_model_file=None, separate_performance=None, 224 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None, 225 | label_whitelist=None, mp_predicted=None): 226 | all_true = False 227 | if not isfile(model_file): 228 | logging.info('model file NOT FOUND: %s' % model_file) 229 | all_true = True 230 | else: 231 | if pca_model_file is not None: 232 | pca = jl.load(pca_model_file) 233 | X_new = pca.transform(X) 234 | else: 235 | X_new = X 236 | m = jl.load(model_file) 237 | P = m.predict(X_new) 238 | if fns > 0: 239 | logging.debug('missed instances: %s' % fns) 240 | performance.increase_false_negative(fns) 241 | if multiple_tps > 0: 242 | performance.increase_true_positive(multiple_tps) 243 | if separate_performance is not None: 244 | separate_performance.increase_true_positive(multiple_tps) 245 | if all_true: # or len(X) <= _min_sample_size: 246 | logging.warning('using querying instead of predicting') 247 | P = numpy.ones(len(X)) 248 | else: 249 | logging.info('instance size %s' % len(P)) 250 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted) 251 | PhenomeLearners.cal_performance(P, Y, performance, separate_performance, 252 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, 253 | doc_folder=doc_folder, label_whitelist=label_whitelist) 254 | 255 | @staticmethod 256 | def predict_use_model_in_action(X, model_file, pca_model_file=None, 257 | doc2predicted=None, doc_anns=None, mp_predicted=None): 258 | all_true = False 259 | if not isfile(model_file): 260 | logging.info('model file NOT FOUND: %s' % model_file) 261 | all_true = True 262 | else: 263 | if pca_model_file is not None: 264 | pca = jl.load(pca_model_file) 265 | X_new = pca.transform(X) 266 | else: 267 | X_new = X 268 | m = jl.load(model_file) 269 | P = m.predict(X_new) 270 | 271 | if all_true: # or len(X) <= _min_sample_size: 272 | logging.warning('using querying instead of predicting') 273 | P = numpy.ones(len(X)) 274 | else: 275 | logging.info('instance size %s' % len(P)) 276 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted) 277 | PhenomeLearners.collect_prediction(P, doc2predicted=doc2predicted, doc_anns=doc_anns) 278 | 279 | @staticmethod 280 | def collect_prediction(P, doc_anns, doc2predicted): 281 | for idx in range(len(P)): 282 | if P[idx] == 1.0 and doc_anns is not None: 283 | d = doc_anns[idx]['d'] 284 | labeled_ann = {'label': doc_anns[idx]['label'], 285 | 'ann': doc_anns[idx]['ann']} 286 | if d not in doc2predicted: 287 | doc2predicted[d] = [labeled_ann] 288 | else: 289 | doc2predicted[d].append(labeled_ann) 290 | 291 | 292 | class LabelPerformance(object): 293 | """ 294 | precision/recall/f1 calculation on TP/FN/FP values 295 | """ 296 | 297 | def __init__(self, label): 298 | self._label = label 299 | self._tp = 0 300 | self._fn = 0 301 | self._fp = 0 302 | 303 | def increase_true_positive(self, k=1): 304 | self._tp += k 305 | 306 | def increase_false_negative(self, k=1): 307 | self._fn += k 308 | 309 | def increase_false_positive(self, k=1): 310 | self._fp += k 311 | 312 | @property 313 | def true_positive(self): 314 | return self._tp 315 | 316 | @property 317 | def false_negative(self): 318 | return self._fn 319 | 320 | @property 321 | def false_positive(self): 322 | return self._fp 323 | 324 | @property 325 | def precision(self): 326 | if self._tp + self._fp == 0: 327 | return -1 328 | else: 329 | return 1.0 * self._tp / (self._tp + self._fp) 330 | 331 | @property 332 | def recall(self): 333 | if self._tp + self._fn == 0: 334 | return -1 335 | else: 336 | return 1.0 * self._tp / (self._tp + self._fn) 337 | 338 | @property 339 | def f1(self): 340 | if self.precision == -1 or self.recall == -1 or self.precision == 0 or self.recall == 0: 341 | return -1 342 | else: 343 | return 2 / (1 / self.precision + 1 / self.recall) 344 | 345 | @staticmethod 346 | def evaluate_to_performance(predicted, labelled, performance_objects): 347 | if predicted == labelled: 348 | if predicted == 1.0: 349 | for pf in performance_objects: 350 | if pf is not None: 351 | pf.increase_true_positive() 352 | elif predicted == 1.0: 353 | for pf in performance_objects: 354 | if pf is not None: 355 | pf.increase_false_positive() 356 | else: 357 | for pf in performance_objects: 358 | if pf is not None: 359 | pf.increase_false_negative() 360 | 361 | 362 | class BinaryClusterClassifier(object): 363 | def __init__(self, label): 364 | self._name = label 365 | self._class1reps = None 366 | self._class2reps = None 367 | 368 | @property 369 | def class1reps(self): 370 | return self._class1reps 371 | 372 | @property 373 | def class2reps(self): 374 | return self._class2reps 375 | 376 | def cluster(self, class1_data, class2_data): 377 | self._class1reps = BinaryClusterClassifier.do_clustering(class1_data, class_prefix='cls1:') 378 | self._class2reps = BinaryClusterClassifier.do_clustering(class2_data, class_prefix='cls2:') 379 | 380 | def classify(self, x, threshold=0.5, complementary_classifiers=None): 381 | p = BinaryClusterClassifier.calculate_most_similar(self, x) 382 | mp = p 383 | if p[1] < threshold and complementary_classifiers is not None: 384 | for classifer in complementary_classifiers: 385 | logging.debug('do extra classifying when the similarity is too low ...') 386 | p = BinaryClusterClassifier.calculate_most_similar(classifer, x) 387 | logging.debug('extra result @ %s' % p[1]) 388 | mp = p if p[1] > mp[1] else mp 389 | if p[1] > threshold: 390 | # stop when once exceeding the threshold 391 | break 392 | return mp, 0 if mp[0].startswith('cls2:') else 1 393 | 394 | @staticmethod 395 | def calculate_most_similar(classifier, x): 396 | results = [] 397 | xa = numpy.array(x).reshape(1, -1) 398 | for cls in classifier.class1reps: 399 | results.append((cls, cosine_similarity(xa, classifier.class1reps[cls]))) 400 | for cls in classifier.class2reps: 401 | results.append((cls, cosine_similarity(xa, classifier.class2reps[cls]))) 402 | return sorted(results, key=lambda x: -x[1])[0] 403 | 404 | @staticmethod 405 | def do_clustering(X, class_prefix='cls:'): 406 | dbm = DBSCAN(eps=1.0).fit(X) 407 | cls2insts = {} 408 | for idx in range(len(dbm.labels_)): 409 | c = dbm.labels_[idx] 410 | cls = '%s%s' % (class_prefix, c) 411 | if cls not in cls2insts: 412 | cls2insts[cls] = [X[idx]] 413 | else: 414 | cls2insts[cls].append(X[idx]) 415 | cls2mean = {} 416 | for cls in cls2insts: 417 | cls2mean[cls] = numpy.mean(cls2insts[cls], axis=0).reshape(1, -1) 418 | return cls2mean -------------------------------------------------------------------------------- /mention_pattern.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import utils 3 | import pandas as pd 4 | from os import listdir 5 | from os.path import isfile, join, split 6 | 7 | 8 | class AbstractedSentence(object): 9 | def __init__(self, seq): 10 | self._seq = 0 11 | self._abstracted_tokens = [] 12 | self._text = None 13 | self._parsed = None 14 | 15 | @property 16 | def seq(self): 17 | return self._seq 18 | 19 | @seq.setter 20 | def seq(self, value): 21 | self._seq = value 22 | 23 | def add_token(self, t): 24 | self._abstracted_tokens.append(t) 25 | 26 | @property 27 | def tokens(self): 28 | return self._abstracted_tokens 29 | 30 | @property 31 | def text(self): 32 | return self._text 33 | 34 | @text.setter 35 | def text(self, value): 36 | self._text = value 37 | 38 | def get_parsed_tree(self, nlp): 39 | """ 40 | use spacy instance to parse the sentence 41 | :param nlp: a spacy instance 42 | :return: dependency tree 43 | """ 44 | if self._parsed is not None: 45 | return self._parsed 46 | if self.text is None: 47 | return None 48 | self._parsed = nlp(self.text) 49 | return self._parsed 50 | 51 | def locate_pos(self, str): 52 | return self._text.find(str) 53 | 54 | def get_abstaction_by_pos(self, pos, nlp): 55 | doc = self.get_parsed_tree(nlp) 56 | token = None 57 | if doc is not None: 58 | for t in doc: 59 | if t.idx + len(t.text) == pos: 60 | token = t 61 | if token is not None: 62 | ta = TokenAbstraction(token, doc) 63 | else: 64 | return None 65 | return ta 66 | 67 | def get_related_tokens(self, t): 68 | ret = [] 69 | for tk in self._parsed: 70 | if tk.head == t: 71 | ret.append(tk) 72 | print(tk.text, tk.dep_, tk.head) 73 | return ret 74 | 75 | 76 | class TokenAbstraction(object): 77 | def __init__(self, token, doc): 78 | self._t = token 79 | self._d = doc 80 | self._children = [] 81 | self._root = None 82 | self._subject = None 83 | self._verbs = None 84 | self._vcontext = [] 85 | self.do_abstract() 86 | 87 | @property 88 | def vcontext(self): 89 | return self._vcontext 90 | 91 | @property 92 | def children(self): 93 | return self._children 94 | 95 | @property 96 | def root(self): 97 | return self._root 98 | 99 | @property 100 | def subject(self): 101 | return self._subject 102 | 103 | @property 104 | def verbs(self): 105 | return self._verbs 106 | 107 | @property 108 | def token(self): 109 | return self._t 110 | 111 | def do_abstract(self): 112 | self._children = [t for t in self._t.children] 113 | t = self._t 114 | r = t 115 | while (t.head != t) and t.dep_ not in ['ROOT', 'relcl', 'acl', 'advcl']: 116 | t = t.head 117 | if t.dep_ in ['ccomp']: 118 | self._subject = [s for s in t.children if s.dep_ in [u"nsubj", 'nsubjpass', 'ROOT', 'pobj']] 119 | if t.pos_ in ['VERB']: 120 | self._vcontext += [s for s in t.children if s.dep_ in ["neg", 'advmod']] 121 | r = t 122 | if t is not None: 123 | self._verbs = [v for v in t.children if v.pos_ == u"VERB"] 124 | if t.dep_ in ['relcl', 'acl']: 125 | self._subject = [t.head] 126 | else: 127 | if len(self._vcontext) == 0: 128 | self._vcontext += [s for s in t.children if s.dep_ in ["neg", 'advmod']] 129 | if self._subject is None: 130 | self._subject = [s for s in t.children if s.dep_ in [u"nsubj", 'nsubjpass', 'ROOT']] 131 | self._root = r 132 | 133 | def do_abstract_waterfall(self, entity_start, entity_end): 134 | t = self._t 135 | seq = [] 136 | while (t.head != t) and t.dep_ not in ['ROOT', 'relcl', 'acl', 'advcl']: 137 | t = t.head 138 | if t.idx > entity_end or (t.idx + len(t.text) < entity_start): 139 | seq.append((t.text, t.dep_, t.pos_)) 140 | seq.reverse() 141 | return seq 142 | 143 | def do_abstract_descendent(self): 144 | return [c for c in self._t.children] 145 | 146 | def to_dict(self): 147 | return {'children': [t.text for t in self.children], 'root': self.root.text, 148 | 'subject': [s.text for s in self.subject], 'verbs': [v.text for v in self.verbs]} 149 | 150 | 151 | class MentionPattern(object): 152 | def __init__(self, pattern_folder, cui2icd, csv_file=None, ann_folder=None, in_action=False): 153 | self._ptn_folder = pattern_folder 154 | self._ref_good_ptns = None 155 | self._ref_bad_ptns = None 156 | self._csv_file = csv_file 157 | self._cui2icd = cui2icd 158 | self._df = None 159 | self._nlp = get_nlp_lg() 160 | self._ann_folder = ann_folder 161 | self._good_ptns = None 162 | self._bad_ptns = None 163 | self._in_action = in_action 164 | self.load() 165 | 166 | def load(self): 167 | if self._csv_file is not None: 168 | self._df = pd.read_csv(self._csv_file) 169 | if self._in_action: 170 | g, b = MentionPattern.load_ref_patterns(self._ptn_folder, 'zzzz') 171 | self._good_ptns = g 172 | self._bad_ptns = b 173 | 174 | @staticmethod 175 | def load_ref_patterns(ptn_folder, ignore_chapter): 176 | good_p = MentionPattern.load_patterns(ptn_folder, to_load=lambda f: f.find('good') > 0 and f.find( 177 | '%s_' % ignore_chapter) != 0) 178 | bad_p = MentionPattern.load_patterns(ptn_folder, to_load=lambda f: f.find('bad') > 0 and f.find( 179 | '%s_' % ignore_chapter) != 0) 180 | return good_p, bad_p 181 | 182 | @staticmethod 183 | def get_sent_by_pos(sents, s, e): 184 | for sent in sents: 185 | if sent['start'] <= s and sent['end'] >= e: 186 | return sent 187 | return None 188 | 189 | def read_semehr_anns(self, doc_anns, container): 190 | """ 191 | doc_anns - [{'d': fk, 'ann': a, 'label': self.label}] 192 | """ 193 | self.read_semehr_anns_by_functions(doc_anns, 194 | get_sent_func=lambda dd: utils.load_json_data(dd)['sentences'], 195 | get_text_func=lambda dd: self._df[self._df['doc_id'] == dd]['text'].iloc[0], 196 | container=container) 197 | 198 | def read_semehr_anns_by_functions(self, doc_anns, get_sent_func, get_text_func, container): 199 | cur_d = None 200 | cur_sents = None 201 | for da in doc_anns: 202 | d = 'se_ann_%s.json' % da['d'] 203 | if d != cur_d: 204 | cur_sents = get_sent_func(join(self._ann_folder, d)) 205 | cur_d = d 206 | a = da['ann'] 207 | ch = self._cui2icd[a.cui] 208 | sent = MentionPattern.get_sent_by_pos(cur_sents, a.start, a.end) 209 | win = get_text_func(da['d'])[sent['start']:sent['end']] 210 | container.append( 211 | {'ch': ch, 'd': da['d'], 's': a.start, 'e': a.end, 's_s': sent['start'], 's_e': sent['end'], 212 | 'win': win}) 213 | 214 | def abstract_ann_pattern(self, ann): 215 | abss = AbstractedSentence(2) 216 | abss.text = ann['win'] 217 | result = abss.get_abstaction_by_pos(ann['e'] - ann['s_s'], self._nlp) 218 | if result is not None: 219 | # abss.get_related_tokens(result.token) 220 | ptn = result.do_abstract_waterfall(ann['s'] - ann['s_s'], ann['e'] - ann['s_s']) 221 | return {'pattern': ptn, "subject": result.subject, "vcontect": result.vcontext} 222 | else: 223 | return None 224 | 225 | def classify_anns(self, anns): 226 | preds = [] 227 | for ann in anns: 228 | ret = self.abstract_ann_pattern(ann) 229 | if ret is not None: 230 | good_ref = self._good_ptns 231 | bad_ref = self._bad_ptns 232 | if not self._in_action: 233 | good_ref, bad_ref = MentionPattern.load_ref_patterns(self._ptn_folder, ann['ch']) 234 | good_match = MentionPattern.compute_similar_from_ref(ret, good_ref, self._nlp) 235 | bad_match = MentionPattern.compute_similar_from_ref(ret, bad_ref, self._nlp) 236 | # ctx = '|'.join([e[0] for e in ret['pattern']]) 237 | cls = MentionPattern.classify_by_pattern_matches(good_match, bad_match, self._nlp) 238 | preds.append(cls) 239 | else: 240 | preds.append(-1) 241 | return preds 242 | 243 | def predict(self, doc_anns, cr=None): 244 | anns = [] 245 | if cr is None: 246 | self.read_semehr_anns(doc_anns, anns) 247 | else: 248 | # single document anns to be read by CustomisedRecoginiser 249 | self.read_semehr_anns_by_functions(doc_anns, get_sent_func=lambda dd: cr.sentences, 250 | get_text_func=lambda dd:cr.full_text, container=anns) 251 | return self.classify_anns(anns) 252 | 253 | @staticmethod 254 | def load_patterns(ptn_folder, to_load=lambda f: True): 255 | return [utils.load_json_data(join(ptn_folder, f)) for f in listdir(ptn_folder) if 256 | to_load(f) and isfile(join(ptn_folder, f))] 257 | 258 | @staticmethod 259 | def sim_seqs(s1, s2, nlp, last_k=2): 260 | scores = 0.0 261 | k = min(last_k, len(s1), len(s2)) 262 | for i in range(1, k + 1): 263 | t1, t2 = nlp(' '.join([s1[-1 * i], s2[-1 * i]])) 264 | if t1.vector_norm > 0 and t2.vector_norm > 0: 265 | scores += t1.similarity(t2) 266 | return scores / k 267 | 268 | @staticmethod 269 | def get_pattern_group(p): 270 | mp = p if len(p) <= 2 else p[-2:] 271 | return '-'.join([e[2] for e in mp]) 272 | 273 | @staticmethod 274 | def compute_similar_from_ref(ret, ref_good_ptns, nlp, threshold=0.7): 275 | p = ret['pattern'] 276 | ctxt = '|'.join([e[0] for e in p]) 277 | # print('>>>working on %s' % ctxt) 278 | if len(ctxt) == 0: 279 | return None 280 | grp = MentionPattern.get_pattern_group(p) 281 | entried_scores = [] 282 | for ref_ptn in ref_good_ptns: 283 | if grp in ref_ptn: 284 | for inst in ref_ptn[grp]: 285 | score = MentionPattern.sim_seqs([e[0] for e in p], ref_ptn[grp][inst]['list'], nlp) 286 | if score > threshold: 287 | entried_scores.append((score, ref_ptn[grp][inst]['freq'])) 288 | # print('\tvs %s: score %s, %s' % (inst, score, ref_good_ptns[grp][inst]['freq'])) 289 | if len(entried_scores) > 0: 290 | total = sum([s[0] * s[1] for s in entried_scores]) 291 | supports = sum([s[1] for s in entried_scores]) 292 | avg_score = total / supports 293 | # print('\tscore %s, support %s, %s|%s' % (avg_score, supports, ret['subject'], ret['vcontect'])) 294 | return {'score': avg_score, 'supports': supports, 'subject': [t.text for t in ret['subject']], 295 | 'context': [t.text for t in ret['vcontect']]} 296 | else: 297 | return None 298 | 299 | @staticmethod 300 | def classify_by_pattern_matches(good_match, bad_match, nlp, 301 | bad_subjs=None, 302 | bad_context=None): 303 | if bad_context is None: 304 | bad_context = ['not', 'mistakenly', 'likely', 'ie'] 305 | if bad_subjs is None: 306 | bad_subjs = ['son', 'daughter', 'manager', 'wife', 'I', 'one', 'anyone', "questions", 307 | "someone", "child", "neighbour", "invesitigation", "screening", 308 | "assessment"] 309 | if good_match is None and bad_match is None: 310 | return -1 311 | if good_match is None: 312 | return 0 313 | # elif bad_match is None: 314 | # return 1 315 | else: 316 | sub = good_match['subject'] 317 | ctx = good_match['context'] 318 | if MentionPattern.lists_sim_enough(sub, bad_subjs, nlp) == 1: 319 | return 0 320 | if MentionPattern.lists_sim_enough(ctx, bad_context, nlp) == 1: 321 | return 0 322 | # return -1 323 | if bad_match is None: 324 | return 1 325 | else: 326 | return 1 if good_match['score'] * good_match['supports'] >= bad_match['score'] * bad_match[ 327 | 'supports'] else 0 328 | 329 | @staticmethod 330 | def lists_sim_enough(l1, l2, nlp, threshold=0.8): 331 | if len(l1) == 0 or len(l2) == 0: 332 | return -1 333 | d1 = nlp(' '.join(l1)) 334 | d2 = nlp(' '.join(l2)) 335 | for t1 in d1: 336 | for t2 in d2: 337 | if t1.similarity(t2) > threshold: 338 | return 1 339 | return 0 340 | 341 | 342 | _nlp_lg = None 343 | 344 | 345 | def get_nlp_lg(): 346 | global _nlp_lg 347 | if _nlp_lg is None: 348 | _nlp_lg = spacy.load('en_core_web_lg') 349 | return _nlp_lg 350 | -------------------------------------------------------------------------------- /neg-tumour-dt-learnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/neg-tumour-dt-learnt.png -------------------------------------------------------------------------------- /nlp_to_phenome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | nlp2phenome 6 | using AI models to infer patient phenotypes from identified named entities (instances of biomedical concepts) 7 | """ 8 | import utils 9 | from os.path import basename, isfile, join 10 | from os import listdir 11 | import json 12 | import logging 13 | from LabelModel import LabelModel 14 | import mention_pattern as mp 15 | from annotation_docs import SemEHRAnnDoc, CustomisedRecoginiser, Concept2Mapping 16 | from EDI_ann_doc import EDIRDoc, ConllDoc, eHostDoc 17 | from learners import LabelPerformance, PhenomeLearners 18 | 19 | 20 | class StrokeSettings(object): 21 | """ 22 | json based configuration setting 23 | """ 24 | 25 | def __init__(self, setting_file): 26 | self._file = setting_file 27 | self._setting = {} 28 | self.load() 29 | 30 | def load(self): 31 | self._setting = utils.load_json_data(self._file) 32 | 33 | @property 34 | def settings(self): 35 | return self._setting 36 | 37 | 38 | def extract_doc_level_ann(ann_dump, output_folder): 39 | """ 40 | 41 | extract doc level annotations and save to separate files 42 | :param ann_dump: 43 | :param output_folder: 44 | :return: 45 | """ 46 | lines = utils.read_text_file(ann_dump) 47 | for l in lines: 48 | doc_ann = json.loads(l) 49 | utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json')) 50 | 51 | 52 | def extract_all_doc_anns(dump_folder, output_folder): 53 | dumps = [f for f in listdir(dump_folder) if isfile(join(dump_folder, f))] 54 | for d in dumps: 55 | extract_doc_level_ann(join(dump_folder, d), output_folder) 56 | 57 | 58 | def save_full_text(xml_file, output_dir): 59 | """ 60 | recover full text from Informatics' xml format 61 | :param xml_file: 62 | :param output_dir: 63 | :return: 64 | """ 65 | if not isfile(xml_file): 66 | return 67 | ed = EDIRDoc(xml_file) 68 | fn = basename(xml_file) 69 | name = fn.replace(r'-ann.xml', '.txt') 70 | logging.info('%s processed to be %s' % (fn, name)) 71 | utils.save_string(ed.get_full_text, join(output_dir, name)) 72 | 73 | 74 | def process_files(read_dir, write_dir): 75 | utils.multi_thread_process_files(read_dir, file_extension='xml', num_threads=10, 76 | process_func=save_full_text, args=[write_dir]) 77 | 78 | 79 | def get_doc_level_inference(label_dir, ann_dir, file_key, type2insts, type2inst_2, t2missed): 80 | """ 81 | learn concept to label inference from gold standard - i.e. querying SemEHR annotations to 82 | draw conclusions 83 | :param label_dir: 84 | :param ann_dir: 85 | :param file_key: 86 | :param type2insts: 87 | :param type2inst_2: 88 | :return: 89 | """ 90 | label_file = '%s-ann.xml' % file_key 91 | ann_file = '%s.json' % file_key 92 | logging.info('working on %s' % join(label_dir, label_file)) 93 | ed = EDIRDoc(join(label_dir, label_file)) 94 | if not isfile(join(label_dir, label_file)): 95 | print('not a file: %s' % join(label_dir, label_file)) 96 | return 97 | sd = SemEHRAnnDoc(join(ann_dir, ann_file)) 98 | sd.learn_mappings_from_labelled(ed, type2insts, t2missed) 99 | 100 | 101 | def learn_concept_mappings(output_lst_folder): 102 | type2insts = {} 103 | type2insts_2 = {} 104 | label_dir = _gold_dir 105 | ann_dir = _ann_dir 106 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 107 | t2missed = {} 108 | for fk in file_keys: 109 | get_doc_level_inference(label_dir, 110 | ann_dir, 111 | fk, 112 | type2insts, 113 | type2insts_2, 114 | t2missed) 115 | for t in type2insts: 116 | type2insts[t] = list(type2insts[t]) 117 | logging.info(json.dumps(type2insts)) 118 | 119 | s = '\n' * 2 120 | for t in type2insts_2: 121 | type2insts_2[t] = list(type2insts_2[t]) 122 | s += json.dumps(type2insts_2) 123 | 124 | s += '\n' * 2 125 | labels = [] 126 | defs = [] 127 | for t in t2missed: 128 | t2missed[t] = list(set(t2missed[t])) 129 | utils.save_string('\n'.join(t2missed[t]) + '\n', join(output_lst_folder, t + '.lst')) 130 | labels += [l.lower() for l in t2missed[t]] 131 | defs.append(t + '.lst' + ':StrokeStudy:' + t) 132 | s += '\n' * 2 133 | s += '\n'.join(defs) 134 | s += json.dumps(t2missed) 135 | logging.info(s) 136 | 137 | 138 | def learn_prediction_model(label, ann_dir=None, gold_dir=None, model_file=None, model_dir=None, 139 | ml_model_file_ptn=None, 140 | pca_dim=None, 141 | pca_model_file=None, 142 | max_dimension=None, 143 | ignore_mappings=[], 144 | viz_file=None, ignore_context=False, separate_by_label=False, full_text_dir=None, 145 | eHostGD=False): 146 | model_changed = False 147 | if model_file is not None: 148 | lm = LabelModel.deserialise(model_file) 149 | else: 150 | model_changed = True 151 | lm = LabelModel(label, _cm_obj) 152 | lm.collect_tfidf_dimensions(ann_dir=ann_dir, gold_dir=gold_dir, ignore_context=ignore_context, 153 | separate_by_label=separate_by_label, full_text_dir=full_text_dir, eHostGD=eHostGD) 154 | lm.use_one_dimension_for_label = False 155 | lm.max_dimensions = max_dimension 156 | if ann_dir is not None: 157 | # bad_lables = lm.get_low_quality_labels(ann_dir, gold_dir) 158 | # logging.info(bad_lables) 159 | bad_lables = [] 160 | data = lm.load_data(ann_dir, gold_dir, ignore_mappings=bad_lables, ignore_context=ignore_context, 161 | separate_by_label=separate_by_label, ful_text_dir=full_text_dir, eHostGD=eHostGD, 162 | annotated_anns=_annotated_anns) 163 | # if separate_by_label: 164 | for lbl in data['lbl2data']: 165 | X = data['lbl2data'][lbl]['X'] 166 | Y = data['lbl2data'][lbl]['Y'] 167 | n_true = 0 168 | for y in Y: 169 | if y == [1]: 170 | n_true += 1 171 | logging.debug('training data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) 172 | if len(X) <= _min_sample_size: 173 | lm.add_rare_label(lbl, n_true * 1.0 / len(X)) 174 | continue 175 | # ignore_mappings += data['bad_labels'] 176 | PhenomeLearners.random_forest_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) 177 | # lm.svm_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) 178 | # lm.gaussian_nb(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl)) 179 | logging.debug('%s, #insts: %s, #tps: %s' % (lbl, len(X), n_true)) 180 | 181 | if model_dir is not None and model_changed: 182 | lm.serialise(join(model_dir, '%s.lm' % label)) 183 | logging.debug('%s.lm saved' % label) 184 | 185 | 186 | def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance, 187 | pca_model_file=None, 188 | max_dimension=None, 189 | ignore_mappings=[], 190 | ignore_context=False, 191 | separate_by_label=False, 192 | full_text_dir=None, 193 | file_pattern='%s-ann.xml', 194 | id2conll=None, 195 | label_whitelist=None, 196 | eHostGD=False, mention_pattern=None): 197 | lm = LabelModel.deserialise(model_file) 198 | lm.max_dimensions = max_dimension 199 | data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context, 200 | separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD, 201 | annotated_anns=_annotated_anns) 202 | 203 | files = data['files'] 204 | for d in files: 205 | d = d.replace('se_ann_', '') 206 | if d not in id2conll: 207 | id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d)) 208 | if label_whitelist is not None: 209 | id2conll[d].set_label_white_list(label_whitelist) 210 | lbl2performances = {} 211 | for lbl in data['lbl2data']: 212 | this_performance = LabelPerformance(lbl) 213 | X = data['lbl2data'][lbl]['X'] 214 | Y = data['lbl2data'][lbl]['Y'] 215 | mtp = data['lbl2data'][lbl]['multiple_tps'] 216 | doc_anns = data['lbl2data'][lbl]['doc_anns'] 217 | mp_predicted = None 218 | if mention_pattern is not None: 219 | mp_predicted = mention_pattern.predict(doc_anns) 220 | if lbl in lm.rare_labels: 221 | logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl])) 222 | PhenomeLearners.predict_use_simple_stats( 223 | lm.rare_labels[lbl], Y, mtp, 224 | performance, separate_performance=this_performance, 225 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, 226 | doc_folder=test_gold_dir, 227 | label_whitelist=label_whitelist, mp_predicted=mp_predicted 228 | ) 229 | else: 230 | if len(X) > 0: 231 | logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X))) 232 | bc = lm.get_binary_cluster_classifier(lbl) 233 | if bc is not None: 234 | complementary_classifiers = [] 235 | for l in lm.cluster_classifier_dict: 236 | if l != lbl: 237 | complementary_classifiers.append(lm.cluster_classifier_dict[l]) 238 | for idx in range(len(X)): 239 | logging.debug( 240 | '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx])) 241 | PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl), 242 | performance, 243 | pca_model_file=pca_model_file, 244 | separate_performance=this_performance, 245 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern, 246 | doc_folder=test_gold_dir, 247 | label_whitelist=label_whitelist, mp_predicted=mp_predicted) 248 | lbl2performances[lbl] = this_performance 249 | perform_str = CustomisedRecoginiser.print_performances(lbl2performances) 250 | logging.debug('missed instances: %s' % data['fns']) 251 | performance.increase_false_negative(data['fns']) 252 | return perform_str 253 | 254 | 255 | def escape_lable_to_filename(s): 256 | return s.replace('\\', '_').replace('/', '_') 257 | 258 | 259 | def populate_semehr_results(label_dir, ann_dir, file_key, 260 | label2performances, using_combined=False): 261 | label_file = '%s-ann.xml' % file_key 262 | ann_file = '%s.json' % file_key 263 | print(join(label_dir, label_file)) 264 | if not isfile(join(label_dir, label_file)): 265 | return 266 | 267 | ed = EDIRDoc(join(label_dir, label_file)) 268 | cm = Concept2Mapping(_concept_mapping) 269 | cr = CustomisedRecoginiser(join(ann_dir, ann_file), cm) 270 | if using_combined: 271 | cr.validate_combined_performance(ed.get_ess_entities(), label2performances) 272 | else: 273 | cr.validate_mapped_performance(ed.get_ess_entities(), label2performances) 274 | 275 | 276 | def populate_validation_results(): 277 | label_dir = _gold_dir 278 | ann_dir = _ann_dir 279 | 280 | label2performances = {} 281 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 282 | for fk in file_keys: 283 | populate_semehr_results(label_dir, ann_dir, fk, label2performances, using_combined=False) 284 | CustomisedRecoginiser.print_performances(label2performances) 285 | 286 | 287 | def do_learn_exp(viz_file, num_dimensions=[20], ignore_context=False, separate_by_label=False, conll_output_file=None, 288 | eHostGD=False, mention_pattern=None): 289 | results = {} 290 | id2conll = {} 291 | result_str = '' 292 | for lbl in _labels: 293 | logging.info('working on [%s]' % lbl) 294 | _learning_model_file = _learning_model_dir + '/%s.lm' % lbl 295 | _ml_model_file_ptn = _learning_model_dir + '/' + lbl + '_%s_DT.model' 296 | _pca_model_file = None 297 | pca_dim = None 298 | max_dimensions = num_dimensions 299 | 300 | t = lbl.replace('neg_', '') 301 | ignore_mappings = _ignore_mappings[t] if t in _ignore_mappings else [] 302 | # remove previous model files logging.debug('removing previously learnt models...') for f in [f for f in 303 | # listdir(_learning_model_dir) if isfile(join(_learning_model_dir, f)) and f.endswith('.model')]: remove( 304 | # join(_learning_model_dir, f)) 305 | for dim in max_dimensions: 306 | logging.info('dimension setting: %s' % dim) 307 | learn_prediction_model(lbl, 308 | ann_dir=_ann_dir, 309 | gold_dir=_gold_dir, 310 | ml_model_file_ptn=_ml_model_file_ptn, 311 | model_dir=_learning_model_dir, 312 | pca_dim=pca_dim, 313 | pca_model_file=_pca_model_file, 314 | max_dimension=dim, 315 | ignore_mappings=ignore_mappings, 316 | viz_file=viz_file, 317 | ignore_context=ignore_context, 318 | separate_by_label=separate_by_label, 319 | full_text_dir=_gold_text_dir, 320 | eHostGD=eHostGD) 321 | logging.debug('bad labels: %s' % ignore_mappings) 322 | pl = '%s dim[%s]' % (lbl, dim) 323 | performance = LabelPerformance(pl) 324 | results[pl] = performance 325 | predict_label(_learning_model_file, 326 | _test_ann_dir, 327 | _test_gold_dir, 328 | _ml_model_file_ptn, 329 | performance, 330 | pca_model_file=_pca_model_file, 331 | max_dimension=dim, 332 | ignore_mappings=ignore_mappings, 333 | ignore_context=ignore_context, 334 | separate_by_label=separate_by_label, 335 | full_text_dir=_test_text_dir, 336 | file_pattern=_gold_file_pattern, 337 | id2conll=id2conll, 338 | label_whitelist=_labels, 339 | eHostGD=eHostGD, mention_pattern=mention_pattern) 340 | result_str = CustomisedRecoginiser.print_performances(results) 341 | return result_str 342 | 343 | 344 | def save_text_files(xml_dir, text_dr): 345 | process_files(xml_dir, text_dr) 346 | 347 | 348 | def extact_doc_anns(semoutput_dir, doc_ann_dir): 349 | extract_all_doc_anns(semoutput_dir, 350 | doc_ann_dir) 351 | 352 | 353 | def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder): 354 | maps = [utils.load_json_data(mf) for mf in map_files] 355 | new_m = {} 356 | for m in maps: 357 | new_m.update(m) 358 | t2list = {} 359 | for dd in dict_dirs: 360 | lst_files = [f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst')] 361 | for f in lst_files: 362 | t = f[:f.index('.')] 363 | labels = utils.read_text_file(join(dd, f)) 364 | if t not in t2list: 365 | t2list[t] = set() 366 | for l in labels: 367 | if len(l) > 0: 368 | t2list[t].add(l) 369 | utils.save_json_array(new_m, new_map_file) 370 | logging.info('mapping saved to %s' % new_map_file) 371 | for t in t2list: 372 | utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst')) 373 | logging.info('%s.lst saved' % t) 374 | logging.info('all done') 375 | 376 | 377 | def test_eHost_doc(): 378 | d = eHostDoc('/Users/honghan.wu/Desktop/ehost_sample.xml') 379 | print([(e.label, e.start, e.end, e.str) for e in d.get_ess_entities()]) 380 | 381 | 382 | def run_learning_v0(): 383 | log_level = 'DEBUG' 384 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' 385 | logging.basicConfig(level='DEBUG', format=log_format) 386 | log_file = './settings/processing.log' 387 | logging.basicConfig(level=log_level, format=log_format) 388 | ss = StrokeSettings('./settings/settings.json') 389 | settings = ss.settings 390 | global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir 391 | global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj 392 | global _annotated_anns 393 | _annotated_anns = {} 394 | if 'annotated_anns' in settings['annotated_anns_file']: 395 | _annotated_anns = utils.load_json_data(settings['annotated_anns_file']) 396 | _min_sample_size = settings['min_sample_size'] 397 | _ann_dir = settings['ann_dir'] 398 | _gold_dir = settings['gold_dir'] 399 | _test_ann_dir = settings['test_ann_dir'] 400 | _test_gold_dir = settings['test_gold_dir'] 401 | _gold_text_dir = settings['dev_full_text_dir'] 402 | _test_text_dir = settings['test_fulltext_dir'] 403 | _concept_mapping = settings['concept_mapping_file'] 404 | _learning_model_dir = settings['learning_model_dir'] 405 | _labels = utils.read_text_file(settings['entity_types_file']) 406 | _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern'] 407 | _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) 408 | _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False 409 | _cm_obj = Concept2Mapping(_concept_mapping) 410 | 411 | mp_inst = mp.MentionPattern(settings['pattern_folder'], _cm_obj.cui2label, 412 | csv_file=settings['csv_file'], ann_folder=_test_ann_dir) 413 | return do_learn_exp(settings['viz_file'], 414 | num_dimensions=[50], 415 | ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False, 416 | separate_by_label=True, 417 | conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst) 418 | 419 | 420 | def run_learning( 421 | train_ann_dir, train_gold_dir, train_text_dir, 422 | test_ann_dir, test_gold_dir, test_text_dir, 423 | settings): 424 | log_level = 'DEBUG' 425 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' 426 | logging.basicConfig(level='DEBUG', format=log_format) 427 | log_file = './settings/processing.log' 428 | logging.basicConfig(level=log_level, format=log_format) 429 | global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir 430 | global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj 431 | global _annotated_anns 432 | _annotated_anns = {} 433 | _min_sample_size = settings['min_sample_size'] 434 | _ann_dir = train_ann_dir 435 | _gold_dir = train_gold_dir 436 | _test_ann_dir = test_ann_dir 437 | _test_gold_dir = test_gold_dir 438 | _gold_text_dir = train_text_dir 439 | _test_text_dir = test_text_dir 440 | _concept_mapping = settings['concept_mapping_file'] 441 | _learning_model_dir = settings['learning_model_dir'] 442 | _labels = utils.read_text_file(settings['entity_types_file']) 443 | _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern'] 444 | _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) 445 | _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False 446 | _cm_obj = Concept2Mapping(_concept_mapping) 447 | 448 | # not using mention patterns for prediction as this is only a in-development feature 449 | mp_inst = None 450 | return do_learn_exp(settings['viz_file'], 451 | num_dimensions=[50], 452 | ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False, 453 | separate_by_label=True, 454 | conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst) 455 | 456 | 457 | if __name__ == "__main__": 458 | log_level = 'DEBUG' 459 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s' 460 | logging.basicConfig(level='DEBUG', format=log_format) 461 | log_file = './settings/processing.log' 462 | logging.basicConfig(level=log_level, format=log_format) 463 | # _cm_obj.load_gaz_dir(settings['concept_gaz_dir']) 464 | 465 | # 0. merging mapping & dictionaries 466 | # merge_mappings_dictionary(['/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/tayside_concept_mapping.json', 467 | # '/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/concept_mapping.json'], 468 | # ['/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/tayside_gazetteer', 469 | # '/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/ess_gazetteer'], 470 | # '/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/merged_concept_mapping.json', 471 | # '/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/merged_gzetteer') 472 | 473 | # 1. extract text files for annotation 474 | # save_text_files(settings['gold_dir'], settings['dev_full_text_dir']) 475 | # 2. run SemEHR on the text files 476 | # 3. extract doc anns into separate files from dumped JSON files 477 | # extact_doc_anns(settings['test_semehr_output_dir'], 478 | # settings['test_ann_dir']) 479 | # 4. learn umls concept to phenotype mappping 480 | # learn_concept_mappings(settings['gazetteer_dir']) 481 | # 5. learn phenotype inference 482 | -------------------------------------------------------------------------------- /predict_helper.py: -------------------------------------------------------------------------------- 1 | from nlp_to_phenome import StrokeSettings, Concept2Mapping, escape_lable_to_filename 2 | from LabelModel import LabelModel, CustomisedRecoginiser 3 | from annotation_docs import PhenotypeAnn 4 | from learners import PhenomeLearners 5 | import utils 6 | import logging 7 | from os.path import join 8 | from ann_converter import AnnConverter 9 | from os import listdir 10 | from os.path import isfile, exists 11 | import sys 12 | 13 | 14 | def predict(settings): 15 | ann_dir = settings['test_ann_dir'] 16 | test_text_dir = settings['test_fulltext_dir'] 17 | _concept_mapping = settings['concept_mapping_file'] 18 | _learning_model_dir = settings['learning_model_dir'] 19 | _labels = utils.read_text_file(settings['entity_types_file']) 20 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) 21 | _cm_obj = Concept2Mapping(_concept_mapping) 22 | 23 | doc2predicted = {} 24 | no_models_labels = [] 25 | for phenotype in _labels: 26 | logging.info('working on [%s]' % phenotype) 27 | _learning_model_file = _learning_model_dir + '/%s.lm' % phenotype 28 | 29 | if not exists(_learning_model_file): 30 | # if previous learnt model not exists, skip 31 | no_models_labels.append(phenotype) 32 | continue 33 | 34 | _ml_model_file_ptn = _learning_model_dir + '/' + phenotype + '_%s_DT.model' 35 | 36 | lm = LabelModel.deserialise(_learning_model_file) 37 | # pass the concept2mapping object to the label model instance 38 | lm.concept_mapping = _cm_obj 39 | lm.max_dimensions = 30 40 | data = lm.load_data_for_predict( 41 | ann_dir=ann_dir, 42 | ignore_mappings=ignore_mappings, ignore_context=True, 43 | separate_by_label=True, 44 | full_text_dir=test_text_dir) 45 | for lbl in data['lbl2data']: 46 | X = data['lbl2data'][lbl]['X'] 47 | logging.debug(X) 48 | doc_anns = data['lbl2data'][lbl]['doc_anns'] 49 | label_model_predict(lm, _ml_model_file_ptn, data['lbl2data'], doc2predicted) 50 | return doc2predicted, no_models_labels 51 | 52 | 53 | def label_model_predict(lm, model_file_pattern, lbl2data, doc2predicted, 54 | mention_pattern=None, mention_prediction_param=None): 55 | for lbl in lbl2data: 56 | mp_predicted = None 57 | if mention_pattern is not None: 58 | mp_predicted = mention_pattern.predict(lbl2data[lbl]['doc_anns'], cr=mention_prediction_param) 59 | X = lbl2data[lbl]['X'] 60 | doc_anns = lbl2data[lbl]['doc_anns'] 61 | if lbl in lm.rare_labels: 62 | logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl])) 63 | PhenomeLearners.predict_use_simple_stats_in_action(lm.rare_labels[lbl], 64 | item_size=len(X), 65 | doc2predicted=doc2predicted, 66 | doc_anns=doc_anns, 67 | mp_predicted=mp_predicted) 68 | else: 69 | if len(X) > 0: 70 | logging.debug('%s, dimensions %s' % (lbl, len(X[0]))) 71 | PhenomeLearners.predict_use_model_in_action(X, model_file=model_file_pattern % escape_lable_to_filename(lbl), 72 | pca_model_file=None, 73 | doc2predicted=doc2predicted, 74 | doc_anns=doc_anns, 75 | mp_predicted=mp_predicted) 76 | 77 | 78 | def hybrid_prediciton(settings): 79 | d2p, labels2work = predict(settings) 80 | ann_dir = settings['test_ann_dir'] 81 | test_text_dir = settings['test_fulltext_dir'] 82 | _concept_mapping = settings['concept_mapping_file'] 83 | _learning_model_dir = settings['learning_model_dir'] 84 | _labels = utils.read_text_file(settings['entity_types_file']) 85 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) 86 | _cm_obj = Concept2Mapping(_concept_mapping) 87 | file_keys = [f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 88 | logging.info('labels to use direct nlp prediction: [%s]' % labels2work) 89 | 90 | # convert SemEHRAnn to PhenotypeAnn 91 | doc2predicted = {} 92 | for d in d2p: 93 | for t in d2p[d]: 94 | ann = t['ann'] 95 | if hasattr(ann, 'cui'): 96 | lbl = _cm_obj.concept2label[ann.cui][0] 97 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 98 | 'StudyName', lbl) 99 | put_ann_label(lbl, pheAnn, doc2predicted, d) 100 | else: 101 | put_ann_label(ann.minor_type, ann, doc2predicted, d) 102 | for fk in file_keys: 103 | cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping) 104 | d = fk 105 | for ann in cr.annotations: 106 | if ann.cui in _cm_obj.concept2label: 107 | lbl = _cm_obj.concept2label[ann.cui][0] 108 | if lbl in labels2work: 109 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 110 | 'StudyName', lbl) 111 | put_ann_label(lbl, pheAnn, doc2predicted, d) 112 | for ann in cr.phenotypes: 113 | if ann.minor_type in labels2work: 114 | put_ann_label(ann.minor_type, ann, doc2predicted, d) 115 | return doc2predicted 116 | 117 | 118 | def direct_nlp_prediction(settings): 119 | ann_dir = settings['test_ann_dir'] 120 | test_text_dir = settings['test_fulltext_dir'] 121 | _concept_mapping = settings['concept_mapping_file'] 122 | _learning_model_dir = settings['learning_model_dir'] 123 | _labels = utils.read_text_file(settings['entity_types_file']) 124 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file']) 125 | _cm_obj = Concept2Mapping(_concept_mapping) 126 | file_keys = [f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 127 | doc2predicted = {} 128 | for fk in file_keys: 129 | cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping) 130 | d = fk 131 | for ann in cr.annotations: 132 | if ann.cui in _cm_obj.concept2label: 133 | lbl = _cm_obj.concept2label[ann.cui][0] 134 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer, 135 | 'StudyName', lbl) 136 | if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0: 137 | continue 138 | put_ann_label(lbl, pheAnn, doc2predicted, d) 139 | for ann in cr.phenotypes: 140 | put_ann_label(ann.minor_type, ann, doc2predicted, d) 141 | return doc2predicted 142 | 143 | 144 | def put_ann_label(lbl, pheAnn, doc2predicted, d): 145 | labeled_ann = {'label': lbl, 146 | 'ann': pheAnn} 147 | if d not in doc2predicted: 148 | doc2predicted[d] = [labeled_ann] 149 | else: 150 | doc2predicted[d].append(labeled_ann) 151 | 152 | 153 | def output_eHOST_format(doc2precited, output_folder): 154 | for d in doc2precited: 155 | xml = AnnConverter.to_eHOST(d, doc2precited[d]) 156 | utils.save_string(str(xml), join(output_folder, '%s.txt.knowtator.xml' % d)) 157 | 158 | 159 | def predict_to_eHOST_results(predict_setting): 160 | ss = StrokeSettings(predict_setting) 161 | if 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'direct_nlp': 162 | logging.info('predicting with direct nlp...') 163 | predicted_results = direct_nlp_prediction(ss.settings) 164 | elif 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'hybrid': 165 | predicted_results = hybrid_prediciton(ss.settings) 166 | else: 167 | logging.info('predicting...') 168 | predicted_results = predict(ss.settings) 169 | output_eHOST_format(predicted_results, ss.settings['output_folder']) 170 | logging.info('results saved to %s' % ss.settings['output_folder']) 171 | if 'output_file' in ss.settings: 172 | d2ann = {} 173 | for d in predicted_results: 174 | d2ann[d] = [{'label': t['label'], 'ann': t['ann'].to_dict()} for t in predicted_results[d]] 175 | utils.save_json_array(d2ann, ss.settings['output_file']) 176 | 177 | 178 | if __name__ == "__main__": 179 | logging.basicConfig(level='DEBUG', format='[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s') 180 | # predict_to_eHOST_results('./settings/prediction_task_direct.json') 181 | if len(sys.argv) != 2: 182 | print('the syntax is [python prediction_helper.py PROCESS_SETTINGS_FILE_PATH]') 183 | else: 184 | predict_to_eHOST_results(sys.argv[1]) -------------------------------------------------------------------------------- /pretrained_models/stroke_settings.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_settings.zip -------------------------------------------------------------------------------- /pretrained_models/stroke_subtype_models.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_subtype_models.zip -------------------------------------------------------------------------------- /pretrained_models/stroke_supplemental-gazetteer.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_supplemental-gazetteer.zip -------------------------------------------------------------------------------- /reportreader.py: -------------------------------------------------------------------------------- 1 | from annotation_docs import SemEHRAnnDoc, BasicAnn 2 | import logging 3 | from os.path import isfile, join 4 | from os import listdir 5 | import spacy 6 | 7 | _spacy_nlp = None 8 | 9 | 10 | def get_nlp_instance(): 11 | global _spacy_nlp 12 | if _spacy_nlp is None: 13 | _spacy_nlp = spacy.load("en_core_web_sm") 14 | return _spacy_nlp 15 | 16 | 17 | def get_sentences_as_anns(nlp, text): 18 | doc = nlp(text) 19 | anns = [] 20 | for s in doc.sents: 21 | anns.append(BasicAnn(s.text, s.start_char, s.end_char)) 22 | return anns 23 | 24 | 25 | class AbstractedSentence(object): 26 | def __init__(self, seq): 27 | self._seq = 0 28 | self._abstracted_tokens = [] 29 | self._text = None 30 | self._parsed = None 31 | 32 | @property 33 | def seq(self): 34 | return self._seq 35 | 36 | @seq.setter 37 | def seq(self, value): 38 | self._seq = value 39 | 40 | def add_token(self, t): 41 | self._abstracted_tokens.append(t) 42 | 43 | @property 44 | def tokens(self): 45 | return self._abstracted_tokens 46 | 47 | @property 48 | def text(self): 49 | return self._text 50 | 51 | @text.setter 52 | def text(self, value): 53 | self._text = value 54 | 55 | def get_parsed_tree(self, nlp): 56 | """ 57 | use spacy instance to parse the sentence 58 | :param nlp: a spacy instance 59 | :return: dependency tree 60 | """ 61 | if self._parsed is not None: 62 | return self._parsed 63 | if self.text is None: 64 | return None 65 | self._parsed = nlp(self.text) 66 | return self._parsed 67 | 68 | def locate_pos(self, str): 69 | return self._text.find(str) 70 | 71 | def get_abstaction_by_pos(self, pos, nlp): 72 | doc = self.get_parsed_tree(nlp) 73 | token = None 74 | if doc is not None: 75 | for t in doc: 76 | if t.idx == pos: 77 | token = t 78 | if token is not None: 79 | ta = TokenAbstraction(token, doc) 80 | else: 81 | return None 82 | return ta 83 | 84 | 85 | class TokenAbstraction(object): 86 | def __init__(self, token, doc): 87 | self._t = token 88 | self._d = doc 89 | self._children = [] 90 | self._root = None 91 | self._subject = None 92 | self._verbs = None 93 | self.do_abstract() 94 | 95 | @property 96 | def children(self): 97 | return self._children 98 | 99 | @property 100 | def root(self): 101 | return self._root 102 | 103 | @property 104 | def subject(self): 105 | return self._subject 106 | 107 | @property 108 | def verbs(self): 109 | return self._verbs 110 | 111 | def do_abstract(self): 112 | self._children = [t for t in self._t.children] 113 | t = self._t 114 | r = t 115 | while (t.head != t) and t.pos_ != u"VERB": 116 | t = t.head 117 | r = t 118 | if t is not None: 119 | self._verbs = [v for v in t.children if v.pos_ == u"VERB"] 120 | self._subject = [s for s in t.children if s.dep_ == u"nsubj"] 121 | self._root = r 122 | 123 | def to_dict(self): 124 | return {'children': [t.text for t in self.children], 'root': self.root.text, 'subject': [s.text for s in self.subject], 'verbs': [v.text for v in self.verbs]} 125 | 126 | 127 | class ReportAbstractor(SemEHRAnnDoc): 128 | def __init__(self, ann_file): 129 | super(ReportAbstractor, self).__init__(ann_file) 130 | self._abstracted_sents = [] 131 | 132 | def get_abstracted_sents(self): 133 | seq = 0 134 | for s in self.sentences: 135 | a_sent = AbstractedSentence(seq) 136 | seq += 1 137 | anns = sorted(self.annotations, key=lambda x: x.start) 138 | for a in anns: 139 | if a.overlap(s): 140 | a_sent.add_token('%s%s[%s]' % ("%s: " % a.negation if a.negation == "Negated" else "", a.str, a.sty)) 141 | self._abstracted_sents.append(a_sent) 142 | logging.debug(a_sent.tokens) 143 | 144 | 145 | def test(): 146 | ann_dir = 'C:/Users/hwu33/Downloads/working/semehr-runtime/radiology-reports/semehr_results/' 147 | files = [f for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 148 | for f in files: 149 | logging.debug('%s' % f) 150 | ra = ReportAbstractor(join(ann_dir, f)) 151 | ra.get_abstracted_sents() 152 | logging.debug('\n') 153 | 154 | 155 | def test_spacy(): 156 | nlp = spacy.load("en_core_web_sm") 157 | doc = nlp(u"She said he might be getting better soon.") 158 | for token in doc: 159 | print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_, 160 | [child for child in token.children], token.idx, token.shape_) 161 | 162 | 163 | def test_abstract_sentence(): 164 | nlp = get_nlp_instance() 165 | abss = AbstractedSentence(1) 166 | abss.text = u"She said he might be getting better soon" 167 | result = abss.get_abstaction_by_pos(29, nlp) 168 | if result is not None: 169 | print(result.root, result.children, result.verbs, result.subject) 170 | 171 | 172 | def test_sentences(): 173 | nlp = get_nlp_instance() 174 | sents = get_sentences_as_anns(nlp, u""" 175 | Circumstances leading to assessment. 176 | Over the past week ZZZZZ. 177 | """) 178 | print([s.serialise_json() for s in sents]) 179 | 180 | 181 | if __name__ == "__main__": 182 | logging.basicConfig(level='DEBUG', format='[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s') 183 | # test_spacy() 184 | # test_abstract_sentence() 185 | test_sentences() 186 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | blis==0.7.3 2 | catalogue==1.0.0 3 | certifi==2020.11.8 4 | chardet==3.0.4 5 | cymem==2.0.4 6 | graphviz==0.15 7 | idna==2.10 8 | importlib-metadata==3.1.0 9 | joblib==0.17.0 10 | murmurhash==1.0.4 11 | numpy==1.19.4 12 | pandas==1.1.4 13 | plac==1.1.3 14 | preshed==3.0.4 15 | python-dateutil==2.8.1 16 | pytz==2020.4 17 | requests==2.25.0 18 | scikit-learn==0.23.2 19 | scipy==1.5.4 20 | six==1.15.0 21 | sklearn==0.0 22 | spacy==2.3.4 23 | srsly==1.0.4 24 | thinc==7.4.3 25 | threadpoolctl==2.1.0 26 | tqdm==4.54.0 27 | urllib3==1.26.2 28 | wasabi==0.8.0 29 | zipp==3.4.0 30 | -------------------------------------------------------------------------------- /run_learning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from sklearn.model_selection import KFold 3 | from os import listdir, makedirs 4 | from os.path import isfile, join, isdir 5 | import shutil 6 | from nlp_to_phenome import run_learning 7 | import utils 8 | import logging 9 | 10 | 11 | def run_kfold_learning(settings): 12 | corpus_folder = settings['corpus_folder'] 13 | semehr_folder = settings['semehr_folder'] 14 | gold_folder = settings['gold_folder'] 15 | working_folder = settings['working_folder'] 16 | kf = KFold(n_splits=settings["kfold"]) 17 | files = [f for f in listdir(corpus_folder) if isfile(join(corpus_folder, f))] 18 | k = 0 19 | for train_idx, test_idx in kf.split(files): 20 | reset_folder(working_folder) 21 | # copy files 22 | train_ann_dir = join(working_folder, 'ann') 23 | train_gold_dir = join(working_folder, 'gold') 24 | train_text_dir = join(working_folder, 'train_corpus') 25 | test_ann_dir = join(working_folder, 'test_ann') 26 | test_gold_dir = join(working_folder, 'test_gold') 27 | test_text_dir = join(working_folder, 'test_corpus') 28 | 29 | for idx in train_idx: 30 | shutil.copy(join(corpus_folder, files[idx]), join(train_text_dir, files[idx])) 31 | ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '') 32 | gold_file = '%s.knowtator.xml' % files[idx] 33 | shutil.copy(join(semehr_folder, ann_file), join(train_ann_dir, ann_file)) 34 | shutil.copy(join(gold_folder, gold_file), join(train_gold_dir, gold_file)) 35 | 36 | for idx in test_idx: 37 | shutil.copy(join(corpus_folder, files[idx]), join(test_text_dir, files[idx])) 38 | ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '') 39 | gold_file = '%s.knowtator.xml' % files[idx] 40 | shutil.copy(join(semehr_folder, ann_file), join(test_ann_dir, ann_file)) 41 | shutil.copy(join(gold_folder, gold_file), join(test_gold_dir, gold_file)) 42 | performance = run_learning(train_ann_dir, train_gold_dir, train_text_dir, 43 | test_ann_dir, test_gold_dir, test_text_dir, 44 | settings) 45 | utils.save_string(performance, join(working_folder, 'folder_%s_perf.tsv' % k)) 46 | k += 1 47 | logging.info('round %s done' % k) 48 | 49 | 50 | def reset_folder(working_folder): 51 | # clear working folder 52 | for d in listdir(working_folder): 53 | if isdir(join(working_folder, d)): 54 | shutil.rmtree(join(working_folder, d)) 55 | 56 | train_ann_dir = join(working_folder, 'ann') 57 | train_gold_dir = join(working_folder, 'gold') 58 | train_text_dir = join(working_folder, 'train_corpus') 59 | test_ann_dir = join(working_folder, 'test_ann') 60 | test_gold_dir = join(working_folder, 'test_gold') 61 | test_text_dir = join(working_folder, 'test_corpus') 62 | learning_model_dir = join(working_folder, 'models') 63 | makedirs(train_ann_dir) 64 | makedirs(train_gold_dir) 65 | makedirs(train_text_dir) 66 | makedirs(test_ann_dir) 67 | makedirs(test_gold_dir) 68 | makedirs(test_text_dir) 69 | makedirs(learning_model_dir) 70 | 71 | 72 | def run_it(learnging_config_file): 73 | settings = utils.load_json_data(learnging_config_file) 74 | run_kfold_learning(settings) 75 | 76 | 77 | if __name__ == "__main__": 78 | run_it() 79 | if len(sys.argv) != 2: 80 | print('the syntax is [python run_it.py LEARNING_SETTINGS_FILE_PATH]') 81 | else: 82 | run_it(sys.argv[1]) -------------------------------------------------------------------------------- /settings/concept_mapping_stroke_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "loc_deep": [ 3 | "C2949882\tRight lentiform nucleus\tBody Part, Organ, or Organ Component", 4 | "C1548801\tExternal\tBody Location or Region", 5 | "C0305578\tcentrum\tPharmacologic Substance", 6 | "C0740279\tCerebellar atrophy\tDisease or Syndrome", 7 | "C0007759\tCortex Cerebelli\tBody Part, Organ, or Organ Component", 8 | "C0007765\tCerebellar\tBody Part, Organ, or Organ Component", 9 | "C0039729\tThalamus\tBody Part, Organ, or Organ Component", 10 | "C2334778\tRight internal capsule\tBody Part, Organ, or Organ Component", 11 | "C0546019\tLeft basal ganglia\tBody Part, Organ, or Organ Component", 12 | "C0241970\tLACUNE\tAcquired Abnormality", 13 | "C0737244\tCorona radiata\tBody Part, Organ, or Organ Component", 14 | "C2328150\tRight thalamus\tBody Part, Organ, or Organ Component", 15 | "C3178801\tLacunar Stroke\tDisease or Syndrome", 16 | "C0164707\tEPI\tPharmacologic Substance", 17 | "C0149854\tCerebellar hemorrhage\tPathologic Function", 18 | "C0152341\tCapsula Interna\tBody Part, Organ, or Organ Component", 19 | "C2330009\tAnterior limb of left internal capsule\tBody Part, Organ, or Organ Component", 20 | "C0007776\tCortex\tBody Part, Organ, or Organ Component", 21 | "C1389280\tBasal ganglia calcification\tPathologic Function", 22 | "C2951935\tRight side of pons\tBody Part, Organ, or Organ Component", 23 | "C0228465\tCerebellar hemisphere\tBody Part, Organ, or Organ Component", 24 | "C0004781\tBasal Nuclei\tBody Part, Organ, or Organ Component", 25 | "C0032639\tPontes\tBody Part, Organ, or Organ Component", 26 | "C0871456\tSubcortical lesions\tDisease or Syndrome", 27 | "C0162342\tLentiform Nuclei\tBody Part, Organ, or Organ Component", 28 | "C0017067\tGanglia\tBody Part, Organ, or Organ Component", 29 | "C2339807\tLeft internal capsule\tBody Part, Organ, or Organ Component", 30 | "C0815275\tsubcortical\tBody Location or Region", 31 | "C0228515\tCerebellar Peduncle\tBody Part, Organ, or Organ Component", 32 | "C0545733\tvertebrobasilar\tBody Part, Organ, or Organ Component", 33 | "C0333559\tLacunar Infarct\tDisease or Syndrome", 34 | "C0006121\tBrainstem\tBody Part, Organ, or Organ Component", 35 | "C2340044\tLeft thalamus\tBody Part, Organ, or Organ Component", 36 | "C0228291\tBasal ganglia and capsules\tBody Part, Organ, or Organ Component", 37 | "C2330887\tRight external capsule\tBody Part, Organ, or Organ Component", 38 | "C0006104\tBrains\tBody Part, Organ, or Organ Component", 39 | "C2950746\tLeft lentiform nucleus\tBody Part, Organ, or Organ Component", 40 | "C0010090\tCorpus Callosum\tBody Part, Organ, or Organ Component", 41 | "C2338227\tLeft external capsule\tBody Part, Organ, or Organ Component", 42 | "C0018827\tVentricle\tBody Part, Organ, or Organ Component", 43 | "C2337761\tLobe of cerebellum\tBody Part, Organ, or Organ Component", 44 | "C0228181\tCentrum ovale\tBody Part, Organ, or Organ Component", 45 | "C0152321\tGenu corpus callosi\tBody Part, Organ, or Organ Component", 46 | "C1446220\tBasal ganglion stroke\tPathologic Function", 47 | "C1116439\tPosterior fossa\tBody Part, Organ, or Organ Component", 48 | "C0025462\tMidbrain\tBody Part, Organ, or Organ Component", 49 | "C0152345\tCapsula Externa\tBody Part, Organ, or Organ Component", 50 | "C0546018\tRight basal ganglia\tBody Part, Organ, or Organ Component" 51 | ], 52 | "atrophy": [ 53 | "C1114365\tAge\tClinical Attribute", 54 | "C0740279\tCerebellar atrophy\tDisease or Syndrome", 55 | "C1265891\tFocal atrophy\tPathologic Function", 56 | "C0333641\tAtrophy\tPathologic Function", 57 | "C2700258\tVolume\tLaboratory Procedure", 58 | "C3151195\tCerebral and cerebellar atrophy\tFinding", 59 | "C0598275\tDiffuse cerebral atrophy\tFinding", 60 | "C3273211\tAge-Related Atrophy\tPathologic Function", 61 | "C0006104\tBrains\tBody Part, Organ, or Organ Component", 62 | "C0235946\tBrain atrophy\tDisease or Syndrome" 63 | ], 64 | "stroke": [ 65 | "C0018944\tHematoma\tPathologic Function", 66 | "C0005847\tVessel\tBody Part, Organ, or Organ Component", 67 | "C0487602\tStaining\tLaboratory Procedure", 68 | "C4019010\tEvent\tClinical Attribute", 69 | "C0751956\tAcute Stroke\tDisease or Syndrome", 70 | "C0038454\tStroke\tDisease or Syndrome" 71 | ], 72 | "metast_tumour": [ 73 | "C0555278\tCerebral metastases\tNeoplastic Process", 74 | "C0221198\tLesion\tFinding", 75 | "C0027627\tMetastases\tNeoplastic Process", 76 | "C0233520\tDestructive\tIndividual Behavior" 77 | ], 78 | "time_recent": [ 79 | "C1279919\tEarlier\tTemporal Concept", 80 | "C0333276\tacute bleed\tPathologic Function", 81 | "C0332185\tRecent\tTemporal Concept", 82 | "C1578513\tnew\tFinding", 83 | "C1513491\tMost Recent\tTemporal Concept", 84 | "C0439588\tacute chronic\tTemporal Concept", 85 | "C0333548\tAcute infarct\tPathologic Function", 86 | "C0333277\tSubacute bleeding\tPathologic Function", 87 | "C0751956\tAcute Stroke\tDisease or Syndrome", 88 | "C0205365\tSubacute\tTemporal Concept", 89 | "C0205178\tAcuteness\tTemporal Concept", 90 | "C0333560\tEvolving infarct\tPathologic Function" 91 | ], 92 | "subdural_haematoma": [ 93 | "C0018944\tHematoma\tPathologic Function", 94 | "C0038541\tSubdural\tBody Space or Junction", 95 | "C0018946\tSubdural Hematoma\tPathologic Function", 96 | "C0749095\tChronic Subdural Hematoma\tPathologic Function", 97 | "C0019080\tBleeding\tPathologic Function" 98 | ], 99 | "ischaemic_stroke": [ 100 | "C0031001\tPerfusion\tTherapeutic or Preventive Procedure", 101 | "C0149566\tSylvian artery\tBody Part, Organ, or Organ Component", 102 | "C0585229\tMultiple lacunar infarcts\tDisease or Syndrome", 103 | "C3178801\tLacunar Stroke\tDisease or Syndrome", 104 | "C0017639\tGlioses\tPathologic Function", 105 | "C0164707\tEPI\tPharmacologic Substance", 106 | "C0001365\tCva\tDisease or Syndrome", 107 | "C1165245\tPacis\tPharmacologic Substance", 108 | "C0948008\tIschemic stroke\tDisease or Syndrome", 109 | "C0740392\tINFARCT MCA\tDisease or Syndrome", 110 | "C0038454\tStroke\tDisease or Syndrome", 111 | "C1446220\tBasal ganglion stroke\tPathologic Function", 112 | "C0333560\tEvolving infarct\tPathologic Function", 113 | "C4019010\tEvent\tClinical Attribute", 114 | "C0022116\tIschemia\tPathologic Function", 115 | "C0752132\tINFARCT PCA\tDisease or Syndrome", 116 | "C0333559\tLacunar Infarct\tDisease or Syndrome", 117 | "C0007785\tCerebral infarct\tDisease or Syndrome", 118 | "C1511938\tDifferentiation\tClinical Attribute", 119 | "C0021308\tInfarct\tPathologic Function", 120 | "C0333551\tOld infarct\tPathologic Function", 121 | "C0241970\tLACUNE\tAcquired Abnormality", 122 | "C0585629\tLeft sided cerebral infarction\tPathologic Function", 123 | "C0333548\tAcute infarct\tPathologic Function", 124 | "C0333542\tFocal infarct\tPathologic Function", 125 | "C0751587\tCADASIL\tDisease or Syndrome" 126 | ], 127 | "haemorrhagic_transformation": [ 128 | "C1510411\tTransformed\tPathologic Function" 129 | ], 130 | "microhaemorrhage": [ 131 | "C0019080\tBleeding\tPathologic Function", 132 | "C0859253\tMicrohaemorrhage\tPathologic Function", 133 | "C2750293\tMicrobleeds\tFinding" 134 | ], 135 | "subarachnoid_haemorrhage": [ 136 | "C0005767\tSanguis\tTissue", 137 | "C0038525\tSubarachnoid Hemorrhage\tDisease or Syndrome" 138 | ], 139 | "tumour": [ 140 | "C1333071\tClival Chordoma\tNeoplastic Process", 141 | "C0270614\tIntracranial lump\tFinding", 142 | "C2931822\tCancer of Nasopharynx\tNeoplastic Process", 143 | "C0027651\tTumors\tNeoplastic Process", 144 | "C0577559\tA mass\tFinding", 145 | "C0001430\tAdenoma\tNeoplastic Process", 146 | "C0346308\tPituitary macroadenoma\tNeoplastic Process", 147 | "C0342419\tPituitary mass\tAnatomical Abnormality", 148 | "C0032000\tPituitary Adenoma\tNeoplastic Process", 149 | "C0746408\tMass/lesion\tNeoplastic Process", 150 | "C0221198\tLesion\tFinding", 151 | "C0871456\tSubcortical lesions\tDisease or Syndrome", 152 | "C0457193\tSoft tissue mass\tAnatomical Abnormality", 153 | "C0024299\tLymphoma\tNeoplastic Process", 154 | "C2752009\tWhite matter lesions\tFinding", 155 | "C0746405\tcystic mass\tDisease or Syndrome" 156 | ], 157 | "small_vessel_disease": [ 158 | "C0228157\tPeriventricular white matter\tBody Part, Organ, or Organ Component", 159 | "C1114365\tAge\tClinical Attribute", 160 | "C0010957\tDamage\tInjury or Poisoning", 161 | "C1843516\tDilated perivascular spaces\tFinding", 162 | "C0815275\tsubcortical\tBody Location or Region", 163 | "C0022116\tIschemia\tPathologic Function", 164 | "C0042373\tAngiopathy\tDisease or Syndrome", 165 | "C1282841\tSmall vessels\tBody Part, Organ, or Organ Component", 166 | "C0152295\tCerebral White Matter\tTissue", 167 | "C2733158\tCerebral Microangiopathy\tDisease or Syndrome", 168 | "C1833300\tWhite matter changes\tFinding", 169 | "C0221198\tLesion\tFinding", 170 | "C0225988\tSmall vessel\tBody Part, Organ, or Organ Component", 171 | "C0012634\tDisease\tDisease or Syndrome", 172 | "C0682708\tWhite Matter\tTissue", 173 | "C0270612\tLeukoencephalopathy\tDisease or Syndrome", 174 | "C0006104\tBrains\tBody Part, Organ, or Organ Component", 175 | "C1853380\tPeriventricular white matter changes\tFinding" 176 | ], 177 | "glioma_tumour": [ 178 | "C0017636\tGlioblastoma\tNeoplastic Process", 179 | "C1997217\tLow grade glioma\tNeoplastic Process", 180 | "C0028945\tOligodendroglioma\tNeoplastic Process", 181 | "C0017638\tGlioma\tNeoplastic Process", 182 | "C0334583\tPiloid astrocytoma\tNeoplastic Process", 183 | "C0004114\tAstrocytoma\tNeoplastic Process" 184 | ], 185 | "loc_cortical": [ 186 | "C0030560\tParietal Lobe\tBody Part, Organ, or Organ Component", 187 | "C0228218\tRight occipital lobe\tBody Part, Organ, or Organ Component", 188 | "C0003842\tArtery\tBody Part, Organ, or Organ Component", 189 | "C0230010\tTemporal Fossa\tBody Space or Junction", 190 | "C0228193\tRight frontal lobe\tBody Part, Organ, or Organ Component", 191 | "C0751438\tPosterior\tDisease or Syndrome", 192 | "C0039484\tOs temporale\tBody Part, Organ, or Organ Component", 193 | "C0016733\tFrontal Lobe\tBody Part, Organ, or Organ Component", 194 | "C0078944\tIV PCA\tTherapeutic or Preventive Procedure", 195 | "C0226213\tRight middle cerebral artery\tBody Part, Organ, or Organ Component", 196 | "C0149566\tSylvian artery\tBody Part, Organ, or Organ Component", 197 | "C3495441\tMotor Strip\tBody Location or Region", 198 | "C0742901\tcraniotomy temporal\tTherapeutic or Preventive Procedure", 199 | "C0039452\tCerebrum\tBody Part, Organ, or Organ Component", 200 | "C2331118\tRight insula\tBody Part, Organ, or Organ Component", 201 | "C0226214\tLeft middle cerebral artery\tBody Part, Organ, or Organ Component", 202 | "C1165245\tPacis\tPharmacologic Substance", 203 | "C0751437\tAnterior\tDisease or Syndrome", 204 | "C0152299\tPrecentral Gyrus\tBody Part, Organ, or Organ Component", 205 | "C0016732\tOs frontale\tBody Part, Organ, or Organ Component", 206 | "C0007776\tCortex\tBody Part, Organ, or Organ Component", 207 | "C0031873\tPica Disease\tMental or Behavioral Dysfunction", 208 | "C0228207\tRight parietal lobe\tBody Part, Organ, or Organ Component", 209 | "C0228208\tLeft parietal lobe\tBody Part, Organ, or Organ Component", 210 | "C0459388\tFrontal Sulcus\tBody Part, Organ, or Organ Component", 211 | "C0028784\tOs occipitale\tBody Part, Organ, or Organ Component", 212 | "C0740392\tINFARCT MCA\tDisease or Syndrome", 213 | "C0149561\tAnterior Cerebral Artery\tBody Part, Organ, or Organ Component", 214 | "C0228219\tLeft occipital lobe\tBody Part, Organ, or Organ Component", 215 | "C0152302\tPostcentral Gyrus\tBody Part, Organ, or Organ Component", 216 | "C1184145\tOccipital\tBody Location or Region", 217 | "C0226247\tRight posterior cerebral artery\tBody Part, Organ, or Organ Component", 218 | "C0152283\tTemporal Horn\tBody Part, Organ, or Organ Component", 219 | "C0028785\tOccipital Lobe\tBody Part, Organ, or Organ Component", 220 | "C2339924\tRight insular cortex\tBody Part, Organ, or Organ Component", 221 | "C0796494\tLobe\tBody Part, Organ, or Organ Component", 222 | "C2362314\tTemporal\tTemporal Concept", 223 | "C0228194\tLeft frontal lobe\tBody Part, Organ, or Organ Component", 224 | "C0152296\tMarginal Gyrus\tBody Part, Organ, or Organ Component", 225 | "C0748512\tOccipital Scalp\tBody Location or Region", 226 | "C0030625\tPCA\tLaboratory Procedure", 227 | "C0228233\tLeft temporal lobe\tBody Part, Organ, or Organ Component", 228 | "C0752132\tINFARCT PCA\tDisease or Syndrome", 229 | "C0149554\tFRONTAL HORN\tBody Part, Organ, or Organ Component", 230 | "C0149576\tArteria cerebri posterior\tBody Part, Organ, or Organ Component", 231 | "C0597434\tsensory cortex\tBody Part, Organ, or Organ Component", 232 | "C0228202\tPremotor Area\tBody Part, Organ, or Organ Component", 233 | "C0021640\tInsula\tBody Part, Organ, or Organ Component", 234 | "C0228232\tRight temporal lobe\tBody Part, Organ, or Organ Component", 235 | "C3496562\tcortical white matter\tBody Part, Organ, or Organ Component", 236 | "C0039485\tTemporal Lobe\tBody Part, Organ, or Organ Component", 237 | "C0235946\tBrain atrophy\tDisease or Syndrome", 238 | "C3496378\tparietal white matter\tBody Part, Organ, or Organ Component", 239 | "C0272451\tParietal fracture\tInjury or Poisoning" 240 | ], 241 | "haemorrhagic_stroke": [ 242 | "C0018944\tHematoma\tPathologic Function", 243 | "C0342406\tPituitary Hemorrhage\tPathologic Function", 244 | "C0333629\tHemosiderin Deposition\tPathologic Function", 245 | "C0333276\tacute bleed\tPathologic Function", 246 | "C3665429\tRecurrent hemorrhage\tPathologic Function", 247 | "C0333277\tSubacute bleeding\tPathologic Function", 248 | "C2937358\tBrain bleeding\tPathologic Function", 249 | "C0019080\tBleeding\tPathologic Function", 250 | "C0456388\tBlood Product\tPharmacologic Substance", 251 | "C0151699\tIntracranial bleed\tPathologic Function", 252 | "C1861265\tNo hemorrhage\tFinding", 253 | "C0149854\tCerebellar hemorrhage\tPathologic Function" 254 | ], 255 | "time_old": [ 256 | "C0333629\tHemosiderin Deposition\tPathologic Function", 257 | "C0749095\tChronic Subdural Hematoma\tPathologic Function", 258 | "C0205156\tFormer\tTemporal Concept", 259 | "C0332152\tBefore\tTemporal Concept", 260 | "C0439588\tacute chronic\tTemporal Concept", 261 | "C0580836\tOld\tTemporal Concept", 262 | "C0205191\tChronic\tTemporal Concept", 263 | "C0333551\tOld infarct\tPathologic Function", 264 | "C3714811\tResolved\tFinding" 265 | ], 266 | "mening_tumour": [ 267 | "C0025286\tMengioma\tNeoplastic Process" 268 | ] 269 | } -------------------------------------------------------------------------------- /settings/entity_types_phenotypes_stroke_sample.txt: -------------------------------------------------------------------------------- 1 | atrophy 2 | glioma_tumour 3 | haemorrhagic_stroke 4 | haemorrhagic_transformation 5 | ischaemic_stroke 6 | mening_tumour 7 | metast_tumour 8 | microhaemorrhage 9 | small_vessel_disease 10 | stroke 11 | subarachnoid_haemorrhage 12 | subdural_haematoma 13 | tumour 14 | intracerebral_haemorrhage 15 | intracranial_haemorrhage 16 | bleeding 17 | aneurysm -------------------------------------------------------------------------------- /settings/ignore_mappings_stroke_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "stroke": [ 3 | "C0018944", 4 | "haematoma", 5 | "C4019010", 6 | "C0005847" 7 | ], 8 | "microhaemorrhage": [ 9 | "C0019080" 10 | ], 11 | "atrophy": [ 12 | "C1114365", 13 | "C0006104", 14 | "C2700258" 15 | ], 16 | "subarachnoid_haemorrhage": [ 17 | "C0005767" 18 | ], 19 | "subdural_haematoma": [ 20 | "C0019080", 21 | "C0038541", 22 | "C0018944", 23 | "haematoma", 24 | "collections" 25 | ], 26 | "small_vessel_disease": [ 27 | "C1114365", 28 | "C0006104", 29 | "C0022116", 30 | "C0682708", 31 | "C0221198", 32 | "C0228157", 33 | "white matter" 34 | ], 35 | "tumour": [ 36 | "C0577559", 37 | "C0221198", 38 | "lesions", 39 | "lesion", 40 | "mass" 41 | ], 42 | "loc_deep":[ 43 | "brain", 44 | "C0006104" 45 | ], 46 | "time_old":[ 47 | "C0332152" 48 | ] 49 | } -------------------------------------------------------------------------------- /settings/sample_setting.json: -------------------------------------------------------------------------------- 1 | { 2 | "ann_dir": "U:/semehr-usecases/trajectories/train_anns", 3 | "gold_dir": "U:/semehr-usecases/trajectories/train_gold_anns", 4 | "dev_full_text_dir": "U:/semehr-usecases/trajectories/train_corpus", 5 | "test_semehr_output_dir": "U:/semehr-usecases/trajectories/lia_anns", 6 | "test_ann_dir": "U:/semehr-usecases/trajectories/nadia_anns", 7 | "test_fulltext_dir": "U:/semehr-usecases/trajectories/annotator_Nadia/ehost-mk5.1/2nd_iteration/2nd_iteration_Nadia/corpus", 8 | "test_gold_dir": "U:/semehr-usecases/trajectories/annotator_Nadia/ehost-mk5.1/2nd_iteration/2nd_iteration_Nadia/saved", 9 | "concept_mapping_file": "./settings/empty_concept_mapping.json", 10 | "learning_model_dir": "./models", 11 | "entity_types_file": "./settings/better_worse_entity_types.txt", 12 | "ignore_mapping_file": "./settings/ignore_mapping.json", 13 | "min_sample_size": 25, 14 | "viz_file": "./settings/viz_%s.pdf", 15 | "conll_output_file": "./settings/conll_output_file.txt", 16 | "gold_file_pattern": "%s.txt.knowtator.xml", 17 | "eHostGD": true 18 | } -------------------------------------------------------------------------------- /settings/sample_setting_kfold_learning.json: -------------------------------------------------------------------------------- 1 | { 2 | "kfold": 10, 3 | "corpus_folder": "/data/annotated_data/corpus", 4 | "gold_folder": "/data/annotated_data/gold", 5 | "semehr_folder": "/data/semehr_results", 6 | "working_folder": "/data/learning", 7 | "concept_mapping_file": "./settings/concept_mapping_stroke_sample.json", 8 | "learning_model_dir": "./models", 9 | "entity_types_file": "./settings/entity_types_phenotypes_stroke_sample.txt", 10 | "ignore_mapping_file": "./settings/ignore_mappings_stroke_sample.json", 11 | "min_sample_size": 25, 12 | "viz_file": "./settings/viz_%s.pdf", 13 | "conll_output_file": "./settings/conll_output_file.txt", 14 | "gold_file_pattern": "%s.txt.knowtator.xml", 15 | "eHostGD": true 16 | } -------------------------------------------------------------------------------- /settings/stroke-subtype-rules-full.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "primary haemorrhagic stroke", 4 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}], 5 | "exclusion_units": [ 6 | [{"phenotype": "mening_tumour"}], 7 | [{"phenotype": "metast_tumour"}], 8 | [{"phenotype": "tumour"}], 9 | [{"phenotype": "ischaemic_stroke"}], 10 | [{"phenotype": "contusion"}], 11 | [{"phenotype": "trauma"}], 12 | [{"phenotype": "subdural_haematoma"}], 13 | [{"phenotype": "transformation"}] 14 | ] 15 | }, 16 | 17 | { 18 | "label": "primary haemorrhagic stroke", 19 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}], 20 | "exclusion_units": [ 21 | [{"phenotype": "mening_tumour"}], 22 | [{"phenotype": "metast_tumour"}], 23 | [{"phenotype": "tumour"}], 24 | [ {"phenotype": "ischaemic_stroke"}], 25 | [{"phenotype": "contusion"}], 26 | [{"phenotype": "trauma"}], 27 | [{"phenotype": "subdural_haematoma"}], 28 | [{"phenotype": "transformation"}] 29 | ] 30 | }, 31 | 32 | { 33 | "label": "subarachnoid haemorrhage", 34 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}], 35 | "exclusion_units": [ 36 | [{"phenotype": "mening_tumour"}], 37 | [{"phenotype": "metast_tumour"}], 38 | [{"phenotype": "tumour"}], 39 | [{"phenotype": "contusion"}], 40 | [{"phenotype": "trauma"}], 41 | [{"phenotype": "subdural_haematoma"}], 42 | [{"phenotype": "transformation"}], 43 | [{"phenotype": "intracerebral_haemorrhage"}] 44 | ] 45 | }, 46 | 47 | { 48 | "label": "intracerebra haemorrhage", 49 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}], 50 | "exclusion_units": [ 51 | [{"phenotype": "mening_tumour"}], 52 | [{"phenotype": "metast_tumour"}], 53 | [{"phenotype": "tumour"}], 54 | [ {"phenotype": "ischaemic_stroke"}], 55 | [{"phenotype": "contusion"}], 56 | [{"phenotype": "trauma"}], 57 | [{"phenotype": "subdural_haematoma"}], 58 | [{"phenotype": "transformation"}], 59 | [{"phenotype": "subarachnoid_haemorrhage"}, {"phenotype": "aneurysm"}] 60 | ] 61 | }, 62 | 63 | { 64 | "label": "ischaemic stroke", 65 | "inclusions": [{"phenotype": "ischaemic_stroke"}], 66 | "exclusion_units": [ 67 | ] 68 | }, 69 | { 70 | "label": "small_vessel_disease", 71 | "inclusions": [{"phenotype": "small_vessel_disease"}], 72 | "exclusion_units": [ 73 | ] 74 | }, 75 | { 76 | "label": "atrophy", 77 | "inclusions": [{"phenotype": "atrophy"}], 78 | "exclusion_units": [ 79 | ] 80 | }, 81 | { 82 | "label": "atrophy", 83 | "inclusions": [{"phenotype": "atrophy"}], 84 | "exclusion_units": [ 85 | ] 86 | }, 87 | { 88 | "label": "tumour", 89 | "inclusions": [ 90 | {"phenotype": "tumour"}, 91 | {"phenotype": "glioma_tumour"}, 92 | {"phenotype": "mening_tumour"}, 93 | {"phenotype": "metast_tumour"} 94 | ], 95 | "exclusion_units": [ 96 | ] 97 | } 98 | ] 99 | -------------------------------------------------------------------------------- /settings/stroke-subtype-rules.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "label": "primary haemorrhagic stroke", 4 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}], 5 | "exclusion_units": [ 6 | [{"phenotype": "mening_tumour"}], 7 | [{"phenotype": "metast_tumour"}], 8 | [{"phenotype": "tumour"}], 9 | [{"phenotype": "ischaemic_stroke"}], 10 | [{"phenotype": "contusion"}], 11 | [{"phenotype": "trauma"}], 12 | [{"phenotype": "subdural_haematoma"}], 13 | [{"phenotype": "transformation"}] 14 | ] 15 | }, 16 | 17 | { 18 | "label": "primary haemorrhagic stroke", 19 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}], 20 | "exclusion_units": [ 21 | [{"phenotype": "mening_tumour"}], 22 | [{"phenotype": "metast_tumour"}], 23 | [{"phenotype": "tumour"}], 24 | [ {"phenotype": "ischaemic_stroke"}], 25 | [{"phenotype": "contusion"}], 26 | [{"phenotype": "trauma"}], 27 | [{"phenotype": "subdural_haematoma"}], 28 | [{"phenotype": "transformation"}] 29 | ] 30 | }, 31 | 32 | { 33 | "label": "subarachnoid haemorrhage", 34 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}], 35 | "exclusion_units": [ 36 | [{"phenotype": "mening_tumour"}], 37 | [{"phenotype": "metast_tumour"}], 38 | [{"phenotype": "tumour"}], 39 | [{"phenotype": "contusion"}], 40 | [{"phenotype": "trauma"}], 41 | [{"phenotype": "subdural_haematoma"}], 42 | [{"phenotype": "transformation"}], 43 | [{"phenotype": "intracerebral_haemorrhage"}] 44 | ] 45 | }, 46 | 47 | { 48 | "label": "intracerebra haemorrhage", 49 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}], 50 | "exclusion_units": [ 51 | [{"phenotype": "mening_tumour"}], 52 | [{"phenotype": "metast_tumour"}], 53 | [{"phenotype": "tumour"}], 54 | [ {"phenotype": "ischaemic_stroke"}], 55 | [{"phenotype": "contusion"}], 56 | [{"phenotype": "trauma"}], 57 | [{"phenotype": "subdural_haematoma"}], 58 | [{"phenotype": "transformation"}], 59 | [{"phenotype": "subarachnoid_haemorrhage"}, {"phenotype": "aneurysm"}] 60 | ] 61 | }, 62 | 63 | { 64 | "label": "ischaemic stroke", 65 | "inclusions": [{"phenotype": "ischaemic_stroke"}], 66 | "exclusion_units": [ 67 | ] 68 | } 69 | ] 70 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from os import listdir, rename 2 | from os.path import isfile, join, split 3 | import queue as Queue 4 | import threading 5 | import json 6 | import codecs 7 | import requests 8 | 9 | 10 | # list files in a folder and put them in to a queue for multi-threading processing 11 | def multi_thread_process_files(dir_path, file_extension, num_threads, process_func, 12 | proc_desc='processed', args=None, multi=None, 13 | file_filter_func=None, callback_func=None, 14 | thread_wise_objs=None): 15 | onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] 16 | num_pdfs = 0 17 | files = None if multi is None else [] 18 | lst = [] 19 | for f in onlyfiles: 20 | if f.endswith('.' + file_extension) if file_filter_func is None \ 21 | else file_filter_func(f): 22 | if multi is None: 23 | lst.append(join(dir_path, f)) 24 | else: 25 | files.append(join(dir_path, f)) 26 | if len(files) >= multi: 27 | lst.append(files) 28 | files = [] 29 | num_pdfs += 1 30 | if files is not None and len(files) > 0: 31 | lst.append(files) 32 | multi_thread_tasking(lst, num_threads, process_func, proc_desc, args, multi, file_filter_func, 33 | callback_func, 34 | thread_wise_objs=thread_wise_objs) 35 | 36 | 37 | def multi_thread_tasking(lst, num_threads, process_func, 38 | proc_desc='processed', args=None, multi=None, 39 | file_filter_func=None, callback_func=None, thread_wise_objs=None, 40 | thread_init_func=None, thread_end_func=None,): 41 | num_pdfs = len(lst) 42 | pdf_queque = Queue.Queue(num_pdfs) 43 | # print('putting list into queue...') 44 | for item in lst: 45 | pdf_queque.put_nowait(item) 46 | thread_num = min(num_pdfs, num_threads) 47 | arr = [process_func] if args is None else [process_func] + args 48 | arr.insert(0, pdf_queque) 49 | # print('queue filled, threading...') 50 | thread_objs = [] 51 | for i in range(thread_num): 52 | tarr = arr[:] 53 | thread_obj = None 54 | if thread_wise_objs is not None and isinstance(thread_wise_objs, list): 55 | thread_obj = thread_wise_objs[i] 56 | if thread_obj is None and thread_init_func is not None: 57 | thread_obj = thread_init_func() 58 | thread_objs.append(thread_obj) 59 | tarr.insert(0, thread_obj) 60 | t = threading.Thread(target=multi_thread_do, args=tuple(tarr)) 61 | t.daemon = True 62 | t.start() 63 | 64 | # print('waiting jobs to finish') 65 | pdf_queque.join() 66 | if thread_end_func is not None: 67 | for to in thread_objs: 68 | if to is not None: 69 | thread_end_func(to) 70 | # print('{0} files {1}'.format(num_pdfs, proc_desc)) 71 | if callback_func is not None: 72 | callback_func(*tuple(args)) 73 | 74 | 75 | def multi_thread_tasking_it(it_lst, num_threads, process_func, 76 | proc_desc='processed', args=None, multi=None, 77 | file_filter_func=None, callback_func=None, thread_wise_objs=None): 78 | pdf_queque = Queue.Queue(1000) 79 | thread_num = num_threads 80 | arr = [process_func] if args is None else [process_func] + args 81 | arr.insert(0, pdf_queque) 82 | # print('queue filled, threading...') 83 | for i in range(thread_num): 84 | tarr = arr[:] 85 | thread_obj = None 86 | if thread_wise_objs is not None and isinstance(thread_wise_objs, list): 87 | thread_obj = thread_wise_objs[i] 88 | tarr.insert(0, thread_obj) 89 | t = threading.Thread(target=multi_thread_do, args=tuple(tarr)) 90 | t.daemon = True 91 | t.start() 92 | 93 | # print('waiting jobs to finish') 94 | # print('putting list into queue...') 95 | for item in it_lst: 96 | pdf_queque.put(item) 97 | pdf_queque.join() 98 | # print('{0} files {1}'.format(num_pdfs, proc_desc)) 99 | if callback_func is not None: 100 | callback_func(*tuple(args)) 101 | 102 | 103 | def multi_thread_do(thread_obj, q, func, *args): 104 | while True: 105 | p = q.get() 106 | try: 107 | if thread_obj is not None: 108 | func(thread_obj, p, *args) 109 | else: 110 | func(p, *args) 111 | except Exception as e: 112 | print(u'error doing {0} on {1} \n{2}'.format(func, p, str(e))) 113 | q.task_done() 114 | 115 | 116 | def save_json_array(lst, file_path, encoding='utf-8'): 117 | with codecs.open(file_path, 'w', encoding=encoding) as wf: 118 | json.dump(lst, wf) 119 | 120 | 121 | def save_string(txt, file_path, encoding='utf-8'): 122 | with codecs.open(file_path, 'w', encoding=encoding) as wf: 123 | wf.write(txt) 124 | 125 | 126 | def load_json_data(file_path): 127 | data = None 128 | with codecs.open(file_path, encoding='utf-8') as rf: 129 | data = json.load(rf, encoding='utf-8') 130 | return data 131 | 132 | 133 | def http_post_result(url, payload, headers=None, auth=None): 134 | req = requests.post( 135 | url, headers=headers, 136 | data=payload, auth=auth) 137 | return str(req.content) # req.content.decode("utf-8") 138 | 139 | 140 | def multi_thread_large_file_tasking(large_file, num_threads, process_func, 141 | proc_desc='processed', args=None, multi=None, 142 | file_filter_func=None, callback_func=None, 143 | thread_init_func=None, thread_end_func=None, 144 | file_encoding='utf-8'): 145 | num_queue_size = 1000 146 | pdf_queque = Queue.Queue(num_queue_size) 147 | print('queue filled, threading...') 148 | thread_objs = [] 149 | for i in range(num_threads): 150 | arr = [process_func] if args is None else [process_func] + args 151 | to = None 152 | if thread_init_func is not None: 153 | to = thread_init_func() 154 | thread_objs.append(to) 155 | arr.insert(0, to) 156 | arr.insert(1, pdf_queque) 157 | t = threading.Thread(target=multi_thread_do, args=tuple(arr)) 158 | t.daemon = True 159 | t.start() 160 | 161 | print('putting list into queue...') 162 | num_lines = 0 163 | with codecs.open(large_file, encoding=file_encoding) as lf: 164 | for line in lf: 165 | num_lines += 1 166 | pdf_queque.put(line) 167 | 168 | print('waiting jobs to finish') 169 | pdf_queque.join() 170 | if thread_end_func is not None: 171 | for to in thread_objs: 172 | if to is not None: 173 | thread_end_func(to) 174 | print('{0} lines {1}'.format(num_lines, proc_desc)) 175 | if callback_func is not None: 176 | callback_func(*tuple(args)) 177 | 178 | 179 | def read_text_file(file_path, encoding='utf-8'): 180 | lines = [] 181 | with codecs.open(file_path, encoding=encoding) as rf: 182 | lines += rf.readlines() 183 | return [l.strip() for l in lines] 184 | 185 | 186 | def read_text_file_as_string(file_path, encoding='utf-8'): 187 | s = None 188 | with codecs.open(file_path, encoding=encoding) as rf: 189 | s = rf.read() 190 | return s 191 | 192 | 193 | def main(): 194 | ann_dir = '/data/annotated_data/gold/' 195 | files = [f for f in listdir(ann_dir) if isfile(join(ann_dir, f))] 196 | for f in files: 197 | rename(join(ann_dir, f), join(ann_dir, f[:-14] + '.txt.knowtator.xml')) 198 | 199 | if __name__ == "__main__": 200 | main() 201 | --------------------------------------------------------------------------------