├── .idea
├── .gitignore
├── misc.xml
├── modules.xml
├── nlp2phenome.iml
├── other.xml
└── vcs.xml
├── EDI_ann_doc.py
├── LabelModel.py
├── README.md
├── ann_converter.py
├── ann_utils.py
├── annotation_docs.py
├── data
├── entity_types.txt
├── entity_types_modifiers.txt
├── entity_types_no_context.txt
├── entity_types_phenotypes.txt
└── entity_types_times.txt
├── doc_inference.py
├── learners.py
├── mention_pattern.py
├── neg-tumour-dt-learnt.png
├── nlp_to_phenome.py
├── predict_helper.py
├── pretrained_models
├── stroke_settings.zip
├── stroke_subtype_models.zip
└── stroke_supplemental-gazetteer.zip
├── reportreader.py
├── requirements.txt
├── run_learning.py
├── settings
├── concept_mapping_stroke_sample.json
├── entity_types_phenotypes_stroke_sample.txt
├── ignore_mappings_stroke_sample.json
├── sample_setting.json
├── sample_setting_kfold_learning.json
├── stroke-subtype-rules-full.json
└── stroke-subtype-rules.json
└── utils.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/nlp2phenome.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/EDI_ann_doc.py:
--------------------------------------------------------------------------------
1 | from annotation_docs import EDIRAnn, relocate_annotation_pos
2 | import logging
3 | from os.path import basename, isfile, join, split
4 | import xml.etree.ElementTree as ET
5 | import re
6 |
7 |
8 | class EDIRDoc(object):
9 | """
10 | a class for reading EDIR annotation doc (XML)
11 | """
12 |
13 | def __init__(self, file_path):
14 | self._path = file_path
15 | self._root = None
16 | self._full_text = None
17 | self._word_offset_start = -1
18 | self._entities = None
19 | self.load()
20 |
21 | @property
22 | def file_path(self):
23 | return self._path
24 |
25 | def load(self):
26 | if not isfile(self.file_path):
27 | logging.debug('%s is NOT a file' % self.file_path)
28 | return
29 | tree = ET.parse(self.file_path)
30 | self._root = tree.getroot()
31 | self.get_word_offset_start()
32 |
33 | @property
34 | def get_full_text(self):
35 | if self._full_text is not None:
36 | return self._full_text
37 | if self._root is None:
38 | self.load()
39 | root = self._root
40 | d = ''
41 | start_offset = -1
42 | for p in root.findall('.//p'):
43 | for s in p:
44 | if 'proc' in s.attrib: # and s.attrib['proc'] == 'yes':
45 | for w in s:
46 | id_val = int(w.attrib['id'][1:])
47 | if start_offset == -1:
48 | start_offset = id_val
49 | offset = id_val - start_offset
50 | d += ' ' * (offset - len(d)) + w.text
51 | self._full_text = d
52 | return d
53 |
54 | def get_word_offset_start(self):
55 | if self._word_offset_start >= 0:
56 | return self._word_offset_start
57 | root = self._root
58 | offset_start = -1
59 | for e in root.findall('.//p/s[@proc]/w'):
60 | if 'id' not in e.attrib:
61 | continue
62 | else:
63 | offset_start = int(e.attrib['id'][1:])
64 | break
65 | # if offset_start == -1:
66 | # logging.debug('%s offset start could not be found' % self.file_path)
67 | self._word_offset_start = offset_start
68 |
69 | def get_ess_entities(self):
70 | if self._entities is not None:
71 | return self._entities
72 | root = self._root
73 | offset_start = self.get_word_offset_start()
74 | entities = []
75 | for e in root.findall('.//standoff/ents/ent'):
76 | if 'type' not in e.attrib:
77 | continue
78 | ent_type = e.attrib['type']
79 | if ent_type.startswith('label:'):
80 | continue
81 | negated = False
82 | if 'neg_' in ent_type:
83 | negated = True
84 | ent_type = ent_type.replace(r'neg_', '')
85 | str = ' '.join([part.text for part in e.findall('./parts/part')])
86 | ent_start = -1
87 | ent_end = -1
88 | for part in e.findall('./parts/part'):
89 | ent_start = int(part.attrib['sw'][1:]) - offset_start
90 | ent_end = ent_start + len(part.text)
91 | ann = EDIRAnn(str=str, start=ent_start, end=ent_end, type=ent_type)
92 | ann.negated = negated
93 | ann.id = len(entities)
94 | entities.append(ann)
95 | self._entities = entities
96 | return self._entities
97 |
98 | def relocate_anns(self, t):
99 | if self._entities is None:
100 | return
101 | for a in self._entities:
102 | s, e = relocate_annotation_pos(t, a.start, a.end, a.str)
103 | a.start = s
104 | a.end = e
105 |
106 |
107 | class eHostGenedDoc(EDIRDoc):
108 | def __init__(self, file_path):
109 | super(eHostGenedDoc, self).__init__(file_path)
110 |
111 | def get_ess_entities(self):
112 | if self._entities is not None:
113 | return self._entities
114 | root = self._root
115 | entities = []
116 | s_e_ids = []
117 | for e in root.findall('.//classMention'):
118 | mcs = e.findall('./mentionClass')
119 | mention_id = e.attrib['id']
120 | if len(mcs) > 0:
121 | mc = mcs[0]
122 | cls = mc.attrib['id']
123 | cls = cls.replace('Negated_', '').replace('hypothetical_', '').replace('Other_', '').replace(
124 | 'historical_', '')
125 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..')
126 | if len(mentions) > 0:
127 | span = mentions[0].findall('./span')
128 | ent_start = span[0].attrib['start']
129 | ent_end = span[0].attrib['end']
130 |
131 | s_e_id = '%s-%s' % (ent_start, ent_end)
132 | if s_e_id in s_e_ids:
133 | continue
134 | s_e_ids.append(s_e_id)
135 |
136 | spannedText = mentions[0].findall('./spannedText')
137 | str = spannedText[0].text
138 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls)
139 | ann.id = len(entities)
140 | entities.append(ann)
141 | self._entities = entities
142 | return self._entities
143 |
144 |
145 | class eHostDoc(EDIRDoc):
146 | def __init__(self, file_path):
147 | super(eHostDoc, self).__init__(file_path)
148 |
149 | def get_ess_entities(self):
150 | if self._entities is not None:
151 | return self._entities
152 | root = self._root
153 | entities = []
154 | for e in root.findall('.//classMention'):
155 | mcs = e.findall('./mentionClass')
156 | mention_id = e.attrib['id']
157 | if len(mcs) > 0:
158 | mc = mcs[0]
159 | m = re.match(r'Verified\_([^\(]+)(\(.*\)){0,1}', mc.attrib['id'])
160 | if m is not None:
161 | cls = m.group(1)
162 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..')
163 | if len(mentions) > 0:
164 | span = mentions[0].findall('./span')
165 | ent_start = span[0].attrib['start']
166 | ent_end = span[0].attrib['end']
167 | spannedText = mentions[0].findall('./spannedText')
168 | str = spannedText[0].text
169 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls)
170 | ann.id = len(entities)
171 | entities.append(ann)
172 | self._entities = entities
173 | return self._entities
174 |
175 |
176 | class ConllDoc(EDIRDoc):
177 | """
178 | for Conll output from classification results
179 | """
180 |
181 | def __init__(self, file_path):
182 | super(ConllDoc, self).__init__(file_path)
183 | self._tokens = None
184 | self._label_white_list = None
185 |
186 | def set_label_white_list(self, labels):
187 | self._label_white_list = labels
188 |
189 | @property
190 | def conll_output(self):
191 | try:
192 | return '\n'.join([' '.join([t['t'], str(len(t['predicted_label'])), t['gold_label'],
193 | (('B-' if t['predicted_label'][-1]['ann'].start == t['offset'] else 'I-') +
194 | t['predicted_label'][-1]['label'])
195 | if len(t['predicted_label']) > 0 else 'O'])
196 | for t in self.get_token_list()])
197 | except:
198 | logging.error('processing [%s] failed' % self.file_path)
199 | return ''
200 |
201 | def get_token_list(self):
202 | if self._tokens is not None:
203 | return self._tokens
204 | self._tokens = []
205 | start_offset = -1
206 | root = self._root
207 | work_ess = list(self.get_ess_entities())
208 | matched_ess = set()
209 | for p in root.findall('.//p'):
210 | for s in p:
211 | if 'proc' in s.attrib: # and s.attrib['proc'] == 'yes':
212 | for w in s:
213 | id_val = int(w.attrib['id'][1:])
214 | if start_offset == -1:
215 | start_offset = id_val
216 | offset = id_val - start_offset
217 | token = {'t': w.text, 'id': w.attrib['id'], 'offset': offset,
218 | 'gold_label': 'O', 'predicted_label': []}
219 | for e in work_ess:
220 | label = e.type.replace('neg_', '').lower().strip()
221 | if self._label_white_list is not None and label not in self._label_white_list:
222 | continue
223 | if token['offset'] == e.start:
224 | token['gold_label'] = 'B-' + label
225 | matched_ess.add(e)
226 | elif e.start < token['offset'] < e.end:
227 | token['gold_label'] = 'I-' + label
228 | matched_ess.add(e)
229 | self._tokens.append(token)
230 | left_ess = [e for e in work_ess if e not in matched_ess
231 | and e.type.replace('neg_', '') in self._label_white_list]
232 | if len(left_ess) > 0:
233 | logging.error('leftovers: [%s] at %s' % (
234 | '\n'.join(['%s (%s,%s)' % (a.type, a.start, a.end) for a in left_ess]), self.file_path))
235 | return self._tokens
236 |
237 | def add_predicted_labels(self, predicted_label):
238 | """
239 | append prediction result to the doc, one annotation a time
240 | :param predicted_label: labelled ann {'label': ..., 'ann': ann object}
241 | :return:
242 | """
243 | if self._label_white_list is not None and predicted_label['label'] not in self._label_white_list:
244 | return
245 | for token in self.get_token_list():
246 | if predicted_label['ann'].start <= token['offset'] < predicted_label['ann'].end:
247 | token['predicted_label'].append(predicted_label)
248 |
--------------------------------------------------------------------------------
/LabelModel.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from os.path import isfile, join
3 | from os import listdir
4 | from annotation_docs import Concept2Mapping, CustomisedRecoginiser
5 | from EDI_ann_doc import EDIRDoc, eHostGenedDoc
6 | import joblib as jl
7 |
8 |
9 | class LabelModel(object):
10 | """
11 | a machine learning based class for inferring phenotypes from NLP results
12 | features:
13 | - feature weighing
14 | - transparent models
15 | """
16 | def __init__(self, label, concept_mapping, max_dimensions=None):
17 | self._label = label
18 | self._concept_mapping = concept_mapping
19 | self._lbl_data = {}
20 | self._cui2label = {}
21 | self._selected_dims = None
22 | self._max_dimensions = 2000 if max_dimensions is None else max_dimensions
23 | self._tps = 0
24 | self._fps = 0
25 | self._lbl_one_dimension = True
26 | self._lbl2tfidf_dims = {}
27 | self._label_dimensions = []
28 | self._rare_labels = {}
29 | self._lbl2classifiers = {}
30 |
31 | @property
32 | def concept_mapping(self):
33 | return self._concept_mapping
34 |
35 | @concept_mapping.setter
36 | def concept_mapping(self, concept_mapping):
37 | self._concept_mapping = concept_mapping
38 |
39 | def get_binary_cluster_classifier(self, label):
40 | if label in self._lbl2classifiers:
41 | return self._lbl2classifiers[label]
42 | else:
43 | return None
44 |
45 | @property
46 | def cluster_classifier_dict(self):
47 | return self._lbl2classifiers
48 |
49 | def put_binary_cluster_classifier(self, label, classifier):
50 | self._lbl2classifiers[label] = classifier
51 |
52 | @property
53 | def rare_labels(self):
54 | return self._rare_labels
55 |
56 | def add_rare_label(self, label, tp_ratio):
57 | self._rare_labels[label] = tp_ratio
58 |
59 | @property
60 | def use_one_dimension_for_label(self):
61 | return self._lbl_one_dimension
62 |
63 | @use_one_dimension_for_label.setter
64 | def use_one_dimension_for_label(self, value):
65 | self._lbl_one_dimension = value
66 |
67 | @property
68 | def cui2label(self):
69 | return self._cui2label
70 |
71 | @property
72 | def label(self):
73 | return self._label
74 |
75 | def add_label_dimension(self, value):
76 | if value.lower() not in self._label_dimensions:
77 | self._label_dimensions.append(value.lower())
78 | # if tp is not None:
79 | # self._tp_labels.add(value.lower())
80 | # if fp is not None:
81 | # self._fp_labels.add(value.lower())
82 |
83 | def add_label_dimension_by_annotation(self, ann):
84 | self.add_label_dimension(LabelModel.get_ann_dim_label(ann, no_negation=True))
85 |
86 | def add_context_dimension(self, value, tp=None, fp=None, lbl='united'):
87 | if lbl not in self._lbl_data:
88 | self._lbl_data[lbl] = {'dims': [], 't2f': {}, 'tps': set(), 'fps': set()}
89 | d = self._lbl_data[lbl]
90 | if value.lower() not in d['dims']:
91 | d['dims'].append(value.lower())
92 | if value.lower() not in d['t2f']:
93 | d['t2f'][value.lower()] = 1
94 | else:
95 | d['t2f'][value.lower()] = d['t2f'][value.lower()] + 1
96 | tps = d['tps']
97 | fps = d['fps']
98 | if tp is not None:
99 | tps.add(value.lower())
100 | if fp is not None:
101 | fps.add(value.lower())
102 |
103 | def add_context_dimension_by_annotation(self, ann, tp=None, fp=None, lbl=None):
104 | self.add_context_dimension(LabelModel.get_ann_dim_label(ann, generalise=True, no_negation=True), tp=tp, fp=fp,
105 | lbl=lbl)
106 |
107 | def get_top_freq_dimensions(self, k, lbl='united'):
108 | if self._selected_dims is not None:
109 | return self._selected_dims
110 | if lbl not in self._lbl_data:
111 | return []
112 | l2f = self._lbl_data[lbl]['t2f']
113 | df = [(l, l2f[l]) for l in l2f]
114 | df = sorted(df, key=lambda x: -x[1])
115 | self._selected_dims = [d[0] for d in df[:k]]
116 | return self._selected_dims
117 |
118 | def get_top_tfidf_dimensions(self, k, lbl='united'):
119 | if lbl in self._lbl2tfidf_dims:
120 | return self._lbl2tfidf_dims[lbl]
121 | self._lbl2tfidf_dims[lbl] = {}
122 | if lbl not in self._lbl_data:
123 | logging.info('label [%s] has no contextual info' % lbl)
124 | return []
125 | d = self._lbl_data[lbl]
126 | tps = d['tps']
127 | fps = d['fps']
128 | idf_weight = 1.0
129 | if len(tps) > 0 and len(fps) > 0:
130 | idf_weight = 1.0 * len(tps) / len(fps)
131 | df = []
132 | max_score = 0
133 | for l in d['t2f']:
134 | idf = 1.0 / ((1 if l in d['tps'] else 0) + (1 if l in d['fps'] else 0))
135 | score = 1.0 * d['t2f'][l] / (len(tps) + len(fps))
136 | if idf_weight == 1 or (l in d['tps'] and l in d['fps']):
137 | score = score * idf
138 | # if l in d['tps'] and l in d['fps']:
139 | # score *= 0.5
140 | elif l in d['fps']:
141 | score *= idf_weight * idf
142 | max_score = max(score, max_score)
143 | df.append((l, score))
144 | df = sorted(df, key=lambda x: -x[1])
145 | # logging.debug(df)
146 | self._lbl2tfidf_dims[lbl] = [(t[0], t[1] * 1.0 / max_score) for t in df[:k]]
147 | logging.debug('%s ==> [%s]' % (lbl, self._lbl2tfidf_dims[lbl]))
148 | return self._lbl2tfidf_dims[lbl]
149 |
150 | @property
151 | def max_dimensions(self):
152 | return self._max_dimensions
153 |
154 | @max_dimensions.setter
155 | def max_dimensions(self, value):
156 | if value is None:
157 | self._max_dimensions = 2000
158 | self._max_dimensions = value
159 |
160 | @property
161 | def label_dimensions(self):
162 | return self._label_dimensions
163 |
164 | def context_dimensions(self, lbl):
165 | if lbl not in self._lbl_data:
166 | return []
167 | # logging.info('%s`s dims: %s' % (lbl, self._lbl_data[lbl]['dims']))
168 | return self._lbl_data[lbl]['dims']
169 |
170 | def encode_ann(self, ann, context_anns, lbl='united', extra_dims=None):
171 | ann_label = LabelModel.get_ann_dim_label(ann)
172 | encoded = []
173 | # if self.use_one_dimension_for_label:
174 | # if ann_label in self.label_dimensions:
175 | # encoded.append(self.label_dimensions.index(ann_label))
176 | # else:
177 | # encoded.append(-1)
178 | # else:
179 | # for l in self.label_dimensions:
180 | # if l == ann_label:
181 | # encoded.append(1)
182 | # else:
183 | # encoded.append(0)
184 | context_labels = [LabelModel.get_ann_dim_label(ann, generalise=True, no_negation=True) for ann in context_anns]
185 | for l, score in self.get_top_tfidf_dimensions(self.max_dimensions, lbl=lbl): # self.context_dimensions:
186 | # freq = 0
187 | # for cl in context_labels:
188 | # if cl.lower() == l.lower():
189 | # freq += 1
190 | if l in context_labels:
191 | encoded.append(1)
192 | else:
193 | encoded.append(0)
194 | # encoded.append(freq * score)
195 | return encoded + ([] if extra_dims is None else extra_dims)
196 |
197 | def collect_dimensions(self, ann_dir):
198 | cm = self.concept_mapping
199 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
200 | # collect dimension labels
201 | for fk in file_keys:
202 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
203 | t = self.label.replace('neg_', '')
204 | anns = cr.get_anns_by_label(t)
205 | neg_anns = cr.get_anns_by_label('neg_' + t)
206 | for a in anns + neg_anns:
207 | self.add_label_dimension_by_annotation(a)
208 | # self.add_context_dimension_by_annotation(a)
209 | if (a.negation != 'Negated' and self.label.startswith('neg_')) or \
210 | (a.negation == 'Negated' and not self.label.startswith('neg_')):
211 | continue
212 | sanns = cr.get_same_sentence_anns(a)
213 | context_anns = [] + sanns['umls'] + sanns['phenotype']
214 | # collect cui labels
215 | for u in sanns['umls']:
216 | self._cui2label[u.cui] = u.pref
217 | for c in context_anns:
218 | self.add_context_dimension_by_annotation(c)
219 |
220 | def collect_tfidf_dimensions(self, ann_dir, gold_dir, ignore_context=False, separate_by_label=False,
221 | full_text_dir=None, eHostGD=False):
222 | cm = self.concept_mapping
223 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
224 | # collect dimension labels
225 | tp_freq = 0
226 | fp_freq = 0
227 | label_type = self.label.replace('neg_', '')
228 | fn_freq = 0
229 | for fk in file_keys:
230 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
231 | fk = fk.replace('se_ann_', '')
232 | if full_text_dir is not None:
233 | cr.full_text_folder = full_text_dir
234 | if eHostGD:
235 | if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)):
236 | continue
237 | gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk))
238 | else:
239 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
240 | continue
241 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))
242 | t = self.label.replace('neg_', '')
243 | anns = cr.get_anns_by_label(t)
244 | neg_anns = cr.get_anns_by_label('neg_' + t)
245 |
246 | # re-segement sentences
247 | # cr.re_segment_sentences(fk)
248 | # cr.relocate_all_anns(fk)
249 | # gd.relocate_anns(cr.get_full_text(fk))
250 |
251 | not_matched_gds = []
252 | for e in gd.get_ess_entities():
253 | if (ignore_context and e.label.replace('neg_', '') == label_type) \
254 | or (not ignore_context and e.label == self.label):
255 | not_matched_gds.append(e.id)
256 | for a in anns + neg_anns:
257 | # self.add_context_dimension_by_annotation(a)
258 | self.add_label_dimension_by_annotation(a)
259 | # if (not ignore_context) and ((a.negation != 'Negated' and self.label.startswith('neg_')) or \
260 | # (a.negation == 'Negated' and not self.label.startswith('neg_'))):
261 | # logging.info('skipped because context')
262 | # continue
263 |
264 | matched = False
265 | for g in gd.get_ess_entities():
266 | if g.id in not_matched_gds:
267 | gt = g.label.replace('neg_', '')
268 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or
269 | (ignore_context and gt == label_type)):
270 | matched = True
271 | tp_freq += 1
272 | not_matched_gds.remove(g.id)
273 | if not matched:
274 | fp_freq += 1
275 |
276 | sanns = cr.get_prior_anns(a, contenxt_depth=-1)
277 | context_anns = [] + sanns['umls'] + sanns['phenotype'] + cr.get_context_words(a, fk)
278 | # context_anns = cr.get_context_words(a, fk)
279 | # collect cui labels
280 | for u in sanns['umls']:
281 | self._cui2label[u.cui] = u.pref
282 | for c in context_anns:
283 | self.add_context_dimension_by_annotation(c, tp=True if matched else None,
284 | fp=True if not matched else None,
285 | lbl='united' if not separate_by_label else
286 | LabelModel.get_ann_query_label(a))
287 | fn_freq += len(not_matched_gds)
288 | self._tps = tp_freq
289 | self._fps = fp_freq
290 | logging.debug('tp: %s, fp: %s, fn: %s' % (tp_freq, fp_freq, fn_freq))
291 |
292 | def get_low_quality_labels(self, ann_dir, gold_dir, accurate_threshold=0.05, min_sample_size=20):
293 | return [t[0] for t in self.assess_label_quality(ann_dir, gold_dir)
294 | if t[1] <= accurate_threshold and t[2] + t[3] >= min_sample_size]
295 |
296 | def assess_label_quality(self, ann_dir, gold_dir, separate_by_label=True, ignore_context=True):
297 | if ignore_context:
298 | logging.info('doing learning without considering contextual info')
299 | # print self.get_top_tfidf_dimensions(self.max_dimensions)
300 | cm = self.concept_mapping
301 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
302 | label_type = self.label.replace('neg_', '')
303 | query_label_perform = {}
304 | for fk in file_keys:
305 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
306 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
307 | continue
308 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))
309 |
310 | not_matched_gds = []
311 | for e in gd.get_ess_entities():
312 | if (ignore_context and e.label.replace('neg_', '') == label_type) \
313 | or (not ignore_context and e.label == self.label):
314 | not_matched_gds.append(e.id)
315 | anns = cr.get_anns_by_label(self.label, no_context=ignore_context)
316 | for a in anns:
317 | multiple_true_positives = 0
318 | matched = False
319 | for g in gd.get_ess_entities():
320 | if g.id in not_matched_gds:
321 | gt = g.label.replace('neg_', '')
322 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or
323 | (ignore_context and gt == label_type)):
324 | if matched:
325 | multiple_true_positives += 1
326 | matched = True
327 | not_matched_gds.remove(g.id)
328 |
329 | if separate_by_label:
330 | lbl = LabelModel.get_ann_query_label(a)
331 | else:
332 | lbl = 'united'
333 | ql = lbl
334 | if ql not in query_label_perform:
335 | query_label_perform[ql] = {'c': 0, 'w': 0}
336 | if matched:
337 | query_label_perform[ql]['c'] += 1
338 | else:
339 | query_label_perform[ql]['w'] += 1
340 | lbls = [(l,
341 | 1.0 * query_label_perform[l]['c'] / (query_label_perform[l]['c'] + query_label_perform[l]['w']),
342 | query_label_perform[l]['c'],
343 | query_label_perform[l]['w']) for l in query_label_perform]
344 | return sorted(lbls, key=lambda x: x[1])
345 |
346 | def load_data(self, ann_dir, gold_dir, verbose=True, ignore_mappings=[], ignore_context=False,
347 | separate_by_label=False, ful_text_dir=None, eHostGD=False, annotated_anns={}):
348 | """
349 |
350 | :param ann_dir:
351 | :param gold_dir:
352 | :param verbose:
353 | :param ignore_mappings:
354 | :param ignore_context:
355 | :param separate_by_label:
356 | :param ful_text_dir:
357 | :param eHostGD:
358 | :param annotated_anns: NB: this is for labelling settings where only partial data is annotated on
359 | the documents. Therefore, we need to filter out those not assessed by the annotators to avoid kill some
360 | true positives (those are correct but not assessed by annotators)
361 | :return:
362 | """
363 | if ignore_context:
364 | logging.info('doing learning without considering contextual info')
365 | # print self.get_top_tfidf_dimensions(self.max_dimensions)
366 | cm = self.concept_mapping
367 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
368 | lbl2data = {}
369 | false_negatives = 0
370 | lbl2tps = {}
371 | label_type = self.label.replace('neg_', '')
372 | query_label_perform = {}
373 | for fk in file_keys:
374 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
375 | fk = fk.replace('se_ann_', '')
376 | if ful_text_dir is not None:
377 | cr.full_text_folder = ful_text_dir
378 | if eHostGD:
379 | if not isfile(join(gold_dir, '%s.txt.knowtator.xml' % fk)):
380 | continue
381 | # logging.debug('using GD file %s' % join(gold_dir, '%s.txt.knowtator.xml' % fk))
382 | gd = eHostGenedDoc(join(gold_dir, '%s.txt.knowtator.xml' % fk))
383 | else:
384 | if not isfile(join(gold_dir, '%s-ann.xml' % fk)):
385 | continue
386 | logging.debug('using GD file %s' % join(gold_dir, '%s-ann.xml' % fk))
387 | gd = EDIRDoc(join(gold_dir, '%s-ann.xml' % fk))
388 |
389 | # re-segement sentences
390 | # cr.re_segment_sentences(fk)
391 | # cr.relocate_all_anns(fk)
392 | # gd.relocate_anns(cr.get_full_text(fk))
393 |
394 | not_matched_gds = []
395 | for e in gd.get_ess_entities():
396 | if (ignore_context and e.label.replace('neg_', '') == label_type) \
397 | or (not ignore_context and e.label == self.label):
398 | not_matched_gds.append(e.id)
399 |
400 | anns = cr.get_anns_by_label(self.label, ignore_mappings=ignore_mappings, no_context=ignore_context)
401 | if len(annotated_anns) > 0:
402 | if '%s.txt' % fk not in annotated_anns:
403 | continue
404 | kept_anns = []
405 | for a in anns:
406 | for aa in annotated_anns['%s.txt' % fk]:
407 | if int(aa['s']) == a.start and int(aa['e']) == a.end:
408 | kept_anns.append(a)
409 | anns = kept_anns
410 | for a in anns:
411 | logging.debug('%s, %s, %s' % (a.str, a.start, a.end))
412 | multiple_true_positives = 0
413 | t2anns = cr.get_prior_anns(a)
414 | # if len(t2anns['umls']) + len(t2anns['phenotype']) == 0:
415 | # t2anns = cr.get_prior_anns(a, contenxt_depth=-2)
416 | context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + \
417 | cr.get_context_words(a, fk)
418 | # context_anns = cr.get_context_words(a, fk)
419 | matched = False
420 | for g in gd.get_ess_entities():
421 | if g.id in not_matched_gds:
422 | gt = g.label.replace('neg_', '')
423 | if g.overlap(a) and ((g.label == self.label and not ignore_context) or
424 | (ignore_context and gt == label_type)):
425 | if matched:
426 | multiple_true_positives += 1
427 | matched = True
428 | not_matched_gds.remove(g.id)
429 | if verbose:
430 | if not matched:
431 | logging.debug('%s %s %s' % ('!',
432 | self.get_ann_dim_label(a) +
433 | ' // ' + ' | '.join(self.get_ann_dim_label(a, generalise=True)
434 | for a in context_anns), fk))
435 | else:
436 | logging.debug('%s %s %s' % ('R',
437 | self.get_ann_dim_label(a) + ' // ' + ' | '.join(
438 | self.get_ann_dim_label(a, generalise=True)
439 | for a in context_anns), fk))
440 |
441 | lbl = LabelModel.get_label_specific_data(self, lbl2data, a, context_anns, fk, cr,
442 | separate_by_label=separate_by_label)
443 |
444 | lbl2data[lbl]['multiple_tps'] += multiple_true_positives
445 | Y = lbl2data[lbl]['Y']
446 | Y.append([1 if matched else 0])
447 | ql = lbl
448 | if ql not in query_label_perform:
449 | query_label_perform[ql] = {'c': 0, 'w': 0}
450 | if matched:
451 | query_label_perform[ql]['c'] += 1
452 | else:
453 | query_label_perform[ql]['w'] += 1
454 | false_negatives += len(not_matched_gds)
455 |
456 | missed = None
457 | for g in gd.get_ess_entities():
458 | if g.id in not_matched_gds:
459 | missed = g
460 | logging.debug('\t'.join(
461 | ['M', g.str, str(g.negated), str(g.start), str(g.end), join(gold_dir, '%s-ann.xml' % fk)]))
462 | # if len(not_matched_gds) > 0:
463 | # print not_matched_gds
464 | # for a in anns:
465 | # logging.debug(a.str, a.start, a.end, missed.overlap(a))
466 | bad_labels = []
467 | for ql in query_label_perform:
468 | p = query_label_perform[ql]
469 | if p['c'] == 0 or (1.0 * p['w'] / p['c'] < 0.05):
470 | bad_labels.append(ql)
471 | return {'lbl2data': lbl2data,
472 | 'fns': false_negatives, 'bad_labels': bad_labels, 'files': file_keys}
473 |
474 | @staticmethod
475 | def get_label_specific_data(label_model, lbl2data, annotation, context_anns, fk, cr,
476 | separate_by_label=False):
477 | a = annotation
478 | extra_dims = [1] if len(cr.get_containing_anns(a)) > 0 else [0]
479 | if separate_by_label:
480 | lbl = LabelModel.get_ann_query_label(a)
481 | else:
482 | lbl = 'united'
483 | if lbl not in lbl2data:
484 | lbl2data[lbl] = {'X': [], 'Y': [], 'multiple_tps': 0, 'doc_anns': []}
485 | X = lbl2data[lbl]['X']
486 | lbl2data[lbl]['doc_anns'].append({'d': fk, 'ann': a, 'label': label_model.label})
487 | X.append(label_model.encode_ann(a, context_anns, lbl=lbl, extra_dims=extra_dims))
488 | return lbl
489 |
490 | @staticmethod
491 | def read_one_ann_doc(label_model, cr, fk, lbl2data=None,
492 | ignore_mappings=[], ignore_context=False, separate_by_label=False):
493 | if lbl2data is None:
494 | lbl2data = {}
495 | anns = cr.get_anns_by_label(label_model.label, ignore_mappings=ignore_mappings, no_context=ignore_context)
496 | for a in anns:
497 | t2anns = cr.get_prior_anns(a)
498 | context_anns = [] + t2anns['umls'] + t2anns['phenotype'] + cr.get_context_words(a, fk)
499 | # context_anns = cr.get_context_words(a, fk)
500 | LabelModel.get_label_specific_data(label_model, lbl2data, a, context_anns, fk, cr,
501 | separate_by_label=separate_by_label)
502 | return lbl2data
503 |
504 | def load_data_for_predict(self, ann_dir, ignore_mappings=[], ignore_context=False,
505 | separate_by_label=False, full_text_dir=None):
506 | """
507 | load data for prediction - no ground truth exists
508 | :param ann_dir:
509 | :param ignore_mappings:
510 | :param ignore_context:
511 | :param separate_by_label:
512 | :param full_text_dir:
513 | :return:
514 | """
515 | if ignore_context:
516 | logging.info('doing learning without considering contextual info')
517 |
518 | cm = self.concept_mapping
519 | file_keys = [f[:f.rfind('.')] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
520 | lbl2data = {}
521 | for fk in file_keys:
522 | cr = CustomisedRecoginiser(join(ann_dir, '%s.json' % fk), cm)
523 | fk = fk.replace('se_ann_', '')
524 | if full_text_dir is not None:
525 | cr.full_text_folder = full_text_dir
526 | LabelModel.read_one_ann_doc(self, cr, fk, lbl2data=lbl2data,
527 | ignore_mappings=ignore_mappings, ignore_context=ignore_context,
528 | separate_by_label=separate_by_label)
529 | return {'lbl2data': lbl2data, 'files': file_keys}
530 |
531 | def serialise(self, output_file):
532 | jl.dump(self, output_file)
533 |
534 | @staticmethod
535 | def type_related_ann_filter(ann, cm_obj):
536 | if hasattr(ann, 'cui'):
537 | return not ann.cui.lower() in cm_obj.all_entities
538 | # return not ann.cui in _cm_obj.type2cocnepts(type)
539 | else:
540 | return not ann.str.lower() in cm_obj.all_entities
541 | # return not ann.str in _cm_obj.type2gaz[type]
542 |
543 | @staticmethod
544 | def get_ann_query_label(ann):
545 | # return ann.str.lower()
546 | neg = ''
547 | # if hasattr(ann, 'negation'):
548 | # neg = 'neg_' if ann.negation == 'Negated' else ''
549 | # else:
550 | # neg = 'neg_' if ann.negated else ''
551 | # if hasattr(ann, 'cui'):
552 | # return neg + ann.cui + ' ' + str(ann.pref)
553 | # else:
554 | # return neg + ann.str.lower()
555 | return neg + ann.str.lower()
556 |
557 | @staticmethod
558 | def deserialise(serialised_file):
559 | return jl.load(serialised_file)
560 |
561 | @staticmethod
562 | def get_ann_dim_label(ann, generalise=False, no_negation=False):
563 | if isinstance(ann, str):
564 | return 'WORD_%s' % ann
565 | negated = ''
566 | label = ann.str
567 | if (hasattr(ann, 'negation') and ann.negation == 'Negated') or (hasattr(ann, 'negated') and ann.negated):
568 | negated = 'neg_'
569 | if no_negation:
570 | negated = ''
571 | # if hasattr(ann, 'cui'):
572 | # label = ann.cui + ' ' + str(ann.pref)
573 | # ann.str
574 | if hasattr(ann, 'minor_type'):
575 | label = ann.str
576 | # if generalise and hasattr(ann, 'sty'):
577 | # label = ann.sty
578 | # if ann.sty.lower() == 'body part, organ, or organ component':
579 | negated = ''
580 | return negated + label.lower()
581 | # return ann.str.lower() if not isinstance(ann, SemEHRAnn) else ann.cui.lower()
582 |
583 |
584 |
585 |
586 |
587 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Archived
2 |
3 | This repo is no longer maintained. For questions on this reop please email Honghan directly (honghan.wu@gmail.com), or for broader CogStack enquiries please reach out to contact@cogstack.org
4 |
5 | # nlp2phenome
6 | using AI models to infer patient phenotypes from identified named entities (instances of biomedical concepts)
7 |
8 | ## why
9 | Using natural language processing(NLP) to identify mentions of biomedical concepts from free text medical records is just the *first* step. There is often a gap between NLP results and what the clinical study is after. For example, a radiology report does not contain the term - `ischemic stroke`. Instead, it reports the patient had `blocked arteries` and `stroke`. To infer the "unspoken" `ischemic stroke`, a mechanism is needed to do such inferences from NLP identifiable mentions of `blocked arteries` and `stroke`. nlp2phenome is designed for doing this extra step from NLP to patient phenome.
10 |
11 | ## what
12 | nlp2phenome was developed for a stroke subtyping study using NLP on radiology reports in Edinburgh University led by [Dr Will Whitely](https://www.ed.ac.uk/profile/dr-william-whiteley). It is based on top of [SemEHR](https://github.com/CogStack/CogStack-SemEHR) results. It identified 2,922 mentions of 32 types of phenotypes from 266 radiology reports and achieved an average F1: 0.929; Precision: 0.925; Recall: 0.939.
13 |
14 | It uses various transparant machine learning models (e.g. decision tree, random forest) to learn the inference from NLP results to more insightful clinical phenotypes (such as subtypes of stroke). The image below is a decision tree learnt for negated tumour from radiology reports. Surprisingly, with specific feature selection methods, decision tree models outperforms the popular neural network based method. The other advantage is that the visualised decision trees can be verified or matched with clinical experts, or even compared to clinical guidelines. A working paper in progress, will update with a link soon.
15 |
16 | 
17 |
18 | ## data
19 | two datasets (radiology reports) collected in Scotland
20 | - [Edinburgh Stroke Study](http://www.dcn.ed.ac.uk/ess/) training data (364 reports), testing data (266 reports)
21 | - Tayside radiology reports (300 reports)
22 |
23 | ## run learning
24 | 1. config your configuration file. Please see `./settings/sample_setting_kfold_learning.json` for reference.
25 | ```javascript
26 | {
27 | "kfold": 10, // the fold for learning
28 | "corpus_folder": "/data/annotated_data/corpus", // the folder containing full text documents
29 | "gold_folder": "/data/annotated_data/gold", // the folder containing the labelled/annotated data
30 | "semehr_folder": "/data/semehr_results", // the folder containing baseline SemEHR results
31 | "working_folder": "/data/learning", // the working folder to store intermidieate data files
32 | "concept_mapping_file": "./settings/empty_concept_mapping.json", // the mapping file to map UMLS CUI to phenotypes
33 | "learning_model_dir": "./models", // where the machine learning models are stored
34 | "entity_types_file": "./settings/better_worse_entity_types.txt", // the list of phenotypes to be worked on
35 | "ignore_mapping_file": "./settings/ignore_mapping.json", // a json based mapping file to ignore certain CUI mappings
36 | "min_sample_size": 25, // minimal number of samples to train a model, if the sample size is less than this number, a counting based stats will be used to assess the correctness of baseline results rather than a machine learning model
37 | "gold_file_pattern": "%s.txt.knowtator.xml", // the annotation file pattern, %s identifies the unique id that will be used to find SemEHR result file and full text file in respective folders
38 | "eHostGD": true // whether use eHOST annotation, only other format supported is EDiR from Edinburgh Informatics
39 | }
40 | ```
41 | - `entity_types_file` - each study is to identify a set of phenotypes (e.g., diseases, symptoms or other biomedical mentions). This file is a plain text file to list all the names of `phenotypes` in a format of one phenotype per line. Check [entity_types_phenotypes_stroke_sample.txt](./settings/entity_types_phenotypes_stroke_sample.txt) as an example.
42 | - `concept_mapping_file` - for each phenotype defined above, it needs to be mapped to one or several ontology concepts (e.g., UMLS CUI). This is a json file. It is a json dictionary, where the key is the `phenotype` name and the value is an array. Each element in the array takes the form of `CONPCET_ID\tLabel\tSemantic Type` - `tab` key separated tuple. The first component is most important and the last two are for display purpose only. Check [concept_mapping_stroke_sample.json](./settings/concept_mapping_stroke_sample.json) as an example.
43 | - `ignore_mapping_file` - this is a json dictionary for removing particular concepts (and customised dictionary terms) from the mappings of phenotypes as defined in `concept_mapping_file`. The key is `phenotype` name and the value is an array containing either concept IDs from the ontology used (e.g., UMLS) or the customised dictionary term. This file is only needed when the `concept_mapping_file` is automatically generated from some learning data and it requires some fine-tuning.
44 | 2. run it by
45 | ```bash
46 | python run_learning.py YOUR_LEARNING_CONFIG_FILE
47 | ```
48 |
49 | ## contact
50 | Dr Honghan Wu (honghan.wu@gmail.com)
51 |
--------------------------------------------------------------------------------
/ann_converter.py:
--------------------------------------------------------------------------------
1 | import xml.etree.ElementTree as ET
2 | import datetime
3 | from os import listdir
4 | from os.path import isfile, join
5 | import utils
6 | import csv
7 |
8 |
9 | class AnnConverter(object):
10 |
11 | @staticmethod
12 | def get_semehr_ann_label(ann):
13 | str_context = ''
14 | if ann.negation != 'Affirmed':
15 | str_context += ann.negation + '_'
16 | if ann.temporality != 'Recent':
17 | str_context += ann.temporality + '_'
18 | if ann.experiencer != 'Patient':
19 | str_context += ann.experiencer + '_'
20 | return '%s%s' % (str_context, ann.minor_type)
21 |
22 | @staticmethod
23 | def to_eHOST(doc_key, anns, file_pattern='%s.txt', id_pattern='smehr-%s-%s'):
24 | elem_annotations = ET.Element("annotations")
25 | elem_annotations.set('textSource', file_pattern % doc_key)
26 | idx = 0
27 | for d in anns:
28 | ann = d['ann']
29 | idx += 1
30 | mention_id = id_pattern % (doc_key, idx)
31 | AnnConverter.create_elem_ann(elem_annotations, mention_id, ann.start, ann.end, ann.str,
32 | AnnConverter.get_semehr_ann_label(ann))
33 | tree = ET.ElementTree(elem_annotations)
34 | return ET.tostring(elem_annotations, encoding='utf8', method='xml')
35 |
36 | @staticmethod
37 | def create_elem_ann(elem_annotations, mention_id, start, end, str, class_label):
38 | elem_ann = ET.SubElement(elem_annotations, "annotation")
39 | elem_mention = ET.SubElement(elem_ann, "mention")
40 | elem_mention.set('id', mention_id)
41 | elem_annotator = ET.SubElement(elem_ann, "annotator")
42 | elem_annotator.set('id', 'semehr')
43 | elem_annotator.text = 'semehr'
44 | elem_span = ET.SubElement(elem_ann, "span")
45 | elem_span.set('start', '%s' % start)
46 | elem_span.set('end', '%s' % end)
47 | elem_spanText = ET.SubElement(elem_ann, "spannedText")
48 | elem_spanText.text = str
49 | elem_date = ET.SubElement(elem_ann, "creationDate")
50 | elem_date.text = datetime.datetime.now().strftime("%a %B %d %X %Z %Y")
51 | #
52 | elem_class = ET.SubElement(elem_annotations, "classMention")
53 | elem_class.set('id', mention_id)
54 | elem_mention_class = ET.SubElement(elem_class, "mentionClass")
55 | elem_mention_class.set('id', class_label)
56 | elem_mention_class.text = str
57 | return elem_ann
58 |
59 | @staticmethod
60 | def load_ann_file(f, do_multi=True):
61 | tree = ET.parse(f)
62 | doc = tree.getroot()
63 | ann2label = {}
64 | ann2freq = {}
65 | for ann in doc.findall("annotation"):
66 | m_id = ann.find("mention").attrib["id"]
67 | cm = doc.find('.//classMention[@id="' + m_id + '"]')
68 | cls =cm.find('mentionClass').attrib["id"]
69 | m_span = ann.find("span").attrib
70 | annid = 'm-%s-%s' % (m_span['start'], m_span['end'])
71 | m_text = ann.find("spannedText").text
72 | freq = 0
73 | if annid not in ann2freq:
74 | ann2freq[annid] = 1
75 | else:
76 | if do_multi:
77 | ann2freq[annid] += 1
78 | annid_freq = '%s:%s' % (annid, ann2freq[annid])
79 | ann2label[annid_freq] = {"text": m_text, "class": cls}
80 | return ann2label
81 |
82 | @staticmethod
83 | def convert_csv_annotations(csv_file, text_folder, ann_folder, mapping_file, annotated_anns_file,
84 | id_pattern='%s-%s', ann_file_pattern='%s.txt.knowtator.xml'):
85 | with open(csv_file, newline='') as cf:
86 | reader = csv.DictReader(cf)
87 | label2concepts = {}
88 | d2annotated_anns = {}
89 | for r in reader:
90 | d2annotated_anns[r['doc_id'] + ".txt"] = [{'s': r['start'], 'e': r['end']}]
91 | if r['Skip Document'] != 'Yes':
92 | utils.save_string(r['text'], join(text_folder, r['doc_id'] + ".txt"))
93 | elem_annotations = ET.Element("annotations")
94 | elem_annotations.set('textSource', r['doc_id'])
95 | mention_id = id_pattern % (r['doc_id'], 0)
96 | if r['Correct'] == 'Yes' and r['Negation'] == 'NOT Negated':
97 | AnnConverter.create_elem_ann(elem_annotations, mention_id,
98 | r['start'], r['end'], r['string_orig'], r['icd10-ch'])
99 | xml = ET.tostring(elem_annotations, encoding='unicode', method='xml')
100 | utils.save_string(xml, join(ann_folder, ann_file_pattern % r['doc_id']))
101 | if r['icd10-ch'] not in label2concepts:
102 | label2concepts[r['icd10-ch']] = []
103 | if r['cui'] not in label2concepts[r['icd10-ch']]:
104 | label2concepts[r['icd10-ch']].append(r['cui'])
105 | utils.save_json_array(label2concepts, mapping_file)
106 | utils.save_json_array(d2annotated_anns, annotated_anns_file)
107 |
108 | @staticmethod
109 | def populate_inter_annotator_results(ann_folder_1, ann_folder_2, output_file, missing_file,
110 | correct_labels = ["VERIFIED_CORRECT"]):
111 | ann_files = [f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f))]
112 | all_mentions = 0
113 | missed = []
114 | mismatched = []
115 | for f in ann_files:
116 | ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f))
117 | ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f))
118 | all_mentions += len(ann1)
119 | for ann in ann1:
120 | if ann not in ann2:
121 | missed.append('%s\t%s\t%s' % (ann, ann1[ann]['text'], ann1[ann]['class']))
122 | elif ann2[ann]['class'] != ann1[ann]['class'] and ann1[ann]['class'] not in correct_labels:
123 | mismatched.append('%s\t%s\t%s\t%s\t%s' % (f, ann, ann1[ann]['text'], ann1[ann]['class'], ann2[ann]['class']))
124 | print('\n'.join(mismatched))
125 | print(len(missed), all_mentions)
126 | utils.save_string('\n'.join(mismatched), output_file)
127 | utils.save_string('\n'.join(missed), missing_file)
128 |
129 | @staticmethod
130 | def calculate_IAA(ann_folder_1, ann_folder_2, output_file):
131 | from sklearn.metrics import cohen_kappa_score
132 | ann_files = [f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f))]
133 | ann1_annotations = {}
134 | ann2_annotations = {}
135 | for f in ann_files:
136 | ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f), do_multi=False)
137 | ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f), do_multi=False)
138 | for ann in ann1:
139 | ann1_annotations['%s_%s' % (f, ann)] = ann1[ann]['class']
140 | for ann in ann2:
141 | ann2_annotations['%s_%s' % (f, ann)] = ann2[ann]['class']
142 | merged_anns = list(set(list(ann1_annotations.keys()) + list(ann2_annotations.keys())))
143 | ann1_merged = []
144 | ann2_merged = []
145 | label_missed = 'missed'
146 | cat2pares = {'subject': {'ann1': [], 'ann2': []},
147 | 'irrelevant': {'ann1': [], 'ann2': []},
148 | 'trajectory': {'ann1': [], 'ann2': []},
149 | }
150 | output = []
151 | for ann in merged_anns:
152 | ann1_label = label_missed if ann not in ann1_annotations else ann1_annotations[ann]
153 | ann2_label = label_missed if ann not in ann2_annotations else ann2_annotations[ann]
154 | ann1_merged.append(ann1_label)
155 | ann2_merged.append(ann2_label)
156 | if ann1_label == 'Irrelevant_label' or ann2_label == 'Irrelevant_label':
157 | cat2pares['irrelevant']['ann1'].append(ann1_label)
158 | cat2pares['irrelevant']['ann2'].append(ann2_label)
159 | elif ann1_label in ['Trajectory_Subject', 'General_Trajectory'] or ann2_label in ['Trajectory_Subject', 'General_Trajectory']:
160 | cat2pares['subject']['ann1'].append(ann1_label)
161 | cat2pares['subject']['ann2'].append(ann2_label)
162 | elif ann1_label in ['better(Trajetory)', 'worse(Trajectory)'] or ann2_label in ['better(Trajetory)', 'worse(Trajectory)']:
163 | cat2pares['trajectory']['ann1'].append(ann1_label)
164 | cat2pares['trajectory']['ann2'].append(ann2_label)
165 | output.append('%s\t%s\t%s' % (ann, ann1_label, ann2_label))
166 |
167 | print('kappa score: [%s]', cohen_kappa_score(ann1_merged, ann2_merged))
168 | for cat in cat2pares:
169 | print('%s kappa score: [%s]' % (cat, cohen_kappa_score(cat2pares[cat]['ann1'], cat2pares[cat]['ann2'])))
170 | utils.save_string('\n'.join(output), output_file)
171 |
172 | if __name__ == "__main__":
173 | # AnnConverter.load_ann_file('S:/NLP/annotation_Steven/stroke_nlp/saved/Stroke_id_105.txt.knowtator.xml')
174 | # AnnConverter.populate_inter_annotator_results('S:/NLP/annotation_Kristiina/stroke_nlp/saved',
175 | # 'S:/NLP/annotation_Steven/stroke_nlp/saved', 'mismatched.tsv')
176 | # AnnConverter.populate_inter_annotator_results('S:/NLP/annotation_Steven/stroke_nlp/saved',
177 | # 'P:/wuh/SemEHR-working/outputs/nlp2phenome',
178 | # 'kristiina_corrections.tsv', 'steven_added.tsv')
179 | ann_folder = '/data/annotated_data/'
180 | ann_files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))]
181 | for f in ann_files:
182 | print('processing %s...' % f)
183 | AnnConverter.convert_csv_annotations(join(ann_folder, f), join(ann_folder, 'corpus'), join(ann_folder, 'gold'), join(ann_folder, 'concept_mapping.json'), join(ann_folder, 'annotated_anns.json'))
184 |
--------------------------------------------------------------------------------
/ann_utils.py:
--------------------------------------------------------------------------------
1 | import sklearn
2 | import datetime
3 | from os import listdir
4 | from os.path import isfile, join
5 | from nlp_to_phenome import EDIRDoc
6 | from annotation_docs import EDIRAnn
7 | import reportreader as rr
8 | import re
9 | import utils
10 | import logging
11 | from operator import itemgetter
12 | import xml.etree.ElementTree as ET
13 |
14 |
15 | class eHostGenedDoc(EDIRDoc):
16 | def __init__(self, file_path):
17 | super(eHostGenedDoc, self).__init__(file_path)
18 |
19 | def get_ess_entities(self):
20 | if self._entities is not None:
21 | return self._entities
22 | root = self._root
23 | entities = []
24 | for e in root.findall('.//classMention'):
25 | mcs = e.findall('./mentionClass')
26 | mention_id = e.attrib['id']
27 | if len(mcs) > 0:
28 | mc = mcs[0]
29 | cls = mc.attrib['id']
30 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..')
31 | if len(mentions) > 0:
32 | span = mentions[0].findall('./span')
33 | ent_start = span[0].attrib['start']
34 | ent_end = span[0].attrib['end']
35 | spannedText = mentions[0].findall('./spannedText')
36 | str = spannedText[0].text
37 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls)
38 | ann.id = len(entities)
39 | entities.append(ann)
40 | self._entities = entities
41 | return self._entities
42 |
43 |
44 | class eHostAnnDoc(EDIRDoc):
45 | """
46 | a document class for ehost annotation file
47 | """
48 | def __init__(self, file_path):
49 | super(eHostAnnDoc, self).__init__(file_path)
50 |
51 | def get_ess_entities(self, no_context=False):
52 | if self._entities is not None:
53 | return self._entities
54 | root = self._root
55 | entities = []
56 | for e in root.findall('.//classMention'):
57 | mcs = e.findall('./mentionClass')
58 | mention_id = e.attrib['id']
59 | if len(mcs) > 0:
60 | mc = mcs[0]
61 | m = re.match(r'VERIFIED\_([^\(]+)', mc.attrib['id'])
62 | if m is None:
63 | m = re.match(r'(IRRELEVANT_LABELS)', mc.attrib['id'])
64 | if m is None:
65 | m = re.match(r'(ADDED)\_([^\(]+)', mc.attrib['id'])
66 | if m is not None:
67 | cls = m.group(1)
68 | if no_context and cls != 'IRRELEVANT_LABELS':
69 | if cls.find('_') >= 0:
70 | cls = cls[cls.find('_')+1:]
71 | mentions = root.findall('.//mention[@id="' + mention_id + '"]/..')
72 | if len(mentions) > 0:
73 | span = mentions[0].findall('./span')
74 | ent_start = span[0].attrib['start']
75 | ent_end = span[0].attrib['end']
76 | spannedText = mentions[0].findall('./spannedText')
77 | str = spannedText[0].text
78 | ann = EDIRAnn(str=str, start=int(ent_start), end=int(ent_end), type=cls)
79 | ann.id = len(entities)
80 | entities.append(ann)
81 | self._entities = entities
82 | return self._entities
83 |
84 |
85 | def ehost_iaa_compute(folder1, folder2, no_context=False):
86 | """
87 | compute inter annotator agreement
88 | :param folder1:
89 | :param folder2:
90 | :param no_context:
91 | :return:
92 | """
93 | annotator1 = read_ehost_annotated_result(folder1, no_context=no_context)
94 | annotator2 = read_ehost_annotated_result(folder2, no_context=no_context)
95 | merged_keys = list(set(annotator1.keys()) | set(annotator2.keys()))
96 | y1 = []
97 | y2 = []
98 | for key in merged_keys:
99 | if key in annotator1 and key in annotator2:
100 | y1.append(annotator1[key])
101 | y2.append(annotator2[key])
102 | else:
103 | print('%s not matched in all' % key)
104 | iaa = sklearn.metrics.cohen_kappa_score(y1, y2)
105 | print('IAA is %s on %s' % (iaa, len(annotator1)))
106 | return iaa
107 |
108 |
109 | def read_ehost_annotated_result(folder, no_context=False):
110 | """
111 | read ehost annotated documents as a dictionary object: id -> entity label
112 | :param folder:
113 | :param no_context:
114 | :return:
115 | """
116 | id2label = {}
117 | files = [f for f in listdir(folder) if isfile(join(folder, f))]
118 | for f in files:
119 | d = eHostAnnDoc(join(folder, f))
120 | for e in d.get_ess_entities(no_context=no_context):
121 | id = '%s-%s-%s' % (f, e.start, e.end)
122 | id2label[id] = e.label
123 | print(id2label)
124 | return id2label
125 |
126 |
127 | def get_what_is_changing(ann_folder, text_folder, output_file, eHostAnnFile=True):
128 | """
129 | get what is getting better/worse
130 | :param ann_folder:
131 | :param text_folder:
132 | :param output_file:
133 | :return:
134 | """
135 | nlp = rr.get_nlp_instance()
136 | files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))]
137 | type2abstractions = {}
138 | for f in files:
139 | anns = []
140 | text_file = join(text_folder, f[0:-14])
141 | if eHostAnnFile:
142 | d = eHostAnnDoc(join(ann_folder, f))
143 | anns = d.get_ess_entities(no_context=True)
144 | else:
145 | d = eHostGenedDoc(join(ann_folder, f))
146 | anns = d.get_ess_entities()
147 | if len(anns) == 0:
148 | logging.info('anns is empty for [{:s}]'.format(f))
149 | text = utils.read_text_file_as_string(join(text_folder, f[0:-14]), encoding='cp1252')
150 | sents = rr.get_sentences_as_anns(nlp, text)
151 | for ann in anns:
152 | for s in sents:
153 | if ann.overlap(s):
154 | abss = rr.AbstractedSentence(1)
155 | abss.text = s.str
156 | result = abss.get_abstaction_by_pos(abss.locate_pos(ann.str), nlp)
157 | if result is None:
158 | logging.info('%s not found in %s' % (ann.str, f))
159 | continue
160 | type = ann.label
161 | if type not in type2abstractions:
162 | type2abstractions[type] = []
163 | type2abstractions[type].append(result.to_dict())
164 | logging.debug(type2abstractions)
165 | utils.save_json_array(type2abstractions, output_file)
166 |
167 |
168 | def compute_iaa():
169 | folder_lia = "S:/NLP/annotation_it02/overlaps/k"
170 | folder_rob = "S:/NLP/annotation_it02/overlaps/s"
171 | folder_nadia = "nadia"
172 | ehost_iaa_compute(folder_lia, folder_rob, no_context=True)
173 |
174 |
175 | def analysing_label_performance(folder, output_file):
176 | s2t = {}
177 | files = [f for f in listdir(folder) if isfile(join(folder, f))]
178 | for f in files:
179 | d = eHostAnnDoc(join(folder, f))
180 | for ann in d.get_ess_entities():
181 | s = ann.str
182 | if not (s in s2t):
183 | s2t[s] = {}
184 | if ann.type in s2t[s]:
185 | s2t[s][ann.type] = s2t[s][ann.type] + 1
186 | else:
187 | s2t[s][ann.type] = 1
188 | sts = sorted([(s, s2t[s]['CORRECT'] if 'CORRECT' in s2t[s] else 0, s2t[s]['IRRELEVANT_LABELS'] if 'IRRELEVANT_LABELS' in s2t[s] else 0, s2t[s]['ADDED'] if 'ADDED' in s2t[s] else 0) for s in s2t], key=itemgetter(2), reverse=True)
189 | s = ('\n'.join(['%s\t%s\t%s\t%s' % (t[0], t[1], t[2], t[3]) for t in sts]))
190 | utils.save_string(s, output_file)
191 |
192 |
193 | def generate_gold_stand_from_validation(generated_ann_folder, validated_ann_folder, gold_standard_folder):
194 |
195 | files = [f for f in listdir(generated_ann_folder) if isfile(join(generated_ann_folder, f))]
196 | for f in files:
197 | logging.debug('processing: %s / %s' % (generated_ann_folder, f))
198 | # ignore added annotations for now
199 | gd_anns = []
200 | gen_doc = eHostGenedDoc(join(generated_ann_folder, f))
201 | logging.debug('ann number: %s' % len(gen_doc.get_ess_entities()))
202 | val_doc = eHostAnnDoc(join(validated_ann_folder, f))
203 | for g in gen_doc.get_ess_entities():
204 | logging.debug('validation label: %s' % g.type)
205 | for v in val_doc.get_ess_entities():
206 | if g.start == v.start and g.end == v.end:
207 | logging.debug('validation label: %s' % v.type)
208 | if v.type == 'CORRECT':
209 | gd_anns.append(g)
210 |
211 | elem_annotations = ET.Element("annotations")
212 | elem_annotations.set('textSource', f)
213 | idx = 0
214 | for ann in gd_anns:
215 | if ann.str.lower() == 'haematoma':
216 | continue
217 | idx += 1
218 | mention_id = '%s-%s' % (f, idx)
219 | elem_ann = ET.SubElement(elem_annotations, "annotation")
220 | elem_mention = ET.SubElement(elem_ann, "mention")
221 | elem_mention.set('id', mention_id)
222 | elem_annotator = ET.SubElement(elem_ann, "annotator")
223 | elem_annotator.set('id', 'semehr')
224 | elem_annotator.text = 'semehr'
225 | elem_span = ET.SubElement(elem_ann, "span")
226 | elem_span.set('start', '%s' % ann.start)
227 | elem_span.set('end', '%s' % ann.end)
228 | elem_spanText = ET.SubElement(elem_ann, "spannedText")
229 | elem_spanText.text = ann.str
230 | elem_date = ET.SubElement(elem_ann, "creationDate")
231 | elem_date.text = datetime.datetime.now().strftime("%a %B %d %X %Z %Y")
232 | #
233 | elem_class = ET.SubElement(elem_annotations, "classMention")
234 | elem_class.set('id', mention_id)
235 | elem_mention_class = ET.SubElement(elem_class, "mentionClass")
236 | if ann.str.lower() == 'haemorrhage' or ann.str.lower() == 'blood' or ann.str.lower() == 'bleed' or ann.str.lower().startswith('collection'):
237 | ann.type = 'bleeding'
238 | elem_mention_class.set('id', ann.type)
239 | elem_mention_class.text = ann.str
240 | tree = ET.ElementTree(elem_annotations)
241 | logging.info('gd file saved to %s - %s' % (gold_standard_folder, f))
242 | utils.save_string(ET.tostring(elem_annotations, encoding='utf8', method='xml'), join(gold_standard_folder, f))
243 |
244 |
245 | def analyse_trajectory_subjects(file, output_file):
246 | t2subs = utils.load_json_data(file)
247 | t2freq = {}
248 | for t in t2subs:
249 | if t not in t2freq:
250 | t2freq[t] = {'subject': {}, 'root': {}}
251 | for sub in t2subs[t]:
252 | add_key_freq(t2freq[t]['subject'], ','.join(sub['subject']))
253 | add_key_freq(t2freq[t]['root'], sub['root'])
254 |
255 | s = ''
256 | for t in t2freq:
257 | freqs = t2freq[t]
258 | subs = sorted([(k, freqs['subject'][k]) for k in freqs['subject']], key=itemgetter(1), reverse=True)
259 | s += '***%s [subjects]***\n%s\n\n' % (t, freq_to_str(subs))
260 | roots = sorted([(k, freqs['root'][k]) for k in freqs['root']], key=itemgetter(1), reverse=True)
261 | s += '***%s [roots]***\n%s\n\n' % (t, freq_to_str(roots))
262 | logging.info(s)
263 | utils.save_string(s, output_file)
264 |
265 |
266 | def freq_to_str(freq):
267 | return '\n'.join(['%s\t%s' % (t[0], t[1]) for t in freq])
268 |
269 |
270 | def add_key_freq(d, key):
271 | if key in d:
272 | d[key] += 1
273 | else:
274 | d[key] = 1
275 |
276 |
277 | def summarise_validation_results(folder):
278 | files = [f for f in listdir(folder) if isfile(join(folder, f))]
279 | t2freq = {}
280 | for f in files:
281 | gen_doc = eHostGenedDoc(join(folder, f))
282 | logging.debug('processing: %s / %s' % (folder, f))
283 | for g in gen_doc.get_ess_entities():
284 | logging.debug('validation label: %s' % g.type)
285 | if g.type not in t2freq:
286 | t2freq[g.type] = 0
287 | t2freq[g.type] += 1
288 | s = '\n'.join(['%s\t%s' % (t, t2freq[t]) for t in t2freq])
289 | logging.info(s)
290 | return s
291 |
292 |
293 |
294 | if __name__ == "__main__":
295 | log_level = 'DEBUG'
296 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
297 | logging.basicConfig(level='DEBUG', format=log_format)
298 | # compute_iaa()
299 | # analysing_label_performance('S:/NLP/annotation_it02/annotation_Steven/iteration_02/saved',
300 | # 'P:/wuh/label2performce_steve.tsv')
301 | # generate_gold_stand_from_validation('P:/wuh/SemEHR-working/outputs_it2/nlp2phenome',
302 | # 'S:/NLP/annotation_it02/annotation_Steven/iteration_02/saved',
303 | # 'P:/wuh/SemEHR-working/outputs_it2/gold_stand_results')
304 | sub_json_file = './diabetes_subs.json'
305 | analyse_trajectory_subjects(sub_json_file, './traject_sub_analysis_result.txt')
306 | # if len(sys.argv) != 4:
307 | # print('the syntax is [python ann_utils.py ann_folder, text_folder, result_file]')
308 | # else:
309 | # logging.info('working...')
310 | # get_what_is_changing(sys.argv[1], sys.argv[2], sys.argv[3], eHostAnnFile=False)
311 | # summarise_validation_results('/data/val/it2')
--------------------------------------------------------------------------------
/annotation_docs.py:
--------------------------------------------------------------------------------
1 | import utils
2 | from os import listdir
3 | from os.path import basename, isfile, join
4 | import logging
5 | import re
6 | from learners import LabelPerformance
7 | # import reportreader as rr
8 |
9 |
10 | class BasicAnn(object):
11 | """
12 | a simple NLP (Named Entity) annotation class
13 | """
14 |
15 | def __init__(self, str, start, end):
16 | self._str = str
17 | self._start = start
18 | self._end = end
19 | self._id = -1
20 |
21 | @property
22 | def id(self):
23 | return self._id
24 |
25 | @id.setter
26 | def id(self, value):
27 | self._id = value
28 |
29 | @property
30 | def str(self):
31 | return self._str
32 |
33 | @str.setter
34 | def str(self, value):
35 | self._str = value
36 |
37 | @property
38 | def start(self):
39 | return self._start
40 |
41 | @start.setter
42 | def start(self, value):
43 | self._start = value
44 |
45 | @property
46 | def end(self):
47 | return self._end
48 |
49 | @end.setter
50 | def end(self, value):
51 | self._end = value
52 |
53 | def overlap(self, other_ann):
54 | if (other_ann.start <= self.start <= other_ann.end or other_ann.start <= self.end <= other_ann.end) or \
55 | (self.start <= other_ann.start <= self.end or self.start <= other_ann.end <= self.end):
56 | return True
57 | else:
58 | return False
59 |
60 | def is_larger(self, other_ann):
61 | return self.start <= other_ann.start and self.end >= other_ann.end \
62 | and not (self.start == other_ann.start and self.end == other_ann.end)
63 |
64 | def serialise_json(self):
65 | return {'start': self.start, 'end': self.end, 'str': self.str, 'id': self.id}
66 |
67 | @staticmethod
68 | def deserialise(jo):
69 | ann = BasicAnn(jo['start'], jo['start'], jo['end'])
70 | ann.id = jo['id']
71 | return ann
72 |
73 |
74 | class EDIRAnn(BasicAnn):
75 | """
76 | EDIR annotation class
77 | """
78 |
79 | def __init__(self, str, start, end, type):
80 | self._type = type
81 | super(EDIRAnn, self).__init__(str, start, end)
82 | self._negated = False
83 |
84 | @property
85 | def type(self):
86 | return self._type
87 |
88 | @type.setter
89 | def type(self, value):
90 | self._type = value
91 |
92 | @property
93 | def negated(self):
94 | return self._negated
95 |
96 | @negated.setter
97 | def negated(self, value):
98 | self._negated = value
99 |
100 | @property
101 | def label(self):
102 | t = self.type
103 | if self.negated:
104 | t = 'neg_' + t
105 | return t
106 |
107 |
108 | class ContextedAnn(BasicAnn):
109 | """
110 | a contextulised annotation class (negation/tempolarity/experiencer)
111 | """
112 |
113 | def __init__(self, str, start, end, negation, temporality, experiencer):
114 | self._neg = negation
115 | self._temp = temporality
116 | self._exp = experiencer
117 | super(ContextedAnn, self).__init__(str, start, end)
118 |
119 | @property
120 | def negation(self):
121 | return self._neg
122 |
123 | @negation.setter
124 | def negation(self, value):
125 | self._neg = value
126 |
127 | @property
128 | def temporality(self):
129 | return self._temp
130 |
131 | @temporality.setter
132 | def temporality(self, value):
133 | self._temp = value
134 |
135 | @property
136 | def experiencer(self):
137 | return self._exp
138 |
139 | @experiencer.setter
140 | def experiencer(self, value):
141 | self._exp = value
142 |
143 |
144 | class PhenotypeAnn(ContextedAnn):
145 | """
146 | a simple customisable phenotype annotation (two attributes for customised attributes)
147 | """
148 |
149 | def __init__(self, str, start, end,
150 | negation, temporality, experiencer,
151 | major_type, minor_type):
152 | super(PhenotypeAnn, self).__init__(str, start, end, negation, temporality, experiencer)
153 | self._major_type = major_type
154 | self._minor_type = minor_type
155 |
156 | @property
157 | def major_type(self):
158 | return self._major_type
159 |
160 | @major_type.setter
161 | def major_type(self, value):
162 | self._major_type = value
163 |
164 | @property
165 | def minor_type(self):
166 | return self._minor_type
167 |
168 | @minor_type.setter
169 | def minor_type(self, value):
170 | self._minor_type = value
171 |
172 | def to_dict(self):
173 | return {
174 | 'str': self.str,
175 | 'start': self.start,
176 | 'end': self.end,
177 | 'negation': self.negation,
178 | 'temporality': self.temporality,
179 | 'experiencer': self.experiencer,
180 | 'majorType': self.major_type,
181 | 'minorType': self.minor_type
182 | }
183 |
184 | def serialise_json(self):
185 | dict = super(PhenotypeAnn, self).serialise_json()
186 | dict['major_type'] = self.major_type
187 | dict['minor_type'] = self.minor_type
188 | return dict
189 |
190 | @staticmethod
191 | def deserialise(jo):
192 | ann = PhenotypeAnn(jo['str'], jo['start'], jo['end'], jo['negation'], jo['temporality'],
193 | jo['experiencer'], jo['major_type'], jo['minor_type'])
194 | ann.id = jo['id']
195 | return ann
196 |
197 |
198 | class SemEHRAnn(ContextedAnn):
199 | """
200 | SemEHR Annotation Class
201 | """
202 |
203 | def __init__(self, str, start, end,
204 | negation, temporality, experiencer,
205 | cui, sty, pref, ann_type):
206 | super(SemEHRAnn, self).__init__(str, start, end, negation, temporality, experiencer)
207 | self._cui = cui
208 | self._sty = sty
209 | self._pref = pref
210 | self._ann_type = ann_type
211 | self._ruled_by = []
212 |
213 | @property
214 | def ruled_by(self):
215 | return self._ruled_by
216 |
217 | @property
218 | def cui(self):
219 | return self._cui
220 |
221 | @cui.setter
222 | def cui(self, value):
223 | self._cui = value
224 |
225 | @property
226 | def sty(self):
227 | return self._sty
228 |
229 | @sty.setter
230 | def sty(self, value):
231 | self._sty = value
232 |
233 | @property
234 | def ann_type(self):
235 | return self._ann_type
236 |
237 | @ann_type.setter
238 | def ann_type(self, value):
239 | self._ann_type = value
240 |
241 | @property
242 | def pref(self):
243 | return self._pref
244 |
245 | @pref.setter
246 | def pref(self, value):
247 | self._pref = value
248 |
249 | @staticmethod
250 | def deserialise(jo):
251 | ann = SemEHRAnn(jo['str'], jo['start'], jo['end'], jo['negation'], jo['temporality'],
252 | jo['experiencer'], jo['cui'], jo['sty'], jo['pref'], 'mention')
253 | ann.id = jo['id']
254 | if 'ruled_by' in jo:
255 | ann._ruled_by = jo['ruled_by']
256 | if 'study_concepts' in jo:
257 | ann._study_concepts = jo['study_concepts']
258 | return ann
259 |
260 |
261 | class SemEHRAnnDoc(object):
262 | """
263 | SemEHR annotation Doc
264 | """
265 |
266 | def __init__(self, file_path, ann_doc=None):
267 | if ann_doc is not None:
268 | self._doc = ann_doc
269 | else:
270 | self._doc = utils.load_json_data(file_path)
271 | self._anns = []
272 | self._phenotype_anns = []
273 | self._sentences = []
274 | self._others = []
275 | self.load_anns()
276 |
277 | def load_anns(self):
278 | all_anns = self._anns
279 | panns = self._phenotype_anns
280 | if 'sentences' in self._doc:
281 | # is a SemEHRAnnDoc serialisation
282 | self._anns = [SemEHRAnn.deserialise(a) for a in self._doc['annotations']]
283 | if 'phenotypes' in self._doc:
284 | self._phenotype_anns = [PhenotypeAnn.deserialise(a) for a in self._doc['phenotypes']]
285 | self._sentences = [BasicAnn.deserialise(a) for a in self._doc['sentences']]
286 | else:
287 | for anns in self._doc['annotations']:
288 | for ann in anns:
289 | t = ann['type']
290 | if t == 'Mention':
291 | a = SemEHRAnn(ann['features']['string_orig'],
292 | int(ann['startNode']['offset']),
293 | int(ann['endNode']['offset']),
294 |
295 | ann['features']['Negation'],
296 | ann['features']['Temporality'],
297 | ann['features']['Experiencer'],
298 |
299 | ann['features']['inst'],
300 | ann['features']['STY'],
301 | ann['features']['PREF'],
302 | t)
303 | all_anns.append(a)
304 | a.id = 'cui-%s' % len(all_anns)
305 | elif t == 'Phenotype':
306 | a = PhenotypeAnn(ann['features']['string_orig'],
307 | int(ann['startNode']['offset']),
308 | int(ann['endNode']['offset']),
309 |
310 | ann['features']['Negation'],
311 | ann['features']['Temporality'],
312 | ann['features']['Experiencer'],
313 |
314 | ann['features']['majorType'],
315 | ann['features']['minorType'])
316 | panns.append(a)
317 | a.id = 'phe-%s' % len(panns)
318 | elif t == 'Sentence':
319 | a = BasicAnn('Sentence',
320 | int(ann['startNode']['offset']),
321 | int(ann['endNode']['offset']))
322 | self._sentences.append(a)
323 | self._sentences = sorted(self._sentences, key=lambda x: x.start)
324 | a.id = 'sent-%s' % len(self._sentences)
325 | else:
326 | self._others.append(ann)
327 |
328 | sorted(all_anns, key=lambda x: x.start)
329 |
330 | @property
331 | def annotations(self):
332 | return self._anns
333 |
334 | @property
335 | def sentences(self):
336 | return self._sentences
337 |
338 | @sentences.setter
339 | def sentences(self, value):
340 | self._sentences = value
341 |
342 | @property
343 | def phenotypes(self):
344 | return self._phenotype_anns
345 |
346 | def learn_mappings_from_labelled(self, labelled_doc, lbl2insts, lbl2missed):
347 | ed = labelled_doc
348 | sd = self
349 | for e in ed.get_ess_entities():
350 | matched = False
351 | for a in sd.annotations:
352 | if a.overlap(e) and not e.is_larger(a):
353 | matched = True
354 | if e.type not in lbl2insts:
355 | lbl2insts[e.type] = set()
356 | lbl2insts[e.type].add('\t'.join([a.cui, a.pref, a.sty]))
357 | continue
358 | # if not matched:
359 | if True:
360 | if e.type not in lbl2missed:
361 | lbl2missed[e.type] = []
362 | lbl2missed[e.type].append(e.str.lower())
363 |
364 | @staticmethod
365 | def keep_max_len_anns(anns):
366 | ann2remove = set()
367 | for idx in range(len(anns)):
368 | a = anns[idx]
369 | for ni in range(idx + 1, len(anns)):
370 | b = anns[ni]
371 | if a.overlap(b):
372 | if a.is_larger(b):
373 | ann2remove.add(b)
374 | elif b.is_larger(a):
375 | ann2remove.add(a)
376 | for a in ann2remove:
377 | anns.remove(a)
378 |
379 |
380 | class Concept2Mapping(object):
381 | """
382 | a mapping from annotations to phenotypes
383 | """
384 |
385 | def __init__(self, concept_map_file):
386 | self._concept_map_file = concept_map_file
387 | self._cui2label = {}
388 | self._concept2label = None
389 | self._type2concept = {}
390 | self._type2gaz = {}
391 | self._all_entities = []
392 | self.load_concept_mappings()
393 |
394 | def load_concept_mappings(self):
395 | concept_mapping = utils.load_json_data(self._concept_map_file)
396 | concept2types = {}
397 | for t in concept_mapping:
398 | self._type2concept[t] = []
399 | for text in concept_mapping[t]:
400 | c = text[:8] # only to get the CUI
401 | self._type2concept[t].append(c)
402 | arr = text.split('\t')
403 | self._cui2label[c] = arr[1]
404 | if c not in concept2types:
405 | concept2types[c] = []
406 | concept2types[c].append(t)
407 | self._all_entities.append(c.lower())
408 | self._concept2label = concept2types
409 |
410 | def load_gaz_dir(self, gaz_dir):
411 | files = [f for f in listdir(gaz_dir) if isfile(join(gaz_dir, f))]
412 | for f in files:
413 | if f.endswith('.lst'):
414 | t = f.split('.')[0]
415 | self._type2gaz[t] = utils.read_text_file(join(gaz_dir, f))
416 | self._all_entities += [t.lower() for t in self._type2gaz[t]]
417 |
418 | @property
419 | def cui2label(self):
420 | return self._cui2label
421 |
422 | @property
423 | def concept2label(self):
424 | return self._concept2label
425 |
426 | @concept2label.setter
427 | def concept2label(self, value):
428 | self._concept2label = value
429 |
430 | def type2cocnepts(self, type):
431 | return self._type2concept[type]
432 |
433 | @property
434 | def type2gaz(self):
435 | return self._type2gaz
436 |
437 | @property
438 | def all_entities(self):
439 | return self._all_entities
440 |
441 |
442 | class CustomisedRecoginiser(SemEHRAnnDoc):
443 | """
444 | recognise target labels based on identified UMLS entities and
445 | customised labels
446 | """
447 |
448 | def __init__(self, file_path, concept_mapping, ann_doc=None):
449 | super(CustomisedRecoginiser, self).__init__(file_path=file_path, ann_doc=ann_doc)
450 | self._concept_mapping = concept_mapping
451 | self._mapped = None
452 | self._phenotypes = None
453 | self._combined = None
454 | self._full_text_folder = None
455 | self._full_text_file_pattern = '%s.txt'
456 | self._full_text = None
457 |
458 | @property
459 | def full_text_folder(self):
460 | return self._full_text_folder
461 |
462 | @full_text_folder.setter
463 | def full_text_folder(self, value):
464 | self._full_text_folder = value
465 |
466 | @property
467 | def full_text_file_pattern(self):
468 | return self._full_text_file_pattern
469 |
470 | @full_text_file_pattern.setter
471 | def full_text_file_pattern(self, value):
472 | self._full_text_file_pattern = value
473 |
474 | @property
475 | def concept2label(self):
476 | return self._concept_mapping.concept2label
477 |
478 | def get_mapped_labels(self):
479 | if self._mapped is not None:
480 | return self._mapped
481 | mapped = []
482 | for ann in self.annotations:
483 | if ann.cui in self.concept2label:
484 | for t in self.concept2label[ann.cui]:
485 | ea = EDIRAnn(ann.str, ann.start, ann.end, t)
486 | ea.negated = ann.negation == 'Negated'
487 | ea.id = ann.id
488 | mapped.append(ea)
489 | self._mapped = mapped
490 | return mapped
491 |
492 | def get_customised_phenotypes(self):
493 | if self._phenotypes is not None:
494 | return self._phenotypes
495 | self._phenotypes = []
496 | for ann in self.phenotypes:
497 | ea = EDIRAnn(ann.str, ann.start, ann.end, ann.minor_type)
498 | ea.negated = ann.negation == 'Negated'
499 | ea.id = ann.id
500 | self._phenotypes.append(ea)
501 | return self._phenotypes
502 |
503 | def get_ann_sentence(self, ann):
504 | sent = None
505 | for s in self.sentences:
506 | if ann.overlap(s):
507 | sent = s
508 | break
509 | if sent is None:
510 | print('sentence not found for %s' % ann.__dict__)
511 | return None
512 | return sent
513 |
514 | def get_previous_sentences(self, ann, include_self=True):
515 | sent = self.get_ann_sentence(ann)
516 | if sent is None:
517 | return None
518 | sents = []
519 | for s in self.sentences:
520 | if s.start < sent.start:
521 | sents.append(s)
522 | return sorted(sents + ([] if not include_self else [sent]), key=lambda s: s.start)
523 |
524 | def get_sent_anns(self, sent, ann_ignore=None, filter_fun=None, filter_param=None):
525 | ret = {'umls': [], 'phenotype': []}
526 | for a in self.annotations:
527 | if a.overlap(sent):
528 | if ann_ignore is not None and ann_ignore.overlap(a):
529 | continue
530 | if filter_fun is not None and filter_fun(a, filter_param):
531 | continue
532 | ret['umls'].append(a)
533 | for a in self.phenotypes:
534 | if a.overlap(sent):
535 | if ann_ignore is not None and ann_ignore.overlap(a):
536 | continue
537 | if filter_fun is not None and filter_fun(a, filter_param):
538 | continue
539 | ret['phenotype'].append(a)
540 | return ret
541 |
542 | def get_same_sentence_anns(self, ann):
543 | sent = self.get_ann_sentence(ann)
544 | if sent is None:
545 | return None
546 | return self.get_sent_anns(sent, ann)
547 |
548 | def get_prior_anns(self, ann, filter_fun=None, filter_param=None, contenxt_depth=-1):
549 | sents = self.get_previous_sentences(ann)
550 | ret = {'umls': [], 'phenotype': []}
551 | for s in sents[contenxt_depth:]:
552 | r = self.get_sent_anns(s, ann_ignore=ann, filter_fun=filter_fun, filter_param=filter_param)
553 | ret['umls'] += r['umls']
554 | ret['phenotype'] += r['phenotype']
555 | return ret
556 |
557 | def get_containing_anns(self, ann):
558 | c_anns = []
559 | for a in self.phenotypes:
560 | if ann != a and ann.str.lower() in a.str.lower() and len(a.str) > len(ann.str):
561 | c_anns.append(a)
562 | return c_anns
563 |
564 | @property
565 | def full_text(self):
566 | return self._full_text
567 |
568 | @full_text.setter
569 | def full_text(self, value):
570 | self._full_text = value
571 |
572 | def get_full_text(self, fk):
573 | if self._full_text is None and self._full_text_folder is not None and self._full_text_file_pattern is not None:
574 | self._full_text = utils.read_text_file_as_string(
575 | join(self._full_text_folder,
576 | self._full_text_file_pattern % fk), encoding='utf-8')
577 | return self._full_text
578 |
579 | def relocate_all_anns(self, fk):
580 | t = self.get_full_text(fk)
581 | for a in self.phenotypes + self.annotations:
582 | s, e = relocate_annotation_pos(t, a.start, a.end, a.str)
583 | a.start = s
584 | a.end = e
585 |
586 | def re_segment_sentences(self, fk):
587 | text = self.get_full_text(fk)
588 | if text is not None:
589 | self.sentences = rr.get_sentences_as_anns(rr.get_nlp_instance(), text)
590 |
591 | def get_context_words(self, ann, file_key, n_words=2):
592 | sent = self.get_ann_sentence(ann)
593 | t = self.get_full_text(file_key)
594 | words = []
595 | if t is not None:
596 | s = t[sent.start:sent.end]
597 | context_start = ann.start - sent.start + len(ann.str)
598 | str = s[context_start:]
599 | p = re.compile(r'\[A-Za-z]{0,2}\b(\w+)\b')
600 | idx = 0
601 | for m in p.finditer(str):
602 | if idx <= n_words - 1:
603 | words.append(str[m.span(1)[0]:m.span(1)[1]])
604 | else:
605 | break
606 | idx += 1
607 |
608 | # use dependency tree to get context words
609 | # abss = rr.AbstractedSentence(1)
610 | # abss.text = s
611 | # result = abss.get_abstaction_by_pos(abss.locate_pos(ann.str), rr.get_nlp_instance())
612 | # dep_words = []
613 | # if result is not None:
614 | # # subject
615 | # dep_words.append(result.subject[0].text if len(result.subject) > 0 else 'empty')
616 |
617 | # # first verb other than root verb
618 | # dep_words.append(result.verbs[0].text if len(result.verbs) > 0 else 'empty')
619 |
620 | # # root verb
621 | # dep_words.append(result.root.text if result.root is not None else 'empty')
622 |
623 | # # first child
624 | # dep_words.append(result.children[0].text if len(result.children) > 0 else 'empty')
625 | # else:
626 | # dep_words += ['empty'] *4
627 | # logging.debug('not found [%s]' % s)
628 | # words += dep_words
629 | if len(words) == 0:
630 | words = ['empty']
631 | return words
632 |
633 | def get_anns_by_label(self, label, ignore_mappings=[], no_context=False):
634 | anns = []
635 | t = label.replace('neg_', '')
636 | for a in self.annotations:
637 | if a.cui not in self.concept2label:
638 | continue
639 | if a.cui in ignore_mappings:
640 | continue
641 | if len(a.ruled_by) > 0:
642 | continue
643 | if t in self.concept2label[a.cui]:
644 | if no_context:
645 | anns.append(a)
646 | elif label.startswith('neg_') and a.negation == 'Negated':
647 | anns.append(a)
648 | elif not label.startswith('neg_') and a.negation != 'Negated':
649 | anns.append(a)
650 | # anns = []
651 | phenotypes = []
652 | smaller_to_remove = []
653 | for a in self.phenotypes:
654 | if a.minor_type == t:
655 | if a.str.lower() in [s.lower() for s in ignore_mappings]:
656 | continue
657 | if no_context or (label.startswith('neg_') and a.negation == 'Negated') or \
658 | (not label.startswith('neg_') and a.negation != 'Negated'):
659 | overlaped = False
660 | for ann in anns + phenotypes:
661 | if ann.overlap(a):
662 | if a.is_larger(ann):
663 | smaller_to_remove.append(ann)
664 | else:
665 | overlaped = True
666 | break
667 | if not overlaped:
668 | phenotypes.append(a)
669 | for o in smaller_to_remove:
670 | if o in anns:
671 | anns.remove(o)
672 | if o in phenotypes:
673 | phenotypes.remove(o)
674 | return anns + phenotypes
675 |
676 | def get_combined_anns(self):
677 | if self._combined is not None:
678 | return self._combined
679 | anns = [] + self.get_mapped_labels()
680 | for ann in self.get_customised_phenotypes():
681 | overlaped = False
682 | for m in self.get_mapped_labels():
683 | if ann.overlap(m):
684 | overlaped = True
685 | break
686 | if not overlaped:
687 | anns.append(ann)
688 | self._combined = anns
689 | return anns
690 |
691 | def validate_mapped_performance(self, gold_anns, label2performance):
692 | CustomisedRecoginiser.validate(gold_anns, self.get_mapped_labels(), label2performance)
693 |
694 | def validate_combined_performance(self, gold_anns, label2performance):
695 | CustomisedRecoginiser.validate(gold_anns,
696 | self.get_combined_anns(),
697 | label2performance)
698 |
699 | @staticmethod
700 | def validate(gold_anns, learnt_anns, label2performance):
701 | matched_ann_ids = []
702 | for ga in gold_anns:
703 | l = ga.label
704 | if l not in label2performance:
705 | label2performance[l] = LabelPerformance(l)
706 | performance = label2performance[l]
707 | matched = False
708 | for la in learnt_anns:
709 | if la.label == l and la.overlap(ga):
710 | matched = True
711 | performance.increase_true_positive()
712 | matched_ann_ids.append(la.id)
713 | break
714 | if not matched:
715 | performance.increase_false_negative()
716 | for la in learnt_anns:
717 | if la.id not in matched_ann_ids:
718 | l = la.label
719 | if l not in label2performance:
720 | label2performance[l] = LabelPerformance(l)
721 | performance = label2performance[l]
722 | performance.increase_false_positive()
723 |
724 | @staticmethod
725 | def print_performances(label2performances):
726 | s = ''.join(['*' * 10, 'performance', '*' * 10])
727 | s += '\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('label', 'precision', 'recall', 'f1', '#insts', 'false positive',
728 | 'false negative', 'true positive')
729 | for t in label2performances:
730 | p = label2performances[t]
731 | s += '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (t, p.precision, p.recall, p.f1,
732 | p.true_positive + p.false_negative,
733 | p.false_positive, p.false_negative, p.true_positive)
734 | logging.getLogger('performance').info(s)
735 | return s
736 |
737 |
738 | def relocate_annotation_pos(t, s, e, string_orig):
739 | if t[s:e] == string_orig:
740 | return [s, e]
741 | candidates = []
742 | ito = re.finditer(r'[\s\.;\,\?\!\:\/$^](' + string_orig + r')[\s\.;\,\?\!\:\/$^]',
743 | t, re.IGNORECASE)
744 | for mo in ito:
745 | # print mo.start(1), mo.end(1), mo.group(1)
746 | candidates.append({'dis': abs(s - mo.start(1)), 's': mo.start(1), 'e': mo.end(1), 'matched': mo.group(1)})
747 | if len(candidates) == 0:
748 | return [s, e]
749 | candidates.sort(cmp=lambda x1, x2: x1['dis'] - x2['dis'])
750 | # print candidates[0]
751 | return [candidates[0]['s'], candidates[0]['e']]
--------------------------------------------------------------------------------
/data/entity_types.txt:
--------------------------------------------------------------------------------
1 | haemorrhagic_stroke
2 | haemorrhagic_transformation
3 | ischaemic_stroke
4 | loc_cortical
5 | loc_deep
6 | mening_tumour
7 | metast_tumour
8 | microhaemorrhage
9 | neg_atrophy
10 | neg_haemorrhagic_stroke
11 | neg_haemorrhagic_transformation
12 | neg_ischaemic_stroke
13 | neg_loc_cortical
14 | neg_loc_deep
15 | neg_mening_tumour
16 | neg_metast_tumour
17 | neg_microhaemorrhage
18 | neg_small_vessel_disease
19 | neg_stroke
20 | neg_subarachnoid_haemorrhage
21 | neg_subdural_haematoma
22 | neg_time_old
23 | neg_time_recent
24 | neg_tumour
25 | small_vessel_disease
26 | stroke
27 | subarachnoid_haemorrhage
28 | subdural_haematoma
29 | time_old
30 | time_recent
31 | tumour
32 | atrophy
--------------------------------------------------------------------------------
/data/entity_types_modifiers.txt:
--------------------------------------------------------------------------------
1 | loc_cortical
2 | loc_deep
3 | time_old
4 | time_recent
--------------------------------------------------------------------------------
/data/entity_types_no_context.txt:
--------------------------------------------------------------------------------
1 | atrophy
2 | glioma_tumour
3 | haemorrhagic_stroke
4 | haemorrhagic_transformation
5 | ischaemic_stroke
6 | loc_cortical
7 | loc_deep
8 | mening_tumour
9 | metast_tumour
10 | microhaemorrhage
11 | small_vessel_disease
12 | stroke
13 | subarachnoid_haemorrhage
14 | subdural_haematoma
15 | time_old
16 | time_recent
17 | tumour
--------------------------------------------------------------------------------
/data/entity_types_phenotypes.txt:
--------------------------------------------------------------------------------
1 | atrophy
2 | glioma_tumour
3 | haemorrhagic_stroke
4 | haemorrhagic_transformation
5 | ischaemic_stroke
6 | mening_tumour
7 | metast_tumour
8 | microhaemorrhage
9 | small_vessel_disease
10 | stroke
11 | subarachnoid_haemorrhage
12 | subdural_haematoma
13 | tumour
--------------------------------------------------------------------------------
/data/entity_types_times.txt:
--------------------------------------------------------------------------------
1 | time_old
2 | time_recent
--------------------------------------------------------------------------------
/doc_inference.py:
--------------------------------------------------------------------------------
1 | import utils
2 | import re
3 | import json
4 | import sys
5 |
6 |
7 | class RuleConstruct(object):
8 | def __init__(self, phenotype):
9 | self._phenotype = phenotype
10 | self._negation = 'Affirmed'
11 | self._temporality = 'Recent'
12 | self._experiencer = 'Patient'
13 |
14 | @property
15 | def phenotype(self):
16 | return self._phenotype
17 |
18 | @phenotype.setter
19 | def phenotype(self, value):
20 | self._phenotype = value
21 |
22 | @property
23 | def negation(self):
24 | return self._negation
25 |
26 | @negation.setter
27 | def negation(self, value):
28 | self._negation = value
29 |
30 | @property
31 | def temporality(self):
32 | return self._temporality
33 |
34 | @temporality.setter
35 | def temporality(self, value):
36 | self._temporality = value
37 |
38 | @property
39 | def experiencer(self):
40 | return self._experiencer
41 |
42 | @experiencer.setter
43 | def experiencer(self, value):
44 | self._experiencer = value
45 |
46 |
47 | class PhenotypeRule(object):
48 | def __init__(self):
49 | self._inclusion = []
50 | self._exclusion = []
51 | self._rule_label = None
52 |
53 | def inclusion_constructs(self):
54 | return self._inclusion
55 |
56 | def exclusion_units(self):
57 | return self._exclusion
58 |
59 | @property
60 | def rule_label(self):
61 | return self._rule_label
62 |
63 | @rule_label.setter
64 | def rule_label(self, value):
65 | self._rule_label = value
66 |
67 | @staticmethod
68 | def load_rules(rule_file):
69 | rules = utils.load_json_data(rule_file)
70 | prs = []
71 | for r in rules:
72 | pr = PhenotypeRule()
73 | pr.rule_label = r['label']
74 | prs.append(pr)
75 | pr.inclusion_constructs = [PhenotypeRule.get_rule_construct(c) for c in r['inclusions']]
76 | pr.exclusion_units = []
77 | for u in r['exclusion_units']:
78 | pr.exclusion_units.append([PhenotypeRule.get_rule_construct(c) for c in u])
79 | return prs
80 |
81 | @staticmethod
82 | def get_rule_construct(c):
83 | rc = RuleConstruct(c['phenotype'])
84 | if 'negation' in c:
85 | rc.negation = c['negation']
86 | if 'temporality' in c:
87 | rc.temporality = c['temporality']
88 | if 'experiencer' in c:
89 | rc.experiencer = c['experiencer']
90 | return rc
91 |
92 |
93 | class PhenotypeRuleExecutor(object):
94 | def __init__(self):
95 | pass
96 |
97 | @staticmethod
98 | def apply_rules(doc_anns, rules):
99 | label_prov = []
100 | anns = [t['ann'] for t in doc_anns]
101 | for r in rules:
102 | prov = {"exclusion": [], "inclusion": None}
103 | label = ''
104 | inclusion_matched = PhenotypeRuleExecutor.match_rule_construct(r.inclusion_constructs, anns)
105 | if len(inclusion_matched) > 0:
106 | prov['inclusion'] = inclusion_matched
107 | for ec in r.exclusion_units:
108 | exclusion_matched = PhenotypeRuleExecutor.match_rule_construct(ec, anns)
109 | if len(exclusion_matched) > 0:
110 | prov['exclusion'].append({'ec': ec, 'matched': exclusion_matched})
111 | if len(prov['exclusion']) == 0:
112 | label = r.rule_label
113 | if label != '': # or len(prov['exclusion']) > 0:
114 | label_prov.append({'label': label, 'prov': prov})
115 | return label_prov
116 |
117 | @staticmethod
118 | def match_ann_rule(rc, ann):
119 | return ann['minorType'] == rc.phenotype and ann['negation'] == rc.negation and ann[
120 | 'temporality'] == rc.temporality and ann['experiencer'] == rc.experiencer
121 |
122 | @staticmethod
123 | def match_rule_construct(rc_list, anns):
124 | matched = []
125 | for ann in anns:
126 | m = True
127 | for rc in rc_list:
128 | if not PhenotypeRuleExecutor.match_ann_rule(rc, ann):
129 | m = False
130 | break
131 | if m:
132 | matched.append(ann)
133 | return matched
134 |
135 |
136 | def load_patient_truth(truth_file):
137 | all_pids = []
138 | lines = utils.read_text_file(truth_file)
139 | type2ids = {}
140 | for l in lines:
141 | arr = l.split('\t')
142 | if arr[2] not in type2ids:
143 | type2ids[arr[2]] = []
144 | type2ids[arr[2]].append(arr[0])
145 | all_pids.append(arr[0])
146 | return type2ids, all_pids
147 |
148 |
149 | def cal_performance(no_reports_pids, type2ids, doc_type2id, gd_labels, pred_label):
150 | gt_list = []
151 | for lbl in gd_labels:
152 | gt_list += type2ids[lbl]
153 | gt_ids = set(gt_list)
154 | pr_ids = set(doc_type2id[pred_label])
155 | print('\n*****%s******' % pred_label)
156 |
157 | false_negative = gt_ids - no_reports_pids - pr_ids
158 | false_positive = pr_ids - gt_ids
159 | print('total reported patients: %s, total truth: %s, predicted: %s, false negative:%s, false positive:%s'
160 | % (len(pids), len(gt_ids - no_reports_pids), len(pr_ids), len(false_negative), len(false_positive)))
161 | print('false negative: %s' % (false_negative))
162 | print('false positive: %s' % false_positive)
163 |
164 |
165 | def doc_infer_with_ground_truth(patient_level_tsv, pids, doc_type2id):
166 | type2ids, all_pids = load_patient_truth(patient_level_tsv)
167 | no_reports_pids = set(all_pids) - set(pids)
168 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['SAH', 'ICH'], 'primary haemorrhagic stroke')
169 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['SAH'], 'subarachnoid haemorrhage')
170 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['ICH'], 'intracerebra haemorrhage')
171 | cal_performance(no_reports_pids, type2ids, doc_type2id, ['Ischaemic'], 'ischaemic stroke')
172 |
173 |
174 | def doc_infer(settings):
175 | rules = PhenotypeRule.load_rules(settings['rule_file'])
176 | d2predicted = utils.load_json_data(settings['doc_nlp_results'])
177 | doc_labels_output = settings['doc_inference_output']
178 | s = ''
179 | doc_type2id = {}
180 | pids = []
181 | for d in d2predicted:
182 | m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d)
183 | pid = d
184 | if m is not None:
185 | pid = m.group(1)
186 | pids.append(pid)
187 | label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules)
188 | print(pid, d, label_provs)
189 | for lp in label_provs:
190 | if lp['label'] != '':
191 | s += '%s\t%s\n' % (pid, lp['label'])
192 | if lp['label'] not in doc_type2id:
193 | doc_type2id[lp['label']] = []
194 | doc_type2id[lp['label']].append(pid)
195 |
196 | pids = list(set(pids))
197 | print(json.dumps(pids))
198 | utils.save_string(s, doc_labels_output)
199 | if 'patient_level_truth_tsv' in settings:
200 | doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id)
201 |
202 |
203 | if __name__ == "__main__":
204 | if len(sys.argv) != 2:
205 | print('the syntax is [python doc_inference.py PROCESS_SETTINGS_FILE_PATH]')
206 | else:
207 | infer_settings = utils.load_json_data(sys.argv[1])
208 | doc_infer(infer_settings)
209 |
--------------------------------------------------------------------------------
/learners.py:
--------------------------------------------------------------------------------
1 | import joblib as jl
2 | from sklearn import tree
3 | from sklearn.ensemble import RandomForestClassifier
4 | from sklearn.gaussian_process import GaussianProcessClassifier
5 | from sklearn.naive_bayes import GaussianNB
6 | from sklearn import svm
7 | from sklearn.decomposition import PCA
8 | from sklearn.cluster import DBSCAN
9 | from sklearn.neighbors import KNeighborsClassifier, KDTree
10 | from sklearn.metrics.pairwise import cosine_similarity
11 | import logging
12 | from os.path import basename, isfile, join, split
13 | from os import listdir, remove
14 | import graphviz
15 | import numpy
16 |
17 |
18 | class PhenomeLearners(object):
19 | def __init__(self, setting):
20 | self._setting = setting
21 |
22 | @property
23 | def min_sample_size(self):
24 | return self._setting['min_sample_size']
25 |
26 | @staticmethod
27 | def decision_tree_learning(self, X, Y, lm, output_file=None, pca_dim=None, pca_file=None, tree_viz_file=None,
28 | lbl='united', min_sample_size=25):
29 | if len(X) <= min_sample_size:
30 | logging.warning('not enough data found for prediction: %s' % lm.label)
31 | if isfile(output_file):
32 | remove(output_file)
33 | return
34 | pca = None
35 | if pca_dim is not None:
36 | pca = PCA(n_components=pca_dim)
37 | X_new = pca.fit_transform(X)
38 | else:
39 | X_new = X
40 | clf = tree.DecisionTreeClassifier()
41 | clf = clf.fit(X_new, Y)
42 | if output_file is not None:
43 | jl.dump(clf, output_file)
44 | logging.info('model file saved to %s' % output_file)
45 | if pca is not None and pca_file is not None:
46 | jl.dump(pca, pca_file)
47 | if tree_viz_file is not None:
48 | label_feature_names = []
49 | if lm.use_one_dimension_for_label:
50 | label_feature_names.append('label')
51 | else:
52 | for l in lm.label_dimensions:
53 | if l.upper() in lm.cui2label:
54 | label_feature_names.append('lbl: ' + lm.cui2label[l.upper()])
55 | else:
56 | label_feature_names.append('lbl: ' + l.upper())
57 | dot_data = tree.export_graphviz(clf, out_file=None,
58 | filled=True, rounded=True,
59 | feature_names=label_feature_names +
60 | [(str(lm.cui2label[
61 | l.upper()]) + '(' + l.upper() + ')') if l.upper() in lm.cui2label else l
62 | for l in lm.context_dimensions(lbl)],
63 | class_names=['Yes', 'No'],
64 | special_characters=True)
65 | graph = graphviz.Source(dot_data)
66 | graph.render(tree_viz_file)
67 |
68 | @staticmethod
69 | def random_forest_learning(X, Y, output_file=None):
70 | if len(X) == 0:
71 | logging.warning('no data found for prediction')
72 | return
73 | clf = RandomForestClassifier()
74 | clf = clf.fit(X, Y)
75 | if output_file is not None:
76 | jl.dump(clf, output_file)
77 | logging.info('model file saved to %s' % output_file)
78 |
79 | @staticmethod
80 | def svm_learning(X, Y, output_file=None):
81 | if len(X) == 0:
82 | logging.info('no data found for prediction')
83 | return
84 | v = -1
85 | all_same = True
86 | for y in Y:
87 | if v == -1:
88 | v = y[0]
89 | if v != y[0]:
90 | all_same = False
91 | break
92 | if all_same:
93 | logging.warning('all same labels %s' % Y)
94 | return
95 | clf = svm.SVC(kernel='sigmoid')
96 | clf = clf.fit(X, Y)
97 | if output_file is not None:
98 | jl.dump(clf, output_file)
99 | logging.info('model file saved to %s' % output_file)
100 |
101 | @staticmethod
102 | def gpc_learning(X, Y, output_file=None):
103 | gpc = GaussianProcessClassifier().fit(X, Y)
104 | if output_file is not None:
105 | jl.dump(gpc, output_file)
106 | logging.info('model file saved to %s' % output_file)
107 |
108 | @staticmethod
109 | def gaussian_nb(X, Y, output_file=None):
110 | gnb = GaussianNB().fit(X, Y)
111 | if output_file is not None:
112 | jl.dump(gnb, output_file)
113 | logging.info('model file saved to %s' % output_file)
114 |
115 | @staticmethod
116 | def cluster(X, Y, output_file=None):
117 | dbm = DBSCAN(eps=.50).fit(X)
118 | cls2label = {}
119 | for idx in range(len(dbm.labels_)):
120 | c = dbm.labels_[idx]
121 | cls = 'cls%s' % c
122 | if cls not in cls2label:
123 | cls2label[cls] = {'t': 0, 'f': 0}
124 | if Y[idx] == [0]:
125 | cls2label[cls]['f'] += 1
126 | else:
127 | cls2label[cls]['t'] += 1
128 | logging.info(cls2label)
129 | kdt = KDTree(X)
130 | if output_file is not None:
131 | jl.dump({'dbm': dbm, 'X': X, 'Y': Y, 'kdt': kdt, 'cls2label': cls2label}, output_file)
132 | logging.info('complex model file saved to %s' % output_file)
133 |
134 | @staticmethod
135 | def cluster_predict(X, Y, fns, multiple_tps, model_file, performance,
136 | separate_performance=None, min_sample_size=25):
137 | all_true = False
138 | if not isfile(model_file):
139 | logging.info('model file NOT FOUND: %s' % model_file)
140 | all_true = True
141 | else:
142 | m = jl.load(model_file)
143 | dbm = m['dbm']
144 | kdt = m['kdt']
145 | P = m.predict(X)
146 | if fns > 0:
147 | logging.debug('missed instances: %s' % fns)
148 | performance.increase_false_negative(fns)
149 | if multiple_tps > 0:
150 | performance.increase_true_positive(multiple_tps)
151 | if all_true or len(X) <= min_sample_size:
152 | logging.warn('using querying instead of predicting')
153 | P = numpy.ones(len(X))
154 | else:
155 | logging.info('instance size %s' % len(P))
156 | for idx in range(len(P)):
157 | LabelPerformance.evaluate_to_performance(P[idx], Y[idx], [performance, separate_performance])
158 |
159 | @staticmethod
160 | def knn_classify(X, Y, output_file=None):
161 | knn = KNeighborsClassifier(n_neighbors=2).fit(X, Y)
162 | if output_file is not None:
163 | jl.dump(knn, output_file)
164 | logging.info('model file saved to %s' % output_file)
165 |
166 | @staticmethod
167 | def predict_use_simple_stats(tp_ratio, Y, multiple_tps, performance, ratio_cut_off=0.15, separate_performance=None,
168 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None,
169 | label_whitelist=None, mp_predicted=None):
170 | P = numpy.ones(len(Y)) if tp_ratio >= ratio_cut_off else numpy.zeros(len(Y))
171 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted)
172 | if multiple_tps > 0:
173 | performance.increase_true_positive(multiple_tps)
174 | if separate_performance is not None:
175 | separate_performance.increase_true_positive(multiple_tps)
176 | PhenomeLearners.cal_performance(P, Y, performance, separate_performance,
177 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
178 | doc_folder=doc_folder,
179 | label_whitelist=label_whitelist)
180 |
181 | @staticmethod
182 | def merge_with_pattern_prediction(y_pred, mp_predict):
183 | if mp_predict is None:
184 | return y_pred
185 | y_merged = []
186 | print('>>>', y_pred, mp_predict)
187 | for idx in range(len(y_pred)):
188 | y_merged.append(y_pred[idx])
189 | if y_pred[idx] == 1 and mp_predict[idx] == 0:
190 | y_merged[idx] = 0
191 | return y_merged
192 |
193 | @staticmethod
194 | def predict_use_simple_stats_in_action(tp_ratio, item_size, ratio_cut_off=0.15,
195 | doc2predicted=None, doc_anns=None, mp_predicted=None):
196 | P = numpy.ones(item_size) if tp_ratio >= ratio_cut_off else numpy.zeros(item_size)
197 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted)
198 | PhenomeLearners.collect_prediction(P, doc2predicted=doc2predicted, doc_anns=doc_anns)
199 |
200 | @staticmethod
201 | def cal_performance(P, Y, performance, separate_performance=None,
202 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None, label_whitelist=None):
203 |
204 | P = numpy.asarray(P).flatten().tolist()
205 | Y = numpy.asarray(Y).flatten().tolist()
206 | doc2predicted = {}
207 | for idx in range(len(P)):
208 | LabelPerformance.evaluate_to_performance(P[idx], Y[idx], [performance, separate_performance])
209 | if P[idx] == 1.0 and id2conll is not None and doc_anns is not None and doc_folder is not None:
210 | PhenomeLearners.collect_prediction(P, doc_anns, doc2predicted)
211 | # comment the following out to skip conll outputs
212 | # for d in doc2predicted:
213 | # if d not in id2conll:
214 | # id2conll[d] = ConllDoc(join(doc_folder, file_pattern % d))
215 | # if label_whitelist is not None:
216 | # id2conll[d].set_label_white_list(label_whitelist)
217 | # cnll = id2conll[d]
218 | # for anns in doc2predicted[d]:
219 | # cnll.add_predicted_labels(anns)
220 |
221 | @staticmethod
222 | def predict_use_model(X, Y, fns, multiple_tps, model_file, performance,
223 | pca_model_file=None, separate_performance=None,
224 | id2conll=None, doc_anns=None, file_pattern=None, doc_folder=None,
225 | label_whitelist=None, mp_predicted=None):
226 | all_true = False
227 | if not isfile(model_file):
228 | logging.info('model file NOT FOUND: %s' % model_file)
229 | all_true = True
230 | else:
231 | if pca_model_file is not None:
232 | pca = jl.load(pca_model_file)
233 | X_new = pca.transform(X)
234 | else:
235 | X_new = X
236 | m = jl.load(model_file)
237 | P = m.predict(X_new)
238 | if fns > 0:
239 | logging.debug('missed instances: %s' % fns)
240 | performance.increase_false_negative(fns)
241 | if multiple_tps > 0:
242 | performance.increase_true_positive(multiple_tps)
243 | if separate_performance is not None:
244 | separate_performance.increase_true_positive(multiple_tps)
245 | if all_true: # or len(X) <= _min_sample_size:
246 | logging.warning('using querying instead of predicting')
247 | P = numpy.ones(len(X))
248 | else:
249 | logging.info('instance size %s' % len(P))
250 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted)
251 | PhenomeLearners.cal_performance(P, Y, performance, separate_performance,
252 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
253 | doc_folder=doc_folder, label_whitelist=label_whitelist)
254 |
255 | @staticmethod
256 | def predict_use_model_in_action(X, model_file, pca_model_file=None,
257 | doc2predicted=None, doc_anns=None, mp_predicted=None):
258 | all_true = False
259 | if not isfile(model_file):
260 | logging.info('model file NOT FOUND: %s' % model_file)
261 | all_true = True
262 | else:
263 | if pca_model_file is not None:
264 | pca = jl.load(pca_model_file)
265 | X_new = pca.transform(X)
266 | else:
267 | X_new = X
268 | m = jl.load(model_file)
269 | P = m.predict(X_new)
270 |
271 | if all_true: # or len(X) <= _min_sample_size:
272 | logging.warning('using querying instead of predicting')
273 | P = numpy.ones(len(X))
274 | else:
275 | logging.info('instance size %s' % len(P))
276 | P = PhenomeLearners.merge_with_pattern_prediction(P, mp_predicted)
277 | PhenomeLearners.collect_prediction(P, doc2predicted=doc2predicted, doc_anns=doc_anns)
278 |
279 | @staticmethod
280 | def collect_prediction(P, doc_anns, doc2predicted):
281 | for idx in range(len(P)):
282 | if P[idx] == 1.0 and doc_anns is not None:
283 | d = doc_anns[idx]['d']
284 | labeled_ann = {'label': doc_anns[idx]['label'],
285 | 'ann': doc_anns[idx]['ann']}
286 | if d not in doc2predicted:
287 | doc2predicted[d] = [labeled_ann]
288 | else:
289 | doc2predicted[d].append(labeled_ann)
290 |
291 |
292 | class LabelPerformance(object):
293 | """
294 | precision/recall/f1 calculation on TP/FN/FP values
295 | """
296 |
297 | def __init__(self, label):
298 | self._label = label
299 | self._tp = 0
300 | self._fn = 0
301 | self._fp = 0
302 |
303 | def increase_true_positive(self, k=1):
304 | self._tp += k
305 |
306 | def increase_false_negative(self, k=1):
307 | self._fn += k
308 |
309 | def increase_false_positive(self, k=1):
310 | self._fp += k
311 |
312 | @property
313 | def true_positive(self):
314 | return self._tp
315 |
316 | @property
317 | def false_negative(self):
318 | return self._fn
319 |
320 | @property
321 | def false_positive(self):
322 | return self._fp
323 |
324 | @property
325 | def precision(self):
326 | if self._tp + self._fp == 0:
327 | return -1
328 | else:
329 | return 1.0 * self._tp / (self._tp + self._fp)
330 |
331 | @property
332 | def recall(self):
333 | if self._tp + self._fn == 0:
334 | return -1
335 | else:
336 | return 1.0 * self._tp / (self._tp + self._fn)
337 |
338 | @property
339 | def f1(self):
340 | if self.precision == -1 or self.recall == -1 or self.precision == 0 or self.recall == 0:
341 | return -1
342 | else:
343 | return 2 / (1 / self.precision + 1 / self.recall)
344 |
345 | @staticmethod
346 | def evaluate_to_performance(predicted, labelled, performance_objects):
347 | if predicted == labelled:
348 | if predicted == 1.0:
349 | for pf in performance_objects:
350 | if pf is not None:
351 | pf.increase_true_positive()
352 | elif predicted == 1.0:
353 | for pf in performance_objects:
354 | if pf is not None:
355 | pf.increase_false_positive()
356 | else:
357 | for pf in performance_objects:
358 | if pf is not None:
359 | pf.increase_false_negative()
360 |
361 |
362 | class BinaryClusterClassifier(object):
363 | def __init__(self, label):
364 | self._name = label
365 | self._class1reps = None
366 | self._class2reps = None
367 |
368 | @property
369 | def class1reps(self):
370 | return self._class1reps
371 |
372 | @property
373 | def class2reps(self):
374 | return self._class2reps
375 |
376 | def cluster(self, class1_data, class2_data):
377 | self._class1reps = BinaryClusterClassifier.do_clustering(class1_data, class_prefix='cls1:')
378 | self._class2reps = BinaryClusterClassifier.do_clustering(class2_data, class_prefix='cls2:')
379 |
380 | def classify(self, x, threshold=0.5, complementary_classifiers=None):
381 | p = BinaryClusterClassifier.calculate_most_similar(self, x)
382 | mp = p
383 | if p[1] < threshold and complementary_classifiers is not None:
384 | for classifer in complementary_classifiers:
385 | logging.debug('do extra classifying when the similarity is too low ...')
386 | p = BinaryClusterClassifier.calculate_most_similar(classifer, x)
387 | logging.debug('extra result @ %s' % p[1])
388 | mp = p if p[1] > mp[1] else mp
389 | if p[1] > threshold:
390 | # stop when once exceeding the threshold
391 | break
392 | return mp, 0 if mp[0].startswith('cls2:') else 1
393 |
394 | @staticmethod
395 | def calculate_most_similar(classifier, x):
396 | results = []
397 | xa = numpy.array(x).reshape(1, -1)
398 | for cls in classifier.class1reps:
399 | results.append((cls, cosine_similarity(xa, classifier.class1reps[cls])))
400 | for cls in classifier.class2reps:
401 | results.append((cls, cosine_similarity(xa, classifier.class2reps[cls])))
402 | return sorted(results, key=lambda x: -x[1])[0]
403 |
404 | @staticmethod
405 | def do_clustering(X, class_prefix='cls:'):
406 | dbm = DBSCAN(eps=1.0).fit(X)
407 | cls2insts = {}
408 | for idx in range(len(dbm.labels_)):
409 | c = dbm.labels_[idx]
410 | cls = '%s%s' % (class_prefix, c)
411 | if cls not in cls2insts:
412 | cls2insts[cls] = [X[idx]]
413 | else:
414 | cls2insts[cls].append(X[idx])
415 | cls2mean = {}
416 | for cls in cls2insts:
417 | cls2mean[cls] = numpy.mean(cls2insts[cls], axis=0).reshape(1, -1)
418 | return cls2mean
--------------------------------------------------------------------------------
/mention_pattern.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import utils
3 | import pandas as pd
4 | from os import listdir
5 | from os.path import isfile, join, split
6 |
7 |
8 | class AbstractedSentence(object):
9 | def __init__(self, seq):
10 | self._seq = 0
11 | self._abstracted_tokens = []
12 | self._text = None
13 | self._parsed = None
14 |
15 | @property
16 | def seq(self):
17 | return self._seq
18 |
19 | @seq.setter
20 | def seq(self, value):
21 | self._seq = value
22 |
23 | def add_token(self, t):
24 | self._abstracted_tokens.append(t)
25 |
26 | @property
27 | def tokens(self):
28 | return self._abstracted_tokens
29 |
30 | @property
31 | def text(self):
32 | return self._text
33 |
34 | @text.setter
35 | def text(self, value):
36 | self._text = value
37 |
38 | def get_parsed_tree(self, nlp):
39 | """
40 | use spacy instance to parse the sentence
41 | :param nlp: a spacy instance
42 | :return: dependency tree
43 | """
44 | if self._parsed is not None:
45 | return self._parsed
46 | if self.text is None:
47 | return None
48 | self._parsed = nlp(self.text)
49 | return self._parsed
50 |
51 | def locate_pos(self, str):
52 | return self._text.find(str)
53 |
54 | def get_abstaction_by_pos(self, pos, nlp):
55 | doc = self.get_parsed_tree(nlp)
56 | token = None
57 | if doc is not None:
58 | for t in doc:
59 | if t.idx + len(t.text) == pos:
60 | token = t
61 | if token is not None:
62 | ta = TokenAbstraction(token, doc)
63 | else:
64 | return None
65 | return ta
66 |
67 | def get_related_tokens(self, t):
68 | ret = []
69 | for tk in self._parsed:
70 | if tk.head == t:
71 | ret.append(tk)
72 | print(tk.text, tk.dep_, tk.head)
73 | return ret
74 |
75 |
76 | class TokenAbstraction(object):
77 | def __init__(self, token, doc):
78 | self._t = token
79 | self._d = doc
80 | self._children = []
81 | self._root = None
82 | self._subject = None
83 | self._verbs = None
84 | self._vcontext = []
85 | self.do_abstract()
86 |
87 | @property
88 | def vcontext(self):
89 | return self._vcontext
90 |
91 | @property
92 | def children(self):
93 | return self._children
94 |
95 | @property
96 | def root(self):
97 | return self._root
98 |
99 | @property
100 | def subject(self):
101 | return self._subject
102 |
103 | @property
104 | def verbs(self):
105 | return self._verbs
106 |
107 | @property
108 | def token(self):
109 | return self._t
110 |
111 | def do_abstract(self):
112 | self._children = [t for t in self._t.children]
113 | t = self._t
114 | r = t
115 | while (t.head != t) and t.dep_ not in ['ROOT', 'relcl', 'acl', 'advcl']:
116 | t = t.head
117 | if t.dep_ in ['ccomp']:
118 | self._subject = [s for s in t.children if s.dep_ in [u"nsubj", 'nsubjpass', 'ROOT', 'pobj']]
119 | if t.pos_ in ['VERB']:
120 | self._vcontext += [s for s in t.children if s.dep_ in ["neg", 'advmod']]
121 | r = t
122 | if t is not None:
123 | self._verbs = [v for v in t.children if v.pos_ == u"VERB"]
124 | if t.dep_ in ['relcl', 'acl']:
125 | self._subject = [t.head]
126 | else:
127 | if len(self._vcontext) == 0:
128 | self._vcontext += [s for s in t.children if s.dep_ in ["neg", 'advmod']]
129 | if self._subject is None:
130 | self._subject = [s for s in t.children if s.dep_ in [u"nsubj", 'nsubjpass', 'ROOT']]
131 | self._root = r
132 |
133 | def do_abstract_waterfall(self, entity_start, entity_end):
134 | t = self._t
135 | seq = []
136 | while (t.head != t) and t.dep_ not in ['ROOT', 'relcl', 'acl', 'advcl']:
137 | t = t.head
138 | if t.idx > entity_end or (t.idx + len(t.text) < entity_start):
139 | seq.append((t.text, t.dep_, t.pos_))
140 | seq.reverse()
141 | return seq
142 |
143 | def do_abstract_descendent(self):
144 | return [c for c in self._t.children]
145 |
146 | def to_dict(self):
147 | return {'children': [t.text for t in self.children], 'root': self.root.text,
148 | 'subject': [s.text for s in self.subject], 'verbs': [v.text for v in self.verbs]}
149 |
150 |
151 | class MentionPattern(object):
152 | def __init__(self, pattern_folder, cui2icd, csv_file=None, ann_folder=None, in_action=False):
153 | self._ptn_folder = pattern_folder
154 | self._ref_good_ptns = None
155 | self._ref_bad_ptns = None
156 | self._csv_file = csv_file
157 | self._cui2icd = cui2icd
158 | self._df = None
159 | self._nlp = get_nlp_lg()
160 | self._ann_folder = ann_folder
161 | self._good_ptns = None
162 | self._bad_ptns = None
163 | self._in_action = in_action
164 | self.load()
165 |
166 | def load(self):
167 | if self._csv_file is not None:
168 | self._df = pd.read_csv(self._csv_file)
169 | if self._in_action:
170 | g, b = MentionPattern.load_ref_patterns(self._ptn_folder, 'zzzz')
171 | self._good_ptns = g
172 | self._bad_ptns = b
173 |
174 | @staticmethod
175 | def load_ref_patterns(ptn_folder, ignore_chapter):
176 | good_p = MentionPattern.load_patterns(ptn_folder, to_load=lambda f: f.find('good') > 0 and f.find(
177 | '%s_' % ignore_chapter) != 0)
178 | bad_p = MentionPattern.load_patterns(ptn_folder, to_load=lambda f: f.find('bad') > 0 and f.find(
179 | '%s_' % ignore_chapter) != 0)
180 | return good_p, bad_p
181 |
182 | @staticmethod
183 | def get_sent_by_pos(sents, s, e):
184 | for sent in sents:
185 | if sent['start'] <= s and sent['end'] >= e:
186 | return sent
187 | return None
188 |
189 | def read_semehr_anns(self, doc_anns, container):
190 | """
191 | doc_anns - [{'d': fk, 'ann': a, 'label': self.label}]
192 | """
193 | self.read_semehr_anns_by_functions(doc_anns,
194 | get_sent_func=lambda dd: utils.load_json_data(dd)['sentences'],
195 | get_text_func=lambda dd: self._df[self._df['doc_id'] == dd]['text'].iloc[0],
196 | container=container)
197 |
198 | def read_semehr_anns_by_functions(self, doc_anns, get_sent_func, get_text_func, container):
199 | cur_d = None
200 | cur_sents = None
201 | for da in doc_anns:
202 | d = 'se_ann_%s.json' % da['d']
203 | if d != cur_d:
204 | cur_sents = get_sent_func(join(self._ann_folder, d))
205 | cur_d = d
206 | a = da['ann']
207 | ch = self._cui2icd[a.cui]
208 | sent = MentionPattern.get_sent_by_pos(cur_sents, a.start, a.end)
209 | win = get_text_func(da['d'])[sent['start']:sent['end']]
210 | container.append(
211 | {'ch': ch, 'd': da['d'], 's': a.start, 'e': a.end, 's_s': sent['start'], 's_e': sent['end'],
212 | 'win': win})
213 |
214 | def abstract_ann_pattern(self, ann):
215 | abss = AbstractedSentence(2)
216 | abss.text = ann['win']
217 | result = abss.get_abstaction_by_pos(ann['e'] - ann['s_s'], self._nlp)
218 | if result is not None:
219 | # abss.get_related_tokens(result.token)
220 | ptn = result.do_abstract_waterfall(ann['s'] - ann['s_s'], ann['e'] - ann['s_s'])
221 | return {'pattern': ptn, "subject": result.subject, "vcontect": result.vcontext}
222 | else:
223 | return None
224 |
225 | def classify_anns(self, anns):
226 | preds = []
227 | for ann in anns:
228 | ret = self.abstract_ann_pattern(ann)
229 | if ret is not None:
230 | good_ref = self._good_ptns
231 | bad_ref = self._bad_ptns
232 | if not self._in_action:
233 | good_ref, bad_ref = MentionPattern.load_ref_patterns(self._ptn_folder, ann['ch'])
234 | good_match = MentionPattern.compute_similar_from_ref(ret, good_ref, self._nlp)
235 | bad_match = MentionPattern.compute_similar_from_ref(ret, bad_ref, self._nlp)
236 | # ctx = '|'.join([e[0] for e in ret['pattern']])
237 | cls = MentionPattern.classify_by_pattern_matches(good_match, bad_match, self._nlp)
238 | preds.append(cls)
239 | else:
240 | preds.append(-1)
241 | return preds
242 |
243 | def predict(self, doc_anns, cr=None):
244 | anns = []
245 | if cr is None:
246 | self.read_semehr_anns(doc_anns, anns)
247 | else:
248 | # single document anns to be read by CustomisedRecoginiser
249 | self.read_semehr_anns_by_functions(doc_anns, get_sent_func=lambda dd: cr.sentences,
250 | get_text_func=lambda dd:cr.full_text, container=anns)
251 | return self.classify_anns(anns)
252 |
253 | @staticmethod
254 | def load_patterns(ptn_folder, to_load=lambda f: True):
255 | return [utils.load_json_data(join(ptn_folder, f)) for f in listdir(ptn_folder) if
256 | to_load(f) and isfile(join(ptn_folder, f))]
257 |
258 | @staticmethod
259 | def sim_seqs(s1, s2, nlp, last_k=2):
260 | scores = 0.0
261 | k = min(last_k, len(s1), len(s2))
262 | for i in range(1, k + 1):
263 | t1, t2 = nlp(' '.join([s1[-1 * i], s2[-1 * i]]))
264 | if t1.vector_norm > 0 and t2.vector_norm > 0:
265 | scores += t1.similarity(t2)
266 | return scores / k
267 |
268 | @staticmethod
269 | def get_pattern_group(p):
270 | mp = p if len(p) <= 2 else p[-2:]
271 | return '-'.join([e[2] for e in mp])
272 |
273 | @staticmethod
274 | def compute_similar_from_ref(ret, ref_good_ptns, nlp, threshold=0.7):
275 | p = ret['pattern']
276 | ctxt = '|'.join([e[0] for e in p])
277 | # print('>>>working on %s' % ctxt)
278 | if len(ctxt) == 0:
279 | return None
280 | grp = MentionPattern.get_pattern_group(p)
281 | entried_scores = []
282 | for ref_ptn in ref_good_ptns:
283 | if grp in ref_ptn:
284 | for inst in ref_ptn[grp]:
285 | score = MentionPattern.sim_seqs([e[0] for e in p], ref_ptn[grp][inst]['list'], nlp)
286 | if score > threshold:
287 | entried_scores.append((score, ref_ptn[grp][inst]['freq']))
288 | # print('\tvs %s: score %s, %s' % (inst, score, ref_good_ptns[grp][inst]['freq']))
289 | if len(entried_scores) > 0:
290 | total = sum([s[0] * s[1] for s in entried_scores])
291 | supports = sum([s[1] for s in entried_scores])
292 | avg_score = total / supports
293 | # print('\tscore %s, support %s, %s|%s' % (avg_score, supports, ret['subject'], ret['vcontect']))
294 | return {'score': avg_score, 'supports': supports, 'subject': [t.text for t in ret['subject']],
295 | 'context': [t.text for t in ret['vcontect']]}
296 | else:
297 | return None
298 |
299 | @staticmethod
300 | def classify_by_pattern_matches(good_match, bad_match, nlp,
301 | bad_subjs=None,
302 | bad_context=None):
303 | if bad_context is None:
304 | bad_context = ['not', 'mistakenly', 'likely', 'ie']
305 | if bad_subjs is None:
306 | bad_subjs = ['son', 'daughter', 'manager', 'wife', 'I', 'one', 'anyone', "questions",
307 | "someone", "child", "neighbour", "invesitigation", "screening",
308 | "assessment"]
309 | if good_match is None and bad_match is None:
310 | return -1
311 | if good_match is None:
312 | return 0
313 | # elif bad_match is None:
314 | # return 1
315 | else:
316 | sub = good_match['subject']
317 | ctx = good_match['context']
318 | if MentionPattern.lists_sim_enough(sub, bad_subjs, nlp) == 1:
319 | return 0
320 | if MentionPattern.lists_sim_enough(ctx, bad_context, nlp) == 1:
321 | return 0
322 | # return -1
323 | if bad_match is None:
324 | return 1
325 | else:
326 | return 1 if good_match['score'] * good_match['supports'] >= bad_match['score'] * bad_match[
327 | 'supports'] else 0
328 |
329 | @staticmethod
330 | def lists_sim_enough(l1, l2, nlp, threshold=0.8):
331 | if len(l1) == 0 or len(l2) == 0:
332 | return -1
333 | d1 = nlp(' '.join(l1))
334 | d2 = nlp(' '.join(l2))
335 | for t1 in d1:
336 | for t2 in d2:
337 | if t1.similarity(t2) > threshold:
338 | return 1
339 | return 0
340 |
341 |
342 | _nlp_lg = None
343 |
344 |
345 | def get_nlp_lg():
346 | global _nlp_lg
347 | if _nlp_lg is None:
348 | _nlp_lg = spacy.load('en_core_web_lg')
349 | return _nlp_lg
350 |
--------------------------------------------------------------------------------
/neg-tumour-dt-learnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/neg-tumour-dt-learnt.png
--------------------------------------------------------------------------------
/nlp_to_phenome.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | nlp2phenome
6 | using AI models to infer patient phenotypes from identified named entities (instances of biomedical concepts)
7 | """
8 | import utils
9 | from os.path import basename, isfile, join
10 | from os import listdir
11 | import json
12 | import logging
13 | from LabelModel import LabelModel
14 | import mention_pattern as mp
15 | from annotation_docs import SemEHRAnnDoc, CustomisedRecoginiser, Concept2Mapping
16 | from EDI_ann_doc import EDIRDoc, ConllDoc, eHostDoc
17 | from learners import LabelPerformance, PhenomeLearners
18 |
19 |
20 | class StrokeSettings(object):
21 | """
22 | json based configuration setting
23 | """
24 |
25 | def __init__(self, setting_file):
26 | self._file = setting_file
27 | self._setting = {}
28 | self.load()
29 |
30 | def load(self):
31 | self._setting = utils.load_json_data(self._file)
32 |
33 | @property
34 | def settings(self):
35 | return self._setting
36 |
37 |
38 | def extract_doc_level_ann(ann_dump, output_folder):
39 | """
40 |
41 | extract doc level annotations and save to separate files
42 | :param ann_dump:
43 | :param output_folder:
44 | :return:
45 | """
46 | lines = utils.read_text_file(ann_dump)
47 | for l in lines:
48 | doc_ann = json.loads(l)
49 | utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json'))
50 |
51 |
52 | def extract_all_doc_anns(dump_folder, output_folder):
53 | dumps = [f for f in listdir(dump_folder) if isfile(join(dump_folder, f))]
54 | for d in dumps:
55 | extract_doc_level_ann(join(dump_folder, d), output_folder)
56 |
57 |
58 | def save_full_text(xml_file, output_dir):
59 | """
60 | recover full text from Informatics' xml format
61 | :param xml_file:
62 | :param output_dir:
63 | :return:
64 | """
65 | if not isfile(xml_file):
66 | return
67 | ed = EDIRDoc(xml_file)
68 | fn = basename(xml_file)
69 | name = fn.replace(r'-ann.xml', '.txt')
70 | logging.info('%s processed to be %s' % (fn, name))
71 | utils.save_string(ed.get_full_text, join(output_dir, name))
72 |
73 |
74 | def process_files(read_dir, write_dir):
75 | utils.multi_thread_process_files(read_dir, file_extension='xml', num_threads=10,
76 | process_func=save_full_text, args=[write_dir])
77 |
78 |
79 | def get_doc_level_inference(label_dir, ann_dir, file_key, type2insts, type2inst_2, t2missed):
80 | """
81 | learn concept to label inference from gold standard - i.e. querying SemEHR annotations to
82 | draw conclusions
83 | :param label_dir:
84 | :param ann_dir:
85 | :param file_key:
86 | :param type2insts:
87 | :param type2inst_2:
88 | :return:
89 | """
90 | label_file = '%s-ann.xml' % file_key
91 | ann_file = '%s.json' % file_key
92 | logging.info('working on %s' % join(label_dir, label_file))
93 | ed = EDIRDoc(join(label_dir, label_file))
94 | if not isfile(join(label_dir, label_file)):
95 | print('not a file: %s' % join(label_dir, label_file))
96 | return
97 | sd = SemEHRAnnDoc(join(ann_dir, ann_file))
98 | sd.learn_mappings_from_labelled(ed, type2insts, t2missed)
99 |
100 |
101 | def learn_concept_mappings(output_lst_folder):
102 | type2insts = {}
103 | type2insts_2 = {}
104 | label_dir = _gold_dir
105 | ann_dir = _ann_dir
106 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
107 | t2missed = {}
108 | for fk in file_keys:
109 | get_doc_level_inference(label_dir,
110 | ann_dir,
111 | fk,
112 | type2insts,
113 | type2insts_2,
114 | t2missed)
115 | for t in type2insts:
116 | type2insts[t] = list(type2insts[t])
117 | logging.info(json.dumps(type2insts))
118 |
119 | s = '\n' * 2
120 | for t in type2insts_2:
121 | type2insts_2[t] = list(type2insts_2[t])
122 | s += json.dumps(type2insts_2)
123 |
124 | s += '\n' * 2
125 | labels = []
126 | defs = []
127 | for t in t2missed:
128 | t2missed[t] = list(set(t2missed[t]))
129 | utils.save_string('\n'.join(t2missed[t]) + '\n', join(output_lst_folder, t + '.lst'))
130 | labels += [l.lower() for l in t2missed[t]]
131 | defs.append(t + '.lst' + ':StrokeStudy:' + t)
132 | s += '\n' * 2
133 | s += '\n'.join(defs)
134 | s += json.dumps(t2missed)
135 | logging.info(s)
136 |
137 |
138 | def learn_prediction_model(label, ann_dir=None, gold_dir=None, model_file=None, model_dir=None,
139 | ml_model_file_ptn=None,
140 | pca_dim=None,
141 | pca_model_file=None,
142 | max_dimension=None,
143 | ignore_mappings=[],
144 | viz_file=None, ignore_context=False, separate_by_label=False, full_text_dir=None,
145 | eHostGD=False):
146 | model_changed = False
147 | if model_file is not None:
148 | lm = LabelModel.deserialise(model_file)
149 | else:
150 | model_changed = True
151 | lm = LabelModel(label, _cm_obj)
152 | lm.collect_tfidf_dimensions(ann_dir=ann_dir, gold_dir=gold_dir, ignore_context=ignore_context,
153 | separate_by_label=separate_by_label, full_text_dir=full_text_dir, eHostGD=eHostGD)
154 | lm.use_one_dimension_for_label = False
155 | lm.max_dimensions = max_dimension
156 | if ann_dir is not None:
157 | # bad_lables = lm.get_low_quality_labels(ann_dir, gold_dir)
158 | # logging.info(bad_lables)
159 | bad_lables = []
160 | data = lm.load_data(ann_dir, gold_dir, ignore_mappings=bad_lables, ignore_context=ignore_context,
161 | separate_by_label=separate_by_label, ful_text_dir=full_text_dir, eHostGD=eHostGD,
162 | annotated_anns=_annotated_anns)
163 | # if separate_by_label:
164 | for lbl in data['lbl2data']:
165 | X = data['lbl2data'][lbl]['X']
166 | Y = data['lbl2data'][lbl]['Y']
167 | n_true = 0
168 | for y in Y:
169 | if y == [1]:
170 | n_true += 1
171 | logging.debug('training data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X)))
172 | if len(X) <= _min_sample_size:
173 | lm.add_rare_label(lbl, n_true * 1.0 / len(X))
174 | continue
175 | # ignore_mappings += data['bad_labels']
176 | PhenomeLearners.random_forest_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
177 | # lm.svm_learning(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
178 | # lm.gaussian_nb(X, Y, output_file=ml_model_file_ptn % escape_lable_to_filename(lbl))
179 | logging.debug('%s, #insts: %s, #tps: %s' % (lbl, len(X), n_true))
180 |
181 | if model_dir is not None and model_changed:
182 | lm.serialise(join(model_dir, '%s.lm' % label))
183 | logging.debug('%s.lm saved' % label)
184 |
185 |
186 | def predict_label(model_file, test_ann_dir, test_gold_dir, ml_model_file_ptn, performance,
187 | pca_model_file=None,
188 | max_dimension=None,
189 | ignore_mappings=[],
190 | ignore_context=False,
191 | separate_by_label=False,
192 | full_text_dir=None,
193 | file_pattern='%s-ann.xml',
194 | id2conll=None,
195 | label_whitelist=None,
196 | eHostGD=False, mention_pattern=None):
197 | lm = LabelModel.deserialise(model_file)
198 | lm.max_dimensions = max_dimension
199 | data = lm.load_data(test_ann_dir, test_gold_dir, ignore_mappings=ignore_mappings, ignore_context=ignore_context,
200 | separate_by_label=separate_by_label, verbose=False, ful_text_dir=full_text_dir, eHostGD=eHostGD,
201 | annotated_anns=_annotated_anns)
202 |
203 | files = data['files']
204 | for d in files:
205 | d = d.replace('se_ann_', '')
206 | if d not in id2conll:
207 | id2conll[d] = ConllDoc(join(test_gold_dir, file_pattern % d))
208 | if label_whitelist is not None:
209 | id2conll[d].set_label_white_list(label_whitelist)
210 | lbl2performances = {}
211 | for lbl in data['lbl2data']:
212 | this_performance = LabelPerformance(lbl)
213 | X = data['lbl2data'][lbl]['X']
214 | Y = data['lbl2data'][lbl]['Y']
215 | mtp = data['lbl2data'][lbl]['multiple_tps']
216 | doc_anns = data['lbl2data'][lbl]['doc_anns']
217 | mp_predicted = None
218 | if mention_pattern is not None:
219 | mp_predicted = mention_pattern.predict(doc_anns)
220 | if lbl in lm.rare_labels:
221 | logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl]))
222 | PhenomeLearners.predict_use_simple_stats(
223 | lm.rare_labels[lbl], Y, mtp,
224 | performance, separate_performance=this_performance,
225 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
226 | doc_folder=test_gold_dir,
227 | label_whitelist=label_whitelist, mp_predicted=mp_predicted
228 | )
229 | else:
230 | if len(X) > 0:
231 | logging.debug('predict data: %s, dimensions %s, insts %s' % (lbl, len(X[0]), len(X)))
232 | bc = lm.get_binary_cluster_classifier(lbl)
233 | if bc is not None:
234 | complementary_classifiers = []
235 | for l in lm.cluster_classifier_dict:
236 | if l != lbl:
237 | complementary_classifiers.append(lm.cluster_classifier_dict[l])
238 | for idx in range(len(X)):
239 | logging.debug(
240 | '%s => %s' % (bc.classify(X[idx], complementary_classifiers=complementary_classifiers), Y[idx]))
241 | PhenomeLearners.predict_use_model(X, Y, 0, mtp, ml_model_file_ptn % escape_lable_to_filename(lbl),
242 | performance,
243 | pca_model_file=pca_model_file,
244 | separate_performance=this_performance,
245 | id2conll=id2conll, doc_anns=doc_anns, file_pattern=file_pattern,
246 | doc_folder=test_gold_dir,
247 | label_whitelist=label_whitelist, mp_predicted=mp_predicted)
248 | lbl2performances[lbl] = this_performance
249 | perform_str = CustomisedRecoginiser.print_performances(lbl2performances)
250 | logging.debug('missed instances: %s' % data['fns'])
251 | performance.increase_false_negative(data['fns'])
252 | return perform_str
253 |
254 |
255 | def escape_lable_to_filename(s):
256 | return s.replace('\\', '_').replace('/', '_')
257 |
258 |
259 | def populate_semehr_results(label_dir, ann_dir, file_key,
260 | label2performances, using_combined=False):
261 | label_file = '%s-ann.xml' % file_key
262 | ann_file = '%s.json' % file_key
263 | print(join(label_dir, label_file))
264 | if not isfile(join(label_dir, label_file)):
265 | return
266 |
267 | ed = EDIRDoc(join(label_dir, label_file))
268 | cm = Concept2Mapping(_concept_mapping)
269 | cr = CustomisedRecoginiser(join(ann_dir, ann_file), cm)
270 | if using_combined:
271 | cr.validate_combined_performance(ed.get_ess_entities(), label2performances)
272 | else:
273 | cr.validate_mapped_performance(ed.get_ess_entities(), label2performances)
274 |
275 |
276 | def populate_validation_results():
277 | label_dir = _gold_dir
278 | ann_dir = _ann_dir
279 |
280 | label2performances = {}
281 | file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
282 | for fk in file_keys:
283 | populate_semehr_results(label_dir, ann_dir, fk, label2performances, using_combined=False)
284 | CustomisedRecoginiser.print_performances(label2performances)
285 |
286 |
287 | def do_learn_exp(viz_file, num_dimensions=[20], ignore_context=False, separate_by_label=False, conll_output_file=None,
288 | eHostGD=False, mention_pattern=None):
289 | results = {}
290 | id2conll = {}
291 | result_str = ''
292 | for lbl in _labels:
293 | logging.info('working on [%s]' % lbl)
294 | _learning_model_file = _learning_model_dir + '/%s.lm' % lbl
295 | _ml_model_file_ptn = _learning_model_dir + '/' + lbl + '_%s_DT.model'
296 | _pca_model_file = None
297 | pca_dim = None
298 | max_dimensions = num_dimensions
299 |
300 | t = lbl.replace('neg_', '')
301 | ignore_mappings = _ignore_mappings[t] if t in _ignore_mappings else []
302 | # remove previous model files logging.debug('removing previously learnt models...') for f in [f for f in
303 | # listdir(_learning_model_dir) if isfile(join(_learning_model_dir, f)) and f.endswith('.model')]: remove(
304 | # join(_learning_model_dir, f))
305 | for dim in max_dimensions:
306 | logging.info('dimension setting: %s' % dim)
307 | learn_prediction_model(lbl,
308 | ann_dir=_ann_dir,
309 | gold_dir=_gold_dir,
310 | ml_model_file_ptn=_ml_model_file_ptn,
311 | model_dir=_learning_model_dir,
312 | pca_dim=pca_dim,
313 | pca_model_file=_pca_model_file,
314 | max_dimension=dim,
315 | ignore_mappings=ignore_mappings,
316 | viz_file=viz_file,
317 | ignore_context=ignore_context,
318 | separate_by_label=separate_by_label,
319 | full_text_dir=_gold_text_dir,
320 | eHostGD=eHostGD)
321 | logging.debug('bad labels: %s' % ignore_mappings)
322 | pl = '%s dim[%s]' % (lbl, dim)
323 | performance = LabelPerformance(pl)
324 | results[pl] = performance
325 | predict_label(_learning_model_file,
326 | _test_ann_dir,
327 | _test_gold_dir,
328 | _ml_model_file_ptn,
329 | performance,
330 | pca_model_file=_pca_model_file,
331 | max_dimension=dim,
332 | ignore_mappings=ignore_mappings,
333 | ignore_context=ignore_context,
334 | separate_by_label=separate_by_label,
335 | full_text_dir=_test_text_dir,
336 | file_pattern=_gold_file_pattern,
337 | id2conll=id2conll,
338 | label_whitelist=_labels,
339 | eHostGD=eHostGD, mention_pattern=mention_pattern)
340 | result_str = CustomisedRecoginiser.print_performances(results)
341 | return result_str
342 |
343 |
344 | def save_text_files(xml_dir, text_dr):
345 | process_files(xml_dir, text_dr)
346 |
347 |
348 | def extact_doc_anns(semoutput_dir, doc_ann_dir):
349 | extract_all_doc_anns(semoutput_dir,
350 | doc_ann_dir)
351 |
352 |
353 | def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder):
354 | maps = [utils.load_json_data(mf) for mf in map_files]
355 | new_m = {}
356 | for m in maps:
357 | new_m.update(m)
358 | t2list = {}
359 | for dd in dict_dirs:
360 | lst_files = [f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst')]
361 | for f in lst_files:
362 | t = f[:f.index('.')]
363 | labels = utils.read_text_file(join(dd, f))
364 | if t not in t2list:
365 | t2list[t] = set()
366 | for l in labels:
367 | if len(l) > 0:
368 | t2list[t].add(l)
369 | utils.save_json_array(new_m, new_map_file)
370 | logging.info('mapping saved to %s' % new_map_file)
371 | for t in t2list:
372 | utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst'))
373 | logging.info('%s.lst saved' % t)
374 | logging.info('all done')
375 |
376 |
377 | def test_eHost_doc():
378 | d = eHostDoc('/Users/honghan.wu/Desktop/ehost_sample.xml')
379 | print([(e.label, e.start, e.end, e.str) for e in d.get_ess_entities()])
380 |
381 |
382 | def run_learning_v0():
383 | log_level = 'DEBUG'
384 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
385 | logging.basicConfig(level='DEBUG', format=log_format)
386 | log_file = './settings/processing.log'
387 | logging.basicConfig(level=log_level, format=log_format)
388 | ss = StrokeSettings('./settings/settings.json')
389 | settings = ss.settings
390 | global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir
391 | global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj
392 | global _annotated_anns
393 | _annotated_anns = {}
394 | if 'annotated_anns' in settings['annotated_anns_file']:
395 | _annotated_anns = utils.load_json_data(settings['annotated_anns_file'])
396 | _min_sample_size = settings['min_sample_size']
397 | _ann_dir = settings['ann_dir']
398 | _gold_dir = settings['gold_dir']
399 | _test_ann_dir = settings['test_ann_dir']
400 | _test_gold_dir = settings['test_gold_dir']
401 | _gold_text_dir = settings['dev_full_text_dir']
402 | _test_text_dir = settings['test_fulltext_dir']
403 | _concept_mapping = settings['concept_mapping_file']
404 | _learning_model_dir = settings['learning_model_dir']
405 | _labels = utils.read_text_file(settings['entity_types_file'])
406 | _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern']
407 | _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
408 | _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False
409 | _cm_obj = Concept2Mapping(_concept_mapping)
410 |
411 | mp_inst = mp.MentionPattern(settings['pattern_folder'], _cm_obj.cui2label,
412 | csv_file=settings['csv_file'], ann_folder=_test_ann_dir)
413 | return do_learn_exp(settings['viz_file'],
414 | num_dimensions=[50],
415 | ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False,
416 | separate_by_label=True,
417 | conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
418 |
419 |
420 | def run_learning(
421 | train_ann_dir, train_gold_dir, train_text_dir,
422 | test_ann_dir, test_gold_dir, test_text_dir,
423 | settings):
424 | log_level = 'DEBUG'
425 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
426 | logging.basicConfig(level='DEBUG', format=log_format)
427 | log_file = './settings/processing.log'
428 | logging.basicConfig(level=log_level, format=log_format)
429 | global _min_sample_size, _ann_dir, _gold_dir, _test_ann_dir, _test_gold_dir, _gold_text_dir, _test_text_dir, _concept_mapping, _learning_model_dir
430 | global _labels, _gold_file_pattern, _ignore_mappings, _eHostGD, _cm_obj
431 | global _annotated_anns
432 | _annotated_anns = {}
433 | _min_sample_size = settings['min_sample_size']
434 | _ann_dir = train_ann_dir
435 | _gold_dir = train_gold_dir
436 | _test_ann_dir = test_ann_dir
437 | _test_gold_dir = test_gold_dir
438 | _gold_text_dir = train_text_dir
439 | _test_text_dir = test_text_dir
440 | _concept_mapping = settings['concept_mapping_file']
441 | _learning_model_dir = settings['learning_model_dir']
442 | _labels = utils.read_text_file(settings['entity_types_file'])
443 | _gold_file_pattern = "%s_ann.xml" if 'gold_file_pattern' not in settings else settings['gold_file_pattern']
444 | _ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
445 | _eHostGD = settings['eHostGD'] if 'eHostGD' in settings else False
446 | _cm_obj = Concept2Mapping(_concept_mapping)
447 |
448 | # not using mention patterns for prediction as this is only a in-development feature
449 | mp_inst = None
450 | return do_learn_exp(settings['viz_file'],
451 | num_dimensions=[50],
452 | ignore_context=settings['ignore_context'] if 'ignore_context' in settings else False,
453 | separate_by_label=True,
454 | conll_output_file=settings['conll_output_file'], eHostGD=_eHostGD, mention_pattern=mp_inst)
455 |
456 |
457 | if __name__ == "__main__":
458 | log_level = 'DEBUG'
459 | log_format = '[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s'
460 | logging.basicConfig(level='DEBUG', format=log_format)
461 | log_file = './settings/processing.log'
462 | logging.basicConfig(level=log_level, format=log_format)
463 | # _cm_obj.load_gaz_dir(settings['concept_gaz_dir'])
464 |
465 | # 0. merging mapping & dictionaries
466 | # merge_mappings_dictionary(['/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/tayside_concept_mapping.json',
467 | # '/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/concept_mapping.json'],
468 | # ['/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/tayside_gazetteer',
469 | # '/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/ess_gazetteer'],
470 | # '/afs/inf.ed.ac.uk/group/project/biomedTM/users/hwu/merged_concept_mapping.json',
471 | # '/Users/honghan.wu/Documents/working/SemEHR-Working/toolkits/bio-yodie-1-2-1/finalize/merged_gzetteer')
472 |
473 | # 1. extract text files for annotation
474 | # save_text_files(settings['gold_dir'], settings['dev_full_text_dir'])
475 | # 2. run SemEHR on the text files
476 | # 3. extract doc anns into separate files from dumped JSON files
477 | # extact_doc_anns(settings['test_semehr_output_dir'],
478 | # settings['test_ann_dir'])
479 | # 4. learn umls concept to phenotype mappping
480 | # learn_concept_mappings(settings['gazetteer_dir'])
481 | # 5. learn phenotype inference
482 |
--------------------------------------------------------------------------------
/predict_helper.py:
--------------------------------------------------------------------------------
1 | from nlp_to_phenome import StrokeSettings, Concept2Mapping, escape_lable_to_filename
2 | from LabelModel import LabelModel, CustomisedRecoginiser
3 | from annotation_docs import PhenotypeAnn
4 | from learners import PhenomeLearners
5 | import utils
6 | import logging
7 | from os.path import join
8 | from ann_converter import AnnConverter
9 | from os import listdir
10 | from os.path import isfile, exists
11 | import sys
12 |
13 |
14 | def predict(settings):
15 | ann_dir = settings['test_ann_dir']
16 | test_text_dir = settings['test_fulltext_dir']
17 | _concept_mapping = settings['concept_mapping_file']
18 | _learning_model_dir = settings['learning_model_dir']
19 | _labels = utils.read_text_file(settings['entity_types_file'])
20 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
21 | _cm_obj = Concept2Mapping(_concept_mapping)
22 |
23 | doc2predicted = {}
24 | no_models_labels = []
25 | for phenotype in _labels:
26 | logging.info('working on [%s]' % phenotype)
27 | _learning_model_file = _learning_model_dir + '/%s.lm' % phenotype
28 |
29 | if not exists(_learning_model_file):
30 | # if previous learnt model not exists, skip
31 | no_models_labels.append(phenotype)
32 | continue
33 |
34 | _ml_model_file_ptn = _learning_model_dir + '/' + phenotype + '_%s_DT.model'
35 |
36 | lm = LabelModel.deserialise(_learning_model_file)
37 | # pass the concept2mapping object to the label model instance
38 | lm.concept_mapping = _cm_obj
39 | lm.max_dimensions = 30
40 | data = lm.load_data_for_predict(
41 | ann_dir=ann_dir,
42 | ignore_mappings=ignore_mappings, ignore_context=True,
43 | separate_by_label=True,
44 | full_text_dir=test_text_dir)
45 | for lbl in data['lbl2data']:
46 | X = data['lbl2data'][lbl]['X']
47 | logging.debug(X)
48 | doc_anns = data['lbl2data'][lbl]['doc_anns']
49 | label_model_predict(lm, _ml_model_file_ptn, data['lbl2data'], doc2predicted)
50 | return doc2predicted, no_models_labels
51 |
52 |
53 | def label_model_predict(lm, model_file_pattern, lbl2data, doc2predicted,
54 | mention_pattern=None, mention_prediction_param=None):
55 | for lbl in lbl2data:
56 | mp_predicted = None
57 | if mention_pattern is not None:
58 | mp_predicted = mention_pattern.predict(lbl2data[lbl]['doc_anns'], cr=mention_prediction_param)
59 | X = lbl2data[lbl]['X']
60 | doc_anns = lbl2data[lbl]['doc_anns']
61 | if lbl in lm.rare_labels:
62 | logging.info('%s to be predicted using %s' % (lbl, lm.rare_labels[lbl]))
63 | PhenomeLearners.predict_use_simple_stats_in_action(lm.rare_labels[lbl],
64 | item_size=len(X),
65 | doc2predicted=doc2predicted,
66 | doc_anns=doc_anns,
67 | mp_predicted=mp_predicted)
68 | else:
69 | if len(X) > 0:
70 | logging.debug('%s, dimensions %s' % (lbl, len(X[0])))
71 | PhenomeLearners.predict_use_model_in_action(X, model_file=model_file_pattern % escape_lable_to_filename(lbl),
72 | pca_model_file=None,
73 | doc2predicted=doc2predicted,
74 | doc_anns=doc_anns,
75 | mp_predicted=mp_predicted)
76 |
77 |
78 | def hybrid_prediciton(settings):
79 | d2p, labels2work = predict(settings)
80 | ann_dir = settings['test_ann_dir']
81 | test_text_dir = settings['test_fulltext_dir']
82 | _concept_mapping = settings['concept_mapping_file']
83 | _learning_model_dir = settings['learning_model_dir']
84 | _labels = utils.read_text_file(settings['entity_types_file'])
85 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
86 | _cm_obj = Concept2Mapping(_concept_mapping)
87 | file_keys = [f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
88 | logging.info('labels to use direct nlp prediction: [%s]' % labels2work)
89 |
90 | # convert SemEHRAnn to PhenotypeAnn
91 | doc2predicted = {}
92 | for d in d2p:
93 | for t in d2p[d]:
94 | ann = t['ann']
95 | if hasattr(ann, 'cui'):
96 | lbl = _cm_obj.concept2label[ann.cui][0]
97 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer,
98 | 'StudyName', lbl)
99 | put_ann_label(lbl, pheAnn, doc2predicted, d)
100 | else:
101 | put_ann_label(ann.minor_type, ann, doc2predicted, d)
102 | for fk in file_keys:
103 | cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping)
104 | d = fk
105 | for ann in cr.annotations:
106 | if ann.cui in _cm_obj.concept2label:
107 | lbl = _cm_obj.concept2label[ann.cui][0]
108 | if lbl in labels2work:
109 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer,
110 | 'StudyName', lbl)
111 | put_ann_label(lbl, pheAnn, doc2predicted, d)
112 | for ann in cr.phenotypes:
113 | if ann.minor_type in labels2work:
114 | put_ann_label(ann.minor_type, ann, doc2predicted, d)
115 | return doc2predicted
116 |
117 |
118 | def direct_nlp_prediction(settings):
119 | ann_dir = settings['test_ann_dir']
120 | test_text_dir = settings['test_fulltext_dir']
121 | _concept_mapping = settings['concept_mapping_file']
122 | _learning_model_dir = settings['learning_model_dir']
123 | _labels = utils.read_text_file(settings['entity_types_file'])
124 | ignore_mappings = utils.load_json_data(settings['ignore_mapping_file'])
125 | _cm_obj = Concept2Mapping(_concept_mapping)
126 | file_keys = [f[:f.rfind('.')].replace('se_ann_', '') for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
127 | doc2predicted = {}
128 | for fk in file_keys:
129 | cr = CustomisedRecoginiser(join(ann_dir, 'se_ann_%s.json' % fk), _concept_mapping)
130 | d = fk
131 | for ann in cr.annotations:
132 | if ann.cui in _cm_obj.concept2label:
133 | lbl = _cm_obj.concept2label[ann.cui][0]
134 | pheAnn = PhenotypeAnn(ann.str, ann.start, ann.end, ann.negation, ann.temporality, ann.experiencer,
135 | 'StudyName', lbl)
136 | if ann.negation != 'Affirmed' or len(ann.ruled_by) > 0:
137 | continue
138 | put_ann_label(lbl, pheAnn, doc2predicted, d)
139 | for ann in cr.phenotypes:
140 | put_ann_label(ann.minor_type, ann, doc2predicted, d)
141 | return doc2predicted
142 |
143 |
144 | def put_ann_label(lbl, pheAnn, doc2predicted, d):
145 | labeled_ann = {'label': lbl,
146 | 'ann': pheAnn}
147 | if d not in doc2predicted:
148 | doc2predicted[d] = [labeled_ann]
149 | else:
150 | doc2predicted[d].append(labeled_ann)
151 |
152 |
153 | def output_eHOST_format(doc2precited, output_folder):
154 | for d in doc2precited:
155 | xml = AnnConverter.to_eHOST(d, doc2precited[d])
156 | utils.save_string(str(xml), join(output_folder, '%s.txt.knowtator.xml' % d))
157 |
158 |
159 | def predict_to_eHOST_results(predict_setting):
160 | ss = StrokeSettings(predict_setting)
161 | if 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'direct_nlp':
162 | logging.info('predicting with direct nlp...')
163 | predicted_results = direct_nlp_prediction(ss.settings)
164 | elif 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'hybrid':
165 | predicted_results = hybrid_prediciton(ss.settings)
166 | else:
167 | logging.info('predicting...')
168 | predicted_results = predict(ss.settings)
169 | output_eHOST_format(predicted_results, ss.settings['output_folder'])
170 | logging.info('results saved to %s' % ss.settings['output_folder'])
171 | if 'output_file' in ss.settings:
172 | d2ann = {}
173 | for d in predicted_results:
174 | d2ann[d] = [{'label': t['label'], 'ann': t['ann'].to_dict()} for t in predicted_results[d]]
175 | utils.save_json_array(d2ann, ss.settings['output_file'])
176 |
177 |
178 | if __name__ == "__main__":
179 | logging.basicConfig(level='DEBUG', format='[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s')
180 | # predict_to_eHOST_results('./settings/prediction_task_direct.json')
181 | if len(sys.argv) != 2:
182 | print('the syntax is [python prediction_helper.py PROCESS_SETTINGS_FILE_PATH]')
183 | else:
184 | predict_to_eHOST_results(sys.argv[1])
--------------------------------------------------------------------------------
/pretrained_models/stroke_settings.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_settings.zip
--------------------------------------------------------------------------------
/pretrained_models/stroke_subtype_models.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_subtype_models.zip
--------------------------------------------------------------------------------
/pretrained_models/stroke_supplemental-gazetteer.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CogStack/nlp2phenome/795fb4e14c4d19e02f7352351ab34b679aeb2432/pretrained_models/stroke_supplemental-gazetteer.zip
--------------------------------------------------------------------------------
/reportreader.py:
--------------------------------------------------------------------------------
1 | from annotation_docs import SemEHRAnnDoc, BasicAnn
2 | import logging
3 | from os.path import isfile, join
4 | from os import listdir
5 | import spacy
6 |
7 | _spacy_nlp = None
8 |
9 |
10 | def get_nlp_instance():
11 | global _spacy_nlp
12 | if _spacy_nlp is None:
13 | _spacy_nlp = spacy.load("en_core_web_sm")
14 | return _spacy_nlp
15 |
16 |
17 | def get_sentences_as_anns(nlp, text):
18 | doc = nlp(text)
19 | anns = []
20 | for s in doc.sents:
21 | anns.append(BasicAnn(s.text, s.start_char, s.end_char))
22 | return anns
23 |
24 |
25 | class AbstractedSentence(object):
26 | def __init__(self, seq):
27 | self._seq = 0
28 | self._abstracted_tokens = []
29 | self._text = None
30 | self._parsed = None
31 |
32 | @property
33 | def seq(self):
34 | return self._seq
35 |
36 | @seq.setter
37 | def seq(self, value):
38 | self._seq = value
39 |
40 | def add_token(self, t):
41 | self._abstracted_tokens.append(t)
42 |
43 | @property
44 | def tokens(self):
45 | return self._abstracted_tokens
46 |
47 | @property
48 | def text(self):
49 | return self._text
50 |
51 | @text.setter
52 | def text(self, value):
53 | self._text = value
54 |
55 | def get_parsed_tree(self, nlp):
56 | """
57 | use spacy instance to parse the sentence
58 | :param nlp: a spacy instance
59 | :return: dependency tree
60 | """
61 | if self._parsed is not None:
62 | return self._parsed
63 | if self.text is None:
64 | return None
65 | self._parsed = nlp(self.text)
66 | return self._parsed
67 |
68 | def locate_pos(self, str):
69 | return self._text.find(str)
70 |
71 | def get_abstaction_by_pos(self, pos, nlp):
72 | doc = self.get_parsed_tree(nlp)
73 | token = None
74 | if doc is not None:
75 | for t in doc:
76 | if t.idx == pos:
77 | token = t
78 | if token is not None:
79 | ta = TokenAbstraction(token, doc)
80 | else:
81 | return None
82 | return ta
83 |
84 |
85 | class TokenAbstraction(object):
86 | def __init__(self, token, doc):
87 | self._t = token
88 | self._d = doc
89 | self._children = []
90 | self._root = None
91 | self._subject = None
92 | self._verbs = None
93 | self.do_abstract()
94 |
95 | @property
96 | def children(self):
97 | return self._children
98 |
99 | @property
100 | def root(self):
101 | return self._root
102 |
103 | @property
104 | def subject(self):
105 | return self._subject
106 |
107 | @property
108 | def verbs(self):
109 | return self._verbs
110 |
111 | def do_abstract(self):
112 | self._children = [t for t in self._t.children]
113 | t = self._t
114 | r = t
115 | while (t.head != t) and t.pos_ != u"VERB":
116 | t = t.head
117 | r = t
118 | if t is not None:
119 | self._verbs = [v for v in t.children if v.pos_ == u"VERB"]
120 | self._subject = [s for s in t.children if s.dep_ == u"nsubj"]
121 | self._root = r
122 |
123 | def to_dict(self):
124 | return {'children': [t.text for t in self.children], 'root': self.root.text, 'subject': [s.text for s in self.subject], 'verbs': [v.text for v in self.verbs]}
125 |
126 |
127 | class ReportAbstractor(SemEHRAnnDoc):
128 | def __init__(self, ann_file):
129 | super(ReportAbstractor, self).__init__(ann_file)
130 | self._abstracted_sents = []
131 |
132 | def get_abstracted_sents(self):
133 | seq = 0
134 | for s in self.sentences:
135 | a_sent = AbstractedSentence(seq)
136 | seq += 1
137 | anns = sorted(self.annotations, key=lambda x: x.start)
138 | for a in anns:
139 | if a.overlap(s):
140 | a_sent.add_token('%s%s[%s]' % ("%s: " % a.negation if a.negation == "Negated" else "", a.str, a.sty))
141 | self._abstracted_sents.append(a_sent)
142 | logging.debug(a_sent.tokens)
143 |
144 |
145 | def test():
146 | ann_dir = 'C:/Users/hwu33/Downloads/working/semehr-runtime/radiology-reports/semehr_results/'
147 | files = [f for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
148 | for f in files:
149 | logging.debug('%s' % f)
150 | ra = ReportAbstractor(join(ann_dir, f))
151 | ra.get_abstracted_sents()
152 | logging.debug('\n')
153 |
154 |
155 | def test_spacy():
156 | nlp = spacy.load("en_core_web_sm")
157 | doc = nlp(u"She said he might be getting better soon.")
158 | for token in doc:
159 | print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
160 | [child for child in token.children], token.idx, token.shape_)
161 |
162 |
163 | def test_abstract_sentence():
164 | nlp = get_nlp_instance()
165 | abss = AbstractedSentence(1)
166 | abss.text = u"She said he might be getting better soon"
167 | result = abss.get_abstaction_by_pos(29, nlp)
168 | if result is not None:
169 | print(result.root, result.children, result.verbs, result.subject)
170 |
171 |
172 | def test_sentences():
173 | nlp = get_nlp_instance()
174 | sents = get_sentences_as_anns(nlp, u"""
175 | Circumstances leading to assessment.
176 | Over the past week ZZZZZ.
177 | """)
178 | print([s.serialise_json() for s in sents])
179 |
180 |
181 | if __name__ == "__main__":
182 | logging.basicConfig(level='DEBUG', format='[%(filename)s:%(lineno)d] %(name)s %(asctime)s %(message)s')
183 | # test_spacy()
184 | # test_abstract_sentence()
185 | test_sentences()
186 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | blis==0.7.3
2 | catalogue==1.0.0
3 | certifi==2020.11.8
4 | chardet==3.0.4
5 | cymem==2.0.4
6 | graphviz==0.15
7 | idna==2.10
8 | importlib-metadata==3.1.0
9 | joblib==0.17.0
10 | murmurhash==1.0.4
11 | numpy==1.19.4
12 | pandas==1.1.4
13 | plac==1.1.3
14 | preshed==3.0.4
15 | python-dateutil==2.8.1
16 | pytz==2020.4
17 | requests==2.25.0
18 | scikit-learn==0.23.2
19 | scipy==1.5.4
20 | six==1.15.0
21 | sklearn==0.0
22 | spacy==2.3.4
23 | srsly==1.0.4
24 | thinc==7.4.3
25 | threadpoolctl==2.1.0
26 | tqdm==4.54.0
27 | urllib3==1.26.2
28 | wasabi==0.8.0
29 | zipp==3.4.0
30 |
--------------------------------------------------------------------------------
/run_learning.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from sklearn.model_selection import KFold
3 | from os import listdir, makedirs
4 | from os.path import isfile, join, isdir
5 | import shutil
6 | from nlp_to_phenome import run_learning
7 | import utils
8 | import logging
9 |
10 |
11 | def run_kfold_learning(settings):
12 | corpus_folder = settings['corpus_folder']
13 | semehr_folder = settings['semehr_folder']
14 | gold_folder = settings['gold_folder']
15 | working_folder = settings['working_folder']
16 | kf = KFold(n_splits=settings["kfold"])
17 | files = [f for f in listdir(corpus_folder) if isfile(join(corpus_folder, f))]
18 | k = 0
19 | for train_idx, test_idx in kf.split(files):
20 | reset_folder(working_folder)
21 | # copy files
22 | train_ann_dir = join(working_folder, 'ann')
23 | train_gold_dir = join(working_folder, 'gold')
24 | train_text_dir = join(working_folder, 'train_corpus')
25 | test_ann_dir = join(working_folder, 'test_ann')
26 | test_gold_dir = join(working_folder, 'test_gold')
27 | test_text_dir = join(working_folder, 'test_corpus')
28 |
29 | for idx in train_idx:
30 | shutil.copy(join(corpus_folder, files[idx]), join(train_text_dir, files[idx]))
31 | ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '')
32 | gold_file = '%s.knowtator.xml' % files[idx]
33 | shutil.copy(join(semehr_folder, ann_file), join(train_ann_dir, ann_file))
34 | shutil.copy(join(gold_folder, gold_file), join(train_gold_dir, gold_file))
35 |
36 | for idx in test_idx:
37 | shutil.copy(join(corpus_folder, files[idx]), join(test_text_dir, files[idx]))
38 | ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '')
39 | gold_file = '%s.knowtator.xml' % files[idx]
40 | shutil.copy(join(semehr_folder, ann_file), join(test_ann_dir, ann_file))
41 | shutil.copy(join(gold_folder, gold_file), join(test_gold_dir, gold_file))
42 | performance = run_learning(train_ann_dir, train_gold_dir, train_text_dir,
43 | test_ann_dir, test_gold_dir, test_text_dir,
44 | settings)
45 | utils.save_string(performance, join(working_folder, 'folder_%s_perf.tsv' % k))
46 | k += 1
47 | logging.info('round %s done' % k)
48 |
49 |
50 | def reset_folder(working_folder):
51 | # clear working folder
52 | for d in listdir(working_folder):
53 | if isdir(join(working_folder, d)):
54 | shutil.rmtree(join(working_folder, d))
55 |
56 | train_ann_dir = join(working_folder, 'ann')
57 | train_gold_dir = join(working_folder, 'gold')
58 | train_text_dir = join(working_folder, 'train_corpus')
59 | test_ann_dir = join(working_folder, 'test_ann')
60 | test_gold_dir = join(working_folder, 'test_gold')
61 | test_text_dir = join(working_folder, 'test_corpus')
62 | learning_model_dir = join(working_folder, 'models')
63 | makedirs(train_ann_dir)
64 | makedirs(train_gold_dir)
65 | makedirs(train_text_dir)
66 | makedirs(test_ann_dir)
67 | makedirs(test_gold_dir)
68 | makedirs(test_text_dir)
69 | makedirs(learning_model_dir)
70 |
71 |
72 | def run_it(learnging_config_file):
73 | settings = utils.load_json_data(learnging_config_file)
74 | run_kfold_learning(settings)
75 |
76 |
77 | if __name__ == "__main__":
78 | run_it()
79 | if len(sys.argv) != 2:
80 | print('the syntax is [python run_it.py LEARNING_SETTINGS_FILE_PATH]')
81 | else:
82 | run_it(sys.argv[1])
--------------------------------------------------------------------------------
/settings/concept_mapping_stroke_sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "loc_deep": [
3 | "C2949882\tRight lentiform nucleus\tBody Part, Organ, or Organ Component",
4 | "C1548801\tExternal\tBody Location or Region",
5 | "C0305578\tcentrum\tPharmacologic Substance",
6 | "C0740279\tCerebellar atrophy\tDisease or Syndrome",
7 | "C0007759\tCortex Cerebelli\tBody Part, Organ, or Organ Component",
8 | "C0007765\tCerebellar\tBody Part, Organ, or Organ Component",
9 | "C0039729\tThalamus\tBody Part, Organ, or Organ Component",
10 | "C2334778\tRight internal capsule\tBody Part, Organ, or Organ Component",
11 | "C0546019\tLeft basal ganglia\tBody Part, Organ, or Organ Component",
12 | "C0241970\tLACUNE\tAcquired Abnormality",
13 | "C0737244\tCorona radiata\tBody Part, Organ, or Organ Component",
14 | "C2328150\tRight thalamus\tBody Part, Organ, or Organ Component",
15 | "C3178801\tLacunar Stroke\tDisease or Syndrome",
16 | "C0164707\tEPI\tPharmacologic Substance",
17 | "C0149854\tCerebellar hemorrhage\tPathologic Function",
18 | "C0152341\tCapsula Interna\tBody Part, Organ, or Organ Component",
19 | "C2330009\tAnterior limb of left internal capsule\tBody Part, Organ, or Organ Component",
20 | "C0007776\tCortex\tBody Part, Organ, or Organ Component",
21 | "C1389280\tBasal ganglia calcification\tPathologic Function",
22 | "C2951935\tRight side of pons\tBody Part, Organ, or Organ Component",
23 | "C0228465\tCerebellar hemisphere\tBody Part, Organ, or Organ Component",
24 | "C0004781\tBasal Nuclei\tBody Part, Organ, or Organ Component",
25 | "C0032639\tPontes\tBody Part, Organ, or Organ Component",
26 | "C0871456\tSubcortical lesions\tDisease or Syndrome",
27 | "C0162342\tLentiform Nuclei\tBody Part, Organ, or Organ Component",
28 | "C0017067\tGanglia\tBody Part, Organ, or Organ Component",
29 | "C2339807\tLeft internal capsule\tBody Part, Organ, or Organ Component",
30 | "C0815275\tsubcortical\tBody Location or Region",
31 | "C0228515\tCerebellar Peduncle\tBody Part, Organ, or Organ Component",
32 | "C0545733\tvertebrobasilar\tBody Part, Organ, or Organ Component",
33 | "C0333559\tLacunar Infarct\tDisease or Syndrome",
34 | "C0006121\tBrainstem\tBody Part, Organ, or Organ Component",
35 | "C2340044\tLeft thalamus\tBody Part, Organ, or Organ Component",
36 | "C0228291\tBasal ganglia and capsules\tBody Part, Organ, or Organ Component",
37 | "C2330887\tRight external capsule\tBody Part, Organ, or Organ Component",
38 | "C0006104\tBrains\tBody Part, Organ, or Organ Component",
39 | "C2950746\tLeft lentiform nucleus\tBody Part, Organ, or Organ Component",
40 | "C0010090\tCorpus Callosum\tBody Part, Organ, or Organ Component",
41 | "C2338227\tLeft external capsule\tBody Part, Organ, or Organ Component",
42 | "C0018827\tVentricle\tBody Part, Organ, or Organ Component",
43 | "C2337761\tLobe of cerebellum\tBody Part, Organ, or Organ Component",
44 | "C0228181\tCentrum ovale\tBody Part, Organ, or Organ Component",
45 | "C0152321\tGenu corpus callosi\tBody Part, Organ, or Organ Component",
46 | "C1446220\tBasal ganglion stroke\tPathologic Function",
47 | "C1116439\tPosterior fossa\tBody Part, Organ, or Organ Component",
48 | "C0025462\tMidbrain\tBody Part, Organ, or Organ Component",
49 | "C0152345\tCapsula Externa\tBody Part, Organ, or Organ Component",
50 | "C0546018\tRight basal ganglia\tBody Part, Organ, or Organ Component"
51 | ],
52 | "atrophy": [
53 | "C1114365\tAge\tClinical Attribute",
54 | "C0740279\tCerebellar atrophy\tDisease or Syndrome",
55 | "C1265891\tFocal atrophy\tPathologic Function",
56 | "C0333641\tAtrophy\tPathologic Function",
57 | "C2700258\tVolume\tLaboratory Procedure",
58 | "C3151195\tCerebral and cerebellar atrophy\tFinding",
59 | "C0598275\tDiffuse cerebral atrophy\tFinding",
60 | "C3273211\tAge-Related Atrophy\tPathologic Function",
61 | "C0006104\tBrains\tBody Part, Organ, or Organ Component",
62 | "C0235946\tBrain atrophy\tDisease or Syndrome"
63 | ],
64 | "stroke": [
65 | "C0018944\tHematoma\tPathologic Function",
66 | "C0005847\tVessel\tBody Part, Organ, or Organ Component",
67 | "C0487602\tStaining\tLaboratory Procedure",
68 | "C4019010\tEvent\tClinical Attribute",
69 | "C0751956\tAcute Stroke\tDisease or Syndrome",
70 | "C0038454\tStroke\tDisease or Syndrome"
71 | ],
72 | "metast_tumour": [
73 | "C0555278\tCerebral metastases\tNeoplastic Process",
74 | "C0221198\tLesion\tFinding",
75 | "C0027627\tMetastases\tNeoplastic Process",
76 | "C0233520\tDestructive\tIndividual Behavior"
77 | ],
78 | "time_recent": [
79 | "C1279919\tEarlier\tTemporal Concept",
80 | "C0333276\tacute bleed\tPathologic Function",
81 | "C0332185\tRecent\tTemporal Concept",
82 | "C1578513\tnew\tFinding",
83 | "C1513491\tMost Recent\tTemporal Concept",
84 | "C0439588\tacute chronic\tTemporal Concept",
85 | "C0333548\tAcute infarct\tPathologic Function",
86 | "C0333277\tSubacute bleeding\tPathologic Function",
87 | "C0751956\tAcute Stroke\tDisease or Syndrome",
88 | "C0205365\tSubacute\tTemporal Concept",
89 | "C0205178\tAcuteness\tTemporal Concept",
90 | "C0333560\tEvolving infarct\tPathologic Function"
91 | ],
92 | "subdural_haematoma": [
93 | "C0018944\tHematoma\tPathologic Function",
94 | "C0038541\tSubdural\tBody Space or Junction",
95 | "C0018946\tSubdural Hematoma\tPathologic Function",
96 | "C0749095\tChronic Subdural Hematoma\tPathologic Function",
97 | "C0019080\tBleeding\tPathologic Function"
98 | ],
99 | "ischaemic_stroke": [
100 | "C0031001\tPerfusion\tTherapeutic or Preventive Procedure",
101 | "C0149566\tSylvian artery\tBody Part, Organ, or Organ Component",
102 | "C0585229\tMultiple lacunar infarcts\tDisease or Syndrome",
103 | "C3178801\tLacunar Stroke\tDisease or Syndrome",
104 | "C0017639\tGlioses\tPathologic Function",
105 | "C0164707\tEPI\tPharmacologic Substance",
106 | "C0001365\tCva\tDisease or Syndrome",
107 | "C1165245\tPacis\tPharmacologic Substance",
108 | "C0948008\tIschemic stroke\tDisease or Syndrome",
109 | "C0740392\tINFARCT MCA\tDisease or Syndrome",
110 | "C0038454\tStroke\tDisease or Syndrome",
111 | "C1446220\tBasal ganglion stroke\tPathologic Function",
112 | "C0333560\tEvolving infarct\tPathologic Function",
113 | "C4019010\tEvent\tClinical Attribute",
114 | "C0022116\tIschemia\tPathologic Function",
115 | "C0752132\tINFARCT PCA\tDisease or Syndrome",
116 | "C0333559\tLacunar Infarct\tDisease or Syndrome",
117 | "C0007785\tCerebral infarct\tDisease or Syndrome",
118 | "C1511938\tDifferentiation\tClinical Attribute",
119 | "C0021308\tInfarct\tPathologic Function",
120 | "C0333551\tOld infarct\tPathologic Function",
121 | "C0241970\tLACUNE\tAcquired Abnormality",
122 | "C0585629\tLeft sided cerebral infarction\tPathologic Function",
123 | "C0333548\tAcute infarct\tPathologic Function",
124 | "C0333542\tFocal infarct\tPathologic Function",
125 | "C0751587\tCADASIL\tDisease or Syndrome"
126 | ],
127 | "haemorrhagic_transformation": [
128 | "C1510411\tTransformed\tPathologic Function"
129 | ],
130 | "microhaemorrhage": [
131 | "C0019080\tBleeding\tPathologic Function",
132 | "C0859253\tMicrohaemorrhage\tPathologic Function",
133 | "C2750293\tMicrobleeds\tFinding"
134 | ],
135 | "subarachnoid_haemorrhage": [
136 | "C0005767\tSanguis\tTissue",
137 | "C0038525\tSubarachnoid Hemorrhage\tDisease or Syndrome"
138 | ],
139 | "tumour": [
140 | "C1333071\tClival Chordoma\tNeoplastic Process",
141 | "C0270614\tIntracranial lump\tFinding",
142 | "C2931822\tCancer of Nasopharynx\tNeoplastic Process",
143 | "C0027651\tTumors\tNeoplastic Process",
144 | "C0577559\tA mass\tFinding",
145 | "C0001430\tAdenoma\tNeoplastic Process",
146 | "C0346308\tPituitary macroadenoma\tNeoplastic Process",
147 | "C0342419\tPituitary mass\tAnatomical Abnormality",
148 | "C0032000\tPituitary Adenoma\tNeoplastic Process",
149 | "C0746408\tMass/lesion\tNeoplastic Process",
150 | "C0221198\tLesion\tFinding",
151 | "C0871456\tSubcortical lesions\tDisease or Syndrome",
152 | "C0457193\tSoft tissue mass\tAnatomical Abnormality",
153 | "C0024299\tLymphoma\tNeoplastic Process",
154 | "C2752009\tWhite matter lesions\tFinding",
155 | "C0746405\tcystic mass\tDisease or Syndrome"
156 | ],
157 | "small_vessel_disease": [
158 | "C0228157\tPeriventricular white matter\tBody Part, Organ, or Organ Component",
159 | "C1114365\tAge\tClinical Attribute",
160 | "C0010957\tDamage\tInjury or Poisoning",
161 | "C1843516\tDilated perivascular spaces\tFinding",
162 | "C0815275\tsubcortical\tBody Location or Region",
163 | "C0022116\tIschemia\tPathologic Function",
164 | "C0042373\tAngiopathy\tDisease or Syndrome",
165 | "C1282841\tSmall vessels\tBody Part, Organ, or Organ Component",
166 | "C0152295\tCerebral White Matter\tTissue",
167 | "C2733158\tCerebral Microangiopathy\tDisease or Syndrome",
168 | "C1833300\tWhite matter changes\tFinding",
169 | "C0221198\tLesion\tFinding",
170 | "C0225988\tSmall vessel\tBody Part, Organ, or Organ Component",
171 | "C0012634\tDisease\tDisease or Syndrome",
172 | "C0682708\tWhite Matter\tTissue",
173 | "C0270612\tLeukoencephalopathy\tDisease or Syndrome",
174 | "C0006104\tBrains\tBody Part, Organ, or Organ Component",
175 | "C1853380\tPeriventricular white matter changes\tFinding"
176 | ],
177 | "glioma_tumour": [
178 | "C0017636\tGlioblastoma\tNeoplastic Process",
179 | "C1997217\tLow grade glioma\tNeoplastic Process",
180 | "C0028945\tOligodendroglioma\tNeoplastic Process",
181 | "C0017638\tGlioma\tNeoplastic Process",
182 | "C0334583\tPiloid astrocytoma\tNeoplastic Process",
183 | "C0004114\tAstrocytoma\tNeoplastic Process"
184 | ],
185 | "loc_cortical": [
186 | "C0030560\tParietal Lobe\tBody Part, Organ, or Organ Component",
187 | "C0228218\tRight occipital lobe\tBody Part, Organ, or Organ Component",
188 | "C0003842\tArtery\tBody Part, Organ, or Organ Component",
189 | "C0230010\tTemporal Fossa\tBody Space or Junction",
190 | "C0228193\tRight frontal lobe\tBody Part, Organ, or Organ Component",
191 | "C0751438\tPosterior\tDisease or Syndrome",
192 | "C0039484\tOs temporale\tBody Part, Organ, or Organ Component",
193 | "C0016733\tFrontal Lobe\tBody Part, Organ, or Organ Component",
194 | "C0078944\tIV PCA\tTherapeutic or Preventive Procedure",
195 | "C0226213\tRight middle cerebral artery\tBody Part, Organ, or Organ Component",
196 | "C0149566\tSylvian artery\tBody Part, Organ, or Organ Component",
197 | "C3495441\tMotor Strip\tBody Location or Region",
198 | "C0742901\tcraniotomy temporal\tTherapeutic or Preventive Procedure",
199 | "C0039452\tCerebrum\tBody Part, Organ, or Organ Component",
200 | "C2331118\tRight insula\tBody Part, Organ, or Organ Component",
201 | "C0226214\tLeft middle cerebral artery\tBody Part, Organ, or Organ Component",
202 | "C1165245\tPacis\tPharmacologic Substance",
203 | "C0751437\tAnterior\tDisease or Syndrome",
204 | "C0152299\tPrecentral Gyrus\tBody Part, Organ, or Organ Component",
205 | "C0016732\tOs frontale\tBody Part, Organ, or Organ Component",
206 | "C0007776\tCortex\tBody Part, Organ, or Organ Component",
207 | "C0031873\tPica Disease\tMental or Behavioral Dysfunction",
208 | "C0228207\tRight parietal lobe\tBody Part, Organ, or Organ Component",
209 | "C0228208\tLeft parietal lobe\tBody Part, Organ, or Organ Component",
210 | "C0459388\tFrontal Sulcus\tBody Part, Organ, or Organ Component",
211 | "C0028784\tOs occipitale\tBody Part, Organ, or Organ Component",
212 | "C0740392\tINFARCT MCA\tDisease or Syndrome",
213 | "C0149561\tAnterior Cerebral Artery\tBody Part, Organ, or Organ Component",
214 | "C0228219\tLeft occipital lobe\tBody Part, Organ, or Organ Component",
215 | "C0152302\tPostcentral Gyrus\tBody Part, Organ, or Organ Component",
216 | "C1184145\tOccipital\tBody Location or Region",
217 | "C0226247\tRight posterior cerebral artery\tBody Part, Organ, or Organ Component",
218 | "C0152283\tTemporal Horn\tBody Part, Organ, or Organ Component",
219 | "C0028785\tOccipital Lobe\tBody Part, Organ, or Organ Component",
220 | "C2339924\tRight insular cortex\tBody Part, Organ, or Organ Component",
221 | "C0796494\tLobe\tBody Part, Organ, or Organ Component",
222 | "C2362314\tTemporal\tTemporal Concept",
223 | "C0228194\tLeft frontal lobe\tBody Part, Organ, or Organ Component",
224 | "C0152296\tMarginal Gyrus\tBody Part, Organ, or Organ Component",
225 | "C0748512\tOccipital Scalp\tBody Location or Region",
226 | "C0030625\tPCA\tLaboratory Procedure",
227 | "C0228233\tLeft temporal lobe\tBody Part, Organ, or Organ Component",
228 | "C0752132\tINFARCT PCA\tDisease or Syndrome",
229 | "C0149554\tFRONTAL HORN\tBody Part, Organ, or Organ Component",
230 | "C0149576\tArteria cerebri posterior\tBody Part, Organ, or Organ Component",
231 | "C0597434\tsensory cortex\tBody Part, Organ, or Organ Component",
232 | "C0228202\tPremotor Area\tBody Part, Organ, or Organ Component",
233 | "C0021640\tInsula\tBody Part, Organ, or Organ Component",
234 | "C0228232\tRight temporal lobe\tBody Part, Organ, or Organ Component",
235 | "C3496562\tcortical white matter\tBody Part, Organ, or Organ Component",
236 | "C0039485\tTemporal Lobe\tBody Part, Organ, or Organ Component",
237 | "C0235946\tBrain atrophy\tDisease or Syndrome",
238 | "C3496378\tparietal white matter\tBody Part, Organ, or Organ Component",
239 | "C0272451\tParietal fracture\tInjury or Poisoning"
240 | ],
241 | "haemorrhagic_stroke": [
242 | "C0018944\tHematoma\tPathologic Function",
243 | "C0342406\tPituitary Hemorrhage\tPathologic Function",
244 | "C0333629\tHemosiderin Deposition\tPathologic Function",
245 | "C0333276\tacute bleed\tPathologic Function",
246 | "C3665429\tRecurrent hemorrhage\tPathologic Function",
247 | "C0333277\tSubacute bleeding\tPathologic Function",
248 | "C2937358\tBrain bleeding\tPathologic Function",
249 | "C0019080\tBleeding\tPathologic Function",
250 | "C0456388\tBlood Product\tPharmacologic Substance",
251 | "C0151699\tIntracranial bleed\tPathologic Function",
252 | "C1861265\tNo hemorrhage\tFinding",
253 | "C0149854\tCerebellar hemorrhage\tPathologic Function"
254 | ],
255 | "time_old": [
256 | "C0333629\tHemosiderin Deposition\tPathologic Function",
257 | "C0749095\tChronic Subdural Hematoma\tPathologic Function",
258 | "C0205156\tFormer\tTemporal Concept",
259 | "C0332152\tBefore\tTemporal Concept",
260 | "C0439588\tacute chronic\tTemporal Concept",
261 | "C0580836\tOld\tTemporal Concept",
262 | "C0205191\tChronic\tTemporal Concept",
263 | "C0333551\tOld infarct\tPathologic Function",
264 | "C3714811\tResolved\tFinding"
265 | ],
266 | "mening_tumour": [
267 | "C0025286\tMengioma\tNeoplastic Process"
268 | ]
269 | }
--------------------------------------------------------------------------------
/settings/entity_types_phenotypes_stroke_sample.txt:
--------------------------------------------------------------------------------
1 | atrophy
2 | glioma_tumour
3 | haemorrhagic_stroke
4 | haemorrhagic_transformation
5 | ischaemic_stroke
6 | mening_tumour
7 | metast_tumour
8 | microhaemorrhage
9 | small_vessel_disease
10 | stroke
11 | subarachnoid_haemorrhage
12 | subdural_haematoma
13 | tumour
14 | intracerebral_haemorrhage
15 | intracranial_haemorrhage
16 | bleeding
17 | aneurysm
--------------------------------------------------------------------------------
/settings/ignore_mappings_stroke_sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "stroke": [
3 | "C0018944",
4 | "haematoma",
5 | "C4019010",
6 | "C0005847"
7 | ],
8 | "microhaemorrhage": [
9 | "C0019080"
10 | ],
11 | "atrophy": [
12 | "C1114365",
13 | "C0006104",
14 | "C2700258"
15 | ],
16 | "subarachnoid_haemorrhage": [
17 | "C0005767"
18 | ],
19 | "subdural_haematoma": [
20 | "C0019080",
21 | "C0038541",
22 | "C0018944",
23 | "haematoma",
24 | "collections"
25 | ],
26 | "small_vessel_disease": [
27 | "C1114365",
28 | "C0006104",
29 | "C0022116",
30 | "C0682708",
31 | "C0221198",
32 | "C0228157",
33 | "white matter"
34 | ],
35 | "tumour": [
36 | "C0577559",
37 | "C0221198",
38 | "lesions",
39 | "lesion",
40 | "mass"
41 | ],
42 | "loc_deep":[
43 | "brain",
44 | "C0006104"
45 | ],
46 | "time_old":[
47 | "C0332152"
48 | ]
49 | }
--------------------------------------------------------------------------------
/settings/sample_setting.json:
--------------------------------------------------------------------------------
1 | {
2 | "ann_dir": "U:/semehr-usecases/trajectories/train_anns",
3 | "gold_dir": "U:/semehr-usecases/trajectories/train_gold_anns",
4 | "dev_full_text_dir": "U:/semehr-usecases/trajectories/train_corpus",
5 | "test_semehr_output_dir": "U:/semehr-usecases/trajectories/lia_anns",
6 | "test_ann_dir": "U:/semehr-usecases/trajectories/nadia_anns",
7 | "test_fulltext_dir": "U:/semehr-usecases/trajectories/annotator_Nadia/ehost-mk5.1/2nd_iteration/2nd_iteration_Nadia/corpus",
8 | "test_gold_dir": "U:/semehr-usecases/trajectories/annotator_Nadia/ehost-mk5.1/2nd_iteration/2nd_iteration_Nadia/saved",
9 | "concept_mapping_file": "./settings/empty_concept_mapping.json",
10 | "learning_model_dir": "./models",
11 | "entity_types_file": "./settings/better_worse_entity_types.txt",
12 | "ignore_mapping_file": "./settings/ignore_mapping.json",
13 | "min_sample_size": 25,
14 | "viz_file": "./settings/viz_%s.pdf",
15 | "conll_output_file": "./settings/conll_output_file.txt",
16 | "gold_file_pattern": "%s.txt.knowtator.xml",
17 | "eHostGD": true
18 | }
--------------------------------------------------------------------------------
/settings/sample_setting_kfold_learning.json:
--------------------------------------------------------------------------------
1 | {
2 | "kfold": 10,
3 | "corpus_folder": "/data/annotated_data/corpus",
4 | "gold_folder": "/data/annotated_data/gold",
5 | "semehr_folder": "/data/semehr_results",
6 | "working_folder": "/data/learning",
7 | "concept_mapping_file": "./settings/concept_mapping_stroke_sample.json",
8 | "learning_model_dir": "./models",
9 | "entity_types_file": "./settings/entity_types_phenotypes_stroke_sample.txt",
10 | "ignore_mapping_file": "./settings/ignore_mappings_stroke_sample.json",
11 | "min_sample_size": 25,
12 | "viz_file": "./settings/viz_%s.pdf",
13 | "conll_output_file": "./settings/conll_output_file.txt",
14 | "gold_file_pattern": "%s.txt.knowtator.xml",
15 | "eHostGD": true
16 | }
--------------------------------------------------------------------------------
/settings/stroke-subtype-rules-full.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "label": "primary haemorrhagic stroke",
4 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}],
5 | "exclusion_units": [
6 | [{"phenotype": "mening_tumour"}],
7 | [{"phenotype": "metast_tumour"}],
8 | [{"phenotype": "tumour"}],
9 | [{"phenotype": "ischaemic_stroke"}],
10 | [{"phenotype": "contusion"}],
11 | [{"phenotype": "trauma"}],
12 | [{"phenotype": "subdural_haematoma"}],
13 | [{"phenotype": "transformation"}]
14 | ]
15 | },
16 |
17 | {
18 | "label": "primary haemorrhagic stroke",
19 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}],
20 | "exclusion_units": [
21 | [{"phenotype": "mening_tumour"}],
22 | [{"phenotype": "metast_tumour"}],
23 | [{"phenotype": "tumour"}],
24 | [ {"phenotype": "ischaemic_stroke"}],
25 | [{"phenotype": "contusion"}],
26 | [{"phenotype": "trauma"}],
27 | [{"phenotype": "subdural_haematoma"}],
28 | [{"phenotype": "transformation"}]
29 | ]
30 | },
31 |
32 | {
33 | "label": "subarachnoid haemorrhage",
34 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}],
35 | "exclusion_units": [
36 | [{"phenotype": "mening_tumour"}],
37 | [{"phenotype": "metast_tumour"}],
38 | [{"phenotype": "tumour"}],
39 | [{"phenotype": "contusion"}],
40 | [{"phenotype": "trauma"}],
41 | [{"phenotype": "subdural_haematoma"}],
42 | [{"phenotype": "transformation"}],
43 | [{"phenotype": "intracerebral_haemorrhage"}]
44 | ]
45 | },
46 |
47 | {
48 | "label": "intracerebra haemorrhage",
49 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}],
50 | "exclusion_units": [
51 | [{"phenotype": "mening_tumour"}],
52 | [{"phenotype": "metast_tumour"}],
53 | [{"phenotype": "tumour"}],
54 | [ {"phenotype": "ischaemic_stroke"}],
55 | [{"phenotype": "contusion"}],
56 | [{"phenotype": "trauma"}],
57 | [{"phenotype": "subdural_haematoma"}],
58 | [{"phenotype": "transformation"}],
59 | [{"phenotype": "subarachnoid_haemorrhage"}, {"phenotype": "aneurysm"}]
60 | ]
61 | },
62 |
63 | {
64 | "label": "ischaemic stroke",
65 | "inclusions": [{"phenotype": "ischaemic_stroke"}],
66 | "exclusion_units": [
67 | ]
68 | },
69 | {
70 | "label": "small_vessel_disease",
71 | "inclusions": [{"phenotype": "small_vessel_disease"}],
72 | "exclusion_units": [
73 | ]
74 | },
75 | {
76 | "label": "atrophy",
77 | "inclusions": [{"phenotype": "atrophy"}],
78 | "exclusion_units": [
79 | ]
80 | },
81 | {
82 | "label": "atrophy",
83 | "inclusions": [{"phenotype": "atrophy"}],
84 | "exclusion_units": [
85 | ]
86 | },
87 | {
88 | "label": "tumour",
89 | "inclusions": [
90 | {"phenotype": "tumour"},
91 | {"phenotype": "glioma_tumour"},
92 | {"phenotype": "mening_tumour"},
93 | {"phenotype": "metast_tumour"}
94 | ],
95 | "exclusion_units": [
96 | ]
97 | }
98 | ]
99 |
--------------------------------------------------------------------------------
/settings/stroke-subtype-rules.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "label": "primary haemorrhagic stroke",
4 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}],
5 | "exclusion_units": [
6 | [{"phenotype": "mening_tumour"}],
7 | [{"phenotype": "metast_tumour"}],
8 | [{"phenotype": "tumour"}],
9 | [{"phenotype": "ischaemic_stroke"}],
10 | [{"phenotype": "contusion"}],
11 | [{"phenotype": "trauma"}],
12 | [{"phenotype": "subdural_haematoma"}],
13 | [{"phenotype": "transformation"}]
14 | ]
15 | },
16 |
17 | {
18 | "label": "primary haemorrhagic stroke",
19 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}],
20 | "exclusion_units": [
21 | [{"phenotype": "mening_tumour"}],
22 | [{"phenotype": "metast_tumour"}],
23 | [{"phenotype": "tumour"}],
24 | [ {"phenotype": "ischaemic_stroke"}],
25 | [{"phenotype": "contusion"}],
26 | [{"phenotype": "trauma"}],
27 | [{"phenotype": "subdural_haematoma"}],
28 | [{"phenotype": "transformation"}]
29 | ]
30 | },
31 |
32 | {
33 | "label": "subarachnoid haemorrhage",
34 | "inclusions": [{"phenotype": "subarachnoid_haemorrhage"}],
35 | "exclusion_units": [
36 | [{"phenotype": "mening_tumour"}],
37 | [{"phenotype": "metast_tumour"}],
38 | [{"phenotype": "tumour"}],
39 | [{"phenotype": "contusion"}],
40 | [{"phenotype": "trauma"}],
41 | [{"phenotype": "subdural_haematoma"}],
42 | [{"phenotype": "transformation"}],
43 | [{"phenotype": "intracerebral_haemorrhage"}]
44 | ]
45 | },
46 |
47 | {
48 | "label": "intracerebra haemorrhage",
49 | "inclusions": [{"phenotype": "intracerebral_haemorrhage"}],
50 | "exclusion_units": [
51 | [{"phenotype": "mening_tumour"}],
52 | [{"phenotype": "metast_tumour"}],
53 | [{"phenotype": "tumour"}],
54 | [ {"phenotype": "ischaemic_stroke"}],
55 | [{"phenotype": "contusion"}],
56 | [{"phenotype": "trauma"}],
57 | [{"phenotype": "subdural_haematoma"}],
58 | [{"phenotype": "transformation"}],
59 | [{"phenotype": "subarachnoid_haemorrhage"}, {"phenotype": "aneurysm"}]
60 | ]
61 | },
62 |
63 | {
64 | "label": "ischaemic stroke",
65 | "inclusions": [{"phenotype": "ischaemic_stroke"}],
66 | "exclusion_units": [
67 | ]
68 | }
69 | ]
70 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from os import listdir, rename
2 | from os.path import isfile, join, split
3 | import queue as Queue
4 | import threading
5 | import json
6 | import codecs
7 | import requests
8 |
9 |
10 | # list files in a folder and put them in to a queue for multi-threading processing
11 | def multi_thread_process_files(dir_path, file_extension, num_threads, process_func,
12 | proc_desc='processed', args=None, multi=None,
13 | file_filter_func=None, callback_func=None,
14 | thread_wise_objs=None):
15 | onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
16 | num_pdfs = 0
17 | files = None if multi is None else []
18 | lst = []
19 | for f in onlyfiles:
20 | if f.endswith('.' + file_extension) if file_filter_func is None \
21 | else file_filter_func(f):
22 | if multi is None:
23 | lst.append(join(dir_path, f))
24 | else:
25 | files.append(join(dir_path, f))
26 | if len(files) >= multi:
27 | lst.append(files)
28 | files = []
29 | num_pdfs += 1
30 | if files is not None and len(files) > 0:
31 | lst.append(files)
32 | multi_thread_tasking(lst, num_threads, process_func, proc_desc, args, multi, file_filter_func,
33 | callback_func,
34 | thread_wise_objs=thread_wise_objs)
35 |
36 |
37 | def multi_thread_tasking(lst, num_threads, process_func,
38 | proc_desc='processed', args=None, multi=None,
39 | file_filter_func=None, callback_func=None, thread_wise_objs=None,
40 | thread_init_func=None, thread_end_func=None,):
41 | num_pdfs = len(lst)
42 | pdf_queque = Queue.Queue(num_pdfs)
43 | # print('putting list into queue...')
44 | for item in lst:
45 | pdf_queque.put_nowait(item)
46 | thread_num = min(num_pdfs, num_threads)
47 | arr = [process_func] if args is None else [process_func] + args
48 | arr.insert(0, pdf_queque)
49 | # print('queue filled, threading...')
50 | thread_objs = []
51 | for i in range(thread_num):
52 | tarr = arr[:]
53 | thread_obj = None
54 | if thread_wise_objs is not None and isinstance(thread_wise_objs, list):
55 | thread_obj = thread_wise_objs[i]
56 | if thread_obj is None and thread_init_func is not None:
57 | thread_obj = thread_init_func()
58 | thread_objs.append(thread_obj)
59 | tarr.insert(0, thread_obj)
60 | t = threading.Thread(target=multi_thread_do, args=tuple(tarr))
61 | t.daemon = True
62 | t.start()
63 |
64 | # print('waiting jobs to finish')
65 | pdf_queque.join()
66 | if thread_end_func is not None:
67 | for to in thread_objs:
68 | if to is not None:
69 | thread_end_func(to)
70 | # print('{0} files {1}'.format(num_pdfs, proc_desc))
71 | if callback_func is not None:
72 | callback_func(*tuple(args))
73 |
74 |
75 | def multi_thread_tasking_it(it_lst, num_threads, process_func,
76 | proc_desc='processed', args=None, multi=None,
77 | file_filter_func=None, callback_func=None, thread_wise_objs=None):
78 | pdf_queque = Queue.Queue(1000)
79 | thread_num = num_threads
80 | arr = [process_func] if args is None else [process_func] + args
81 | arr.insert(0, pdf_queque)
82 | # print('queue filled, threading...')
83 | for i in range(thread_num):
84 | tarr = arr[:]
85 | thread_obj = None
86 | if thread_wise_objs is not None and isinstance(thread_wise_objs, list):
87 | thread_obj = thread_wise_objs[i]
88 | tarr.insert(0, thread_obj)
89 | t = threading.Thread(target=multi_thread_do, args=tuple(tarr))
90 | t.daemon = True
91 | t.start()
92 |
93 | # print('waiting jobs to finish')
94 | # print('putting list into queue...')
95 | for item in it_lst:
96 | pdf_queque.put(item)
97 | pdf_queque.join()
98 | # print('{0} files {1}'.format(num_pdfs, proc_desc))
99 | if callback_func is not None:
100 | callback_func(*tuple(args))
101 |
102 |
103 | def multi_thread_do(thread_obj, q, func, *args):
104 | while True:
105 | p = q.get()
106 | try:
107 | if thread_obj is not None:
108 | func(thread_obj, p, *args)
109 | else:
110 | func(p, *args)
111 | except Exception as e:
112 | print(u'error doing {0} on {1} \n{2}'.format(func, p, str(e)))
113 | q.task_done()
114 |
115 |
116 | def save_json_array(lst, file_path, encoding='utf-8'):
117 | with codecs.open(file_path, 'w', encoding=encoding) as wf:
118 | json.dump(lst, wf)
119 |
120 |
121 | def save_string(txt, file_path, encoding='utf-8'):
122 | with codecs.open(file_path, 'w', encoding=encoding) as wf:
123 | wf.write(txt)
124 |
125 |
126 | def load_json_data(file_path):
127 | data = None
128 | with codecs.open(file_path, encoding='utf-8') as rf:
129 | data = json.load(rf, encoding='utf-8')
130 | return data
131 |
132 |
133 | def http_post_result(url, payload, headers=None, auth=None):
134 | req = requests.post(
135 | url, headers=headers,
136 | data=payload, auth=auth)
137 | return str(req.content) # req.content.decode("utf-8")
138 |
139 |
140 | def multi_thread_large_file_tasking(large_file, num_threads, process_func,
141 | proc_desc='processed', args=None, multi=None,
142 | file_filter_func=None, callback_func=None,
143 | thread_init_func=None, thread_end_func=None,
144 | file_encoding='utf-8'):
145 | num_queue_size = 1000
146 | pdf_queque = Queue.Queue(num_queue_size)
147 | print('queue filled, threading...')
148 | thread_objs = []
149 | for i in range(num_threads):
150 | arr = [process_func] if args is None else [process_func] + args
151 | to = None
152 | if thread_init_func is not None:
153 | to = thread_init_func()
154 | thread_objs.append(to)
155 | arr.insert(0, to)
156 | arr.insert(1, pdf_queque)
157 | t = threading.Thread(target=multi_thread_do, args=tuple(arr))
158 | t.daemon = True
159 | t.start()
160 |
161 | print('putting list into queue...')
162 | num_lines = 0
163 | with codecs.open(large_file, encoding=file_encoding) as lf:
164 | for line in lf:
165 | num_lines += 1
166 | pdf_queque.put(line)
167 |
168 | print('waiting jobs to finish')
169 | pdf_queque.join()
170 | if thread_end_func is not None:
171 | for to in thread_objs:
172 | if to is not None:
173 | thread_end_func(to)
174 | print('{0} lines {1}'.format(num_lines, proc_desc))
175 | if callback_func is not None:
176 | callback_func(*tuple(args))
177 |
178 |
179 | def read_text_file(file_path, encoding='utf-8'):
180 | lines = []
181 | with codecs.open(file_path, encoding=encoding) as rf:
182 | lines += rf.readlines()
183 | return [l.strip() for l in lines]
184 |
185 |
186 | def read_text_file_as_string(file_path, encoding='utf-8'):
187 | s = None
188 | with codecs.open(file_path, encoding=encoding) as rf:
189 | s = rf.read()
190 | return s
191 |
192 |
193 | def main():
194 | ann_dir = '/data/annotated_data/gold/'
195 | files = [f for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
196 | for f in files:
197 | rename(join(ann_dir, f), join(ann_dir, f[:-14] + '.txt.knowtator.xml'))
198 |
199 | if __name__ == "__main__":
200 | main()
201 |
--------------------------------------------------------------------------------