├── .gitignore
├── COPYING
├── README.md
├── commands
    ├── debug.sh
    ├── german.sh
    ├── german_cnn.sh
    └── random.sh
├── dataloaders
    ├── __init__.py
    ├── data_loader.py
    ├── data_loader_orig.py
    └── dataloader_unicode.py
├── datasets
    └── english
    │   ├── eng.dev.bio.conll
    │   ├── eng.test.bio.conll
    │   └── eng.train.bio.conll
├── eval
    ├── IO2BIO.py
    ├── IO2BIOES.py
    ├── conlleval
    ├── conlleval.v2
    ├── eval.sh
    └── format.py
├── main.py
├── models
    ├── __init__.py
    ├── decoders.py
    ├── encoders.py
    └── model_builder.py
└── utils
    ├── Convert_Output_Darpa.py
    ├── Convert_to_darpa_xml.py
    ├── __init__.py
    ├── extract_authors.py
    ├── features.py
    ├── gaz.csv
    ├── old_segnerfts.py
    ├── orm_morph.py
    ├── orm_norm
        ├── __init__.py
        ├── lexicon_supplement.txt
        ├── morpar_orm.py
        ├── orm_gaz.txt
        ├── orm_lexicon.txt
        ├── orm_lexicon_wikibooks.txt
        ├── orm_morph.py
        └── ormnorm.py
    ├── post_process.py
    ├── segnerfts
        ├── README.md
        ├── __init__.py
        ├── gaz.csv
        ├── morpar.py
        ├── morpar_orm.py
        ├── orm_morph.py
        ├── segnerfts.py
        └── tir_morph.py
    ├── segnerfts_2.py
    ├── split_train_ensemble.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # macOS files
 2 | **/*.DS_Store
 3 | *.pyc
 4 | # JetBrains
 5 | .idea/
 6 | *.iml
 7 | *.txt
 8 | datasets/embedding/
 9 | maxma/
10 | eval/
11 | 
12 | !utils/orm_norm/*.txt
13 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright 2018 cmu-ner team.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cmu-ner
 2 | 
 3 | by Chunting Zhou, Aditi Chaudhary, David Mortenson
 4 | (in collaboration w/ Graham Neubig and Jaime Carbonell)
 5 | 
 6 | CMU-NER is a suite of software to detect Named Entities, such as people, locations, geo-political entities, in text in different languages.  This software suite is primarily based on deep learning methods.
 7 | 
 8 | ## TODO
 9 | 
10 | * Cross-lingual transfer learning with cheap translation.
11 | * Preproduce the morphology tag features.
12 | * Add IPA transfer learning.
13 | 
14 | ## Acknowledgements
15 | 
16 | The development of this software has been sponsored by DARPA-funded project at CMU called AIRES under DARPA's LORELEI initiative.
17 | 
18 | ## License
19 | 
20 | This software is available under the BSD license (see COPYING for details).
21 | 


--------------------------------------------------------------------------------
/commands/debug.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python ../main.py \
 3 |     --dynet-seed 3278657 \
 4 |     --word_emb_dim 100 \
 5 |     --batch_size 10 \
 6 |     --model_name "eng" \
 7 |     --lang eng \
 8 |     --valid_freq 1300
 9 | 
10 | #   --pretrain_emb_path ../new_datasets/embs/glove.6B.100d.txt\
11 | 


--------------------------------------------------------------------------------
/commands/german.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | MODEL_NAME=$1
 3 | python ../main.py \
 4 |     --dynet-seed 5783287 \
 5 |     --word_emb_dim 64 \
 6 |     --batch_size 10 \
 7 |     --train_path ../datasets/german/deu.train.utf8.conll \
 8 |     --dev_path ../datasets/german/deu.testa.utf8.conll \
 9 |     --test_path ../datasets/german/deu.testb.utf8.conll \
10 |     --pretrain_emb_path ../datasets/embs/sskip/ger_emb.txt \
11 |     --emb_dropout_rate 0.0 \
12 |     --output_dropout_rate 0.5 \
13 |     --init_lr 0.01 \
14 |     --model_arc char_birnn \
15 |     --tag_emb_dim 100 \
16 |     --hidden_dim 100 \
17 |     --char_emb_dim 30\
18 |     --char_hidden_dim 25 \
19 |     --lang german \
20 |     --replace_unk_rate 0.5 \
21 |     --valid_freq 1300 2>&1 | tee ${MODEL_NAME}
22 | 


--------------------------------------------------------------------------------
/commands/german_cnn.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash


--------------------------------------------------------------------------------
/commands/random.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python ../main.py \
3 |     --dynet-seed 3278657 \
4 |     --word_emb_dim 100 \
5 |     --batch_size 10 \
6 |     --lang eng
7 | 


--------------------------------------------------------------------------------
/dataloaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/dataloaders/__init__.py


--------------------------------------------------------------------------------
/dataloaders/data_loader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | import os
  3 | from utils.util import *
  4 | from utils.features import *
  5 | from utils.segnerfts import orm_morph as ormnorm
  6 | 
  7 | class NER_DataLoader():
  8 |     def __init__(self, args, special_normal=False):
  9 |         # This is data loader as well as feature extractor!!
 10 |         '''Data format: id word pos_tag syntactic_tag NER_tag'''
 11 |         ''' TODO: 1. normalizing all digits
 12 |                   2. Using full vocabulary from GloVe, when testing, lower case first'''
 13 |         self.args = args
 14 |         if args.train_ensemble:
 15 |             self.train_path = args.full_data_path
 16 |         else:
 17 |             self.train_path = args.train_path
 18 |         self.test_path = args.test_path
 19 |         self.dev_path = args.dev_path
 20 |         self.args = args
 21 | 
 22 |         self.tag_vocab_path = self.train_path + ".tag_vocab"
 23 |         self.word_vocab_path = self.train_path + ".word_vocab"
 24 |         self.char_vocab_path = self.train_path + ".char_vocab"
 25 | 
 26 |         self.pretrained_embedding_path = args.pretrain_emb_path
 27 |         self.use_discrete_feature = args.use_discrete_features
 28 |         self.use_brown_cluster = args.use_brown_cluster
 29 |         self.orm_norm = args.oromo_normalize
 30 |         self.orm_lower = args.train_lowercase_oromo
 31 | 
 32 |         if special_normal:
 33 |             self.orm_norm = False
 34 |             self.orm_lower = False
 35 | 
 36 |         if self.use_brown_cluster:
 37 |             self.brown_cluster_dicts = get_brown_cluster(args.brown_cluster_path)
 38 |             self.brown_cluster_dicts['<unk>'] = 499
 39 |         else:
 40 |             self.brown_cluster_dicts = None
 41 | 
 42 |         if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path):
 43 |             # TODO: encoding?
 44 |             print("Load vocabs from file ....")
 45 |             self.tag_to_id = pkl_load(self.tag_vocab_path)
 46 |             self.word_to_id = pkl_load(self.word_vocab_path)
 47 |             self.char_to_id = pkl_load(self.char_vocab_path)
 48 |             print("Done!")
 49 |         else:
 50 |             print("Generating vocabs from training file ....")
 51 |             if not self.args.isLr:
 52 |                 paths_to_read = [self.train_path, self.test_path, self.dev_path]
 53 |                 self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read)
 54 |             else:
 55 |                 paths_to_read = [self.train_path]
 56 |                 setEpaths = [self.dev_path, self.test_path]
 57 |                 self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files_lr(paths_to_read, setEpaths)
 58 |             # FIXME: Remember dictionary value for char and word has been shifted by 1
 59 |             print "Size of vocab before: ", len(self.word_to_id)
 60 |             self.word_to_id['<unk>'] = len(self.word_to_id) + 1
 61 |             self.char_to_id['<unk>'] = len(self.char_to_id) + 1
 62 | 
 63 |             self.word_to_id['<\s>'] = 0
 64 |             self.char_to_id['<pad>'] = 0
 65 |             print "Size of vocab after: ", len(self.word_to_id)
 66 |             pkl_dump(self.tag_to_id, self.tag_vocab_path)
 67 |             pkl_dump(self.char_to_id, self.char_vocab_path)
 68 |             pkl_dump(self.word_to_id, self.word_vocab_path)
 69 | 
 70 |         self.word_padding_token = 0
 71 |         self.char_padding_token = 0
 72 | 
 73 |         if self.pretrained_embedding_path is not None:
 74 |             self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path,
 75 |                                                                          self.word_to_id, args.word_emb_dim)
 76 |         # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the <unk>
 77 |         self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()}
 78 |         self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()}
 79 |         self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()}
 80 | 
 81 |         self.ner_vocab_size = len(self.id_to_tag)
 82 |         self.word_vocab_size = len(self.id_to_word)
 83 |         self.char_vocab_size = len(self.id_to_char)
 84 | 
 85 |         print "Size of vocab after: ", len(self.word_to_id)
 86 |         print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size))
 87 | 
 88 |     @staticmethod
 89 |     def exists(path):
 90 |         return os.path.exists(path)
 91 | 
 92 |     def read_one_line(self, line, tag_set, word_dict, char_set):
 93 |         for w in line:
 94 |             fields = w.split()
 95 |             word = fields[0]
 96 |             ner_tag = fields[-1]
 97 |             for c in word:
 98 |                 char_set.add(c)
 99 |             tag_set.add(ner_tag)
100 |             if self.orm_lower:
101 |                 word = word.lower()
102 |             if self.orm_norm:
103 |                 #word = orm_morph.best_parse(word)
104 |                 word = ormnorm.normalize(word)
105 |             word_dict[word] += 1
106 | 
107 |     def get_vocab_from_set(self, a_set, shift=0):
108 |         vocab = {}
109 |         for i, elem in enumerate(a_set):
110 |             vocab[elem] = i + shift
111 | 
112 |         return vocab
113 | 
114 |     def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False):
115 |         vocab = {}
116 |         i = 0
117 |         self.singleton_words = set()
118 |         for k, v in a_dict.iteritems():
119 |             if v == 1:
120 |                 self.singleton_words.add(i + shift)
121 |             if remove_singleton:
122 |                 if v > 1:
123 |                     # print k, v
124 |                     vocab[k] = i + shift
125 |                     i += 1
126 |             else:
127 |                 vocab[k] = i + shift
128 |                 i += 1
129 |         print "Singleton words number: ", len(self.singleton_words)
130 |         return vocab
131 | 
132 |     def read_files(self, paths):
133 |         # word_list = []
134 |         # char_list = []
135 |         # tag_list = []
136 |         word_dict = defaultdict(lambda: 0)
137 |         char_set = set()
138 |         tag_set = set()
139 | 
140 |         def _read_a_file(path):
141 |             with codecs.open(path, "r", "utf-8") as fin:
142 |                 to_read_line = []
143 |                 for line in fin:
144 |                     if line.strip() == "":
145 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
146 |                         to_read_line = []
147 |                     else:
148 |                         to_read_line.append(line.strip())
149 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
150 | 
151 |         for path in paths:
152 |             _read_a_file(path)
153 | 
154 |         tag_vocab = self.get_vocab_from_set(tag_set)
155 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
156 |         char_vocab = self.get_vocab_from_set(char_set, 1)
157 | 
158 |         return tag_vocab, word_vocab, char_vocab
159 | 
160 |     def read_files_lr(self, paths, setEpaths):
161 |         # word_list = []
162 |         # char_list = []
163 |         # tag_list = []
164 |         word_dict = defaultdict(lambda: 0)
165 |         char_set = set()
166 |         tag_set = set()
167 | 
168 |         def _read_a_file(path):
169 |             with codecs.open(path, "r", "utf-8") as fin:
170 |                 to_read_line = []
171 |                 for line in fin:
172 |                     if line.strip() == "":
173 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
174 |                         to_read_line = []
175 |                     else:
176 |                         to_read_line.append(line.strip())
177 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
178 | 
179 |         for path in paths:
180 |             _read_a_file(path)
181 | 
182 |         #reading from SetE
183 |         for path in setEpaths:
184 |             with codecs.open(path, "r", "utf-8") as fin:
185 |                 for line in fin:
186 |                     fields = line.strip().split()
187 |                     for word in fields:
188 |                         for c in word:
189 |                             char_set.add(c)
190 |                         if self.orm_lower:
191 |                             word = word.lower()
192 |                         if self.orm_norm:
193 |                             #word = orm_morph.best_parse(word)
194 |                             word = ormnorm.normalize(word)
195 |                         word_dict[word] += 1
196 | 
197 |         tag_vocab = self.get_vocab_from_set(tag_set)
198 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
199 |         char_vocab = self.get_vocab_from_set(char_set, 1)
200 | 
201 |         return tag_vocab, word_vocab, char_vocab
202 | 
203 |     def get_data_set(self, path, lang):
204 |         sents = []
205 |         char_sents = []
206 |         tgt_tags = []
207 |         discrete_features = []
208 |         bc_features = []
209 | 
210 |         def add_sent(one_sent):
211 |             temp_sent = []
212 |             temp_ner = []
213 |             temp_char = []
214 |             temp_bc = []
215 |             for w in one_sent:
216 |                 fields = w.split()
217 |                 word = fields[0]
218 |                 ner_tag = fields[-1]
219 |                 if self.use_brown_cluster:
220 |                     temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts["<unk>"])
221 | 
222 |                 if self.orm_lower:
223 |                     word = word.lower()
224 | 
225 |                 if self.orm_norm:
226 |                     #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char
227 |                     word = ormnorm.normalize(word)
228 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
229 |                 temp_ner.append(self.tag_to_id[ner_tag])
230 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
231 | 
232 |             sents.append(temp_sent)
233 |             char_sents.append(temp_char)
234 |             tgt_tags.append(temp_ner)
235 |             bc_features.append(temp_bc)
236 |             if not self.args.isLr:
237 |                 discrete_features.append([])
238 |             else:
239 |                 discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else [])
240 | 
241 |             # print len(discrete_features[-1])
242 | 
243 |         with codecs.open(path, "r", "utf-8") as fin:
244 |             i = 0
245 |             one_sent = []
246 |             for line in fin:
247 |                 if line.strip() == "":
248 |                     if len(one_sent) > 0:
249 |                         add_sent(one_sent)
250 |                         i += 1
251 |                         if i % 1000 == 0:
252 |                             print("Processed %d training data." % (i,))
253 |                     one_sent = []
254 |                 else:
255 |                     one_sent.append(line.strip())
256 | 
257 |             if len(one_sent) > 0:
258 |                 add_sent(one_sent)
259 | 
260 |         if self.use_discrete_feature:
261 |             self.num_feats = len(discrete_features[0][0])
262 |         else:
263 |             self.num_feats = 0
264 |         return sents, char_sents, tgt_tags, discrete_features, bc_features
265 | 
266 |     def get_lr_test(self, path, lang):
267 |         # setE.txt
268 |         sents = []
269 |         char_sents = []
270 |         discrete_features = []
271 |         bc_features = []
272 | 
273 |         def add_sent(one_sent):
274 |             temp_sent = []
275 |             temp_char = []
276 |             temp_bc = []
277 |             for word in one_sent:
278 |                 if self.use_brown_cluster:
279 |                     temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts["<unk>"])
280 |                 if self.orm_lower:
281 |                     word = word.lower()
282 |                 if self.orm_norm:
283 |                     #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char
284 |                     word = ormnorm.normalize(word)
285 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
286 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
287 | 
288 |             sents.append(temp_sent)
289 |             char_sents.append(temp_char)
290 |             discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else [])
291 |             bc_features.append(temp_bc)
292 | 
293 |         original_sents = []
294 |         with codecs.open(path, "r", "utf-8") as fin:
295 |             i = 0
296 |             for line in fin:
297 |                 one_sent = line.rstrip().split()
298 |                 if line:
299 |                     add_sent(one_sent)
300 |                     original_sents.append(one_sent)
301 |                 i += 1
302 |                 if i % 1000 == 0:
303 |                     print("Processed %d testing data." % (i,))
304 | 
305 |         if self.use_discrete_feature:
306 |             self.num_feats = len(discrete_features[0][0])
307 |         else:
308 |             self.num_feats = 0
309 | 
310 |         return sents, char_sents, discrete_features, original_sents, bc_features
311 | 
312 |     def get_lr_test_setE(self, path, lang):
313 |         # setE.conll
314 |         sents = []
315 |         char_sents = []
316 |         discrete_features = []
317 |         bc_features = []
318 |         doc_ids = []
319 |         original_sents = []
320 | 
321 |         def add_sent(one_sent):
322 |             temp_sent = []
323 |             temp_char = []
324 |             temp_bc = []
325 |             temp_ori_sent = []
326 |             for w in one_sent:
327 |                 tokens = w.split('\t')
328 |                 word = tokens[0]
329 |                 temp_ori_sent.append(word)
330 |                 docfile = tokens[3]
331 |                 doc_type = docfile.split('_')[1]
332 |                 if self.use_brown_cluster:
333 |                     temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts["<unk>"])
334 | 
335 |                 if self.orm_lower:
336 |                     word = word.lower()
337 | 
338 |                 if self.orm_norm:
339 |                     #word = orm_morph.best_parse(word)
340 |                     word = ormnorm.normalize(word)
341 | 
342 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
343 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
344 | 
345 |             doc_ids.append(docfile.split('_')[1])
346 |             sents.append(temp_sent)
347 |             char_sents.append(temp_char)
348 |             bc_features.append(temp_bc)
349 |             discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else [])
350 |             original_sents.append(temp_ori_sent)
351 |             # print len(discrete_features[-1])
352 | 
353 |         with codecs.open(path, "r", "utf-8") as fin:
354 |             i = 0
355 |             one_sent = []
356 |             for line in fin:
357 |                 if line.strip() == "":
358 |                     if len(one_sent) > 0:
359 |                         add_sent(one_sent)
360 |                     one_sent = []
361 |                 else:
362 |                     one_sent.append(line.strip())
363 |                 i += 1
364 |                 if i % 1000 == 0:
365 |                     print("Processed %d testing data." % (i,))
366 | 
367 |             if len(one_sent) > 0:
368 |                 add_sent(one_sent)
369 | 
370 |         if self.use_discrete_feature:
371 |             self.num_feats = len(discrete_features[0][0])
372 |         else:
373 |             self.num_feats = 0
374 | 
375 |         return sents, char_sents, discrete_features, bc_features, original_sents, doc_ids
376 | 
377 | 
378 | class Dataloader_Combine():
379 |     def __init__(self, args, normal_vocab, lower_vocab, char_to_id, brown_cluster_dicts=None, lower_brown_dicts=None):
380 |         self.word_to_id = normal_vocab
381 |         self.lower_word_to_id = lower_vocab
382 | 	self.args = args
383 | 
384 |         self.char_to_id = char_to_id
385 |         self.brown_cluster_dicts = brown_cluster_dicts
386 |         self.lower_brown_dicts = lower_brown_dicts
387 | 
388 |         self.use_discrete_feature = args.use_discrete_features
389 |         self.use_brown_cluster = args.use_brown_cluster
390 |         self.orm_norm = args.oromo_normalize
391 |         self.orm_lower = args.train_lowercase_oromo
392 | 
393 |     def get_lr_test_setE(self, path, lang):
394 |         # setE.conll
395 |         sents = []
396 |         char_sents = []
397 |         discrete_features = []
398 |         bc_features = []
399 |         doc_ids = []
400 |         original_sents = []
401 | 
402 |         def add_sent(one_sent):
403 |             temp_sent = []
404 |             temp_char = []
405 |             temp_bc = []
406 |             temp_ori_sent = []
407 |             for w in one_sent:
408 |                 tokens = w.split('\t')
409 |                 word = tokens[0]
410 |                 temp_ori_sent.append(word)
411 |                 docfile = tokens[3]
412 |                 doc_type = docfile.split('_')[1]
413 |                 if self.use_brown_cluster:
414 |                     if doc_type == "SN":
415 |                         temp_bc.append(self.lower_brown_dicts[word] if word in self.lower_brown_dicts else self.lower_brown_dicts["<unk>"])
416 |                     else:
417 |                         temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts["<unk>"])
418 | 
419 |                 if doc_type == "SN":
420 |                     if self.orm_lower:
421 |                         word = word.lower()
422 | 
423 |                     if self.orm_norm:
424 |                         #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char
425 |                         word = ormnorm.normalize(word)
426 |                     temp_sent.append(self.lower_word_to_id[word] if word in self.lower_word_to_id else self.lower_word_to_id["<unk>"])
427 |                 else:
428 |                     temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
429 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
430 | 
431 |             doc_ids.append(docfile.split('_')[1])
432 |             sents.append(temp_sent)
433 |             char_sents.append(temp_char)
434 |             bc_features.append(temp_bc)
435 |             discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else [])
436 |             original_sents.append(temp_ori_sent)
437 |             # print len(discrete_features[-1])
438 | 
439 |         with codecs.open(path, "r", "utf-8") as fin:
440 |             i = 0
441 |             one_sent = []
442 |             for line in fin:
443 |                 if line.strip() == "":
444 |                     if len(one_sent) > 0:
445 |                         add_sent(one_sent)
446 |                     one_sent = []
447 |                 else:
448 |                     one_sent.append(line.strip())
449 |                 i += 1
450 |                 if i % 1000 == 0:
451 |                     print("Processed %d testing data." % (i,))
452 | 
453 |             if len(one_sent) > 0:
454 |                 add_sent(one_sent)
455 | 
456 |         if self.use_discrete_feature:
457 |             self.num_feats = len(discrete_features[0][0])
458 |         else:
459 |             self.num_feats = 0
460 | 
461 |         return sents, char_sents, discrete_features, bc_features, original_sents, doc_ids
462 | 


--------------------------------------------------------------------------------
/dataloaders/data_loader_orig.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | from models.utils import *
  3 | import codecs
  4 | import os
  5 | from models.features import *
  6 | 
  7 | class NER_DataLoader():
  8 |     def __init__(self, args):
  9 |         '''Data format: id word pos_tag syntactic_tag NER_tag'''
 10 |         ''' TODO: 1. normalizing all digits
 11 |                   2. Using full vocabulary from GloVe, when testing, lower case first'''
 12 |         self.train_path = args.train_path
 13 |         self.test_path = args.test_path
 14 |         self.dev_path = args.dev_path
 15 |         self.args = args
 16 | 
 17 |         self.tag_vocab_path = self.train_path + ".tag_vocab"
 18 |         self.word_vocab_path = self.train_path + ".word_vocab"
 19 |         self.char_vocab_path = self.train_path + ".char_vocab"
 20 | 
 21 |         self.pretrained_embedding_path = args.pretrain_emb_path
 22 |         self.use_discrete_feature = args.use_discrete_features
 23 | 
 24 |         if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path):
 25 |             # TODO: encoding?
 26 |             print("Load vocabs from file ....")
 27 |             self.tag_to_id = pkl_load(self.tag_vocab_path)
 28 |             self.word_to_id = pkl_load(self.word_vocab_path)
 29 |             self.char_to_id = pkl_load(self.char_vocab_path)
 30 |             print("Done!")
 31 |         else:
 32 |             print("Generating vocabs from training file ....")
 33 |             paths_to_read = [self.train_path, self.test_path, self.dev_path]
 34 |             self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read)
 35 |             # FIXME: Remember dictionary value for char and word has been shifted by 1
 36 |             print "Size of vocab before: ", len(self.word_to_id)
 37 |             self.word_to_id['<unk>'] = len(self.word_to_id) + 1
 38 |             self.char_to_id['<unk>'] = len(self.char_to_id) + 1
 39 | 
 40 |             self.word_to_id['<eos>'] = 0
 41 |             self.char_to_id['<pad>'] = 0
 42 |             print "Size of vocab after: ", len(self.word_to_id)
 43 |             pkl_dump(self.tag_to_id, self.tag_vocab_path)
 44 |             pkl_dump(self.char_to_id, self.char_vocab_path)
 45 |             pkl_dump(self.word_to_id, self.word_vocab_path)
 46 | 
 47 |         self.word_padding_token = 0
 48 |         self.char_padding_token = 0
 49 | 
 50 |         if self.pretrained_embedding_path is not None:
 51 |             self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path,
 52 |                                                                          self.word_to_id, args.word_emb_dim)
 53 |         # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the <unk>
 54 |         self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()}
 55 |         self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()}
 56 |         self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()}
 57 | 
 58 |         self.ner_vocab_size = len(self.id_to_tag)
 59 |         self.word_vocab_size = len(self.id_to_word)
 60 |         self.char_vocab_size = len(self.id_to_char)
 61 | 
 62 |         print "Size of vocab after: ", len(self.word_to_id)
 63 |         print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size))
 64 | 
 65 |     @staticmethod
 66 |     def exists(path):
 67 |         return os.path.exists(path)
 68 | 
 69 |     def read_one_line(self, line, tag_set, word_dict, char_set):
 70 |         for w in line:
 71 |             fields = w.split()
 72 |             word = fields[0]
 73 |             ner_tag = fields[-1]
 74 |             for c in word:
 75 |                 char_set.add(c)
 76 |             tag_set.add(ner_tag)
 77 |             word_dict[word] += 1
 78 | 
 79 |     def get_vocab_from_set(self, a_set, shift=0):
 80 |         vocab = {}
 81 |         for i, elem in enumerate(a_set):
 82 |             vocab[elem] = i + shift
 83 | 
 84 |         return vocab
 85 | 
 86 |     def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False):
 87 |         vocab = {}
 88 |         i = 0
 89 |         self.singleton_words = set()
 90 |         for k, v in a_dict.iteritems():
 91 |             if v == 1:
 92 |                 self.singleton_words.add(i + shift)
 93 |             if remove_singleton:
 94 |                 if v > 1:
 95 |                     # print k, v
 96 |                     vocab[k] = i + shift
 97 |                     i += 1
 98 |             else:
 99 |                 vocab[k] = i + shift
100 |                 i += 1
101 |         return vocab
102 | 
103 |     def read_files(self, paths):
104 |         # word_list = []
105 |         # char_list = []
106 |         # tag_list = []
107 |         word_dict = defaultdict(lambda: 0)
108 |         char_set = set()
109 |         tag_set = set()
110 | 
111 |         def _read_a_file(path):
112 |             with codecs.open(path, "r", "utf-8") as fin:
113 |                 to_read_line = []
114 |                 for line in fin:
115 |                     if line.strip() == "":
116 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
117 |                         to_read_line = []
118 |                     else:
119 |                         to_read_line.append(line.strip())
120 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
121 | 
122 |         for path in paths:
123 |             _read_a_file(path)
124 | 
125 |         tag_vocab = self.get_vocab_from_set(tag_set)
126 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
127 |         char_vocab = self.get_vocab_from_set(char_set, 1)
128 | 
129 |         return tag_vocab, word_vocab, char_vocab
130 | 
131 |     def get_data_set(self, path, lang, training=True):
132 |         sents = []
133 |         char_sents = []
134 |         tgt_tags = []
135 |         discrete_features = []
136 | 
137 |         def add_sent(one_sent):
138 |             temp_sent = []
139 |             temp_ner = []
140 |             temp_char = []
141 | 
142 |             for w in one_sent:
143 |                 fields = w.split()
144 |                 word = fields[0]
145 |                 ner_tag = fields[-1]
146 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
147 |                 temp_ner.append(self.tag_to_id[ner_tag])
148 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
149 |             sents.append(temp_sent)
150 |             char_sents.append(temp_char)
151 |             tgt_tags.append(temp_ner)
152 |             discrete_features.append(get_feature_w(lang, one_sent)[0] if self.use_discrete_feature else [])
153 | 
154 |         with codecs.open(path, "r", "utf-8") as fin:
155 |             one_sent = []
156 |             for line in fin:
157 |                 if line.strip() == "":
158 |                     if len(one_sent) > 0:
159 |                         add_sent(one_sent)
160 |                     one_sent = []
161 |                 else:
162 |                     one_sent.append(line.strip())
163 |             if len(one_sent) > 0:
164 |                 add_sent(one_sent)
165 | 
166 |         if self.use_discrete_feature:
167 |             self.num_feats = len(discrete_features[0][0])
168 |         else:
169 |             self.num_feats = 0
170 |         return sents, char_sents, tgt_tags, discrete_features
171 | 
172 |     def get_lr_test(self, path, lang):
173 |         sents = []
174 |         char_sents = []
175 |         discrete_features = []
176 | 
177 |         def add_sent(one_sent):
178 |             temp_sent = []
179 |             temp_char = []
180 |             temp_discrete = []
181 |             for word in one_sent:
182 |                 if self.use_discrete_feature:
183 |                     temp_discrete.append(get_feature_w(lang, word))
184 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
185 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
186 |             sents.append(temp_sent)
187 |             char_sents.append(temp_char)
188 |             discrete_features.append(temp_discrete)
189 | 
190 |         original_sents = []
191 |         with codecs.open(path, "r", "utf-8") as fin:
192 |             for line in fin:
193 |                 one_sent = line.rstrip().split()
194 |                 if line:
195 |                     add_sent(one_sent)
196 |                     original_sents.append(one_sent)
197 | 
198 |         if self.use_discrete_feature:
199 |             self.num_feats = len(discrete_features[0][0])
200 |         else:
201 |             self.num_feats = 0
202 | 
203 |         return sents, char_sents, discrete_features, original_sents


--------------------------------------------------------------------------------
/dataloaders/dataloader_unicode.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | import os
  3 | 
  4 | from utils.features import *
  5 | from utils.util import *
  6 | 
  7 | 
  8 | class NER_DataLoader():
  9 |     def __init__(self, args):
 10 |         '''Data format: id word pos_tag syntactic_tag NER_tag'''
 11 |         ''' TODO: 1. normalizing all digits
 12 |                   2. Using full vocabulary from GloVe, when testing, lower case first'''
 13 |         self.train_path = args.train_path
 14 |         self.test_path = args.test_path
 15 |         self.dev_path = args.dev_path
 16 |         self.args = args
 17 | 
 18 |         self.tag_vocab_path = self.train_path + ".tag_vocab"
 19 |         self.word_vocab_path = self.train_path + ".word_vocab"
 20 |         self.char_vocab_path = self.train_path + ".char_vocab"
 21 | 
 22 |         self.pretrained_embedding_path = args.pretrain_emb_path
 23 |         self.use_discrete_feature = args.use_discrete_features
 24 | 
 25 |         if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path):
 26 |             # TODO: encoding?
 27 |             print("Load vocabs from file ....")
 28 |             self.tag_to_id = pkl_load(self.tag_vocab_path)
 29 |             self.word_to_id = pkl_load(self.word_vocab_path)
 30 |             self.char_to_id = pkl_load(self.char_vocab_path)
 31 |             print("Done!")
 32 |         else:
 33 |             print("Generating vocabs from training file ....")
 34 |             if not self.args.isLr:
 35 |                 paths_to_read = [self.train_path, self.test_path, self.dev_path]
 36 |                 self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read)
 37 |             else:
 38 |                 paths_to_read = [self.train_path, self.dev_path]
 39 |                 self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files_lr(paths_to_read,self.test_path)
 40 |             # FIXME: Remember dictionary value for char and word has been shifted by 1
 41 |             print "Size of vocab before: ", len(self.word_to_id)
 42 |             self.word_to_id['<unk>'] = len(self.word_to_id) + 1
 43 |             self.char_to_id['<unk>'] = len(self.char_to_id) + 1
 44 | 
 45 |             self.word_to_id['<\s>'] = 0
 46 |             self.char_to_id['<pad>'] = 0
 47 |             print "Size of vocab after: ", len(self.word_to_id)
 48 |             pkl_dump(self.tag_to_id, self.tag_vocab_path)
 49 |             pkl_dump(self.char_to_id, self.char_vocab_path)
 50 |             pkl_dump(self.word_to_id, self.word_vocab_path)
 51 | 
 52 |         self.word_padding_token = 0
 53 |         self.char_padding_token = 0
 54 | 
 55 |         if self.pretrained_embedding_path is not None:
 56 |             self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path,
 57 |                                                                          self.word_to_id, args.word_emb_dim)
 58 |         # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the <unk>
 59 |         self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()}
 60 |         self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()}
 61 |         self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()}
 62 | 
 63 |         self.ner_vocab_size = len(self.id_to_tag)
 64 |         self.word_vocab_size = len(self.id_to_word)
 65 |         self.char_vocab_size = len(self.id_to_char)
 66 | 
 67 |         print "Size of vocab after: ", len(self.word_to_id)
 68 |         print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size))
 69 | 
 70 |     @staticmethod
 71 |     def exists(path):
 72 |         return os.path.exists(path)
 73 | 
 74 |     def read_one_line(self, line, tag_set, word_dict, char_set):
 75 |         for w in line:
 76 |             fields = w.split()
 77 |             word = fields[0]
 78 |             ner_tag = fields[-1]
 79 |             for c in word:
 80 |                 char_set.add(c)
 81 |             tag_set.add(ner_tag)
 82 |             word_dict[word] += 1
 83 | 
 84 |     def get_vocab_from_set(self, a_set, shift=0):
 85 |         vocab = {}
 86 |         for i, elem in enumerate(a_set):
 87 |             vocab[elem] = i + shift
 88 | 
 89 |         return vocab
 90 | 
 91 |     def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False):
 92 |         vocab = {}
 93 |         i = 0
 94 |         self.singleton_words = set()
 95 |         for k, v in a_dict.iteritems():
 96 |             if v == 1:
 97 |                 self.singleton_words.add(i + shift)
 98 |             if remove_singleton:
 99 |                 if v > 1:
100 |                     # print k, v
101 |                     vocab[k] = i + shift
102 |                     i += 1
103 |             else:
104 |                 vocab[k] = i + shift
105 |                 i += 1
106 |         return vocab
107 | 
108 |     def read_files(self, paths):
109 |         # word_list = []
110 |         # char_list = []
111 |         # tag_list = []
112 |         word_dict = defaultdict(lambda: 0)
113 |         char_set = set()
114 |         tag_set = set()
115 | 
116 |         def _read_a_file(path):
117 |             with codecs.open(path, "r") as fin:
118 |                 to_read_line = []
119 |                 for line in fin:
120 |                     if line.strip() == "":
121 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
122 |                         to_read_line = []
123 |                     else:
124 |                         to_read_line.append(line.strip())
125 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
126 | 
127 |         for path in paths:
128 |             _read_a_file(path)
129 | 
130 |         tag_vocab = self.get_vocab_from_set(tag_set)
131 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
132 |         char_vocab = self.get_vocab_from_set(char_set, 1)
133 | 
134 |         return tag_vocab, word_vocab, char_vocab
135 | 
136 |     def read_files_lr(self, paths, test_path):
137 |         # word_list = []
138 |         # char_list = []
139 |         # tag_list = []
140 |         word_dict = defaultdict(lambda: 0)
141 |         char_set = set()
142 |         tag_set = set()
143 | 
144 |         def _read_a_file(path):
145 |             with codecs.open(path, "r") as fin:
146 |                 to_read_line = []
147 |                 for line in fin:
148 |                     if line.strip() == "":
149 |                         self.read_one_line(to_read_line, tag_set, word_dict, char_set)
150 |                         to_read_line = []
151 |                     else:
152 |                         to_read_line.append(line.strip())
153 |                 self.read_one_line(to_read_line, tag_set, word_dict, char_set)
154 | 
155 |         for path in paths:
156 |             _read_a_file(path)
157 | 
158 |         #reading from SetE
159 |         with codecs.open(test_path, "r") as fin:
160 |             for line in fin:
161 |                 fields = line.strip().split()
162 |                 for word in fields:
163 |                     for c in word:
164 |                         char_set.add(c)
165 |                     word_dict[word] += 1
166 | 
167 |         tag_vocab = self.get_vocab_from_set(tag_set)
168 |         word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton)
169 |         char_vocab = self.get_vocab_from_set(char_set, 1)
170 | 
171 |         return tag_vocab, word_vocab, char_vocab
172 | 
173 |     def get_data_set(self, path, lang):
174 |         sents = []
175 |         char_sents = []
176 |         tgt_tags = []
177 |         discrete_features = []
178 | 
179 |         def add_sent(one_sent):
180 |             temp_sent = []
181 |             temp_ner = []
182 |             temp_char = []
183 | 
184 |             for w in one_sent:
185 |                 fields = w.split()
186 |                 word = fields[0]
187 |                 ner_tag = fields[-1]
188 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
189 |                 temp_ner.append(self.tag_to_id[ner_tag])
190 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
191 |             sents.append(temp_sent)
192 |             char_sents.append(temp_char)
193 |             tgt_tags.append(temp_ner)
194 |             discrete_features.append(get_feature_w(lang, one_sent) if self.use_discrete_feature else [])
195 | 
196 |             # print len(discrete_features[-1])
197 | 
198 |         with codecs.open(path, "r") as fin:
199 |             one_sent = []
200 |             for line in fin:
201 |                 if line.strip() == "":
202 |                     if len(one_sent) > 0:
203 |                         add_sent(one_sent)
204 |                     one_sent = []
205 |                 else:
206 |                     one_sent.append(line.strip())
207 |             if len(one_sent) > 0:
208 |                 add_sent(one_sent)
209 | 
210 |         if self.use_discrete_feature:
211 |             self.num_feats = len(discrete_features[0][0])
212 |         else:
213 |             self.num_feats = 0
214 |         return sents, char_sents, tgt_tags, discrete_features
215 | 
216 |     def get_lr_test(self, path, lang):
217 |         sents = []
218 |         char_sents = []
219 |         discrete_features = []
220 | 
221 |         def add_sent(one_sent):
222 |             temp_sent = []
223 |             temp_char = []
224 |             for word in one_sent:
225 |                 temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id["<unk>"])
226 |                 temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id["<unk>"] for c in word])
227 |             sents.append(temp_sent)
228 |             char_sents.append(temp_char)
229 |             discrete_features.append(get_feature_w(lang, one_sent) if self.use_discrete_feature else [])
230 | 
231 |         original_sents = []
232 |         with codecs.open(path, "r") as fin:
233 |             for line in fin:
234 |                 one_sent = line.rstrip().split()
235 |                 if line:
236 |                     add_sent(one_sent)
237 |                     original_sents.append(one_sent)
238 | 
239 |         if self.use_discrete_feature:
240 |             self.num_feats = len(discrete_features[0][0])
241 |         else:
242 |             self.num_feats = 0
243 | 
244 |         return sents, char_sents, discrete_features, original_sents
245 | 


--------------------------------------------------------------------------------
/eval/IO2BIO.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def transform(ifile, ofile):
 4 | 	with open(ifile, 'r') as reader, open(ofile, 'w') as writer:
 5 | 		prev = 'O'
 6 | 		for line in reader:
 7 | 			line = line.strip()
 8 | 			if len(line) == 0:
 9 | 				prev = 'O'
10 | 				writer.write('\n')
11 | 				continue
12 | 
13 | 			tokens = line.split()
14 | 			# print tokens
15 | 			label = tokens[-1]
16 | 			if label != 'O' and label != prev:
17 | 				if prev == 'O':
18 | 					label = 'B-' + label[2:]
19 | 				elif label[2:] != prev[2:]:
20 | 					label = 'B-' + label[2:]
21 | 				else:
22 | 					label = label
23 | 			writer.write(" ".join(tokens[:-1]) + " " + label)
24 | 			writer.write('\n')
25 | 			prev = tokens[-1]
26 | 
27 | if __name__ == '__main__':
28 | 	transform('eng.train.conll', 'eng.train.bio.conll')
29 | 	transform('eng.dev.conll', 'eng.dev.bio.conll')
30 | 	transform('eng.test.conll', 'eng.test.bio.conll')
31 | 


--------------------------------------------------------------------------------
/eval/IO2BIOES.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def transform(ifile, ofile):
 4 | 	with open(ifile, 'r') as reader, open(ofile, 'w') as writer:
 5 | 		sents = []
 6 | 		sent = []
 7 | 		for line in reader:
 8 | 			line = line.strip()
 9 | 			if len(line) == 0:
10 | 				sents.append(sent)
11 | 				sent = []
12 | 				continue
13 | 
14 | 			sent.append(line)
15 | 		if len(sent) > 0:
16 | 			sents.append(sent)
17 | 
18 | 		for sent in sents:
19 | 			length = len(sent)
20 | 			labels = []
21 | 			for line in sent:
22 | 				tokens = line.split()
23 | 				label = tokens[-1]
24 | 				labels.append(label)
25 | 			
26 | 			# print "%d %d" % (length, len(labels))
27 | 
28 | 			for i in range(length):
29 | 				tokens = sent[i].split()
30 | 				label = labels[i]
31 | 				new_label = label
32 | 				if label != 'O':
33 | 					if label.startswith('B-'):
34 | 						if i + 1 == length or not labels[i + 1].startswith('I-'):
35 | 							new_label = 'S-' + label[2:]
36 | 					elif label.startswith('I-'):
37 | 						if i + 1 == length or not labels[i + 1].startswith('I-'):
38 | 							new_label = 'E-' + label[2:]
39 | 				writer.write(" ".join(tokens[:-1]) + " " + new_label)
40 | 				writer.write('\n')
41 | 			writer.write('\n')
42 | 
43 | 
44 | if __name__ == '__main__':
45 | 	transform('../datasets/english/eng.train.bio.conll', '../datasets/english/eng.train.bioes.conll')
46 | 	transform('../datasets/english/eng.dev.bio.conll',   '../datasets/english/eng.dev.bioes.conll')
47 | 	transform('../datasets/english/eng.test.bio.conll',  '../datasets/english/eng.test.bioes.conll')
48 | 


--------------------------------------------------------------------------------
/eval/conlleval:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://lcg-www.uia.ac.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
265 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
266 | 
267 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
269 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
270 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
271 | 
272 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
273 |       $chunkEnd = $true; 
274 |    }
275 | 
276 |    # corrected 1998-12-22: these chunks are assumed to have length 1
277 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
279 | 
280 |    return($chunkEnd);   
281 | }
282 | 
283 | # startOfChunk: checks if a chunk started between the previous and current word
284 | # arguments:    previous and current chunk tags, previous and current types
285 | # note:         this code is capable of handling other chunk representations
286 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
287 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
288 | 
289 | sub startOfChunk {
290 |    my $prevTag = shift(@_);
291 |    my $tag = shift(@_);
292 |    my $prevType = shift(@_);
293 |    my $type = shift(@_);
294 |    my $chunkStart = $false;
295 | 
296 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
297 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
298 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
299 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
300 | 
301 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
302 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
303 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
304 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
305 | 
306 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
307 |       $chunkStart = $true; 
308 |    }
309 | 
310 |    # corrected 1998-12-22: these chunks are assumed to have length 1
311 |    if ( $tag eq "[" ) { $chunkStart = $true; }
312 |    if ( $tag eq "]" ) { $chunkStart = $true; }
313 | 
314 |    return($chunkStart);   
315 | }
316 | 


--------------------------------------------------------------------------------
/eval/conlleval.v2:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://lcg-www.uia.ac.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; }
265 | 
266 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
267 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
269 | 
270 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
271 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
272 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
273 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; }
274 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; }
275 | 
276 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; }
277 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; }
279 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; }
280 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; }
281 |    
282 | 
283 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
284 |       $chunkEnd = $true; 
285 |    }
286 | 
287 |    # corrected 1998-12-22: these chunks are assumed to have length 1
288 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
289 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
290 | 
291 |    return($chunkEnd);   
292 | }
293 | 
294 | # startOfChunk: checks if a chunk started between the previous and current word
295 | # arguments:    previous and current chunk tags, previous and current types
296 | # note:         this code is capable of handling other chunk representations
297 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
298 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
299 | 
300 | sub startOfChunk {
301 |    my $prevTag = shift(@_);
302 |    my $tag = shift(@_);
303 |    my $prevType = shift(@_);
304 |    my $type = shift(@_);
305 |    my $chunkStart = $false;
306 | 
307 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
308 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
309 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
310 |    if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; }
311 |    if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; }
312 |    
313 |    if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; }
314 |    if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; }
315 |    if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; }
316 |    if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; }
317 |    if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; }
318 | 
319 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
320 |    if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; }
321 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
322 | 
323 |    if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; }
324 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
325 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
326 | 
327 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
328 |       $chunkStart = $true; 
329 |    }
330 | 
331 |    # corrected 1998-12-22: these chunks are assumed to have length 1
332 |    if ( $tag eq "[" ) { $chunkStart = $true; }
333 |    if ( $tag eq "]" ) { $chunkStart = $true; }
334 | 
335 |    return($chunkStart);   
336 | }
337 | 


--------------------------------------------------------------------------------
/eval/eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for file in $(ls $1)
3 | do
4 |     echo evaluating $file
5 |     ./conlleval < $1$file
6 | done
7 | 


--------------------------------------------------------------------------------
/eval/format.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def format(ifile, ofile):
 5 |     with open(ifile, 'r') as reader, open(ofile, 'w') as writer:
 6 |         i = 1
 7 |         for line in reader:
 8 |             line = line.strip()
 9 |             if len(line) == 0:
10 |                 i = 1
11 |                 writer.write('\n')
12 |             else:
13 |                 writer.write('%d %s\n' % (i, line))
14 |                 i += 1
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     format('eng.train', 'eng.train.conll')
19 |     format('eng.dev', 'eng.dev.conll')
20 |     format('eng.test', 'eng.test.conll')
21 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/models/__init__.py


--------------------------------------------------------------------------------
/models/decoders.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | from utils.util import *
  3 | 
  4 | 
  5 | class Decoder():
  6 |     def __init__(self, tag_size):
  7 |         # type: () -> object
  8 |         pass
  9 | 
 10 |     def decode_loss(self):
 11 |         raise NotImplementedError
 12 | 
 13 |     def decoding(self):
 14 |         raise NotImplementedError
 15 | 
 16 | 
 17 | def constrained_transition_init(transition_matrix, contraints):
 18 |     '''
 19 |     :param transition_matrix: numpy array, (from, to)
 20 |     :param contraints: [[from_indexes], [to_indexes]]
 21 |     :return: newly initialized transition matrix
 22 |     '''
 23 |     for cons in contraints:
 24 |         transition_matrix[cons[0], cons[1]] = -1000.0
 25 |     return transition_matrix
 26 | 
 27 | 
 28 | class chain_CRF_decoder(Decoder):
 29 |     ''' For NER and POS Tagging. '''
 30 | 
 31 |     def __init__(self, args, model, src_output_dim, tag_emb_dim, tag_size, constraints=None):
 32 |         Decoder.__init__(self, tag_size)
 33 |         self.model = model
 34 |         self.start_id = tag_size
 35 |         self.end_id = tag_size + 1
 36 |         self.tag_size = tag_size + 2
 37 |         tag_size = tag_size + 2
 38 | 
 39 |         # optional: transform the hidden space of src encodings into the tag embedding space
 40 |         self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim))
 41 |         self.b_src2tag_readout = model.add_parameters((tag_emb_dim))
 42 |         self.b_src2tag_readout.zero()
 43 | 
 44 |         self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim))
 45 |         self.b_scores_readout2tag = model.add_parameters((tag_size))
 46 |         self.b_scores_readout2tag.zero()
 47 | 
 48 |         # (to, from), trans[i] is the transition score to i
 49 |         init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to
 50 |         # init_transition_matrix[self.start_id, :] = -1000.0
 51 |         # init_transition_matrix[:, self.end_id] = -1000.0
 52 |         init_transition_matrix[self.end_id, :] = -1000.0
 53 |         init_transition_matrix[:, self.start_id] = -1000.0
 54 |         if constraints is not None:
 55 |             init_transition_matrix = constrained_transition_init(init_transition_matrix, constraints)
 56 |         # print init_transition_matrix
 57 |         self.transition_matrix = model.add_lookup_parameters((tag_size, tag_size),
 58 |                                                              init=dy.NumpyInitializer(init_transition_matrix))
 59 | 
 60 |         self.interpolation = args.interp_crf_score
 61 |         if self.interpolation:
 62 |             self.W_weight_transition = model.add_parameters((1, tag_emb_dim))
 63 |             self.b_weight_transition = model.add_parameters((1))
 64 |             self.b_weight_transition.zero()
 65 | 
 66 |     def forward_alg(self, tag_scores):
 67 |         ''' Forward DP for CRF.
 68 |         tag_scores (list of batched dy.Tensor): (tag_size, batchsize)
 69 |         '''
 70 |         # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row;
 71 |         # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a
 72 |         transpose_transition_score = dy.parameter(self.transition_matrix)
 73 |         # transpose_transition_score = dy.transpose(transition_score)
 74 |         # alpha(t', s) = the score of sequence from t=0 to t=t' in log space
 75 |         # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size))
 76 |         # np_init_alphas[self.start_id, :] = 0.0
 77 |         # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True)
 78 | 
 79 |         alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0]
 80 |         # self.transition_matrix[i]: from i, column
 81 |         # transpose_score[i]: to i, row
 82 |         # transpose_score: to, from
 83 | 
 84 |         for tag_score in tag_scores[1:]:
 85 |             # extend for each transit <to>
 86 |             alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size)  # (from, to, batch_size)
 87 |             # each column i of tag_score will be the repeated emission score to tag i
 88 |             tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size))
 89 |             alpha_t = alpha_tm1 + transpose_transition_score + tag_score
 90 |             alpha_tm1 = log_sum_exp_dim_0(alpha_t)  # (tag_size, batch_size)
 91 | 
 92 |         terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id])  # (1, batch_size)
 93 |         return terminal_alpha
 94 | 
 95 |     def score_one_sequence(self, tag_scores, tags, batch_size):
 96 |         ''' tags: list of tag ids at each time step '''
 97 |         # print tags, batch_size
 98 |         # print batch_size
 99 |         # print "scoring one sentence"
100 |         tags = [[self.start_id] * batch_size] + tags  # len(tag_scores) = len(tags) - 1
101 |         score = dy.inputTensor(np.zeros(batch_size), batched=True)
102 |         # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size
103 |         # print "tag dim: ", tag_scores.dim()
104 |         for i in range(len(tags) - 1):
105 |             score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \
106 |                     + dy.pick_batch(tag_scores[i], tags[i + 1])
107 |         score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id]*batch_size), tags[-1])
108 |         return score
109 | 
110 |     def decode_loss(self, src_encodings, tgt_tags):
111 |         # This is the batched version which requires bucketed batch input with the same length.
112 |         '''
113 |         The length of src_encodings and tgt_tags are time_steps.
114 |         src_encodings: list of dynet.Tensor (src_output_dim, batch_size)
115 |         tgt_tags: list of tag ids [(1, batch_size)]
116 |         return: average of negative log likelihood
117 |         '''
118 |         # TODO: transpose tgt tags first
119 |         batch_size = len(tgt_tags)
120 |         tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
121 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
122 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
123 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
124 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
125 | 
126 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding
127 |                     in src_encodings]
128 |         if self.interpolation:
129 |             W_transit = dy.parameter(self.W_weight_transition)
130 |             b_transit = dy.parameter(self.b_weight_transition)
131 |             step_weight_on_transit = [dy.logistic(dy.affine_transform([b_transit, W_transit, tag_emb])) for tag_emb in tag_embs]
132 | 
133 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
134 | 
135 |         # scores over all paths, all scores are in log-space
136 |         forward_scores = self.forward_alg(tag_scores)
137 |         gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size)
138 |         # negative log likelihood
139 |         loss = dy.sum_batches(forward_scores - gold_score) / batch_size
140 |         return loss #, dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size
141 | 
142 |     def get_crf_scores(self, src_encodings):
143 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
144 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
145 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
146 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
147 | 
148 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
149 |                     for src_encoding in src_encodings]
150 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
151 | 
152 |         transpose_transition_score = dy.parameter(self.transition_matrix)  # (to, from)
153 | 
154 |         return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores]
155 | 
156 |     def decoding(self, src_encodings):
157 |         ''' Viterbi decoding for a single sequence. '''
158 |         W_src2tag_readout = dy.parameter(self.W_src2tag_readout)
159 |         b_src2tag_readout = dy.parameter(self.b_src2tag_readout)
160 |         W_score_tag = dy.parameter(self.W_scores_readout2tag)
161 |         b_score_tag = dy.parameter(self.b_scores_readout2tag)
162 | 
163 |         tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding]))
164 |                     for src_encoding in src_encodings]
165 |         tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs]
166 | 
167 |         back_trace_tags = []
168 |         np_init_alpha = np.ones(self.tag_size) * -2000.0
169 |         np_init_alpha[self.start_id] = 0.0
170 |         max_tm1 = dy.inputTensor(np_init_alpha)
171 |         transpose_transition_score = dy.parameter(self.transition_matrix)  # (to, from)
172 | 
173 |         for i, tag_score in enumerate(tag_scores):
174 |             max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size)
175 |             max_t = max_tm1 + transpose_transition_score
176 |             if i != 0:
177 |                 eval_score = max_t.npvalue()[:-2, :]
178 |             else:
179 |                 eval_score = max_t.npvalue()
180 |             best_tag = np.argmax(eval_score, axis=0)
181 |             back_trace_tags.append(best_tag)
182 |             max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score
183 | 
184 |         terminal_max_T = max_tm1 + self.transition_matrix[self.end_id]
185 |         eval_terminal = terminal_max_T.npvalue()[:-2]
186 |         best_tag = np.argmax(eval_terminal, axis=0)
187 |         best_path_score = eval_terminal[best_tag]
188 | 
189 |         best_path = [best_tag]
190 |         for btpoint in reversed(back_trace_tags):
191 |             best_tag = btpoint[best_tag]
192 |             best_path.append(best_tag)
193 |         start = best_path.pop()
194 |         assert start == self.start_id
195 |         best_path.reverse()
196 |         return best_path_score, best_path
197 | 
198 |     def cal_accuracy(self, pred_path, true_path):
199 |         return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path)
200 | 
201 | 
202 | def ensemble_viterbi_decoding(l_tag_scores, l_transit_score, tag_size):
203 |     back_trace_tags = []
204 |     tag_size = tag_size + 2
205 |     start_id = tag_size - 2
206 |     end_id = tag_size - 1
207 |     max_tm1 = np.ones(tag_size) * -2000.0
208 |     max_tm1[start_id] = 0.0
209 | 
210 |     tag_scores = []
211 |     for i in range(len(l_tag_scores[0])):
212 |         tag_scores.append(sum([ts[i] for ts in l_tag_scores]) / len(l_tag_scores))
213 |     transpose_transition_score = sum(l_transit_score) / len(l_transit_score)  # (from, to)
214 | 
215 |     for i, tag_score in enumerate(tag_scores):
216 |         max_tm1 = np.tile(np.expand_dims(max_tm1, axis=1), (1, tag_size))
217 |         max_t = max_tm1 + transpose_transition_score
218 |         if i != 0:
219 |             eval_score = max_t[:-2, :]
220 |         else:
221 |             eval_score = max_t
222 |         best_tag = np.argmax(eval_score, axis=0)
223 |         back_trace_tags.append(best_tag)
224 |         max_tm1 = eval_score[best_tag, range(tag_size)] + tag_score
225 | 
226 |     terminal_max_T = max_tm1 + transpose_transition_score[:, end_id]
227 |     eval_terminal = terminal_max_T[:-2]
228 |     best_tag = np.argmax(eval_terminal, axis=0)
229 |     best_path_score = eval_terminal[best_tag]
230 | 
231 |     best_path = [best_tag]
232 |     for btpoint in reversed(back_trace_tags):
233 |         best_tag = btpoint[best_tag]
234 |         best_path.append(best_tag)
235 |     start = best_path.pop()
236 |     assert start == start_id
237 |     best_path.reverse()
238 |     return best_path_score, best_path
239 | 
240 | 
241 | class classifier(Decoder):
242 |     def __init__(self, model, input_dim, tag_size):
243 |         self.W_softmax = model.add_parameters((tag_size, input_dim))
244 |         self.b_softmax = model.add_parameters((tag_size))
245 | 
246 |     def decode_loss(self, src_encoding, tgt_tags):
247 |         batch_size = len(tgt_tags)
248 |         tgt_tags, tgt_mask = transpose_input(tgt_tags, 0)
249 | 
250 |         assert len(src_encoding) == len(tgt_tags)
251 | 
252 |         W_softmax = dy.parameter(self.W_softmax)
253 |         b_softmax = dy.parameter(self.b_softmax)
254 | 
255 |         predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
256 | 
257 |         losses = [dy.pickneglogsoftmax_batch(pred, tgt) for pred, tgt in zip(predictions, tgt_tags)]
258 | 
259 |         loss = dy.sum_batches(dy.esum(losses)) / (batch_size * len(src_encoding))
260 | 
261 |         return loss
262 | 
263 |     def decoding(self, src_encoding):
264 |         W_softmax = dy.parameter(self.W_softmax)
265 |         b_softmax = dy.parameter(self.b_softmax)
266 |         predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding]
267 | 
268 |         predictions = [np.argmax(pred.npvalue()) for pred in predictions]
269 | 
270 |         return None, predictions
271 | 


--------------------------------------------------------------------------------
/models/encoders.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | from utils.util import *
  3 | 
  4 | ''' Designing idea: the encoder should be agnostic to the input, it can be either
  5 |     arbitrary spans, characters, or words, or even raw feature. However, user has to specify
  6 |     whether to have the lookup table for any input.
  7 | 
  8 |     There are also two ways to feed in multiple input features:
  9 |     (a) First concatenate all features for each position, and then use them as features for one encoder, e.g. bilstm
 10 |     (b) Use multiple encoders for multiple features then combine outputs from multiple encoders, either concat them
 11 |         or feed them to another encoder.'''
 12 | 
 13 | 
 14 | class Encoder():
 15 |     def __init__(self):
 16 |         pass
 17 | 
 18 |     def encode(self):
 19 |         raise NotImplementedError
 20 | 
 21 | # class concat_input_encoder(encoder):
 22 | #     def __init__(self, model, lookups, lookup_table_dims):
 23 | #         # length of elements in lookup_table_dims == number of elements in lookups which are true
 24 | #         self.num_inputs = len(lookups)
 25 | #         self.lookups = lookups
 26 | #         self.lookup_params = []
 27 | #         for i, lookup in enumerate(lookups):
 28 | #             if lookup == 1:
 29 | #                 # add loop up parameters
 30 | #                 self.lookup_params.append(model.add_lookup_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
 31 | #             elif lookup == 2:
 32 | #                 # add normal transformation parameters
 33 | #                 # dims: discrete_feature_num, continuous_emb_dim
 34 | #                 # the input should concatenate all the discrete features together first
 35 | #                 self.lookup_params.append(model.add_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1])))
 36 | #             else:
 37 | #                 self.lookup_params.append(0)
 38 | #
 39 | #     def prepare_inputs(self, inputs):
 40 | #         # inputs: (a)
 41 | #         input_features = []
 42 | #         for i, lookup in enumerate(self.lookups):
 43 | #             if lookup == 1:
 44 | 
 45 | 
 46 | class Lookup_Encoder(Encoder):
 47 |     def __init__(self, model, args, vocab_size, emb_size, padding_token=None, pretrain_embedding=None, isFeatureEmb=False):
 48 |         Encoder.__init__(self)
 49 |         self.padding_token = padding_token
 50 |         self.map_pretrain = args.map_pretrain
 51 |         self.pretrain_fix = args.pretrain_fix
 52 |         self.isFeatureEmb = isFeatureEmb
 53 |         if args.map_pretrain:
 54 |             self.W_map = model.add_parameters((args.map_dim, emb_size))
 55 |             self.b_map = model.add_parameters(args.map_dim)
 56 |             self.b_map.zero()
 57 |         if pretrain_embedding is not None:
 58 |             self.lookup_table = model.lookup_parameters_from_numpy(pretrain_embedding)
 59 |         else:
 60 |             self.lookup_table = model.add_lookup_parameters((vocab_size, emb_size))
 61 | 
 62 |     def encode(self, input_seqs):
 63 |         transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
 64 |         embs = [dy.lookup_batch(self.lookup_table, wids) for wids in transpose_inputs]
 65 |         if self.pretrain_fix and not self.isFeatureEmb:
 66 |             embs = [dy.nobackprop(emb) for emb in embs]
 67 |         # TODO: initialize <unk> with ones vector, initialize W_map with identity matrix
 68 |         if self.map_pretrain and not self.isFeatureEmb:
 69 |             if not self.pretrain_fix:
 70 |                 embs = [dy.nobackprop(emb) for emb in embs]
 71 |             W_map = dy.parameter(self.W_map)
 72 |             b_map = dy.parameter(self.b_map)
 73 |             embs = [dy.affine_transform([b_map, W_map, emb]) for emb in embs]
 74 |         return embs
 75 | 
 76 | 
 77 | class Discrete_Feature_Encoder(Encoder):
 78 |     def __init__(self, model, num_feats, to_dim):
 79 |         Encoder.__init__(self)
 80 |         self.num_feats = num_feats
 81 |         self.to_dim = to_dim
 82 |         self.W_feat_emb = model.add_parameters((to_dim, num_feats))
 83 | 
 84 |     def encode(self, input_feats):
 85 |         batch_size = len(input_feats)
 86 |         # after transpose: input_feats: [(num_feats, batch_size)]
 87 |         input_feats = transpose_discrete_features(input_feats)
 88 |         W_feat_emb = dy.parameter(self.W_feat_emb)
 89 |         output_emb = []
 90 |         for wif in input_feats:
 91 |             extend_wif = dy.transpose(dy.concatenate_cols([wif for _ in range(self.to_dim)]))
 92 |             feature_emb = dy.cmult(extend_wif, W_feat_emb)
 93 |             output_emb.append(dy.reshape(feature_emb, (self.to_dim * self.num_feats, ), batch_size=batch_size))
 94 |         return output_emb
 95 | 
 96 | 
 97 | class CNN_Encoder(Encoder):
 98 |     def __init__(self, model, emb_size, win_size=3, filter_size=64, dropout=0.5, vocab_size=0, padding_token=0, lookup_emb=None):
 99 |         Encoder.__init__(self)
100 |         self.vocab_size = vocab_size # if 0, no lookup tables
101 |         self.win_size = win_size
102 |         self.filter_size = filter_size
103 |         self.emb_size = emb_size
104 |         self.dropout_rate = dropout
105 |         self.paddding_token = padding_token
106 |         if vocab_size != 0:
107 |             print "In CNN encoder: creating lookup embedding!"
108 |             self.lookup_emb = model.add_lookup_parameters((vocab_size, 1, 1, emb_size))
109 |         else:
110 |             assert lookup_emb is not None
111 |             print "In CNN encoder: reusing lookup embedding!"
112 |             self.lookup_emb = lookup_emb
113 | 
114 |         self.W_cnn = model.add_parameters((1, win_size, emb_size, filter_size))
115 |         self.b_cnn = model.add_parameters((filter_size))
116 |         self.b_cnn.zero()
117 | 
118 |     def _cnn_emb(self, input_embs, training):
119 |         # input_embs: (h, time_step, dim, batch_size), h=1
120 |         if self.dropout_rate > 0 and training:
121 |             input_embs = dy.dropout(input_embs, self.dropout_rate)
122 |         W_cnn = dy.parameter(self.W_cnn)
123 |         b_cnn = dy.parameter(self.b_cnn)
124 | 
125 |         cnn_encs = dy.conv2d_bias(input_embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False)
126 |         tanh_cnn_encs = dy.tanh(cnn_encs)
127 |         max_pool_out = dy.reshape(dy.max_dim(tanh_cnn_encs, d=1), (self.filter_size,))
128 |         # rec_pool_out = dy.rectify(max_pool_out)
129 |         return max_pool_out
130 | 
131 |     def encode(self, input_seqs, training=True, char=True):
132 |         batch_size = len(input_seqs)
133 |         sents_embs = []
134 |         if char:
135 |             # we don't batch at first, we batch after cnn
136 |             for sent in input_seqs:
137 |                 sent_emb = []
138 |                 for w in sent:
139 |                     if len(w) < self.win_size:
140 |                         w += [self.paddding_token] * (self.win_size - len(w))
141 |                     input_embs = dy.concatenate([dy.lookup(self.lookup_emb, c) for c in w], d=1)
142 |                     w_emb = self._cnn_emb(input_embs, training)  # (filter_size, 1)
143 |                     sent_emb.append(w_emb)
144 |                 sents_embs.append(sent_emb)
145 |             sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.filter_size) # [(filter_size, batch_size)]
146 |         else:
147 |             for sent in input_seqs:
148 |                 if self.vocab_size != 0:
149 |                     if len(sent) < self.win_size:
150 |                         sent += [0] * (self.win_size - len(sent))
151 |                     input_embs = dy.concatenate([dy.lookup(self.lookup_emb, w) for w in sent], d=1)
152 |                 else:
153 |                     # input_seqs: [(emb_size, batch_size)]
154 |                     if len(sent) < self.win_size:
155 |                         sent += [dy.zeros(self.emb_size)] * (self.win_size - len(sent))
156 |                     input_embs = dy.transpose(dy.concatenate_cols(sent)) # (time_step, emb_size, bs)
157 |                     input_embs = dy.reshape(input_embs, (1, len(sent), self.emb_size), )
158 | 
159 |                 sent_emb = self._cnn_emb(input_embs, training)  # (filter_size, 1)
160 |                 sents_embs.append(sent_emb)
161 |             sents_embs = dy.reshape(dy.concatenate(sents_embs, d=1), (self.filter_size,), batch_size =batch_size) # (filter_size, batch_size)
162 | 
163 |         return sents_embs
164 | 
165 | 
166 | class BiRNN_Encoder(Encoder):
167 |     def __init__(self,
168 |                  model,
169 |                  input_dim,
170 |                  hidden_dim,
171 |                  emb_dropout_rate=0.3,
172 |                  output_dropout_rate=0.5,
173 |                  padding_token=None,
174 |                  vocab_size=0,
175 |                  emb_size=0,
176 |                  layer=1,
177 |                  rnn="lstm",
178 |                  vocab_emb=None):
179 |         Encoder.__init__(self)
180 |         # self.birnn = dy.BiRNNBuilder(layer, input_dim, hidden_dim, model, dy.LSTMBuilder if rnn == "lstm" else dy.GRUBuilder)
181 |         self.fwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
182 |         self.bwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model)
183 | 
184 |         self.input_dim = input_dim
185 |         self.vocab_size = vocab_size
186 |         self.padding_token = padding_token
187 |         self.drop_out_rate = output_dropout_rate
188 |         self.emb_drop_rate = emb_dropout_rate
189 |         self.hidden_dim = hidden_dim
190 |         if vocab_size > 0:
191 |             print "In BiRNN, creating lookup table!"
192 |             self.vocab_emb = model.add_lookup_parameters((vocab_size, emb_size))
193 |         else:
194 |             if vocab_emb is not None:
195 |                 # assert vocab_emb is not None
196 |                 self.vocab_emb = vocab_emb
197 |             else:
198 |                 self.vocab_emb = None
199 | 
200 |     def encode(self, input_seqs, training=True, char=False):
201 |         if char:
202 |             return self.encode_word(input_seqs, training=training)
203 |         else:
204 |             return self.encode_seq(input_seqs, training=training)
205 | 
206 |     def encode_seq(self, input_seqs, training=True, char=False):
207 |         if self.vocab_emb is not None:
208 |             # input_seqs = [[w1, w2],[]]
209 |             transpose_inputs, _ = transpose_input(input_seqs, self.padding_token)
210 |             if self.vocab_size != 0:
211 |                 w_embs = [dy.dropout(dy.lookup_batch(self.vocab_emb, wids),
212 |                                      self.emb_drop_rate) if self.emb_drop_rate > 0. and training
213 |                           else dy.lookup_batch(self.vocab_emb, wids)
214 |                           for wids in transpose_inputs]
215 |             else:
216 |                 # print "In BiRNN, reusing lookup table!"
217 |                 # print "In our case, use parameters shared by CNN char encoder, need conversion!"
218 |                 vocab_emb = dy.parameter(self.vocab_emb)
219 |                 vocab_size = vocab_emb.dim()[0][-1]
220 |                 # print "In BiRNN Char vocab size: ", vocab_size
221 |                 vocab_emb = dy.reshape(vocab_emb, (self.input_dim, vocab_size))  # expression, not lookup_parameters
222 | 
223 |                 # for wids in transpose_inputs:
224 |                 #     print wids
225 |                 #     print vocab_emb.dim()
226 |                 #     a = dy.pick_batch(vocab_emb, wids, dim=1)
227 |                 #     print a.value()
228 |                 # Special case handler: use pick_batch
229 |                 w_embs = [dy.dropout(dy.pick_batch(vocab_emb, wids, dim=1),
230 |                                      self.emb_drop_rate) if self.emb_drop_rate > 0. and training
231 |                           else dy.pick_batch(vocab_emb, wids, dim=1)
232 |                           for wids in transpose_inputs]
233 |                 # print "In BiRNN char: ", w_embs[0].dim()
234 |         else:
235 |             w_embs = [dy.dropout(emb, self.emb_drop_rate) if self.emb_drop_rate > 0. and training else emb for emb in input_seqs]
236 |         # if vocab_size = 0: input_seqs = [(input_dim, batch_size)]
237 | 
238 |         w_embs_r = w_embs[::-1]
239 |         # birnn_outputs = [dy.dropout(emb, self.drop_out_rate) if self.drop_out_rate > 0. else emb for emb in self.birnn.transduce(w_embs)]
240 |         fwd_vectors = self.fwd_RNN.initial_state().transduce(w_embs)
241 |         bwd_vectors = self.bwd_RNN.initial_state().transduce(w_embs_r)[::-1]
242 | 
243 |         if char:
244 |             return dy.concatenate([fwd_vectors[-1], bwd_vectors[0]])
245 | 
246 |         birnn_outputs = [dy.dropout(dy.concatenate([fwd_v, bwd_v]), self.drop_out_rate) if self.drop_out_rate > 0.0 and training
247 |                          else dy.concatenate([fwd_v, bwd_v])
248 |                          for (fwd_v, bwd_v) in zip(fwd_vectors, bwd_vectors)]
249 |         return birnn_outputs
250 | 
251 |     def encode_word(self, input_seqs, training=True):
252 |         # embedding dropout rate is 0.0, because we dropout at the later stage of RNN
253 |         sents_embs = []
254 | 
255 |         for sent in input_seqs:
256 |             sent_emb = []
257 |             for w in sent:
258 |                 w_emb = self.encode_seq([w], training=training, char=True)
259 |                 sent_emb.append(w_emb)
260 |             sents_embs.append(sent_emb)
261 |         sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.hidden_dim*2)  # [(hidden_dim*2, batch_size)]
262 |         return sents_embs


--------------------------------------------------------------------------------
/utils/Convert_Output_Darpa.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import codecs
 4 | 
 5 | 
 6 | def run_program(input, output, setEconll):
 7 |     reload(sys)	
 8 |     sys.setdefaultencoding('utf-8')
 9 |     if input is not None and setEconll is not None:
10 |         with codecs.open(input, 'r', encoding='utf-8', errors='ignore') as input_file:
11 |             lines = input_file.readlines()
12 |         tags = []
13 |         for i, line in enumerate(lines):
14 |             if len(line) >= 2:
15 |                 line_split = line.strip().split()
16 |                 # sys.stderr.write('line: ' + line.strip() + '\n')
17 |                 # sys.stderr.flush()
18 |                 assert len(line_split) == 4
19 |                 tags.append(line_split[-1])
20 | 
21 |         output_lines = lines
22 | 
23 |         with codecs.open(setEconll, 'r',encoding='utf-8', errors='ignore') as input_file:
24 |             lines = input_file.readlines()
25 |         assert len(output_lines) == len(lines)
26 |         with codecs.open(output,'w',encoding='utf-8') as output_file:
27 |             ctr = -1
28 |             for line in lines:
29 |                 if len(line) > 2:
30 |                     ctr += 1
31 |                     line_split = line.strip().split()
32 |                     assert len(line_split) == 10
33 |                     # print '\t'.join(line_split) + '\t' + tags[ctr]
34 |                     output_file.write('\t'.join(line_split) + '\t' + tags[ctr] +"\n")
35 |                 else:
36 |                     # print ""
37 |                     output_file.write("\n")
38 |             assert ctr + 1 == len(tags)
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--input", type=str, default=None)
43 |     parser.add_argument("--setEconll", type=str, default=None)
44 |     parser.add_argument("--output", type=str, default=None)
45 |     args = parser.parse_args()
46 |     run_program(args.input, args.output, args.setEconll)
47 | 


--------------------------------------------------------------------------------
/utils/Convert_to_darpa_xml.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import codecs
  3 | import argparse
  4 | 
  5 | 
  6 | def print_entities(fout,entities, curr_docum, curr_anot):
  7 |     # print 'CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + ' '.join(
  8 |     #     entities[0]) + '\t' + curr_docum + ':' + str(entities[2]) + '-' + str(entities[3]) + '\t' + 'NIL' + '\t' + \
  9 |     #       entities[1] + '\t' + 'NAM' + '\t' + '1.0'
 10 |     fout.write('CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + ' '.join(
 11 |         entities[0]) + '\t' + curr_docum + ':' + str(entities[2]) + '-' + str(entities[3]) + '\t' + 'NIL' + '\t' + \
 12 |           entities[1] + '\t' + 'NAM' + '\t' + '1.0' + "\n")
 13 | 
 14 | 
 15 | def run_program_darpa(input, output):
 16 |     reload(sys)
 17 |     sys.setdefaultencoding('utf-8')
 18 |     if input is not None and output is not None:
 19 |         with codecs.open(input, encoding='utf-8', mode='r') as input_file:
 20 |             lines = input_file.readlines()
 21 | 
 22 |         entities = [[], None, -1, -1]
 23 |         in_entity = False
 24 |         curr_docum = None
 25 |         curr_anot = 1
 26 |         fout = codecs.open(output,'w',encoding='utf-8')
 27 |         for i, line in enumerate(lines):
 28 |             if len(line) > 2:
 29 |                 # print 'Line number: ' + str(i + 1) + '\n'
 30 |                 #sys.stderr.flush()
 31 |                 line_split = line.strip().split()
 32 |                 if curr_docum != line_split[3]:
 33 |                     curr_docum = line_split[3]
 34 |                     curr_anot = 1
 35 |                     # print ''
 36 |                 if len(line_split) != 11:
 37 |                     print line
 38 |                     print 'Error in line: ' + str(i + 1) + '\n'
 39 |                     assert len(line_split) == 11
 40 |                 if line_split[-1][0] == 'B':
 41 |                     if in_entity:
 42 |                         print_entities(fout, entities, curr_docum, curr_anot)
 43 |                         # restart
 44 |                         entities[0] = []
 45 |                         entities[1] = None
 46 |                         entities[2] = -1
 47 |                         entities[3] = -1
 48 |                         curr_anot += 1
 49 |                         in_entity = False
 50 |                     else:
 51 |                         assert len(entities[0]) == 0 and entities[1] is None and entities[2] == -1 and entities[3] == -1
 52 |                     assert not (in_entity)
 53 |                     in_entity = True
 54 |                     assert line_split[-1][1] == '-'
 55 |                     entities[0].append(line_split[0])
 56 |                     entities[1] = ''.join(line_split[-1][2:])
 57 |                     entities[2] = int(line_split[-5])
 58 |                     entities[3] = int(line_split[-4])
 59 |                 elif line_split[-1][0] == 'I':
 60 |                     # print 'line num: ' + str(i + 1) + '\n'
 61 | 
 62 |                     if not in_entity or (len(entities[0]) > 0  and line_split[-1][2:] != entities[1]):# when first tag is I-PER treat it as B-PER
 63 |                         in_entity = True
 64 |                         entities[0].append(line_split[0])
 65 |                         entities[1] = ''.join(line_split[-1][2:])
 66 |                         entities[2] = int(line_split[-5])
 67 |                         entities[3] = int(line_split[-4])
 68 | 
 69 |                     else:
 70 |                         assert in_entity and len(entities[0]) > 0 and not (entities[0] is None) and ''.join(
 71 |                             line_split[-1][2:]) == entities[1] and entities[2] >= 0 and entities[3] >= 0
 72 |                         entities[0].append(line_split[0])
 73 |                         assert entities[2] >= 0
 74 |                         assert int(line_split[-4]) > entities[3]
 75 |                         entities[3] = int(line_split[-4])
 76 |                 elif line_split[-1][0] == 'O':
 77 |                     if in_entity:
 78 |                         print_entities(fout,entities, curr_docum, curr_anot)
 79 |                         entities[0] = []
 80 |                         entities[1] = None
 81 |                         entities[2] = -1
 82 |                         entities[3] = -1
 83 |                         curr_anot += 1
 84 |                         in_entity = False
 85 |             else:
 86 |                 if in_entity:
 87 |                     # print 'We are in an entity and met sentence boundary, line: ' + str(i + 1) + '\n'
 88 |                     print_entities(fout, entities, curr_docum, curr_anot)
 89 |                     entities[0] = []
 90 |                     entities[1] = None
 91 |                     entities[2] = -1
 92 |                     entities[3] = -1
 93 |                     curr_anot += 1
 94 |                     in_entity = False
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     parser = argparse.ArgumentParser()
 99 |     parser.add_argument("--input", type=str, default=None)
100 |     parser.add_argument("--output", type=str, default=None)
101 |     args = parser.parse_args()
102 |     run_program_darpa(args.input, args.output)
103 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/__init__.py


--------------------------------------------------------------------------------
/utils/extract_authors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import codecs
 3 | import xml.etree.ElementTree as ET
 4 | import sys
 5 | 
 6 | def extract_authors(dir_name, output_fname):
 7 |     author_set = set()
 8 |     for fname in os.listdir(dir_name):
 9 |         fin_name = os.path.join(dir_name, fname)
10 |         if os.path.isfile(fin_name):
11 |             fs = fname.split('_')
12 |             if fs[1] != "WL":
13 |                 continue
14 |             print fname
15 |             tree = ET.parse(fin_name)
16 |             root = tree.getroot()
17 |             # elems = root.findall(".//*[@type='post']/[@name='author']")
18 |             elems = root.findall(".//*[@type='post']/attribute")
19 |             for elem in elems:
20 |                 if elem.get('name') == u'author':
21 |                     author = elem.get(u'value')
22 |                     author_set.add(author)
23 | 
24 |     with codecs.open(output_fname, "w", "utf-8") as fout:
25 |         for elem in author_set:
26 |             fout.write(elem + '\n')
27 | 
28 | if __name__ == "__main__":
29 |     dname = sys.argv[1]
30 |     fout_name = sys.argv[2]
31 |     extract_authors(dname, fout_name)


--------------------------------------------------------------------------------
/utils/features.py:
--------------------------------------------------------------------------------
 1 | from utils.segnerfts import segnerfts
 2 | import codecs
 3 | 
 4 | 
 5 | def get_feature_sent(lang, sent, args):
 6 |     if args.use_gazatter and args.use_morph:
 7 |         return segnerfts.extract(lang, sent)
 8 |     elif args.use_gazatter:
 9 |         return segnerfts.extract_type_token_gaz(lang, sent)
10 |     elif args.use_morph:
11 |         return segnerfts.extract_type_token_morph(lang, sent)
12 |     else:
13 |         return segnerfts.extract_type_token_level(lang, sent)
14 | 
15 | 
16 | def get_brown_cluster(path):
17 |     bc_dict = dict()
18 |     linear_map = dict()
19 |     with codecs.open(path, "r", "utf-8") as fin:
20 |         for line in fin:
21 |             fields = line.strip().split('\t')
22 |             if len(fields) == 3:
23 |                 word = fields[1]
24 |                 binary_string = fields[0]
25 |                 bid = int(binary_string, 2)
26 |                 if bid not in linear_map:
27 |                     linear_map[bid] = len(linear_map)
28 |                 bc_dict[word] = linear_map[bid]
29 |     return bc_dict


--------------------------------------------------------------------------------
/utils/old_segnerfts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | import regex as re
  6 | 
  7 | 
  8 | LONG_TOKEN_THRESH = 8
  9 | 
 10 | 
 11 | def ex_capitalized(ws):
 12 |     return [w[0].isupper() for w in ws]
 13 | 
 14 | 
 15 | def ex_all_uppercased(ws):
 16 |     return [all(x.isupper() for x in w) for w in ws]
 17 | 
 18 | 
 19 | def ex_mixed_case(ws):
 20 |     def mixed_case(w):
 21 |         noninit = [x.isupper() for x in w[1:]]
 22 |         return True in noninit and False in noninit
 23 |     return map(mixed_case, ws)
 24 | 
 25 | 
 26 | def ex_internal_period(ws):
 27 |     return [len(w) > 2 and '.' in w[1:-1] for w in ws]
 28 | 
 29 | 
 30 | def ex_non_letter(ws):
 31 |     return [bool(re.search(r'[^\p{Letter}\p{Mark}]', w)) for w in ws]
 32 | 
 33 | 
 34 | def ex_digits(ws):
 35 |     return [bool(re.search(r'[\p{Number}]', w)) for w in ws]
 36 | 
 37 | 
 38 | def ex_long_token(ws):
 39 |     return [len(w) > LONG_TOKEN_THRESH for w in ws]
 40 | 
 41 | 
 42 | def ex_contains_latin(ws):
 43 |     return [bool(re.search(r'\p{Latin}', w)) for w in ws]
 44 | 
 45 | 
 46 | def ex_contains_ethiopic(ws):
 47 |     return [bool(re.search(r'\p{Ethiopic}', w)) for w in ws]
 48 | 
 49 | 
 50 | ex_title = {
 51 |     'eng': lambda ws: [False] + [w in {
 52 |         'Mister',
 53 |         'Mr.',
 54 |         'Mr',
 55 |         'Misses',
 56 |         'Mrs.',
 57 |         'Mrs',
 58 |         'Miss',
 59 |         'Ms.',
 60 |         'Ms',
 61 |         'Doctor',
 62 |         'Dr.',
 63 |         'Dr',
 64 |         'Professor',
 65 |         'Prof.',
 66 |         'Prof',
 67 |         'Father',
 68 |         'Fr.',
 69 |         'Fr',
 70 |         'Reverend',
 71 |         'Rev.',
 72 |         'Rev',
 73 |         'Revd',
 74 |         'Pastor',
 75 |         'Bishop',
 76 |         'Bp.',
 77 |         'Bp',
 78 |         'President',
 79 |         'Pres.',
 80 |         'Representative',
 81 |         'Rep.',
 82 |         'Rep',
 83 |         'Congressman',
 84 |         'Congresswoman',
 85 |         'Congressperson',
 86 |         'Senator',
 87 |         'Sen.',
 88 |         'Sen',
 89 |         'Secretary',
 90 |         'Sec.',
 91 |         'Sec',
 92 |         'Lord',
 93 |         'Lady',
 94 |         'Justice',
 95 |         'Sheriff',
 96 |         'Principal',
 97 |         'Mayor',
 98 |     } for w in ws[:-1]],
 99 |     'deu': lambda ws: [False] + [w in {
100 |         'Herr',
101 |         'Hr.',
102 |         'Frau',
103 |         'Fr.',
104 |         'Fraulein',
105 |         'Frl.',
106 |         'Doktor',
107 |         'Dr.',
108 |         'Dr.med.',
109 |         'Dr.phil.',
110 |         'Dr.rer.nat.',
111 |         'Dr.jur.',
112 |         'Dr.theol.',
113 |         'Professor',
114 |         'Prof.',
115 |         'a.o.Prof.',
116 |         'o.Pr.',
117 |         'Dozent',
118 |         'Doz.',
119 |         'Richter',
120 |         'Senator',
121 |         'Sen.',
122 |         'Ministerpräsident',
123 |         'Ministerpräsidentin',
124 |         'Bürgermeister',
125 |         'Abgeordenete',
126 |         'Abg.',
127 |         'Bundeskanzler',
128 |         'Landeshauptmann',
129 |         'Kaiser',
130 |         'Kaiserin',
131 |         'König',
132 |         'Königin',
133 |         'Kurfürst',
134 |         'Kurfürstin',
135 |         'Erzherzog',
136 |         'Erzherzogin',
137 |         'Großherzog',
138 |         'Großherzogin',
139 |         'Großfürst',
140 |         'Großfürstin',
141 |         'Herzog',
142 |         'Herzogin',
143 |         'Pfalzgraf',
144 |         'Pfalzgräfin',
145 |         'Markgraf',
146 |         'Markgräfin',
147 |         'Landgraf',
148 |         'Landgräfin',
149 |         'Reichsfürst',
150 |         'Reichsfürstin',
151 |         'Reichsgraf',
152 |         'Reichsgräfin',
153 |         'Burggraf',
154 |         'Burggräfin',
155 |         'Altgraf',
156 |         'Altgräfin',
157 |         'Reichsfreiherr',
158 |         'Reichsfreifrau',
159 |         'Reichsfreiin',
160 |         'Reichsritter',
161 |         'Ritter',
162 |         'Graf',
163 |         'Gräfin',
164 |         'Edler',
165 |         'Edle',
166 |         'Freifrau',
167 |         'Frfr.',
168 |         'Freiherr',
169 |         'Frhr.',
170 |         'Hochwürden',
171 |         'Pater',
172 |         'Pfarrer',
173 |         'Pastor',
174 |         'P.',
175 |         'Pfarrhelfer',
176 |         'Kaplan',
177 |         'Vikar',
178 |         'Dekan',
179 |         'Bischof',
180 |         'Kapitän',
181 |         'Kpt.',
182 |         'Leutnant',
183 |         'Lt.',
184 |         'Vorsitzender',
185 |         'Vors.',
186 |     } for w in ws[:-1]],
187 |     'amh': lambda ws: [False] + [w in {
188 |         'አቶ',  # Mr.
189 |         'ወይዘሮ',
190 |         'ወይዘሪት',
191 |         'ፕሮፌሰር',
192 |         'ፕሬዚዳንት',
193 |         'ፐሬዝዳንት',
194 |         'ፕሬዝዳንት',
195 |         'ኮለኔል',
196 |         'ጄኔራል',
197 |         'አቡነ',
198 |         'ቀስ',
199 |         'ሰላም',
200 |         'ሼኽ',
201 |         'ራስ',
202 |         'ቢትወደድ',
203 |         'ወ/ሮ',
204 |         'ወ/ሪት',
205 |         'ድ/ር',
206 |         'ፕ/ር',
207 |         'ፕ/ት',
208 |         'ኮ/ል',
209 |         'ጄ/ል',
210 |         'ሼኽ',
211 |         'ራስ',
212 |         'ቢትወደድ',
213 |         'አዛዥና',
214 |         'ልዑል',
215 |         'ሚኒስቴር',
216 |         'ዕድሜው',
217 |         'ወታደር',
218 |         'ም/ል',
219 |         'ጸሃፊ',
220 |         'ረዳት',
221 |         'ጸሐፊ',
222 |         'አምባሳደር',
223 |         'አስተዳዳሪ',
224 |         'ሪፖርተራችን',
225 |     } for w in ws[:-1]],
226 |     'orm': lambda ws: [False] + [w.lower() in {
227 |         'obbo',  # Mister
228 |         'obboo',  # Mister
229 |         'obo',  # Mister
230 |         'abbaa',  # Father
231 |         'aba',
232 |         'ministeeraa',  # Minister
233 |     } for w in ws[:-1]],
234 |     'tir': lambda ws: [False] + [w in {
235 |         'ኣቶ',  # Mister_1
236 |         'ጐይታይ',  # Mister_2
237 |         'ሓላፊ',  # President_1
238 |         'ሓለቓ',  # President_2
239 |         'ወዘተ',  # President_3
240 |         'ፕረሲደንት',  # President_4
241 |         'ፕሬዝዳንት',  # President_5
242 |         'ኣቦ',  # Father
243 |     } for w in ws[:-1]],
244 |     'som': lambda ws: [w in {} for w in ws],
245 | }
246 | 
247 | 
248 | ex_head_org = {
249 |     'eng': lambda ws: [w in {
250 |         'Ministry',
251 |         'Department',
252 |         'Agency',
253 |         'Bureau',
254 |         'Company',
255 |         'Corporation',
256 |         'Inc.',
257 |         'Inc',
258 |         'Corp.',
259 |         'Corp',
260 |         'Authority',
261 |         'Organization',
262 |         'Organisation',
263 |         'Committee',
264 |         'Bank',
265 |     } for w in ws],
266 |     'deu': lambda ws: [w in {
267 |         'Amt',
268 |         'Ministerium',
269 |         'Agentur',
270 |         'Büro',
271 |         'Organisation',
272 |         'Abteilung',
273 |         'Abt.',
274 |         'Aktiengesellschaft',
275 |         'AG',
276 |         'Union',
277 |         'Genossenschaft',
278 |         'Gen.',
279 |         'Gesellschaft',
280 |         'GmbH',
281 |         'HTL',
282 |         'Regierung',
283 |         'Verband',
284 |         'Kommission',
285 |         'Bank',
286 |     } for w in ws],
287 |     'amh': lambda ws: [w in {
288 |         'ሚኒስቴር',
289 |         'ኤጀንሲ',
290 |         'ኮሚሽን',
291 |         'ኮርፖሬሽን',  # corporation
292 |         'ድርጅት',
293 |         'ባለሥልጣን',
294 |         'ባንክ',
295 |         'ቢሮ',
296 |         'ኮሚቴ',
297 |         'ኮርፖሬሽን',
298 |         'ምንጮች',
299 |         'ፓርቲ',  # party
300 |         'ፓርቲን',  # party_2
301 |         'ጋዜጣ',  # newpaper
302 |     } for w in ws],
303 |     'orm': lambda ws: [w.lower() in {
304 |         'ministirii',  # Ministry
305 |         'ministiri',
306 |         'damiyyaa',  # Department
307 |         'damiyya',
308 |         'wakkiila',  # Agency
309 |         'wakila',
310 |         'dhaabbata',  # Organization
311 |         'dhabata',
312 |         'koree',  # Committee
313 |         'kore',
314 |         'baankii',  # Bank
315 |         'banki',
316 |         'waldaa',  # Society
317 |         'walda',
318 |         'waraanni',  # Front
319 |         'warnani',
320 |     } for w in ws],
321 |     'tir': lambda ws: [w in {
322 |         'ክፍሊ',  # Department_1
323 |         'ጨንፈር',  # Department_2
324 |         'ዋኒን',  # Agency_1
325 |         'ተግባር',  # Agency_2
326 |         'ስርሒት',  # Agency_3
327 |         'ኤጄንሲ',  # Agency_4
328 |         'ሰደቓ',  # Bureau
329 |         'ኮርፖረሽን',  # Corporation
330 |         'ውድብ',  # Organization_1
331 |         'ኣወዳድባ',  # Organization_2
332 |         'ኣመሰራርታ',  # Organization_3
333 |         'ኮመት',  # Committee_1
334 |         'ሽማግለ',  # Committee_2
335 |         'ሰራዊት',  # Army
336 |         'ስርዓት',  # Regime
337 |     } for w in ws],
338 |     'som': lambda ws: [w.lower() in {
339 |         'dowladda',  # government
340 |         'maamulka',  # administration
341 |         'xafiiska',  # office
342 |         'wasaaradda',  # ministry
343 |         'hay\'adda',  # agency
344 |         'shirkadda',  # corporation
345 |         'saacadaha',  # organization
346 |         'guddi',  # board
347 |         'bankiga',  # bank
348 |         'ciidamada',  # army
349 |         'kooxda',  # faction
350 |         'shabakada',  # network
351 |     } for w in ws],
352 | }
353 | 
354 | 
355 | ex_head_loc = {
356 |     'eng': lambda ws: [w in {
357 |         'Island',
358 |         'Lake',
359 |         'River',
360 |         'Sea',
361 |         'Ocean',
362 |         'Mountain',
363 |         'Mountains',
364 |         'Valley',
365 |         'Bay',
366 |         'Mosque',
367 |         'Cathedral',
368 |         'Church',
369 |     } for w in ws],
370 |     'deu': lambda ws: [any([
371 |         re.search('[Bb]erg$', w),
372 |         re.search('[Gg]ebirge$', w),
373 |         re.search('[Ss]ee$', w),
374 |         re.search('[Mm]eer$', w),
375 |         re.search('[Oo]zean$', w),
376 |         re.search('[Tt]al$', w),
377 |         re.search('wald$', w),
378 |         re.search('[Bb]ucht$', w),
379 |         re.search('[Kk]irche$', w),
380 |         re.search('[Mm]oschee$', w),
381 |     ]) for w in ws],
382 |     'amh': lambda ws: [w in {
383 |         'ደሴት',
384 |         'ሐይክ',
385 |         'ወንዝ',
386 |         'ባህር',
387 |         'ወቅያኖስ',
388 |         'ተራራ',
389 |         'ሸለቆ',
390 |         'ሰፈር',
391 |         'ወሽመጥ',
392 |         'መስጊድ',
393 |         'ሀገር',
394 |         'ሆስፒታል',  # hospital
395 |     } for w in ws],
396 |     'orm': lambda ws: [w.lower() in {
397 |         'odoola',  # Island
398 |         'odola',
399 |         'odoolota',  # Islands
400 |         'odolota',
401 |         'calalaqa',  # Lake_1
402 |         'dabbal',  # Lake_2
403 |         'dabal',
404 |         'hara',  # Lake_3
405 |         'laaqii',  # Lake_4
406 |         'laqi',
407 |         'lagaa',  # River
408 |         'laga',
409 |         'garba',  # Sea
410 |         'maanya',  # Ocean
411 |         'manya',
412 |         'gooroo',  # Mountains
413 |         'goro',
414 |         'gaara',  # Mountain
415 |         'sulula',  # Valley
416 |         'bataskaana',  # Church
417 |         'masqiida',  # Mosque
418 |     } for w in ws],
419 |     'tir': lambda ws: [w in {
420 |         'ደሴት',  # Island_1
421 |         'ግሉል',  # Island_2
422 |         'ብሕቱው',  # Island_3
423 |         'ቀላይ',  # Lake_1
424 |         'ወይናይ',  # Lake_2
425 |         'ፈለግ',  # River
426 |         'ባሕሪ',  # Sea
427 |         'ሰፊሕ',  # Ocean
428 |         'ጎቦ',  # Mountain_1
429 |         'እምባ',  # Mountain_2
430 |         'ሩባ',  # Valley_1
431 |         'ለሰ',  # Valley_2
432 |         'ሕሉም',  # Valley_3
433 |         'ስንጭሮ',  # Valley_4
434 |         'በተኽስያን',  # Church
435 |         'መስጊድ',  # Mosque
436 |     } for w in ws],
437 |     'som': lambda ws: [w.lower() in {
438 |         'jasiirad',  # island
439 |         'harada',  # lake
440 |         'buurta',  # mountain
441 |         'dooxada',  # valley
442 |         'badweynta',  # ocean
443 |         'webiga',  # river
444 |         'masaajid',  # mosque
445 |         'hoteel',  # hotel
446 |         'hotelka',  # hotel
447 |         'hotel',  # hotel
448 |         'degmada',  # district
449 |         'deegaanka',  # district
450 |     } for w in ws],
451 | }
452 | 
453 | 
454 | ex_head_gpe = {
455 |     'eng': lambda ws: [w in {
456 |         'District',
457 |         'Zone',
458 |         'Region',
459 |         'Province',
460 |         'Division',
461 |         'Republic',
462 |         'Nation',
463 |         'City',
464 |         'Town',
465 |         'Village',
466 |         'State',
467 |     } for w in ws],
468 |     'deu': lambda ws: [any([
469 |         re.search('[rR]epublik$', w),
470 |         re.search('land$', w),
471 |         re.search('stan$', w),
472 |         re.search('[sS]tadt$', w),
473 |         re.search('heim$', w),
474 |         re.search('dorf$', w),
475 |         re.search('hausen$', w),
476 |         re.search('burg$', w),
477 |         re.search('berg$', w),
478 |         re.search('gau$', w),
479 |         re.search('[pP]rovinz$', w)
480 |     ]) for w in ws],
481 |     'amh': lambda ws: [w in {
482 |         'ከተማ',
483 |         'መንደር',
484 |         'ቀበሌ',
485 |         'ወረዳ',
486 |         'ዞን',
487 |         'ክልል',
488 |         'አውራጃ',
489 |         'መንግስት',
490 |         'ክፍላት',
491 |         'ጦር',
492 |         'ዙሪያ',
493 |         'ላይ',
494 |         'ተከማ',  # town
495 |     } for w in ws],
496 |     'orm': lambda ws: [w.lower() in {
497 |         'koonyaa',  # District_1
498 |         'konya',
499 |         'aanaa',  # District_2
500 |         'ana',
501 |         'goltaa',  # Zone_1
502 |         'golta',
503 |         'godina',  # Zone_2
504 |         'naannoo',  # Region
505 |         'nano',
506 |         'jamuriyaa',  # Republic_1
507 |         'jamuriya',
508 |         'republika',  # Republic_2
509 |         'magaalaa',  # City
510 |         'magala',
511 |         'magaalaan',
512 |         'magalan',
513 |         'daabbaa',  # Town
514 |         'daba',
515 |         'dira',  # Big Town
516 |         'gandaa',  # Village
517 |         'ganda',
518 |         'mootummaa',
519 |         'motuma',
520 |     } for w in ws],
521 |     'tir': lambda ws: [w in {
522 |         'ወረዳ',  # District
523 |         'ዞባ',  # Zone
524 |         'ከተማ',  # City
525 |         'ዞና',  # Region
526 |         'መንግስቲ',  # State
527 |         'ኣውራጃ',  # Prefecture/Province
528 |         'ረፑብሊክ',  # Republic
529 |         'ከተማ',  # City
530 |         'ገጠር',  # Village_1
531 |         'ቁሸት',  # Village_2
532 |         'ዓዲ',  # Village_3
533 |     } for w in ws],
534 |     'som': lambda ws: [w.lower() in {
535 |         'dalka',  # country
536 |         'dalalka',  # country
537 |         'gobolka',  # province, state
538 |         'magaalada',  # city
539 |         'tuulo',  # village
540 |         'jamhuuriyadda',  # republic
541 |     } for w in ws],
542 | }
543 | 
544 | 
545 | ex_prep_from = {
546 |     'eng': lambda ws: [w.lower() == 'from' for w in ws],
547 |     'deu': lambda ws: [w.lower() in {'von', 'vom'} for w in ws],
548 |     'amh': lambda ws: [bool(re.match('ከ', w)) for w in ws],
549 |     'orm': lambda ws: [w.lower() in {'irraa', 'ira'} for w in ws],
550 |     'tir': lambda ws: [w in {'ካብ'} for w in ws],
551 |     'som': lambda ws: [w in {'ilaa'} for w in ws],
552 | }
553 | 
554 | 
555 | ex_prep_in = {
556 |     'eng': lambda ws: [w.lower() == 'in' for w in ws],
557 |     'deu': lambda ws: [w.lower() in {'in', 'im'} for w in ws],
558 |     'amh': lambda ws: [bool(re.match('በ', w)) for w in ws],
559 |     'orm': lambda ws: [w.lower() in {'keessa', 'kesa', 'itti', 'iti'} for w in ws],
560 |     'tir': lambda ws: [w in {'ኣብ'} for w in ws],
561 |     'som': lambda ws: [w in {'ee'} for w in ws],
562 | }
563 | 
564 | 
565 | extractors = [
566 |     lambda lang: ex_capitalized,
567 |     lambda lang: ex_all_uppercased,
568 |     lambda lang: ex_mixed_case,
569 |     lambda lang: ex_internal_period,
570 |     lambda lang: ex_non_letter,
571 |     lambda lang: ex_digits,
572 |     lambda lang: ex_long_token,
573 |     lambda lang: ex_contains_latin,
574 |     lambda lang: ex_contains_ethiopic,
575 |     lambda lang: ex_title[lang],
576 |     lambda lang: ex_head_org[lang],
577 |     lambda lang: ex_head_loc[lang],
578 |     lambda lang: ex_head_gpe[lang],
579 |     lambda lang: ex_prep_from[lang],
580 |     lambda lang: ex_prep_in[lang],
581 | ]
582 | 
583 | 
584 | TYPE_START, TYPE_END = 0, 9
585 | TOKEN_START, TOKEN_END = 9, 15
586 | 
587 | 
588 | def extract(lang, seg):
589 |     fts = zip(*[ex(lang)(seg) for ex in extractors])
590 |     return [map(int, f) for f in fts]
591 | 
592 | 
593 | def extract_type_level(lang, seg):
594 |     fts = extract(lang, seg)
595 |     return [v[TYPE_START:TYPE_END] for v in fts]
596 | 
597 | 
598 | def extract_token_level(lang, seg):
599 |     fts = extract(lang, seg)
600 |     return [v[TOKEN_START:TOKEN_END] for v in fts]
601 | 
602 | 
603 | def extractIndicatorFeatures(lang, seg):
604 |     fts = extract(lang, seg)
605 |     return fts                                 
606 | 
607 | if __name__ == "__main__":
608 |     seg = [u'\u121d\u12dd\u1263\u12d5', u'\u12a3\u12e8\u122d', u'-', u'\u12f6\u1265', u'\u12a3\u120d\u1266', u'\u12c8\u1325\u122a', u'\u12d3\u1208\u121d']
609 |     b = extract("tir", seg)
610 |     print(b)


--------------------------------------------------------------------------------
/utils/orm_morph.py:
--------------------------------------------------------------------------------
1 | def best_parse(a):
2 |     return "www"


--------------------------------------------------------------------------------
/utils/orm_norm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/orm_norm/__init__.py


--------------------------------------------------------------------------------
/utils/orm_norm/orm_gaz.txt:
--------------------------------------------------------------------------------
  1 | Abadula Gemeda	Abbaaduulaa Gammadaa
  2 | Abalti Mountains	Gooroo Abbaltii
  3 | Abbalti	Abbaltii
  4 | Abbaya	Abbaya
  5 | Abbaya	Abbaayaa
  6 | Abomsa	Abboomsaa
  7 | Abyssinia	Habashaa
  8 | Adabba	Adaabbaa
  9 | Adama	Adaamaa
 10 | Adama	Hadaamaa
 11 | Adami Tullu	Adaamii Tulluu
 12 | Addalle	Addallee
 13 | Addelle	Addellee
 14 | Addis Ababa	Finfinne
 15 | Addis Ababa	Shaggar
 16 | Adoolaa	Adoolaa
 17 | Adulala	Adulaala
 18 | Afan Oromo (language)	Afaan Oromoo
 19 | Afar Region	Naannoo Affaar
 20 | Afghanistan	Afgaanistaan
 21 | Africa	Afrikaa
 22 | Africa	Aafrikaa
 23 | Africa	Afrikaa
 24 | Agafra	Agafraa
 25 | Aggaro	Aggaaroo
 26 | Aji	Ajjee
 27 | Ajje	Ajjee
 28 | Akaki	Aqaaqii
 29 | Akaki Basaka	Aqaaqii Basaqaa
 30 | Alaltu	Alaltuu
 31 | Albania	Albaaniyaa
 32 | Alexandria	Iskindiriyaa
 33 | Algeria	Aljeeriyaa
 34 | Ambo	Amboo
 35 | Ambo	Ambo
 36 | Ambo University	Yuunibarsiitii Amboo
 37 | Amhara Region	Naannoo Amaaraa
 38 | Amigna	Amiinyaa
 39 | Amnesty International	Amnistii Internaashinaal
 40 | Andorra	Andooraa
 41 | Angar	Angar
 42 | Angettu	Angeettuu
 43 | Angola	Angoolaa
 44 | Antarctic	Antaartikaa
 45 | Antarctic Ocean	Maanya Kibbacabbii
 46 | Antigua and Barbuda	Antiguwaa fi Barbuda
 47 | Arba Bordode	Arba Bordodee
 48 | Arba Gugu Mountains	Gooroo Arba Guguu
 49 | Arboyye	Arbooyyee
 50 | Arctic Ocean	Maanya Kaabacabbii
 51 | Arero	Areeroo
 52 | Argentina	Arjantiinaa
 53 | Arjo	Arjoo
 54 | Armenia	Armeeniyaa
 55 | Arsi	Arsii
 56 | Arsi Zone	Arsii
 57 | Arsi Zone	Godina Arsii
 58 | Asabot	Asaboot
 59 | Asalla	Asallaa
 60 | Asandabo	Asandaaboo
 61 | Asasa	Asaasa
 62 | Aseko	Asako
 63 | Asgori Abebe	Asgorii Abeebee
 64 | Asgori Bacho	Asgorii Bachoo
 65 | Asha	Ashaa
 66 | Asia	Eshiyaatti
 67 | Asia	Asiyaa
 68 | Asosa	Asaasaa
 69 | Atlanta	Atlaantaa
 70 | Australia	Awustiraaliyaa
 71 | Australia	Awustaraaliyaa
 72 | Austria	Ostiriyaa
 73 | Awaash Malkasa	Awaash Malkaasaa
 74 | Awadai	Awaday
 75 | Awash	Awaash
 76 | Awash Baldho	Awaash Baldhoo
 77 | Awash Kunture	Awaash Qunxuree
 78 | Ayira	Ayiraa
 79 | Azerbaijan	Azarbajaan
 80 | Baabbile	Baabbilee
 81 | Babbicha	Baabbicha
 82 | Babylon	Baabiloon
 83 | Badda Kerro	Baddaa Qeerroo
 84 | Badda Rogge	Baddaa Roggee
 85 | Baddalle	Baddallee
 86 | Baddanno	Baddannoo
 87 | Baddessa	Baddeessaa
 88 | Bahamas	Bahaamaa
 89 | Bahir Dar	Baahir Daar
 90 | Bahrain	Baahireen
 91 | Bakke	Bakkee
 92 | Bakko	Baakkoo
 93 | Bako Tibe	Baakkoo Tibbee
 94 | Bale	Baale
 95 | Bale (zone)	Baale
 96 | Bale (zone)	Godina Baale
 97 | Bambasi	Baambasii
 98 | Bangladesh	Baangiladeesh
 99 | Bantu	Baantuu
100 | Barbados	Barbaadoos
101 | Barcelona	Barseloonaa
102 | Baro	Baaroo
103 | Bata (town in Equatorial Guinea)	Bataa
104 | Bati	Baatii
105 | Batu	Baatuu
106 | Becho	Bacho
107 | Begi	Beegii
108 | Belarus	Belarus
109 | Belgium	Beeljigii
110 | Belize	Beliz
111 | Bench Maji	Beenchii Maajii
112 | Benin	Bineen
113 | Benishangul-Gumuz (region)	Benishangul-Gumuzii
114 | Bhutan	Butaan
115 | Bijata	Abijaataa
116 | Bila	Biilaa
117 | Birbir	Birbir
118 | Bishoftu	Bishooftuu
119 | Bodda	Bodda
120 | Bofa	Bofaa
121 | Boke Tikko	Bookee Xiqqoo
122 | Bokojjii	Boqojjii
123 | Bokona	Borkanaa
124 | Bole Bulbula	Boolee Bulbulaa
125 | Bolivia	Boliibiyaa
126 | Bombay	Bombee
127 | Bora	Booraa
128 | Borana	Boorana
129 | Bore	Booree
130 | Borena Zone	Godina Booranaa
131 | Borana Zone	Godina Booranaa
132 | Boroda	Borodaa
133 | Boru Jawwi	Boruu Jaawwii
134 | Bosnia and Herzegovina	Bosniyaa fi Hersigobenaa
135 | Botswana	Botiswaanaa
136 | Brazil	Biraazil
137 | Brunei	Birunaay
138 | Bulbuloo	Bulbuloo
139 | Bulgaria	Bulgaariyaa
140 | Burayu	Buraayyuu
141 | Bure	Buree
142 | Burka	Burqaa
143 | Burkina Faso	Burkiinaa Faasoo
144 | Burkina Faso	Burkinaa Faasoo
145 | Burma	Barmaa
146 | Burundi	Burundii
147 | Busa	Buusaa
148 | Calcutta	Kaalikutaa
149 | Cambodia	Kamboodiyaa
150 | Cameroon	Kaameruun
151 | Cameroon	Kamero
152 | Canada	Kanaadaa
153 | Cape Town	Keep Taawon
154 | Cape Verde	Keppe Verdee
155 | Casablanca	Kaasabilaankaa
156 | Central African Republic	Republika Afrikaa Jiddugalee
157 | Chacha	Caaca
158 | Chad	Chaad
159 | Chaffa Robi	Caffaa Roobii
160 | Chaffe Donsa	Caffee Doonsaa
161 | Chaffo	Caffo
162 | Chalalaka	Calalqaa
163 | Chalanko	Calanqoo
164 | Chalbi	Calbii
165 | Chamo	Caamoo
166 | Chancho	Caancoo
167 | Chanka	Caanqaa
168 | Cheliya	Calliyaa
169 | Chile	Chiilee
170 | China	Chaayinaa
171 | Chira	Ciraa
172 | Chiro	Ciro
173 | Chirracha	Cirracha
174 | Chittu	Cittuu
175 | Chukkala	Hara Cuqqaalaa
176 | Chulta Gurre Birki	Cuultaa Gurree Birqii
177 | Colombia	Kolombiyaa
178 | Comoros	Komooroo
179 | Congo (idk if it's DRC or Republic of the Congo)	Koongoo
180 | Congo Brazzaville	Kongoo Biraazabil
181 | Congo Kinshasa	Koongoo Kinshaasaa
182 | Costa Rica	Kostaa Rikaa
183 | Cote D'ivoire	Kotee Dibiwaar
184 | Croatia	Kirowaatiyaa
185 | Cuba	Kuubaa
186 | Cyprus	Sippiras
187 | Czech Republic	Republika Cheekii
188 | Dabana	Daabanaa
189 | Dabbus	Daabbus
190 | Dadar	Dadar
191 | Dagaga	Dagaagaa
192 | Dagam	Dagam
193 | Dalatti	Daalattii
194 | Dallo Sarbo	Dalloo Sarboo
195 | Dambi	Dambii
196 | Dambi Dollo	Dambi Dolloo
197 | Dandi Mountains	Gooroo Dandii
198 | Dannaba	Dannaba
199 | Dano	Daannoo
200 | Dase	Dasee
201 | Dawwa	Dawwaa
202 | Dawwe	Daawwee
203 | Dembidolo	Dambi Dooloo
204 | Denmark	Denmaarkii
205 | Dhera	Dheeraa
206 | Diddessa	Dhiddheessa
207 | Diksis	Diksiis
208 | Dilalla	Diilallaa
209 | Dima	Diimaa
210 | Dimtuu	Diimtuu
211 | Dinsho	Diinsho
212 | Dire Dawa	Dirre Dhawaa
213 | Dire Dawa	Dirre Dawaa
214 | Dire Dawa	Dirree Dhawaa
215 | Dirre Dhawa	Dirre Dhawaa
216 | Dirre Dhawa	Dirre Dawaa
217 | Dirre Dhawa	Dirree Dhawaa
218 | Djibouti	Jibuutii
219 | Djibouti	Jabuutii
220 | Dobba	Doobbaa
221 | Dodola woreda	Dodola
222 | Dollo	Noonnoo
223 | Dominica	Dominikaa
224 | Dominican Republic	Republika Dominikaa
225 | Dongoro	Dongoroo
226 | Doni	Doonii
227 | Doriya	Dooriyaa
228 | Dubar	Dubar
229 | Dukam	Duukam
230 | East Hararge	Harargee Bahaa
231 | East Hararghe Zone	Harargee Bahaa
232 | East Hararghe Zone	Godina Harargee Bahaa
233 | East Shawa	Shawaa Bahaa
234 | East Shewa Zone	Baha Shawaa
235 | East Shewa Zone	Godina Baha Shawaa
236 | East Welega Zone	Wallagga Bahaa
237 | East Welega Zone	Godina Wallagga Bahaa
238 | East Wellega Zone	Wallagga Bahaa
239 | East Wellega Zone	Godina Wallagga Bahaa
240 | East Wollega Zone	Wallagga Bahaa
241 | East Wollega Zone	Godina Wallagga Bahaa
242 | East Wallagga	Wallagga Bahaa
243 | East Wallagga	Godina Wallagga Bahaa
244 | Ebibiyin (city in Equatorial Guinea)	Ebeebiyiin
245 | Ecuador	Ekuwadoor
246 | Eddo	Eddoo
247 | Egypt	Gibxii
248 | Ejere	Ejeree
249 | Ejersa	Ejersa
250 | Ejersa Goro	Ejersa Gooroo
251 | El Salvador	Elsalbadoor
252 | Equatorial Guinea	Ekuwaatooriyaal Giinii
253 | Equatorial Guinea	Giinii Mundhilafaa
254 | Erer	Erer
255 | Erer Gota	Erer Gootaa
256 | Eritrea	Eeritiraa, Eertiraa
257 | Eritrea	Ertiraa
258 | Estonia	Istooniyaa
259 | Ethiopia	Itiyoophiyaa
260 | Ethiopia	Itiyoopiyaa
261 | Ethiopia	Itoophiyaa
262 | Ethiopia	Itoopiyaa
263 | Ethiopia	Toophiyaa
264 | Europe	Yuurooppi
265 | Europe	Oroppaa
266 | Fafan	Faafan
267 | Feyisa Lilesa	Fayyisaa Leellisaa
268 | Fiche	Fiche
269 | Fiji	Fiijii
270 | Filtu	Filtuu
271 | Fincha'a	Fincaa'aa
272 | Fincha	Fincaa'aa
273 | Finchawa	Fincaawa
274 | Finland	Finlaandii
275 | France	Firaansi
276 | France	Faransaa
277 | Frankfurt	Firaankifurtii
278 | Funyan Bira	Funyaan Biraa
279 | Gabba	Gabba
280 | Gabon	Gaabon
281 | Gabon	Gaaboon
282 | Gachi	Gachii
283 | Gafarsa	Hara Gafarsa
284 | Galamso	Galamso
285 | Gambela Region	Naannoo Gaambeellaa
286 | Gambia	Gaambiyaa
287 | Gamo Gofa (zone)	Gamuu-Gofaa
288 | Ganji	Ganjii
289 | Gannale	Gannaalee
290 | Garba	Garbaa
291 | Garba Gurracha	Garba Gurraacha
292 | Gasara	Gasaraa
293 | Gattira	Gaattiraa
294 | Gedo	Geedoo
295 | Gembe	Gembee
296 | Geneva	Jenebaa
297 | Georgia	Joorjiyaa
298 | Germany	Biyya Jarmanii
299 | Ghana	Gaanaa
300 | Gibe	Gibe
301 | Gibe Kalla	Gibe Qalldhaa
302 | Gidami	Gidaamii
303 | Gidda Ayyana	Giddaa Ayyaanaa
304 | Gimbi	Gimbii
305 | Ginchi	Giincii
306 | Gindo	Gindo
307 | Ginde Beret	Gindabarat
308 | Ginnir	Gindhir
309 | Girma Seifu	Girmaa Sayifuu
310 | Gobba	Gobba
311 | Gobessa	Gobeessa
312 | Gojjo	Goojjoo
313 | Gojjota	Goojjota
314 | Gore	Goree
315 | Gori	Gorii
316 | Goro Bale	Gooroo Baalee
317 | Goro Sole	Gooroo Soolee
318 | Goro Waliso	Gooroo Walisoo
319 | Greece	Giriik
320 | Greece	Biyaa Giriikii
321 | Grenada	Girenaadaa
322 | Guatemala	Guwatimaalaa
323 | Gudar	Gudar
324 | Guinea	Giinii
325 | Guinea Bissau	Giinii Bisaawu
326 | Guji Zone	Godina Gujii
327 | Gullallee (district of Addis Ababa)	Gullallee
328 | Gulliso	Gullisoo
329 | Guraferda	Gura Ferdaa
330 | Gurage	Guraagee
331 | Guraghe	Guraagee
332 | Gurra Dhamole	Gurra Dhaamolee
333 | Gurura	Gur'uraa
334 | Guruwa	Guruwaa
335 | Guyana	Guyaanaa
336 | Haile Selassie	Hayila Sillaasee
337 | Haiti	Hayitii
338 | Hamburg	Hamburgii
339 | Hara Maya	Hara Maayaa
340 | Haramaya	Hara Maayaa
341 | Harar	Harar
342 | Harari (region)	Naannoo Hararii
343 | Harato	Haratoo
344 | Harawacha	Harawaaca
345 | Harbu	Harbuu
346 | Harbu Chululle	Harbuu Culullee
347 | Haro Dibbe	Haroo Dibbee
348 | Haro Dumal	Haroo Dumaal
349 | Haro Hara Liban	Haroo Hara Liiban
350 | Harsadi	Harsadii
351 | Hatayye	Haxaayyee
352 | Herero	Hereero
353 | Hiddi Lola	Hiddii Lolaa
354 | Hirna	Hirnaa
355 | Ho Chi Minh City	Magaalaa Hochi Minii
356 | Holota	Hoolota
357 | Holota	Holota
358 | Honduras	Honduraas
359 | Honqolloo	Honqolloo
360 | Horn of Africa	Gaanfi Afrikaa, Gaafa Afrikaa
361 | Horo Guduru Welega Zone	Horroo Guduruu
362 | Horo Guduru Welega Zone	Godina Horroo Guduruu
363 | Hulluka	Hulluuqaa
364 | Human Rights Watch	Hiyumaan Raaytis Waach
365 | Hungary	Hungaariyaa
366 | Hurrumu	Hurruumuu
367 | Hursa	Huursaa
368 | Huruta	Hurutaa
369 | Ibadan	Ibaadan
370 | Ibsa (name)	Ibsaa
371 | Iceland	Islaandii
372 | Ijajji	Ijaajjii
373 | Illubabor Zone	Illuu Abbaa Booraa
374 | Illubabor Zone	Illuu Abbaaboor
375 | Ilubba Bora	Ilubbaa Booraa
376 | Inango	Inaangoo
377 | Inchinni Metta	Incinnii Meettaa
378 | Inchinnii Liban	Incinnii Liiban
379 | India	Biyya Hindii
380 | Indian Ocean	Garba Indiyaa
381 | Indian Ocean	Maanya Hindii
382 | Indonesia	Indoneesiyaa
383 | Iran	Iraan
384 | Iraq	Iraaq
385 | Ireland	Irlaandii
386 | Israel	Israa'el
387 | Istanbul	Istaanbul
388 | Italy	Ixaaliyaa
389 | Italy	Biyya Xaaliyaanii
390 | Itayya	Itayyaa
391 | Ivory Coast	Ayvorii Koost, Iyvoorikoost
392 | Jaja	Jaaja
393 | Jajjabe	Jajjabee
394 | Jalliyan	Jalliyaan
395 | Jamaica	Jamaayikaa
396 | Jamma	Jammaa
397 | Japan	Jaappaan
398 | Jarra	Jaarraa
399 | Jarso	Jaarso
400 | Jerusalem	Yerusaalem
401 | Jido	Jidda
402 | Jimma Mountains	Gooroo Jimmaa
403 | Jimma University	Yuunivarsiitii Jimmaa
404 | Jimma Zone	Godina Jimmaa
405 | Jimma	Jimma
406 | Jimma	Jimaan
407 | Jimma	Jimmam
408 | Jima	Jimma
409 | Jima	Jimaan
410 | Jima	Jimmam
411 | Johannesburg	Johaannisburgii
412 | Jordan	Joordaan
413 | Kachisi	Kaachisii
414 | Kake	Qaaqee
415 | Kakka	Kaakkaa
416 | Kamise	Kamisee
417 | Kara Mille	Karaa Millee
418 | Karachi	Karaachii
419 | Karra Kore	Kaarra Qoree
420 | Karsa	Qarsaa
421 | Kazakhstan	Kazaakistaan
422 | Kebbe	Qebbee
423 | Keffa	Kafaa
424 | Kelam Welega Zone	Qeellam Wallaggaa
425 | Kelam Welega Zone	Godina Qeellam Wallaggaa
426 | Kelam Wellega Zone	Qeellam Wallaggaa
427 | Kelam Wellega Zone	Godina Qeellam Wallaggaa
428 | Kelam Wollega Zone	Qeellam Wallaggaa
429 | Kelam Wollega Zone	Godina Qeellam Wallaggaa
430 | Kenya	Keniyaa,Keeniyaa
431 | Kerransa	Qeerransa
432 | Kersa	Qarsaa
433 | Kiribati	Kiribatii
434 | Kobbo Barento	Qobboo Barentoo
435 | Kobbo Rayya	Qobboo Raayyaa
436 | Kofale	Kofalee
437 | Koka	Qooqaa
438 | Kokossa	Kokossa
439 | Kolobo	Koloboo
440 | Kombolcha Guduru	Kombolcha Guduruu
441 | Kombolcha Nole	Kombolcha Noolee
442 | Kore	Qoree
443 | Kotoba	Kotoba
444 | Kullubbi	Qullubbii
445 | Kunni	Qundhii
446 | Kurfa Challe	Kurfaa Callee
447 | Kuwait	Kuweet
448 | Kuyera	Kuyeeraa
449 | Kyrgyzstan	Kirgizistaan
450 | Laga Dadhi	Laga Daadhii
451 | Lagos	Laagos
452 | Lake Awasa	Laga Awaash
453 | Lake Dadi	Hara Daadhii
454 | Lake Dambal	Laaqii Dambal
455 | Lake Dandi	Dabbal dandii
456 | Lake Maya	Hara Maayaa
457 | Lake Wanchi	Calalaqa Wanci
458 | Laki Dambal	Laaqii Dambal
459 | Langanno	Laangannoo
460 | Laos	Laa'os
461 | Latvia	Laatbiyaa
462 | Lebanon	Lebaanon
463 | Lemman	Leemman
464 | Lesotho	Lesootoo
465 | Liben Zone	Liiban
466 | Liberia	Libeeriyaa
467 | Libya	Liibiyaa
468 | Liechtenstein	Lishtenistaayin
469 | Limmu Shaye	Limmu Shaayee
470 | Lithuania	Lituweeniyaa
471 | Los Angeles	Los Anjeles
472 | Luxembourg	Luksamburgii
473 | Macedonia	Maasedoniyaa
474 | Machara	Machaaraa
475 | Madagascar	Madagaaskaar
476 | Maddo Gashi	Maddo Gaashii
477 | Maki	Maqii
478 | Malabo (town in Equatorial Guinea)	Malabo
479 | Malawi	Malaawii
480 | Malaysia	Maleesiyaa
481 | Maldives	Maaldibis
482 | Mali	Maalii
483 | Malka Rafu	Malkaa Raafuu
484 | Malka Wakkanna	Malkaa Waakkannaa
485 | Malta	Maaltaa
486 | Manchester United	Manchastar Yunaayitid, Manchastar Siitii
487 | Mandi	Mandii
488 | Mandida	Mandiidaa
489 | Manna	Maannaa
490 | Mararo	Mararoo
491 | Marsa	Marsaa
492 | Marseilles	Maarsee
493 | Marshal Islands	Odoolota Maarshaal
494 | Marti	Martii
495 | Mata Hara	Mata Haaraa
496 | Mattu	Mattuu
497 | Mauritania	Mawurtaaniyaa
498 | Mauritius	Mawurishees
499 | Mecca	Makkaa
500 | Mecha and Tulama Self-Help Association	Waldaa Wal-gargaarsa Maccaa fi Tuulamaa
501 | Medina	Madiinaa
502 | Mediterranean Sea	Galaana Mediteraaniyaanii
503 | Mediterranean Sea	Garba Meditraaniyaa
504 | Megga	Meeggaa
505 | Meles Zenawi	Meles Zeenaawwii
506 | Menelik (Emperor)	Minilik
507 | Meta (woreda)	Meettaa
508 | Metta Gafarsa	Meettaa Gafarsaa
509 | Metti	Meexxii
510 | Mexico	Meeksikoo
511 | Michata	Miiccataa
512 | Micronesia	Mikroneeisyaa
513 | Miesso	Mi'eesso
514 | Milan	Milaanoo
515 | Minnesota	Minnesootaa
516 | Minnesota	Minisootaa
517 | Mogadishu	Moqaadishoo
518 | Mogor	Mogor
519 | Mojo	Mojo
520 | Moldova	Moldoobaa
521 | Mombasa	Mombaasaa
522 | Monaco	Monaakoo
523 | Mongolia	Mongooliyaa
524 | Mongomo (town in Equatorial Guinea)	Mongoma
525 | Montreal	Montireel
526 | Mormor	Mormor
527 | Morocco	Morokko
528 | Morocco	Morokoo
529 | Moyale	Mooyalee
530 | Mozambique	Moozaambik
531 | Mozambique	Mozaambik
532 | Mt. Abella	Gaara Abeellaa
533 | Mt. Adi	Gaara Adii
534 | Mt. Asabot	Agaar Asaboot
535 | Mt. Asha	Gaara Ashaa
536 | Mt. Batu	Gaara Baatuu
537 | Mt. Bora	Gaara Booraa
538 | Mt. Chilalo	Gaara Cilaaloo
539 | Mt. Chukkala	Tulluu Cuqqaalaa
540 | Mt. Dalota	Tulluu Daalotaa
541 | Mt. Dannaba	Gaara Dannabaa
542 | Mt. Dimtu	Tullu Diimtuu
543 | Mt. Erer	Gaara Erer
544 | Mt. Fantalle	Gaara Fantaallee
545 | Mt. Furi	Tulluu Furii
546 | Mt. Gorfo	Gaara Gorfoo
547 | Mt. Gorte	Gaara Gorxee
548 | Mt. Hatabella	Gaara Haxabeellaa
549 | Mt. Hochocha	Gaara Hococaa
550 | Mt. Jibat	Tulluu Jibaat
551 | Mt. Jorgo	Tulluu Joorgoo
552 | Mt. Kunduddo	Gaara Qunduddoo
553 | Mt. Mao	Gaara Maa'oo
554 | Mt. Mara	Tulluu Maraa
555 | Mt. Mullata	Gaara Muldhataa
556 | Mt. Salale	Gaara Salaalee
557 | Mt. Walal	Tulluu Walal
558 | Muggi	Muggii
559 | Munich	Muniik
560 | Nagalle Arsi	Nageellee Arsii
561 | Nageellee Metama	Nageellee Meexamaa
562 | Nagelle Borana	Nageellee Booranaa
563 | Najjo	Najjoo
564 | Nakamte	Naqamte
565 | Namagna	Amanya
566 | Namibia	Namibiyaa
567 | Nauru	Nawuruu
568 | Negele Arsi	Arsii-Nageellee
569 | Nekemte	Naqamte
570 | Nekemte	Naqamtee
571 | Nakamti	Naqamte
572 | Nakamti	Naqamtee
573 | Nek’emtē	Naqamte
574 | Nek’emtē	Naqamtee
575 | Nek'emte	Naqamte
576 | Nek'emte	Naqamtee
577 | Nepal	Neppaal
578 | Netherlands	Neezarlaandii
579 | New York	Niwu Yoorki
580 | New Zealand	Niw Zilaadii
581 | Nicaragua	Nikaraaguwaa
582 | Niger	Nijeer
583 | Nigeria	Naayijeeriyaa
584 | Nigeria	Naajeriyaa
585 | Nono	Noonoo
586 | North America	Ameerikaa Kaabaa
587 | North Korea	Koriyaa Kaabaa
588 | North Sea	Garba Boroo
589 | North Shewa Zone	Shawaa Kaabaa
590 | North Shawa	Shawaa Kaabaa
591 | Norway	Noorwee
592 | Obbi	Obbi
593 | Oborra	Oborraa
594 | Oborso	Oborso
595 | Ogolcho	Ogolchoo
596 | Olonkomi	Olonkomii
597 | Oman	Omaan
598 | Oromia Media Network (OMN)	OMN
599 | Oromia Region	Oromiyaa
600 | Oromiya Region	Oromiyaa
601 | Oromia Regional State	Oromiyaa
602 | Oromo (people)	Oromoo
603 | Oromo Liberation Front (OLF)	Adda Bilisummaa Oromoo
604 | Oromo People's Democratic Organization (OPDO)	Dhaabbata Dimookraatawaa Ummata Oromoo
605 | Osaka	Osaakaa
606 | Pacific Ocean	Garba Paasifiik
607 | Pakistan	Paakistaan
608 | Palau	Palawuu
609 | Panama	Panamaa
610 | Papua New Guinea	Pappawaa Niw giinii
611 | Paraguay	Paraguwaay
612 | Philippines	Filippiin
613 | Poland	Polandii
614 | Portugal	Portugaal
615 | Qatar	Kataar
616 | Rammis	Raammis
617 | Ras Dejen	Raash Daashin
618 | Ras Dashen	Raash Daashin
619 | Ras Dashan	Raash Daashin
620 | Red Sea	Galaana Diimaa
621 | Red Sea	Garba Diimaa
622 | Rejji	Reejjii
623 | Rio de Janeiro	Riiyoo Di Jeneroo
624 | Robe Arsi	Roobe Arsii
625 | Robe Bale	roobe Baalee
626 | Rogge Ammayya	Roggee Ammayyaa
627 | Romania	Rumaaniyaa
628 | Rotterdam	Roterdaam
629 | Russia	Rusiyaa
630 | Rwanda	Ruwaandaa
631 | Sabbata	Sabbata
632 | Sagan	Saagan
633 | Sagure	Saaguee
634 | Saint Kitts and Nevis	Seenti Kitii fi Nebis
635 | Saint Lucia	Seenti Luchiyaa
636 | Saint Vincent and Grenadines	Seenti Binchentii fi Girenadiin
637 | Sakka	Saqqaa
638 | Sakkata	Saaqqata
639 | Saku	Saaku
640 | Sambate	Sambatee
641 | same as Garba Meditraaniyaa	Garba Jiddugaleessa
642 | Samoa	Samowaa
643 | San Marino	Saan Mariinoo
644 | Sandafa	Sandaafa
645 | Sao Tome and Principe	Sawo Toomee fi Pirinsippii
646 | Sarbo	Sarboo
647 | Sasigga	Saasiggaa
648 | Saudi Arabia	Saudi Arabiya
649 | Saudi Arabia	Sa'uudi Arabiya
650 | Saudi Arabia	Suud Arabiyaa
651 | Saudi Arabia	Sawudii Arabiyaa
652 | Sayyo	sayyoo
653 | Seden Sodo	Sadan Sooddoo
654 | Senegal	Sengaal
655 | Senegal	Senegaal
656 | Serbia and Montenegro	Sarbiyaa fi Montenegroo
657 | Serofta	Seeroftaa
658 | Seru	Seeruu
659 | Seychelles	Sechiliis
660 | Shabbe	Shabbee
661 | Shaggar	Shaggar
662 | Shaki	Shakii
663 | Shakkiso	Shaakkisoo
664 | Shala	Shaalaa
665 | Shambu	Shaambu
666 | Shanan	Shanan
667 | Shanghai	Shaangaay
668 | Shano	Shano
669 | Shashamanne	Shaashamannee
670 | Shashemene	Shashemane
671 | Shashemene	Shaashemannee
672 | Shashamane	Shashemane
673 | Shashamane	Shaashemannee
674 | Shayya	Shaayyaa
675 | Shirbo	Shirboo
676 | Shire Arsi	Shiree Arsii
677 | Shoa: Shewa	Shawaa
678 | Shoboka	Shoboka
679 | Sibu Sire	Siibuu Siree
680 | Sidama Zone	Sidaamoo
681 | Sidama Zone	Sidaamaa
682 | Sierra Leone	Seraliyoon
683 | Silt'e Zone	Silxee
684 | Siltie Zone	Silxee
685 | Singapore	Singapoor
686 | Sire Arsi	Siree Arsii
687 | Sire Robi	Siree Roobii
688 | Slovakia	Islobaakiyaa
689 | Slovakia	Islobeeniyaa
690 | Sodare	Soodaree
691 | Sokorru	Sokorruu
692 | Solomo	Soolomo
693 | Solomon Islands	Odoolota Solomon
694 | Somali	Somaalee
695 | Somalia	Somaaliyaa
696 | Sor	Soor
697 | South Africa	Afrikaa Kibbaa
698 | South Africa	Aafrikaa Kibbaa
699 | South America	Ameerikaa Kibbaa
700 | South Korea	Koriyaa Kibbaa
701 | South Sudan	Sudaan Kibbaa
702 | Southern Nations, Nationalities, and Peoples' Region (SSNPR)	Naannoo Sabaa fi Sab-lammoota Ummattoota Kibbaa
703 | Southwest Shewa Zone	Shawaa Kibba-lixaa
704 | Southwest Shewa Zone	Shawaa Kill Lixaa
705 | Soyyama	Sooyyama
706 | Spain	Ispaanyaa
707 | Sri Lanka	Siri Laankaa
708 | St. Petersburg	Seenti Peterburgii
709 | Strasbourg	Istiraasburgii
710 | Sudan	Sudaan
711 | Sululta	Sululta
712 | Suriname	Surinaam
713 | Swaiziland	Iswaazilaandii
714 | Sweden	Iswiidin
715 | Switzerland	Biyya Iswiisii
716 | Sydney	Sidinee
717 | Syria	Sooriyaa
718 | Tafki	Tafkii
719 | Taiwan	Taayiwaan
720 | Taji	Tajii
721 | Tajikistan	Tajikistaan
722 | Taltalle	Taltallee
723 | Tanzania	Tanzaaniyaa
724 | Tanzania	Tanzaaniyaa
725 | Thailand	Taayilaandii
726 | The Hague	Haag
727 | Tibbe	Tibbee
728 | Tigray Region	Naannoo Tigraay
729 | Timbuktu	Tumbuktuu
730 | Tiyya	Xiyyaa
731 | Tobba	Toobbaa
732 | Togo	Toogoo
733 | Toke Kutaye	Kuutaayee
734 | Toke Kutaye	Tokkee Kuutaayee
735 | Tokke Irressa	Tokkee Irreessaa
736 | Tole	Tole
737 | Tonga	Tongaa
738 | Toronto	Torontoo
739 | Trinidad and Tobago	Tirindaadii fi Tobaagoo
740 | Tullu Milki	Tulluu Milkii
741 | Tulu Bolo	Tullu Boolloo
742 | Tunisia	Tuniisiyaa
743 | Tunisia	Tuniisiyaa
744 | Turkey	Biyya Turkii
745 | Turkmenistan	Turkemenistaan
746 | Tuvalu	Tabuluu
747 | U.S.A.	Ameerikaa
748 | Uganda	Yugaandaa
749 | Uganda	Ugaandaa
750 | Ukraine	Ukraayin
751 | United Arab Emirates	Tokkummaa Imiroota Arabaa
752 | United Kingdom	Biritaaniyaa
753 | United Kingdom	Yunaayitid Kingidem
754 | UK	Yunaayitid Kingidem
755 | United Liberation Forces of Oromia	Tokkummaa Humnoota Bilisummaa Oromiyaa
756 | United States	Yunaayitid Isteesi
757 | United States of America	Yunaayitid Isteesi
758 | US	Yunaayitid Isteesi
759 | USA	Yunaayitid Isteesi
760 | Urgessa	Urgeessaa
761 | Uruguay	Uruguwaay
762 | Uzbekistan	Uzbeekistaan
763 | Vanatu	Banuwaatuu
764 | Vatican	Baatikaan
765 | Venezuela	Benezuweelaa
766 | Venice	Benesiyaa
767 | Vietnam	Beetinaam
768 | Voice of America	raadiyoon sagalee Amerikaa
769 | Wabe	Waabee
770 | Wabe Gafarsa	Waabee Gafarsaa
771 | Wachu	Waaccuu
772 | Wadera	Wadeera
773 | Walabu	Haroo Walaabuu
774 | Walanchitti	Walancittii
775 | Waldiya	Waldiyaa
776 | Walga	Waalgaa
777 | Waliso (wordea)	Waliso
778 | Waliso (wordea)	Walisoo
779 | Walmal	Walmal
780 | Wama	Waamaa
781 | Wanci Mountains	Gooroo Wancii
782 | Wanji	Wanjii
783 | Wanji Gafarsa	Wanjii Gafarsaa
784 | Warka	Warqa
785 | Watar	Watar
786 | Wayane	Wayyannee
787 | Wayane	Wayyanne
788 | Wayane	Wayyaane
789 | Tigrayan People's Liberation Front	Wayyannee
790 | Tigrayan People's Liberation Front	Wayyanne
791 | Tigrayan People's Liberation Front	Wayyaane
792 | Weyane	Wayyannee
793 | Weyane	Wayyanne
794 | Weyane	Wayyaane
795 | Second Weyane	Wayyannee
796 | Second Weyane	Wayyanne
797 | Second Weyane	Wayyaane
798 | Wayyu	Waayyuu
799 | Welega	Wallagga
800 | Wellega	Wallagga
801 | Wollega	Wallagga
802 | West Arsi Zone	Arsii Lixaa
803 | West Arsi Zone	Arsii Dhihaa
804 | Mirab Arsi Zone	Arsii Lixaa
805 | Mirab Arsi Zone	Arsii Dhihaa
806 | West Gojjam Zone	Goojjaam Dhihaa
807 | West Hararghe Zone	Harargee Dhihaa
808 | West Hararghe Zone	Godina Harargee Dhihaa
809 | West Hararghe Zone	Harargee Lixaa
810 | West Shewa Zone	Shawaa Lixaa
811 | West Shewa Zone	Shawaa Dhihaa
812 | West Shewa Zone	Godina Shawaa Lixaa
813 | West Shewa Zone	Godina Shawaa Dhihaa
814 | West Welega Zone	Wallagga Lixaa
815 | West Welega Zone	Wallagga Dhihaa
816 | West Welega Zone	Godina Wallagga Lixaa
817 | West Welega Zone	Godina Wallagga Dhihaa
818 | West Wellega Zone	Wallagga Lixaa
819 | West Wellega Zone	Wallagga Dhihaa
820 | West Wellega Zone	Godina Wallagga Lixaa
821 | West Wellega Zone	Godina Wallagga Dhihaa
822 | West Wollega Zone	Wallagga Lixaa
823 | West Wollega Zone	Wallagga Dhihaa
824 | West Wollega Zone	Godina Wallagga Lixaa
825 | West Wollega Zone	Godina Wallagga Dhihaa
826 | Mirab Welega	Wallagga Lixaa
827 | Mirab Welega	Wallagga Dhihaa
828 | Mirab Welega	Godina Wallagga Lixaa
829 | Mirab Welega	Godina Wallagga Dhihaa
830 | West Wallagga	Wallagga Lixaa
831 | West Wallagga	Wallagga Dhihaa
832 | West Wallagga	Godina Wallagga Lixaa
833 | West Wallagga	Godina Wallagga Dhihaa
834 | Weyib	Weeyib
835 | World Bank	Baankii Addunyaa
836 | Yaballo	Yaaballoo
837 | Yabbu	Yabbuu
838 | Yadot	Yaadot
839 | Yambaro	Yambaroo
840 | Yayyu	Yaayyoo
841 | Yemen	Yaman
842 | Yirba Muda	Yirbaa Muudaa
843 | Yubdo	Yuubdoo
844 | Zambia	Zaambiyaa
845 | Zimbabwe	Zimbaabwee
846 | 


--------------------------------------------------------------------------------
/utils/orm_norm/ormnorm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | 
 6 | PATTERNS = [
 7 |     (r'([aeiou])(\1)', r'\1'),
 8 |     (r'(b|c|ch|d|dh|f|g|h|j|k|l|m|n|ny|p|ph|q|r|s|sh|t|v|w|x|y|z)\1', r'\1'),
 9 |     (r'ph', r'p'),
10 |     (r'q', r'k'),
11 |     (r'x', r't'),
12 |     (r'c([^h]|\b)', r'ch\1'),
13 |     (r'ai', r'ayi'),
14 |     (r's(b|c|ch|d|dh|f|g|h|j|k|l|m|n|ny|p|ph|q|r|s|sh|t|v|w|x|y|z)', r'f\1'),
15 | ]
16 | 
17 | 
18 | def normalize(text):
19 |     if all([x.isupper() for x in text]):
20 |         return text
21 |     cap = True if text[0].isupper() and all([x.islower() for x in text[1:]]) else False
22 |     text = text.lower()
23 |     for pattern, repl in PATTERNS:
24 |         text = re.sub(pattern, repl, text)
25 |     if cap:
26 |         return text.capitalize()
27 |     else:
28 |         return text
29 | 


--------------------------------------------------------------------------------
/utils/post_process.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | from collections import defaultdict
  3 | # "GENERAL lookup table"
  4 | tags = set(['GPE', 'PER', 'ORG', 'LOC'])
  5 | 
  6 | 
  7 | def read_gold_file(gold_path):
  8 |     with codecs.open(gold_path, "r", "utf-8") as fin:
  9 |         doc_set = set()
 10 |         for line in fin:
 11 |             line = line.strip()
 12 |             if len(line) == 0:
 13 |                 continue
 14 | 
 15 |             line = line.decode('utf-8')
 16 |             tokens = line.split('\t')
 17 | 
 18 |             doc_id = tokens[0]
 19 |             start = int(tokens[1])
 20 |             end = int(tokens[2])
 21 |             ner = tokens[5].split('/')[0]
 22 | 
 23 |             doc_set.add((doc_id, start, end))
 24 | 
 25 |         print 'num of annotated doc: %d' % len(doc_set)
 26 |     return doc_set
 27 | 
 28 | 
 29 | def make_darpa_format(span, curr_docum, curr_anot, start, end, tag):
 30 |     st = 'CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + span\
 31 |     + '\t' + curr_docum + ':' + str(start) + '-' + str(end) + '\t' + 'NIL' + '\t' + \
 32 |     tag + '\t' + 'NAM' + '\t' + '1.0' + "\n"
 33 |     return st.split('\t')
 34 | 
 35 | 
 36 | def combine_lookup_table(lookup_files):
 37 |     lookup_table = defaultdict(lambda: set())
 38 | 
 39 |     for key, fname in lookup_files.iteritems():
 40 |         if key in tags:
 41 |             with codecs.open(fname, "r", "utf-8") as fin:
 42 |                 for line in fin:
 43 |                     lookup_table[line.strip()].add(key)
 44 |         else:
 45 |             with codecs.open(fname, "r", "utf-8") as fin:
 46 |                 for line in fin:
 47 |                     fs = line.strip().split('\t')
 48 |                     lookup_table[fs[0]].add(fs[1])
 49 |     new_lookup_table = dict()
 50 | 
 51 |     # remove spans that are annotated with multiple entities
 52 |     for key, value in lookup_table.iteritems():
 53 |         if len(value) == 1:
 54 |             new_lookup_table[key] = list(value)[0]
 55 |     return new_lookup_table
 56 | 
 57 | 
 58 | def single_lookup_table(lookup_file, tag):
 59 |     lookup_table = dict()
 60 |     if tag in tags:
 61 |         with codecs.open(lookup_file, "r", "utf-8") as fin:
 62 |             for line in fin:
 63 |                 lookup_table[line.strip()] = tag
 64 |     else:
 65 |         with codecs.open(lookup_file, "r", "utf-8") as fin:
 66 |             for line in fin:
 67 |                 fs = line.strip().split('\t')
 68 |                 lookup_table[fs[0]] = fs[1]
 69 |     return lookup_table
 70 | 
 71 | 
 72 | def find_ngrams(sent, starts, ends, n):
 73 |     all_ngrams = []
 74 |     all_starts = []
 75 |     all_ends = []
 76 |     for i in range(1, n+1):
 77 |         all_ngrams += zip(*[sent[j:] for j in range(i)])
 78 |         all_starts += zip(*[starts[j:] for j in range(i)])
 79 |         all_ends += zip(*[ends[j:] for j in range(i)])
 80 |     return all_ngrams, all_starts, all_ends
 81 | 
 82 | 
 83 | def post_processing(path_darpa_prediction,
 84 |                     path_to_full_setE,
 85 |                     path_to_author,
 86 |                     output_file,
 87 |                     lookup_files=None,
 88 |                     label_propagate=True,
 89 |                     conf_num=0,
 90 |                     gold_file_path=None,
 91 |                     most_freq_num=20,
 92 |                     fout_conll_name=None):
 93 |     '''
 94 | 
 95 |     :param path_darpa_prediction: Final output
 96 |     :param path_to_full_setE: setE.conll
 97 |     :param path_to_author: "path_to_author_list"
 98 |     :param output_file:
 99 |     :param lookup_files: {"General": "path_to_lexicon_1", "General": path2"}
100 |     :param label_propagate: BOOLEAN
101 |     :return:
102 |     '''
103 | 
104 |     predicted_doc = defaultdict(lambda: dict()) # (doc_id: (span_token, start, end):NER)
105 |     unpredicted_spans = defaultdict(lambda: list()) # (doc_id: [(ngram_token, start, end)])
106 |     MAX_NGRAM = 5
107 |     prediction_list = []
108 |     predicted_spans = defaultdict(lambda: list())
109 | 
110 |     if lookup_files is not None:
111 |         lookup_table = combine_lookup_table(lookup_files)
112 |     author_lookup = single_lookup_table(path_to_author, "PER")
113 |     annot_id = defaultdict(lambda: 0) # doc_id:annotation num
114 | 
115 |     gold_spans = read_gold_file(gold_file_path)
116 | 
117 |     def _look_up(span, doc_attribute):
118 |         if doc_attribute == "DF" and span in author_lookup:
119 |             return 'PER'
120 |         if lookup_files is not None and span in lookup_table:
121 |             return lookup_table[span]
122 |         return None
123 | 
124 |     def _is_overlap(s1, e1, s2, e2):
125 |         # Condition: s1 < e1, s2 < e2
126 |         return not(e1 < s2 or e2 < s1)
127 | 
128 |     def _check_cross_annotations(list_spans, target_start, target_end):
129 |         flag = False
130 |         for (s, e) in list_spans:
131 |             if _is_overlap(s, e, target_start, target_end):
132 |                 flag = True
133 |                 break
134 |         return flag
135 | 
136 |     add_labels = 0  # includes both fixed labels and added labels
137 | 
138 |     # First using the lookup table to fix up the current predictions
139 |     with codecs.open(path_darpa_prediction, "r", "utf-8") as fin:
140 |         for line in fin:
141 |             fields = line.strip().split('\t')
142 |             span = fields[2]
143 |             predict_tag = fields[5]
144 |             doc_id_span = fields[3].split(":")
145 |             doc_id = doc_id_span[0]
146 |             doc_attribute = doc_id.split('_')[1]
147 |             annot_id[doc_id] += 1
148 |             span_id = [int(i.strip()) for i in doc_id_span[1].split('-')]
149 |             start_id, end_id = span_id[0], span_id[1]
150 | 
151 |             lookup_tag = _look_up(span, doc_attribute)
152 |             if lookup_tag is not None and lookup_tag != predict_tag and (doc_id, start_id, end_id) in gold_spans:
153 |                 add_labels += 1
154 |             predict_tag = predict_tag if lookup_tag is None else lookup_tag
155 | 
156 |             predicted_doc[doc_id][(span, start_id, end_id)] = predict_tag
157 |             prediction_list.append(make_darpa_format(span, doc_id, annot_id[doc_id], start_id, end_id, predict_tag))
158 |             predicted_spans[doc_id].append((start_id, end_id))
159 |     # Second, iterate over the full setE using the lookup tables to completed the predicted dict
160 |     # In the mean time, give statistics of ngrams for label propagation.
161 |     ngram_freq = defaultdict(lambda: 0)
162 |     full_setE_list = []
163 |     with codecs.open(path_to_full_setE, "r", "utf-8") as fin:
164 |         one_sent = []
165 |         start_ids = []
166 |         end_ids = []
167 |         doc_attribute = ""
168 |         for line in fin:
169 |             tokens = line.split('\t')
170 |             if len(tokens) == 0 or line == "" or line == "\n":
171 |                 one_sent_place_holder = []
172 |                 for k, (w, s, e) in enumerate(zip(one_sent, start_ids, end_ids)):
173 |                     one_sent_place_holder.append((s, e, doc_id, w))
174 |                 full_setE_list.append(one_sent_place_holder)
175 | 
176 |                 ngrams, starts, ends = find_ngrams(one_sent, start_ids, end_ids, MAX_NGRAM)
177 |                 for ngram, s, e in zip(ngrams, starts, ends):
178 |                     ngram = " ".join(ngram)
179 |                     ngram_freq[ngram] += 1
180 |                     predict_tag = _look_up(ngram, doc_attribute)
181 |                     key = (ngram, s[0], e[-1])
182 |                     if predict_tag is not None:
183 |                         if key not in predicted_doc[doc_id] and not _check_cross_annotations(predicted_spans[doc_id], s[0], s[-1]):
184 |                             predicted_doc[doc_id][key] = predict_tag
185 |                             annot_id[doc_id] += 1
186 |                             prediction_list.append(make_darpa_format(ngram, doc_id, annot_id[doc_id], s[0], e[-1], predict_tag))
187 | 
188 |                             predicted_spans[doc_id].append((s[0], e[-1]))
189 |                             if (doc_id, s[0], e[-1]) in gold_spans:
190 |                                 add_labels += 1
191 |                     else:
192 |                         if key not in predicted_doc[doc_id]:
193 |                             unpredicted_spans[doc_id].append(key)
194 |                 one_sent = []
195 |                 start_ids = []
196 |                 end_ids = []
197 |             else:
198 |                 word = tokens[0]
199 |                 doc_id = tokens[3]
200 |                 doc_attribute = doc_id.split('_')[1]
201 |                 start = int(tokens[6])
202 |                 end = int(tokens[7])
203 | 
204 |                 one_sent.append(word)
205 |                 start_ids.append(start)
206 |                 end_ids.append(end)
207 | 
208 |         if len(one_sent) != 0:
209 |             one_sent_place_holder = []
210 |             for k, (w, s, e) in enumerate(zip(one_sent, start_ids, end_ids)):
211 |                 one_sent_place_holder.append((s, e, doc_id, w))
212 |             full_setE_list.append(one_sent_place_holder)
213 | 
214 |             ngrams, starts, ends = find_ngrams(one_sent, start_ids, end_ids, MAX_NGRAM)
215 |             for ngram, s, e in zip(ngrams, starts, ends):
216 |                 ngram = " ".join(ngram)
217 |                 ngram_freq[ngram] += 1
218 |                 predict_tag = _look_up(ngram, doc_attribute)
219 |                 key = (ngram, s[0], e[-1])
220 |                 if predict_tag is not None:
221 |                     if key not in predicted_doc[doc_id] and not _check_cross_annotations(predicted_spans[doc_id], s[0],
222 |                                                                                          s[-1]):
223 |                         predicted_doc[doc_id][key] = predict_tag
224 |                         annot_id[doc_id] += 1
225 |                         prediction_list.append(
226 |                             make_darpa_format(ngram, doc_id, annot_id[doc_id], s[0], e[-1], predict_tag))
227 | 
228 |                         predicted_spans[doc_id].append((s[0], e[-1]))
229 |                         if (doc_id, s[0], e[-1]) in gold_spans:
230 |                             add_labels += 1
231 |                 else:
232 |                     if key not in predicted_doc[doc_id]:
233 |                         unpredicted_spans[doc_id].append(key)
234 | 
235 |     print("Total %d labels in the gold spans get fixed by the lookup tables!" % (add_labels,))
236 | 
237 |     def _print(dic):
238 |         for k, v in dic.iteritems():
239 |             print k, v
240 | 
241 |     tot_prop_label = 0
242 |     if label_propagate:
243 |         # Label propagation
244 |         # (a) Within document propagation
245 |         for doc_id, span_infos in predicted_doc.iteritems():
246 |             vote_tag = defaultdict(lambda: defaultdict(list))  # span: tag:[(start, end)]
247 |             for span_info, tag in span_infos.iteritems():
248 |                 span = span_info[0]
249 |                 start = span_info[1]
250 |                 end = span_info[2]
251 |                 vote_tag[span][tag].append((start, end))
252 |             new_vote_tag = dict()
253 |             for span, other in vote_tag.iteritems():
254 |                 max_tag = ""
255 |                 max_vote = 0
256 |                 for tag in other.keys():
257 |                     vote = len(other[tag])
258 |                     if vote > max_vote:
259 |                         max_vote = vote
260 |                         max_tag = tag
261 |                 new_vote_tag[span] = (max_tag, vote_tag[span][max_tag], max_vote)
262 | 
263 |             add_label = 0
264 |             for unpredict_span in unpredicted_spans[doc_id]:
265 |                 s2, e2 = unpredict_span[1], unpredict_span[2]
266 |                 uspan = unpredict_span[0]
267 |                 if uspan in new_vote_tag:
268 |                     # conservative propagation
269 |                     if new_vote_tag[uspan][2] <= conf_num:
270 |                         continue
271 |                     pred_tag = new_vote_tag[uspan][0]
272 |                     # check if there is an overlap between spans
273 |                     flag = True
274 |                     for s1, e1 in new_vote_tag[uspan][1]:
275 |                         if _is_overlap(s1, e1, s2, e2):
276 |                             print "There is overlap: ", (s1, e1), (s2, e2)
277 |                             flag = False
278 |                             break
279 |                     if flag and not _check_cross_annotations(predicted_spans[doc_id], s2, e2):
280 |                         # propagate the label
281 |                         if (doc_id, s2, e2) in gold_spans:
282 |                             add_label += 1
283 |                         annot_id[doc_id] += 1
284 |                         prediction_list.append(make_darpa_format(uspan, doc_id, annot_id[doc_id], s2, e2, pred_tag))
285 |                         predicted_spans[doc_id].append((s2, e2))
286 |                         unpredicted_spans[doc_id].remove(unpredict_span)
287 |             if add_label > 0:
288 |                 tot_prop_label += add_label
289 |                 print("Within Document Label Propagation: Add %d labels for Doc %s. " % (add_label, doc_id))
290 | 
291 |         print("Total %d labels get propagated within document for gold setE!" % (tot_prop_label, ))
292 | 
293 |         # (b) Cross document propagation
294 |         freq_ngram_list = sorted(ngram_freq, key=ngram_freq.get)[-most_freq_num:]
295 |         # for w in freq_ngram_list:
296 |         #     print w
297 |         vote_tag = defaultdict(lambda: defaultdict(lambda :0))
298 |         for doc_id, span_infos in predicted_doc.iteritems():
299 |             for span_info, tag in span_infos.iteritems():
300 |                 span = span_info[0]
301 |                 if span in freq_ngram_list:
302 |                     vote_tag[span][tag] += 1
303 |         vote_out_ents = dict()
304 |         vote_ent_freq = defaultdict(lambda: 0)
305 |         for span, other in vote_tag.iteritems():
306 |             max_tag = ""
307 |             max_vote = 0
308 |             for tag, vote in other.iteritems():
309 |                 vote_ent_freq[span] += vote
310 |                 if vote > max_vote:
311 |                     max_tag = tag
312 |                     max_vote = vote
313 |             vote_out_ents[span] = max_tag
314 |         print("###### Among %d most frequent ngram, %d of which are given labels by the model! ########### "
315 |               "\n The original form and their voted labels are as follows: " % (most_freq_num, len(vote_out_ents)))
316 |         print vote_out_ents
317 |         print("#" * 6 + "More friendly format: " + "#" * 6)
318 |         _print(vote_out_ents)
319 |         print("######## Please do some correction or addition here if you are willing to! #########")
320 |         vote_out_ents["#VOATigrigna"] = "ORG"
321 |         vote_out_ents[u"\u12ad\u120d\u120d"] = "O"
322 |         # vote_out_ents.__delitem__(u"\u12ad\u120d\u120d")
323 |         print("#" * 6 + "After your correction, now they are: " + "#" * 6)
324 |         _print(vote_out_ents)
325 |         print("######## The model predictions are also fixed using the new dictionary! #########")
326 |         fixed_pred = 0
327 |         for i, items in enumerate(prediction_list):
328 |             if items[2] in vote_out_ents:
329 |                 if vote_out_ents[items[2]] == "O":
330 |                     del prediction_list[i]
331 |                     fixed_pred += 1
332 |                 elif items[5] != vote_out_ents[items[2]]:
333 |                     prediction_list[i][5] = vote_out_ents[items[2]]
334 |                     fixed_pred += 1
335 |         print("Total %d labels in previous predictions get fixed!" % (fixed_pred,))
336 |         add_label = 0
337 |         vote_ent_add_freq = defaultdict(lambda :0)
338 |         for doc_id, unpredict_span_list in unpredicted_spans.iteritems():
339 |             for unpredict_span in unpredict_span_list:
340 |                 start, end = unpredict_span[1], unpredict_span[2]
341 |                 uspan = unpredict_span[0]
342 |                 if uspan in vote_out_ents and not _check_cross_annotations(predicted_spans[doc_id], start, end) and vote_out_ents[uspan] != "O":
343 |                     # if (doc_id, start, end) in gold_spans:
344 |                     #     add_label += 1
345 |                     add_label += 1
346 |                     vote_ent_add_freq[uspan] += 1
347 |                     annot_id[doc_id] += 1
348 |                     prediction_list.append(
349 |                         make_darpa_format(uspan, doc_id, annot_id[doc_id], start, end, vote_out_ents[uspan]))
350 | 
351 |                     predicted_spans[doc_id].append((start, end))
352 |                     unpredicted_spans[doc_id].remove(unpredict_span)
353 |         print("\nTotal %d labels get propagated across document for gold setE!" % (add_label, ))
354 |         print("\n####### Before label prop, the number of predictions have been assigned for each span: ########")
355 |         _print(vote_ent_freq)
356 |         print("####### Number of labels of each span ADDED in label prop: #########")
357 |         _print(vote_ent_add_freq)
358 |     with codecs.open(output_file, "w", encoding='utf-8') as fout:
359 |         for item in prediction_list:
360 |             one_sent = "\t".join(item)
361 |             fout.write(one_sent)
362 | 
363 |     print "#" * 10 + "Starting converting to conll format! " + "#" * 10
364 |     if fout_conll_name is not None:
365 |         prediction_dict = dict()
366 | 
367 |         for items in prediction_list:
368 |             doc_id = items[1].split('-')[0]
369 |             s = int(items[3].split(":")[1].split("-")[0])
370 |             e = int(items[3].split(":")[1].split("-")[1])
371 |             word = items[2]
372 |             tag = items[5]
373 |             prediction_dict[(s, e, doc_id)] = (word, tag)
374 | 
375 |         def _check_predicted(word, s, e, doc_id, first_index, last_index):
376 |             if (s, e, doc_id) in prediction_dict:
377 |                 pword, tag = prediction_dict[(s, e, doc_id)]
378 |                 if word == pword:
379 |                     return True, "B-" + tag
380 |             else:
381 |                 for i in range(e+1, last_index+1):
382 |                     if (s, i, doc_id) in prediction_dict:
383 |                         pword, tag = prediction_dict[(s, i, doc_id)]
384 |                         if word == pword[0:len(word)]:
385 |                             return True, "B-" + tag
386 |                 for i in range(first_index, s):
387 |                     if (i, e, doc_id) in prediction_dict:
388 |                         pword, tag = prediction_dict[(i, e, doc_id)]
389 |                         if word == pword[len(pword)-len(word):]:
390 |                             return True, "I-" + tag
391 |                 for i in range(first_index, s):
392 |                     for j in range(e+1, last_index+1):
393 |                         if (i, j, doc_id) in prediction_dict:
394 |                             pword, tag = prediction_dict[(i, j, doc_id)]
395 |                             if word in pword:
396 |                                 return True, "I-" + tag
397 |                 return False, "O"
398 | 
399 |         num_preded = 0
400 |         lines = 0
401 |         with codecs.open(fout_conll_name, "w", encoding="utf-8") as fout:
402 |             for sent in full_setE_list:
403 |                 first_index = sent[0][1]
404 |                 last_index = sent[-1][1]
405 |                 for s, e, doc_id, w in sent:
406 |                     exist, tag = _check_predicted(w, s, e, doc_id, first_index, last_index)
407 |                     fout.write(w + "\tNNP\tNP\t" + tag + "\n")
408 |                     if exist:
409 |                         num_preded += 1
410 |                 fout.write("\n")
411 |                 lines += 1
412 |                 if lines % 1000 == 0:
413 |                     print("Converted %d lines to conll!" % lines)
414 |         assert num_preded >= len(prediction_dict)
415 | 
416 | # based on ngram frequency
417 | if __name__ == "__main__":
418 |     author_list = "./debug/set012E_author.txt"
419 |     author_list = "/home/chuntinz/LORELEI_NER/datasets/post_data/tig/set012E_author.txt"
420 | 
421 |     setE_conll = "../datasets/setE/tig/setE.conll"
422 |     pred = "./debug/pred.conll"
423 |     pred = "../eval/ensemble3_59df10_darpa_output.conll"
424 |     # pred = "./post_test.txt"
425 |     setE_conll = "../new_datasets/setE/tig/setE.conll"
426 |     pred = "./debug/ensemble_67.conll"
427 | 
428 |     # lookup_file = {"Gen": "../eval/oromo/Oromo_Annotated.txt"}
429 |     output_file = "post_output_67.txt"
430 |     gold_file_path = "../ner_score/tir_setE_edl.tac"
431 |     f_conll_out = "post_output_67.conll"
432 | 
433 |     post_processing(pred, setE_conll, author_list, output_file, lookup_files=None, label_propagate=True,
434 |                     gold_file_path=gold_file_path, conf_num=2, most_freq_num=100, fout_conll_name=f_conll_out)
435 |     # post_process_lookup(pred, setE_conll, author_list, output_file, lookup_file)
436 | 
437 |     import os
438 | 
439 |     score_file = "../ner_score/score_tir.sh"
440 |     fout_name_before = "./before_score.txt"
441 |     fout_name = "./score.txt"
442 |     os.system("bash %s %s %s" % (score_file, output_file, fout_name))
443 |     os.system("bash %s %s %s" % (score_file, pred, fout_name_before))
444 |     print open(fout_name).read()
445 | 
446 | 


--------------------------------------------------------------------------------
/utils/segnerfts/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation for `segnerfts.py`
 2 | 
 3 | The module `segnerfts` defines NER indicator feature extractors for the following languages:
 4 | 
 5 | | Language | ISO 639-3 |
 6 | |----------|-----------|
 7 | | Amharic  | amh       |
 8 | | English  | eng       |
 9 | | German   | deu       |
10 | | Oromo    | orm       |
11 | | Somali   | som       |
12 | | Tigrinya | tir       |
13 | 
14 | ## Dependencies
15 | 
16 | This code requires the `unicodecsv` package.
17 | 
18 | ## Usage
19 | 
20 | The function `extract` takes as arguments the ISO 639-3 code and a list of tokens (ideally, a sentence) and returns a list consisting of a list of feature values for each token in the input.
21 | 
22 | ```python
23 | >>> import segnerfts
24 | >>> segnerfts.extract('deu', u'Vereinigten Arabischen Republik'.split())
25 | [[1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]]
26 | ```
27 | 
28 | The functions `extract_type_level` and `extract_token_level` take arguments of the same types but return only type-level and token-level features, respectively. The function `extract_gaz_features` features takes arguments of the same type and returns only the gazetteer features.
29 | 
30 | ## Features
31 | 
32 | The type-level feature extractors are functions. The token-level features are dictionaries that take ISO 639-3 codes and return functions.
33 | 
34 | ### Type-Level Features
35 | 
36 | * `ex_capitalized`: is the first character of the token upper-case?
37 | * `ex_all_uppercased`: are all characters of the token upper-case?
38 | * `ex_mixed_case`: among the non-initial characters, are there both upper case and lower case characters?
39 | * `ex_internal_period`: does the token include a period (full stop) that is non-initial and non-final?
40 | * `ex_non_letter`: does the token include a character that is not a letter and not a mark (according to Unicode definitions)?
41 | * `ex_digits`: does the character contain digits?
42 | * `ex_long_token`: is the token longer than a threshold (default=8 characters)?
43 | * `ex_contains_latin`: does the token include Latin characters?
44 | * `ex_contains_ethiopic`: does the token include Ethiopic characters?
45 | 
46 | ### Token-Level Features
47 | 
48 | * `ex_title`: is the preceding token a title? Note that in Somali, titles are not used before personal names.
49 | * `ex_head_org`: is the token a head word for an organization?
50 | * `ex_head_loc`: is the token a head word for a location or does it include such a word?
51 | * `ex_head_gpe`: is the token a head word for a geopolitical entity or does it include such a word?
52 | * `ex_prep_from`: is the token, or does the token include, a preposition meaning 'from'
53 | * `ex_prep_in`: is the token, or does the token include, a preposition meaning 'in'
54 | 
55 | ### Gazetteer Features
56 | 
57 | * `ex_b_gaz, LOC`: token is first token of LOC in gazetteer
58 | * `ex_b_gaz, GPE`: token is first token of GPE in gazetteer
59 | * `ex_b_gaz, ORG`: token is first token of ORG in gazetteer
60 | * `ex_b_gaz, PER`: token is first token of PER in gazetteer
61 | * `ex_i_gaz, LOC`: token is non-initial token of LOC in gazetteer
62 | * `ex_i_gaz, GPE`: token is non-initial token of GPE in gazetteer
63 | * `ex_i_gaz, ORG`: token is non-initial token of ORG in gazetteer
64 | * `ex_i_gaz, PER`: token is non-initial token of PER in gazetteer
65 | * `ex_o_gaz`: token is not in a gazetteer entry
66 | 


--------------------------------------------------------------------------------
/utils/segnerfts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/segnerfts/__init__.py


--------------------------------------------------------------------------------
/utils/segnerfts_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import print_function
  4 | 
  5 | import regex as re
  6 | import unicodecsv as csv
  7 | import copy
  8 | from functools import partial
  9 | 
 10 | 
 11 | def find_ngrams(input_list, n):
 12 |     return zip(*[input_list[i:] for i in range(n)])
 13 | 
 14 | 
 15 | def get_variants(raw):
 16 |     raw = raw.replace('; ', ';')
 17 |     return [tuple(v.split()) for v in raw.split(';')]
 18 | 
 19 | 
 20 | def load_gaz(gaz_fn):
 21 |     template = {'GPE': [], 'LOC': [], 'ORG': [], 'PER': []}
 22 |     gaz = {
 23 |         'amh': copy.copy(template),
 24 |         'eng': copy.copy(template),
 25 |         'deu': copy.copy(template),
 26 |         'orm': copy.copy(template),
 27 |         'som': copy.copy(template),
 28 |         'tir': copy.copy(template),
 29 |         }
 30 |     with open(gaz_fn, 'rb') as f:
 31 |         reader = csv.reader(f, encoding='utf-8')
 32 |         next(reader)
 33 |         for fields in reader:
 34 |             eng, lab, tir, tir_ipa, orm, orm_ipa, wik, id_, _ = fields
 35 |             if not lab:
 36 |                 if len(eng.split()) == 1:
 37 |                     lab = 'GPE'
 38 |             if tir and lab:
 39 |                 for v in get_variants(tir):
 40 |                     gaz['tir'][lab].append(v)
 41 |             if orm and lab:
 42 |                 for v in get_variants(orm):
 43 |                     gaz['orm'][lab].append(v)
 44 |     return gaz
 45 | 
 46 | 
 47 | gazetteer = load_gaz('../utils/gaz.csv')
 48 | 
 49 | 
 50 | def ex_b_gaz(segment, language=None, label=None):
 51 |     fts = list(map(lambda x: False, segment))
 52 |     for entry in gazetteer[language][label]:
 53 |         ngrams = find_ngrams(segment, len(entry))
 54 |         for i, ngram in enumerate(ngrams):
 55 |             if entry == ngram:
 56 |                 fts[i] = True
 57 |     return fts
 58 | 
 59 | 
 60 | def ex_i_gaz(segment, language=None, label=None):
 61 |     fts = list(map(lambda x: False, segment))
 62 |     for entry in gazetteer[language][label]:
 63 |         ngrams = find_ngrams(segment, len(entry))
 64 |         for i, ngram in enumerate(ngrams):
 65 |             if entry == ngram:
 66 |                 for j in range(len(ngram) - 1):
 67 |                     fts[i + j + 1] = True
 68 |     return fts
 69 | 
 70 | 
 71 | def ex_o_gaz(segment, language=None):
 72 |     fts = list(map(lambda x: True, segment))
 73 |     for label in gazetteer[language].keys():
 74 |         for entry in gazetteer[language][label]:
 75 |             ngrams = find_ngrams(segment, len(entry))
 76 |             for i, ngram in enumerate(ngrams):
 77 |                 if entry == ngram:
 78 |                     for j in range(len(ngram)):
 79 |                         fts[i + j] = False
 80 |     return fts
 81 | 
 82 | 
 83 | LONG_TOKEN_THRESH = 8
 84 | 
 85 | 
 86 | def ex_capitalized(ws):
 87 |     return [w[0].isupper() for w in ws]
 88 | 
 89 | 
 90 | def ex_all_uppercased(ws):
 91 |     return [all(x.isupper() for x in w) for w in ws]
 92 | 
 93 | 
 94 | def ex_mixed_case(ws):
 95 |     def mixed_case(w):
 96 |         noninit = [x.isupper() for x in w[1:]]
 97 |         return True in noninit and False in noninit
 98 |     return map(mixed_case, ws)
 99 | 
100 | 
101 | def ex_internal_period(ws):
102 |     return [len(w) > 2 and '.' in w[1:-1] for w in ws]
103 | 
104 | 
105 | def ex_non_letter(ws):
106 |     return [bool(re.search(r'[^\p{Letter}\p{Mark}]', w)) for w in ws]
107 | 
108 | 
109 | def ex_digits(ws):
110 |     return [bool(re.search(r'[\p{Number}]', w)) for w in ws]
111 | 
112 | 
113 | def ex_long_token(ws):
114 |     return [len(w) > LONG_TOKEN_THRESH for w in ws]
115 | 
116 | 
117 | def ex_contains_latin(ws):
118 |     return [bool(re.search(r'\p{Latin}', w)) for w in ws]
119 | 
120 | 
121 | def ex_contains_ethiopic(ws):
122 |     return [bool(re.search(r'\p{Ethiopic}', w)) for w in ws]
123 | 
124 | 
125 | ex_title = {
126 |     'eng': lambda ws: [w in {
127 |         'Mister',
128 |         'Mr.',
129 |         'Mr',
130 |         'Misses',
131 |         'Mrs.',
132 |         'Mrs',
133 |         'Miss',
134 |         'Ms.',
135 |         'Ms',
136 |         'Doctor',
137 |         'Dr.',
138 |         'Dr',
139 |         'Professor',
140 |         'Prof.',
141 |         'Prof',
142 |         'Father',
143 |         'Fr.',
144 |         'Fr',
145 |         'Reverend',
146 |         'Rev.',
147 |         'Rev',
148 |         'Revd',
149 |         'Pastor',
150 |         'Bishop',
151 |         'Bp.',
152 |         'Bp',
153 |         'President',
154 |         'Pres.',
155 |         'Representative',
156 |         'Rep.',
157 |         'Rep',
158 |         'Congressman',
159 |         'Congresswoman',
160 |         'Congressperson',
161 |         'Senator',
162 |         'Sen.',
163 |         'Sen',
164 |         'Secretary',
165 |         'Sec.',
166 |         'Sec',
167 |         'Lord',
168 |         'Lady',
169 |         'Justice',
170 |         'Sheriff',
171 |         'Principal',
172 |         'Mayor',
173 |     } for w in ws],
174 |     'deu': lambda ws: [w in {
175 |         'Herr',
176 |         'Hr.',
177 |         'Frau',
178 |         'Fr.',
179 |         'Fraulein',
180 |         'Frl.',
181 |         'Doktor',
182 |         'Dr.',
183 |         'Dr.med.',
184 |         'Dr.phil.',
185 |         'Dr.rer.nat.',
186 |         'Dr.jur.',
187 |         'Dr.theol.',
188 |         'Professor',
189 |         'Prof.',
190 |         'a.o.Prof.',
191 |         'o.Pr.',
192 |         'Dozent',
193 |         'Doz.',
194 |         'Richter',
195 |         'Senator',
196 |         'Sen.',
197 |         'Ministerpräsident',
198 |         'Ministerpräsidentin',
199 |         'Bürgermeister',
200 |         'Abgeordenete',
201 |         'Abg.',
202 |         'Bundeskanzler',
203 |         'Landeshauptmann',
204 |         'Kaiser',
205 |         'Kaiserin',
206 |         'König',
207 |         'Königin',
208 |         'Kurfürst',
209 |         'Kurfürstin',
210 |         'Erzherzog',
211 |         'Erzherzogin',
212 |         'Großherzog',
213 |         'Großherzogin',
214 |         'Großfürst',
215 |         'Großfürstin',
216 |         'Herzog',
217 |         'Herzogin',
218 |         'Pfalzgraf',
219 |         'Pfalzgräfin',
220 |         'Markgraf',
221 |         'Markgräfin',
222 |         'Landgraf',
223 |         'Landgräfin',
224 |         'Reichsfürst',
225 |         'Reichsfürstin',
226 |         'Reichsgraf',
227 |         'Reichsgräfin',
228 |         'Burggraf',
229 |         'Burggräfin',
230 |         'Altgraf',
231 |         'Altgräfin',
232 |         'Reichsfreiherr',
233 |         'Reichsfreifrau',
234 |         'Reichsfreiin',
235 |         'Reichsritter',
236 |         'Ritter',
237 |         'Graf',
238 |         'Gräfin',
239 |         'Edler',
240 |         'Edle',
241 |         'Freifrau',
242 |         'Frfr.',
243 |         'Freiherr',
244 |         'Frhr.',
245 |         'Hochwürden',
246 |         'Pater',
247 |         'Pfarrer',
248 |         'Pastor',
249 |         'P.',
250 |         'Pfarrhelfer',
251 |         'Kaplan',
252 |         'Vikar',
253 |         'Dekan',
254 |         'Bischof',
255 |         'Kapitän',
256 |         'Kpt.',
257 |         'Leutnant',
258 |         'Lt.',
259 |         'Vorsitzender',
260 |         'Vors.',
261 |     } for w in ws],
262 |     'amh': lambda ws: [w in {
263 |         'አቶ',  # Mr.
264 |         'ወይዘሮ',
265 |         'ወይዘሪት',
266 |         'ፕሮፌሰር',
267 |         'ፕሬዚዳንት',
268 |         'ፐሬዝዳንት',
269 |         'ፕሬዝዳንት',
270 |         'ኮለኔል',
271 |         'ጄኔራል',
272 |         'አቡነ',
273 |         'ቀስ',
274 |         'ሰላም',
275 |         'ሼኽ',
276 |         'ራስ',
277 |         'ቢትወደድ',
278 |         'ወ/ሮ',
279 |         'ወ/ሪት',
280 |         'ድ/ር',
281 |         'ፕ/ር',
282 |         'ፕ/ት',
283 |         'ኮ/ል',
284 |         'ጄ/ል',
285 |         'ሼኽ',
286 |         'ራስ',
287 |         'ቢትወደድ',
288 |         'አዛዥና',
289 |         'ልዑል',
290 |         'ሚኒስቴር',
291 |         'ዕድሜው',
292 |         'ወታደር',
293 |         'ም/ል',
294 |         'ጸሃፊ',
295 |         'ረዳት',
296 |         'ጸሐፊ',
297 |         'አምባሳደር',
298 |         'አስተዳዳሪ',
299 |         'ሪፖርተራችን',
300 |     } for w in ws],
301 |     'orm': lambda ws: [w.lower() in {
302 |         'obbo',  # Mister
303 |         'obboo',  # Mister
304 |         'obo',  # Mister
305 |         'abbaa',  # Father
306 |         'aba',  # Father
307 |         'ministeeraa',  # Minister
308 |         'durataa\'aa',  # President
309 |         'jeneraal',  # General
310 |     } for w in ws],
311 |     'tir': lambda ws: [w in {
312 |         'ኣቶ',  # Mister_1
313 |         'ጐይታይ',  # Mister_2
314 |         'ሓላፊ',  # President_1
315 |         'ሓለቓ',  # President_2
316 |         'ወዘተ',  # President_3
317 |         'ፕረሲደንት',  # President_4
318 |         'ፕሬዝዳንት',  # President_5
319 |         'ኣቦ',  # Father
320 |     } for w in ws],
321 |     'som': lambda ws: [w in {} for w in ws],
322 | }
323 | 
324 | 
325 | ex_head_org = {
326 |     'eng': lambda ws: [w in {
327 |         'Ministry',
328 |         'Department',
329 |         'Agency',
330 |         'Bureau',
331 |         'Company',
332 |         'Corporation',
333 |         'Inc.',
334 |         'Inc',
335 |         'Corp.',
336 |         'Corp',
337 |         'Authority',
338 |         'Organization',
339 |         'Organisation',
340 |         'Committee',
341 |         'Bank',
342 |     } for w in ws],
343 |     'deu': lambda ws: [w in {
344 |         'Amt',
345 |         'Ministerium',
346 |         'Agentur',
347 |         'Büro',
348 |         'Organisation',
349 |         'Abteilung',
350 |         'Abt.',
351 |         'Aktiengesellschaft',
352 |         'AG',
353 |         'Union',
354 |         'Genossenschaft',
355 |         'Gen.',
356 |         'Gesellschaft',
357 |         'GmbH',
358 |         'HTL',
359 |         'Regierung',
360 |         'Verband',
361 |         'Kommission',
362 |         'Bank',
363 |     } for w in ws],
364 |     'amh': lambda ws: [w in {
365 |         'ሚኒስቴር',
366 |         'ኤጀንሲ',
367 |         'ኮሚሽን',
368 |         'ኮርፖሬሽን',  # corporation
369 |         'ድርጅት',
370 |         'ባለሥልጣን',
371 |         'ባንክ',
372 |         'ቢሮ',
373 |         'ኮሚቴ',
374 |         'ኮርፖሬሽን',
375 |         'ምንጮች',
376 |         'ፓርቲ',  # party
377 |         'ፓርቲን',  # party_2
378 |         'ጋዜጣ',  # newpaper
379 |     } for w in ws],
380 |     'orm': lambda ws: [w.lower() in {
381 |         'ministirii',  # Ministry
382 |         'ministiri',
383 |         'damiyyaa',  # Department
384 |         'damiyya',
385 |         'wakkiila',  # Agency
386 |         'wakila',
387 |         'dhaabbata',  # Organization
388 |         'dhabata',
389 |         'koree',  # Committee
390 |         'kore',
391 |         'baankii',  # Bank
392 |         'banki',
393 |         'waldaa',  # Society
394 |         'walda',
395 |         'waraanni',  # Front
396 |         'warnani',
397 |     } for w in ws],
398 |     'tir': lambda ws: [w in {
399 |         'ክፍሊ',  # Department_1
400 |         'ጨንፈር',  # Department_2
401 |         'ዋኒን',  # Agency_1
402 |         'ተግባር',  # Agency_2
403 |         'ስርሒት',  # Agency_3
404 |         'ኤጄንሲ',  # Agency_4
405 |         'ሰደቓ',  # Bureau
406 |         'ኮርፖረሽን',  # Corporation
407 |         'ውድብ',  # Organization_1
408 |         'ኣወዳድባ',  # Organization_2
409 |         'ኣመሰራርታ',  # Organization_3
410 |         'ኮመት',  # Committee_1
411 |         'ሽማግለ',  # Committee_2
412 |         'ሰራዊት',  # Army
413 |         'ስርዓት',  # Regime
414 |     } for w in ws],
415 |     'som': lambda ws: [w.lower() in {
416 |         'dowladda',  # government
417 |         'maamulka',  # administration
418 |         'xafiiska',  # office
419 |         'wasaaradda',  # ministry
420 |         'hay\'adda',  # agency
421 |         'shirkadda',  # corporation
422 |         'saacadaha',  # organization
423 |         'guddi',  # board
424 |         'bankiga',  # bank
425 |         'ciidamada',  # army
426 |         'kooxda',  # faction
427 |         'shabakada',  # network
428 |     } for w in ws],
429 | }
430 | 
431 | 
432 | ex_head_loc = {
433 |     'eng': lambda ws: [w in {
434 |         'Island',
435 |         'Lake',
436 |         'River',
437 |         'Sea',
438 |         'Ocean',
439 |         'Mountain',
440 |         'Mountains',
441 |         'Valley',
442 |         'Bay',
443 |         'Mosque',
444 |         'Cathedral',
445 |         'Church',
446 |     } for w in ws],
447 |     'deu': lambda ws: [any([
448 |         re.search('[Bb]erg$', w),
449 |         re.search('[Gg]ebirge$', w),
450 |         re.search('[Ss]ee$', w),
451 |         re.search('[Mm]eer$', w),
452 |         re.search('[Oo]zean$', w),
453 |         re.search('[Tt]al$', w),
454 |         re.search('wald$', w),
455 |         re.search('[Bb]ucht$', w),
456 |         re.search('[Kk]irche$', w),
457 |         re.search('[Mm]oschee$', w),
458 |     ]) for w in ws],
459 |     'amh': lambda ws: [w in {
460 |         'ደሴት',
461 |         'ሐይክ',
462 |         'ወንዝ',
463 |         'ባህር',
464 |         'ወቅያኖስ',
465 |         'ተራራ',
466 |         'ሸለቆ',
467 |         'ሰፈር',
468 |         'ወሽመጥ',
469 |         'መስጊድ',
470 |         'ሀገር',
471 |         'ሆስፒታል',  # hospital
472 |     } for w in ws],
473 |     'orm': lambda ws: [w.lower() in {
474 |         'odoola',  # Island
475 |         'odola',
476 |         'odoolota',  # Islands
477 |         'odolota',
478 |         'calalaqa',  # Lake_1
479 |         'dabbal',  # Lake_2
480 |         'dabal',
481 |         'hara',  # Lake_3
482 |         'laaqii',  # Lake_4
483 |         'laqi',
484 |         'lagaa',  # River
485 |         'laga',
486 |         'garba',  # Sea
487 |         'maanya',  # Ocean
488 |         'manya',
489 |         'gooroo',  # Mountains
490 |         'goro',
491 |         'gaara',  # Mountain
492 |         'sulula',  # Valley
493 |         'bataskaana',  # Church
494 |         'masqiida',  # Mosque
495 |     } for w in ws],
496 |     'tir': lambda ws: [w in {
497 |         'ደሴት',  # Island_1
498 |         'ግሉል',  # Island_2
499 |         'ብሕቱው',  # Island_3
500 |         'ቀላይ',  # Lake_1
501 |         'ወይናይ',  # Lake_2
502 |         'ፈለግ',  # River
503 |         'ባሕሪ',  # Sea
504 |         'ሰፊሕ',  # Ocean
505 |         'ጎቦ',  # Mountain_1
506 |         'እምባ',  # Mountain_2
507 |         'ሩባ',  # Valley_1
508 |         'ለሰ',  # Valley_2
509 |         'ሕሉም',  # Valley_3
510 |         'ስንጭሮ',  # Valley_4
511 |         'በተኽስያን',  # Church
512 |         'መስጊድ',  # Mosque
513 |     } for w in ws],
514 |     'som': lambda ws: [w.lower() in {
515 |         'jasiirad',  # island
516 |         'harada',  # lake
517 |         'buurta',  # mountain
518 |         'dooxada',  # valley
519 |         'badweynta',  # ocean
520 |         'webiga',  # river
521 |         'masaajid',  # mosque
522 |         'hoteel',  # hotel
523 |         'hotelka',  # hotel
524 |         'hotel',  # hotel
525 |         'degmada',  # district
526 |         'deegaanka',  # district
527 |     } for w in ws],
528 | }
529 | 
530 | 
531 | ex_head_gpe = {
532 |     'eng': lambda ws: [w in {
533 |         'District',
534 |         'Zone',
535 |         'Region',
536 |         'Province',
537 |         'Division',
538 |         'Republic',
539 |         'Nation',
540 |         'City',
541 |         'Town',
542 |         'Village',
543 |         'State',
544 |     } for w in ws],
545 |     'deu': lambda ws: [any([
546 |         re.search('[rR]epublik$', w),
547 |         re.search('land$', w),
548 |         re.search('stan$', w),
549 |         re.search('[sS]tadt$', w),
550 |         re.search('heim$', w),
551 |         re.search('dorf$', w),
552 |         re.search('hausen$', w),
553 |         re.search('burg$', w),
554 |         re.search('berg$', w),
555 |         re.search('gau$', w),
556 |         re.search('[pP]rovinz$', w)
557 |     ]) for w in ws],
558 |     'amh': lambda ws: [w in {
559 |         'ከተማ',
560 |         'መንደር',
561 |         'ቀበሌ',
562 |         'ወረዳ',
563 |         'ዞን',
564 |         'ክልል',
565 |         'አውራጃ',
566 |         'መንግስት',
567 |         'ክፍላት',
568 |         'ጦር',
569 |         'ዙሪያ',
570 |         'ላይ',
571 |         'ተከማ',  # town
572 |     } for w in ws],
573 |     'orm': lambda ws: [w.lower() in {
574 |         'koonyaa',  # District_1
575 |         'konya',
576 |         'aanaa',  # District_2
577 |         'ana',
578 |         'goltaa',  # Zone_1
579 |         'golta',
580 |         'godina',  # Zone_2
581 |         'naannoo',  # Region
582 |         'nano',
583 |         'jamuriyaa',  # Republic_1
584 |         'jamuriya',
585 |         'republika',  # Republic_2
586 |         'magaalaa',  # City
587 |         'magala',
588 |         'magaalaan',
589 |         'magalan',
590 |         'daabbaa',  # Town
591 |         'daba',
592 |         'dira',  # Big Town
593 |         'gandaa',  # Village
594 |         'ganda',
595 |         'mootummaa',
596 |         'motuma',
597 |     } for w in ws],
598 |     'tir': lambda ws: [w in {
599 |         'ወረዳ',  # District
600 |         'ዞባ',  # Zone
601 |         'ከተማ',  # City
602 |         'ዞና',  # Region
603 |         'መንግስቲ',  # State
604 |         'ኣውራጃ',  # Prefecture/Province
605 |         'ረፑብሊክ',  # Republic
606 |         'ከተማ',  # City
607 |         'ገጠር',  # Village_1
608 |         'ቁሸት',  # Village_2
609 |         'ዓዲ',  # Village_3
610 |     } for w in ws],
611 |     'som': lambda ws: [w.lower() in {
612 |         'dalka',  # country
613 |         'dalalka',  # country
614 |         'gobolka',  # province, state
615 |         'magaalada',  # city
616 |         'tuulo',  # village
617 |         'jamhuuriyadda',  # republic
618 |     } for w in ws],
619 | }
620 | 
621 | 
622 | ex_prep_from = {
623 |     'eng': lambda ws: [w.lower() == 'from' for w in ws],
624 |     'deu': lambda ws: [w.lower() in {'von', 'vom'} for w in ws],
625 |     'amh': lambda ws: [bool(re.match('ከ', w)) for w in ws],
626 |     'orm': lambda ws: [w.lower() in {'irraa', 'ira'} for w in ws],
627 |     'tir': lambda ws: [w in {'ካብ'} for w in ws],
628 |     'som': lambda ws: [w in {'ilaa'} for w in ws],
629 | }
630 | 
631 | 
632 | ex_prep_in = {
633 |     'eng': lambda ws: [w.lower() == 'in' for w in ws],
634 |     'deu': lambda ws: [w.lower() in {'in', 'im'} for w in ws],
635 |     'amh': lambda ws: [bool(re.match('በ', w)) for w in ws],
636 |     'orm': lambda ws: [w.lower() in {'keessa', 'kesa', 'itti', 'iti'} for w in ws],
637 |     'tir': lambda ws: [w in {'ኣብ'} for w in ws],
638 |     'som': lambda ws: [w in {'ee'} for w in ws],
639 | }
640 | 
641 | 
642 | extractors = [
643 |     lambda lang: ex_capitalized,
644 |     lambda lang: ex_all_uppercased,
645 |     lambda lang: ex_mixed_case,
646 |     lambda lang: ex_internal_period,
647 |     lambda lang: ex_non_letter,
648 |     lambda lang: ex_digits,
649 |     lambda lang: ex_long_token,
650 |     lambda lang: ex_contains_latin,
651 |     lambda lang: ex_contains_ethiopic,
652 |     lambda lang: ex_title[lang],
653 |     lambda lang: ex_head_org[lang],
654 |     lambda lang: ex_head_loc[lang],
655 |     lambda lang: ex_head_gpe[lang],
656 |     lambda lang: ex_prep_from[lang],
657 |     lambda lang: ex_prep_in[lang],
658 |     lambda lang: partial(ex_b_gaz, language=lang, label='GPE'),
659 |     lambda lang: partial(ex_b_gaz, language=lang, label='LOC'),
660 |     lambda lang: partial(ex_b_gaz, language=lang, label='ORG'),
661 |     lambda lang: partial(ex_b_gaz, language=lang, label='PER'),
662 |     lambda lang: partial(ex_i_gaz, language=lang, label='GPE'),
663 |     lambda lang: partial(ex_i_gaz, language=lang, label='LOC'),
664 |     lambda lang: partial(ex_i_gaz, language=lang, label='ORG'),
665 |     lambda lang: partial(ex_i_gaz, language=lang, label='PER'),
666 |     lambda lang: partial(ex_o_gaz, language=lang),
667 | ]
668 | 
669 | 
670 | TYPE_START, TYPE_END = 0, 9
671 | TOKEN_START, TOKEN_END = 9, 15
672 | GAZ_START, GAZ_END = 15, 24
673 | 
674 | 
675 | def fake_extract(lang, seg):
676 |     fts = [ex(lang)(seg) for ex in extractors]
677 |     return fts
678 | 
679 | 
680 | def extract(lang, seg):
681 |     fts = zip(*[ex(lang)(seg) for ex in extractors])
682 |     return [list(map(int, f)) for f in fts]
683 | 
684 | 
685 | def extract_type_level(lang, seg):
686 |     fts = extract(lang, seg)
687 |     return [v[TYPE_START:TYPE_END] for v in fts]
688 | 
689 | 
690 | def extract_token_level(lang, seg):
691 |     fts = extract(lang, seg)
692 |     return [v[TOKEN_START:TOKEN_END] for v in fts]
693 | 
694 | 
695 | def extract_gaz_features(lang, seg):
696 |     fts = extract(lang, seg)
697 |     return [v[GAZ_START:GAZ_END] for v in fts]
698 | 
699 | 
700 | def extract_type_token_level(lang, seg):
701 |     fts = extract(lang, seg)
702 |     return [v[TYPE_START:TOKEN_END] for v in fts]
703 | 
704 | if __name__ == "__main__":
705 |     seg = [u'\u121d\u12dd\u1263\u12d5', u'\u12a3\u12e8\u122d', u'-', u'\u12f6\u1265', u'\u12a3\u120d\u1266', u'\u12c8\u1325\u122a', u'\u12d3\u1208\u121d']
706 |     b = extract("tir", seg)
707 |     print(b)
708 |     # a = extract_gaz_features("tir", seg)
709 |     # print(a)
710 | 


--------------------------------------------------------------------------------
/utils/split_train_ensemble.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import codecs
 3 | from random import shuffle
 4 | 
 5 | 
 6 | def split(path, write_to, split_num):
 7 |     tot_data = []
 8 | 
 9 |     with codecs.open(path, "r", "utf-8") as fin:
10 |         one_sent = []
11 |         for line in fin:
12 |             if line.strip() == "":
13 |                 if len(one_sent) > 0:
14 |                     tot_data.append(one_sent)
15 |                 one_sent = []
16 |             else:
17 |                 one_sent.append(line.strip())
18 |         if len(one_sent) > 0:
19 |             tot_data.append(one_sent)
20 | 
21 |     shuffle(tot_data)
22 | 
23 |     divs = len(tot_data) / split_num
24 |     splits = range(0, len(tot_data), divs)
25 |     splits[-1] = len(tot_data)
26 |     for i in range(split_num):
27 |         with codecs.open(write_to + "cp3_train_ens_" + str(i) + ".conll", "w", "utf-8") as fout:
28 |             for j in range(splits[i], splits[i+1]):
29 |                 for line in tot_data[j]:
30 |                     fout.write(line + "\n")
31 |                 fout.write("\n")
32 | 
33 | if __name__ == "__main__":
34 |     # Usage: python split_train_ensemble.py ../datasets/cp3/oromo/cp3_train.conll ../datasets/cp3/oromo/ 5
35 |     fname = sys.argv[1]
36 |     write_to_folder = sys.argv[2]
37 |     split_num = int(sys.argv[3])
38 | 
39 |     split(fname, write_to_folder, split_num)


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'chuntingzhou'
  2 | import dynet as dy
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | import gzip
  6 | import cPickle as pkl
  7 | import codecs
  8 | import math
  9 | import random
 10 | from random import shuffle
 11 | random.seed(448)
 12 | np.random.seed(1)
 13 | import operator
 14 | import re
 15 | MAX_CHAR_LENGTH = 45
 16 | 
 17 | # Regular expressions used to normalize digits.
 18 | DIGIT_RE = re.compile(br"\d")
 19 | 
 20 | # word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
 21 | 
 22 | 
 23 | def iob2(tags):
 24 |     """
 25 |     Check that tags have a valid IOB format.
 26 |     Tags in IOB1 format are converted to IOB2.
 27 |     """
 28 |     for i, tag in enumerate(tags):
 29 |         if tag == 'O':
 30 |             continue
 31 |         split = tag.split('-')
 32 |         if len(split) != 2 or split[0] not in ['I', 'B']:
 33 |             return False
 34 |         if split[0] == 'B':
 35 |             continue
 36 |         elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
 37 |             tags[i] = 'B' + tag[1:]
 38 |         elif tags[i - 1][1:] == tag[1:]:
 39 |             continue
 40 |         else:  # conversion IOB1 to IOB2
 41 |             tags[i] = 'B' + tag[1:]
 42 |     return True
 43 | 
 44 | 
 45 | def get_entity(label):
 46 |     entities = []
 47 |     i = 0
 48 |     while i < len(label):
 49 |         if label[i] != 'O':
 50 |             e_type = label[i][2:]
 51 |             j = i + 1
 52 |             while j < len(label) and label[j] == 'I-' + e_type:
 53 |                 j += 1
 54 |             entities.append((i, j, e_type))
 55 |             i = j
 56 |         else:
 57 |             i += 1
 58 |     return entities
 59 | 
 60 | 
 61 | def evaluate_ner(pred, gold):
 62 |     tp = 0
 63 |     fp = 0
 64 |     fn = 0
 65 |     for i in range(len(pred)):
 66 |         pred_entities = get_entity(pred[i])
 67 |         gold_entities = get_entity(gold[i])
 68 |         temp = 0
 69 |         for entity in pred_entities:
 70 |             if entity in gold_entities:
 71 |                 tp += 1
 72 |                 temp += 1
 73 |             else:
 74 |                 fp += 1
 75 |         fn += len(gold_entities) - temp
 76 |     precision = 1.0 * tp / (tp + fp)
 77 |     recall = 1.0 * tp / (tp + fn)
 78 |     f1 = 2 * precision * recall / (precision + recall)
 79 |     return precision, recall, f1
 80 | 
 81 | 
 82 | def fopen(filename, mode='r'):
 83 |     if filename.endswith('.gz'):
 84 |         return gzip.open(filename, mode)
 85 |     return open(filename, mode)
 86 | 
 87 | 
 88 | def get_pretrained_emb(path_to_emb, word_to_id, dim):
 89 |     word_emb = []
 90 |     print "Loading pretrained embeddings from %s." % (path_to_emb)
 91 |     for _ in range(len(word_to_id)):
 92 |         word_emb.append(np.random.uniform(-math.sqrt(3.0/dim), math.sqrt(3.0/dim), size=dim))
 93 | 
 94 |     print "length of dict: ", len(word_to_id)
 95 |     pretrain_word_emb = {}
 96 |     for line in codecs.open(path_to_emb, "r", "utf-8", errors='replace'):
 97 |         items = line.strip().split()
 98 |         if len(items) == dim + 1:
 99 |             try:
100 |                 pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
101 |             except ValueError:
102 |                 continue
103 | 
104 |     not_covered = 0
105 |     for word, id in word_to_id.iteritems():
106 |         if word in pretrain_word_emb:
107 |             word_emb[id] = pretrain_word_emb[word]
108 |         elif word.lower() in pretrain_word_emb:
109 |             word_emb[id] = pretrain_word_emb[word.lower()]
110 |         else:
111 |             not_covered += 1
112 | 
113 |     emb = np.array(word_emb, dtype=np.float32)
114 | 
115 |     print "Word number not covered in pretrain embedding: ", not_covered
116 |     return emb, word_to_id
117 | 
118 | 
119 | def pkl_dump(obj, path):
120 |     with open(path, "wb") as fout:
121 |         pkl.dump(obj, fout)
122 | 
123 | 
124 | def pkl_load(path):
125 |     with open(path, "rb") as fin:
126 |         obj = pkl.load(fin)
127 |     return obj
128 | 
129 | 
130 | def log_sum_exp_dim_0(x):
131 |     # numerically stable log_sum_exp
132 |     dims = x.dim()
133 |     max_score = dy.max_dim(x, 0) # (dim_1, batch_size)
134 |     if len(dims[0]) == 1:
135 |         max_score_extend = max_score
136 |     else:
137 |         max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1])
138 |         max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0])
139 |     x = x - max_score_extend
140 |     exp_x = dy.exp(x)
141 |     # (dim_1, batch_size), if no dim_1, return ((1,), batch_size)
142 |     log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0])
143 |     return log_sum_exp_x + max_score
144 | 
145 | 
146 | def data_iterator(data_pair, batch_size):
147 |     batches = make_bucket_batches(data_pair, batch_size)
148 |     for batch in batches:
149 |         yield batch
150 | 
151 | 
152 | def make_bucket_batches(data_collections, batch_size):
153 |     # Data are bucketed according to the length of the first item in the data_collections.
154 |     buckets = defaultdict(list)
155 |     tot_items = len(data_collections[0])
156 |     for data_item in data_collections:
157 |         src = data_item[0]
158 |         buckets[len(src)].append(data_item)
159 | 
160 |     batches = []
161 |     # np.random.seed(2)
162 |     for src_len in buckets:
163 |         bucket = buckets[src_len]
164 |         np.random.shuffle(bucket)
165 | 
166 |         num_batches = int(np.ceil(len(bucket) * 1.0 / batch_size))
167 |         for i in range(num_batches):
168 |             cur_batch_size = batch_size if i < num_batches - 1 else len(bucket) - batch_size * i
169 |             batches.append([[bucket[i * batch_size + j][k] for j in range(cur_batch_size)] for k in range(tot_items)])
170 |     np.random.shuffle(batches)
171 |     return batches
172 | 
173 | 
174 | def transpose_input(seq, padding_token=0):
175 |     # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]]
176 |     max_len = max([len(sent) for sent in seq])
177 |     seq_pad = []
178 |     seq_mask = []
179 |     for i in range(max_len):
180 |         pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq]
181 |         mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq]
182 |         seq_pad.append(pad_temp)
183 |         seq_mask.append(mask_temp)
184 | 
185 |     return seq_pad, seq_mask
186 | 
187 | 
188 | def transpose_discrete_features(feature_batch):
189 |     # Discrete features are zero-one features
190 |     # TODO: Other integer features, create lookup tables
191 |     # tgt_batch: [[[feature of word 1 of sent 1], [feature of word 2 of sent 2], ]]
192 |     # return: [(feature_num, batchsize)]
193 |     max_sent_len = max([len(s) for s in feature_batch])
194 |     feature_num = len(feature_batch[0][0])
195 |     batch_size = len(feature_batch)
196 |     features = [] # each: (feature_num, batch_size)
197 |     for i in range(max_sent_len):
198 |         w_i_feature = [dy.inputTensor(sent[i], batched=True) if i < len(sent) else dy.zeros(feature_num) for sent in feature_batch]
199 |         w_i_feature = dy.reshape(dy.concatenate(w_i_feature, d=1), (feature_num,), batch_size=batch_size)
200 |         features.append(w_i_feature)
201 | 
202 |     return features
203 | 
204 | 
205 | def transpose_and_batch_embs(input_embs, emb_size):
206 |     # input_embs: [[w1_emb, w2_emb, ]], embs are dy.expressions
207 |     max_len = max(len(sent) for sent in input_embs)
208 |     batch_size = len(input_embs)
209 |     padded_seq_emb = []
210 |     seq_masks = []
211 |     for i in range(max_len):
212 |         w_i_emb = [sent[i] if i < len(sent) else dy.zeros(emb_size) for sent in input_embs]
213 |         w_i_emb = dy.reshape(dy.concatenate(w_i_emb, d=1), (emb_size, ), batch_size=batch_size)
214 |         w_i_mask = [1.0 if i < len(sent) else 0.0 for sent in input_embs]
215 |         padded_seq_emb.append(w_i_emb)
216 |         seq_masks.append(w_i_mask)
217 | 
218 |     return padded_seq_emb, seq_masks
219 | 
220 | 
221 | def transpose_char_input(tgt_batch, padding_token):
222 |     # The tgt_batch may not be padded with <sow> and <eow>
223 |     # tgt_batch: [[[<sow>, <sos>, <eow>], [<sow>, s,h,e, <eow>],
224 |     # [<sow>, i,s, <eow>], [<sow>, p,r,e,t,t,y, <eow>], [<sow>, <eos>, <eow>]], [[],[],[]]]
225 |     max_sent_len = max([len(s) for s in tgt_batch])
226 |     sent_w_batch = []  # each is list of list: max_word_len, batch_size
227 |     sent_mask_batch = []  # each is list of list: max_word_len, batch_size
228 |     max_w_lens = []
229 |     SOW_PAD = 0
230 |     EOW_PAD = 1
231 |     EOS_PAD = 2
232 |     for i in range(max_sent_len):
233 |         max_len_w = max([len(sent[i]) for sent in tgt_batch if i < len(sent)])
234 |         max_w_lens.append(max_len_w)
235 |         w_batch = []
236 |         mask_batch = []
237 |         for j in range(0, max_len_w):
238 |             temp_j_w = []
239 |             for sent in tgt_batch:
240 |                 if i < len(sent) and j < len(sent[i]):
241 |                     temp_j_w.append(sent[i][j])
242 |                 elif i >= len(sent):
243 |                     if j == 0:
244 |                         temp_j_w.append(SOW_PAD)
245 |                     elif j == max_len_w - 1:
246 |                         temp_j_w.append(EOW_PAD)
247 |                     else:
248 |                         temp_j_w.append(EOS_PAD)
249 |                 else:
250 |                     temp_j_w.append(EOW_PAD)
251 |             # w_batch = [sent[i][j] if i < len(sent) and j < len(sent[i]) else self.EOW for sent in tgt_batch]
252 |             # print "temp: ", temp_j_w
253 |             w_batch.append(temp_j_w)
254 |             mask_batch.append([1. if i < len(sent) and j < len(sent[i]) else 0.0 for sent in tgt_batch])
255 |         sent_w_batch.append(w_batch)
256 |         sent_mask_batch.append(mask_batch)
257 |     return sent_w_batch, sent_mask_batch, max_sent_len, max_w_lens
258 | 
259 | 
260 | if __name__ == "__main__":
261 |     # from scipy.misc import logsumexp
262 |     # import numpy as np
263 |     #
264 |     # # a = np.random.rand(3, 4, 2)
265 |     # # b = logsumexp(a, axis=0)
266 |     # # a_t = dy.inputTensor(a, batched=True)
267 |     # # b_t = log_sum_exp_dim_0(a_t)
268 |     # # print "numpy "
269 |     # # print b
270 |     # # print "dynet "
271 |     # # print b_t.value(), b_t.dim()
272 |     # # print dy.pick_batch_elem(b_t, 1).npvalue()
273 |     #
274 |     # a = np.random.rand(3, 2)
275 |     # b = logsumexp(a, axis=0)
276 |     # a_t = dy.inputTensor(a, batched=True)
277 |     # b_t = log_sum_exp_dim_0(a_t)
278 |     # print "numpy "
279 |     # print b
280 |     # print "dynet "
281 |     # print b_t.value(), b_t.dim()
282 |     # print dy.pick_batch_elem(b_t, 1).npvalue()
283 |     dim = 100
284 |     #9 1000
285 |     path_to_emb = "/Users/zct/Downloads/tir1.emb"
286 |     # path_to_emb = "../datasets/english/glove.6B/glove.6B.100d.txt"
287 |     pretrain_word_emb = {}
288 |     i = 1
289 |     for line in codecs.open(path_to_emb, "r", 'utf-8', errors='replace'):
290 |         items = line.strip().split()
291 |         if len(items) == dim + 1:
292 |             try:
293 |                 pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32)
294 |             except ValueError:
295 |                 continue
296 |             print items[0], i, pretrain_word_emb[items[0]][:3]
297 |         i += 1
298 | 
299 | # gradient clipping
300 | # turn off the dropout
301 | # use smaller initial lr
302 | # variational dropout


--------------------------------------------------------------------------------