├── .gitignore ├── COPYING ├── README.md ├── commands ├── debug.sh ├── german.sh ├── german_cnn.sh └── random.sh ├── dataloaders ├── __init__.py ├── data_loader.py ├── data_loader_orig.py └── dataloader_unicode.py ├── datasets └── english │ ├── eng.dev.bio.conll │ ├── eng.test.bio.conll │ └── eng.train.bio.conll ├── eval ├── IO2BIO.py ├── IO2BIOES.py ├── conlleval ├── conlleval.v2 ├── eval.sh └── format.py ├── main.py ├── models ├── __init__.py ├── decoders.py ├── encoders.py └── model_builder.py └── utils ├── Convert_Output_Darpa.py ├── Convert_to_darpa_xml.py ├── __init__.py ├── extract_authors.py ├── features.py ├── gaz.csv ├── old_segnerfts.py ├── orm_morph.py ├── orm_norm ├── __init__.py ├── lexicon_supplement.txt ├── morpar_orm.py ├── orm_gaz.txt ├── orm_lexicon.txt ├── orm_lexicon_wikibooks.txt ├── orm_morph.py └── ormnorm.py ├── post_process.py ├── segnerfts ├── README.md ├── __init__.py ├── gaz.csv ├── morpar.py ├── morpar_orm.py ├── orm_morph.py ├── segnerfts.py └── tir_morph.py ├── segnerfts_2.py ├── split_train_ensemble.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS files 2 | **/*.DS_Store 3 | *.pyc 4 | # JetBrains 5 | .idea/ 6 | *.iml 7 | *.txt 8 | datasets/embedding/ 9 | maxma/ 10 | eval/ 11 | 12 | !utils/orm_norm/*.txt 13 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2018 cmu-ner team. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cmu-ner 2 | 3 | by Chunting Zhou, Aditi Chaudhary, David Mortenson 4 | (in collaboration w/ Graham Neubig and Jaime Carbonell) 5 | 6 | CMU-NER is a suite of software to detect Named Entities, such as people, locations, geo-political entities, in text in different languages. This software suite is primarily based on deep learning methods. 7 | 8 | ## TODO 9 | 10 | * Cross-lingual transfer learning with cheap translation. 11 | * Preproduce the morphology tag features. 12 | * Add IPA transfer learning. 13 | 14 | ## Acknowledgements 15 | 16 | The development of this software has been sponsored by DARPA-funded project at CMU called AIRES under DARPA's LORELEI initiative. 17 | 18 | ## License 19 | 20 | This software is available under the BSD license (see COPYING for details). 21 | -------------------------------------------------------------------------------- /commands/debug.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python ../main.py \ 3 | --dynet-seed 3278657 \ 4 | --word_emb_dim 100 \ 5 | --batch_size 10 \ 6 | --model_name "eng" \ 7 | --lang eng \ 8 | --valid_freq 1300 9 | 10 | # --pretrain_emb_path ../new_datasets/embs/glove.6B.100d.txt\ 11 | -------------------------------------------------------------------------------- /commands/german.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | MODEL_NAME=$1 3 | python ../main.py \ 4 | --dynet-seed 5783287 \ 5 | --word_emb_dim 64 \ 6 | --batch_size 10 \ 7 | --train_path ../datasets/german/deu.train.utf8.conll \ 8 | --dev_path ../datasets/german/deu.testa.utf8.conll \ 9 | --test_path ../datasets/german/deu.testb.utf8.conll \ 10 | --pretrain_emb_path ../datasets/embs/sskip/ger_emb.txt \ 11 | --emb_dropout_rate 0.0 \ 12 | --output_dropout_rate 0.5 \ 13 | --init_lr 0.01 \ 14 | --model_arc char_birnn \ 15 | --tag_emb_dim 100 \ 16 | --hidden_dim 100 \ 17 | --char_emb_dim 30\ 18 | --char_hidden_dim 25 \ 19 | --lang german \ 20 | --replace_unk_rate 0.5 \ 21 | --valid_freq 1300 2>&1 | tee ${MODEL_NAME} 22 | -------------------------------------------------------------------------------- /commands/german_cnn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash -------------------------------------------------------------------------------- /commands/random.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python ../main.py \ 3 | --dynet-seed 3278657 \ 4 | --word_emb_dim 100 \ 5 | --batch_size 10 \ 6 | --lang eng 7 | -------------------------------------------------------------------------------- /dataloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/dataloaders/__init__.py -------------------------------------------------------------------------------- /dataloaders/data_loader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | import os 3 | from utils.util import * 4 | from utils.features import * 5 | from utils.segnerfts import orm_morph as ormnorm 6 | 7 | class NER_DataLoader(): 8 | def __init__(self, args, special_normal=False): 9 | # This is data loader as well as feature extractor!! 10 | '''Data format: id word pos_tag syntactic_tag NER_tag''' 11 | ''' TODO: 1. normalizing all digits 12 | 2. Using full vocabulary from GloVe, when testing, lower case first''' 13 | self.args = args 14 | if args.train_ensemble: 15 | self.train_path = args.full_data_path 16 | else: 17 | self.train_path = args.train_path 18 | self.test_path = args.test_path 19 | self.dev_path = args.dev_path 20 | self.args = args 21 | 22 | self.tag_vocab_path = self.train_path + ".tag_vocab" 23 | self.word_vocab_path = self.train_path + ".word_vocab" 24 | self.char_vocab_path = self.train_path + ".char_vocab" 25 | 26 | self.pretrained_embedding_path = args.pretrain_emb_path 27 | self.use_discrete_feature = args.use_discrete_features 28 | self.use_brown_cluster = args.use_brown_cluster 29 | self.orm_norm = args.oromo_normalize 30 | self.orm_lower = args.train_lowercase_oromo 31 | 32 | if special_normal: 33 | self.orm_norm = False 34 | self.orm_lower = False 35 | 36 | if self.use_brown_cluster: 37 | self.brown_cluster_dicts = get_brown_cluster(args.brown_cluster_path) 38 | self.brown_cluster_dicts[''] = 499 39 | else: 40 | self.brown_cluster_dicts = None 41 | 42 | if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path): 43 | # TODO: encoding? 44 | print("Load vocabs from file ....") 45 | self.tag_to_id = pkl_load(self.tag_vocab_path) 46 | self.word_to_id = pkl_load(self.word_vocab_path) 47 | self.char_to_id = pkl_load(self.char_vocab_path) 48 | print("Done!") 49 | else: 50 | print("Generating vocabs from training file ....") 51 | if not self.args.isLr: 52 | paths_to_read = [self.train_path, self.test_path, self.dev_path] 53 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read) 54 | else: 55 | paths_to_read = [self.train_path] 56 | setEpaths = [self.dev_path, self.test_path] 57 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files_lr(paths_to_read, setEpaths) 58 | # FIXME: Remember dictionary value for char and word has been shifted by 1 59 | print "Size of vocab before: ", len(self.word_to_id) 60 | self.word_to_id[''] = len(self.word_to_id) + 1 61 | self.char_to_id[''] = len(self.char_to_id) + 1 62 | 63 | self.word_to_id['<\s>'] = 0 64 | self.char_to_id[''] = 0 65 | print "Size of vocab after: ", len(self.word_to_id) 66 | pkl_dump(self.tag_to_id, self.tag_vocab_path) 67 | pkl_dump(self.char_to_id, self.char_vocab_path) 68 | pkl_dump(self.word_to_id, self.word_vocab_path) 69 | 70 | self.word_padding_token = 0 71 | self.char_padding_token = 0 72 | 73 | if self.pretrained_embedding_path is not None: 74 | self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path, 75 | self.word_to_id, args.word_emb_dim) 76 | # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the 77 | self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()} 78 | self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()} 79 | self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()} 80 | 81 | self.ner_vocab_size = len(self.id_to_tag) 82 | self.word_vocab_size = len(self.id_to_word) 83 | self.char_vocab_size = len(self.id_to_char) 84 | 85 | print "Size of vocab after: ", len(self.word_to_id) 86 | print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size)) 87 | 88 | @staticmethod 89 | def exists(path): 90 | return os.path.exists(path) 91 | 92 | def read_one_line(self, line, tag_set, word_dict, char_set): 93 | for w in line: 94 | fields = w.split() 95 | word = fields[0] 96 | ner_tag = fields[-1] 97 | for c in word: 98 | char_set.add(c) 99 | tag_set.add(ner_tag) 100 | if self.orm_lower: 101 | word = word.lower() 102 | if self.orm_norm: 103 | #word = orm_morph.best_parse(word) 104 | word = ormnorm.normalize(word) 105 | word_dict[word] += 1 106 | 107 | def get_vocab_from_set(self, a_set, shift=0): 108 | vocab = {} 109 | for i, elem in enumerate(a_set): 110 | vocab[elem] = i + shift 111 | 112 | return vocab 113 | 114 | def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False): 115 | vocab = {} 116 | i = 0 117 | self.singleton_words = set() 118 | for k, v in a_dict.iteritems(): 119 | if v == 1: 120 | self.singleton_words.add(i + shift) 121 | if remove_singleton: 122 | if v > 1: 123 | # print k, v 124 | vocab[k] = i + shift 125 | i += 1 126 | else: 127 | vocab[k] = i + shift 128 | i += 1 129 | print "Singleton words number: ", len(self.singleton_words) 130 | return vocab 131 | 132 | def read_files(self, paths): 133 | # word_list = [] 134 | # char_list = [] 135 | # tag_list = [] 136 | word_dict = defaultdict(lambda: 0) 137 | char_set = set() 138 | tag_set = set() 139 | 140 | def _read_a_file(path): 141 | with codecs.open(path, "r", "utf-8") as fin: 142 | to_read_line = [] 143 | for line in fin: 144 | if line.strip() == "": 145 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 146 | to_read_line = [] 147 | else: 148 | to_read_line.append(line.strip()) 149 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 150 | 151 | for path in paths: 152 | _read_a_file(path) 153 | 154 | tag_vocab = self.get_vocab_from_set(tag_set) 155 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 156 | char_vocab = self.get_vocab_from_set(char_set, 1) 157 | 158 | return tag_vocab, word_vocab, char_vocab 159 | 160 | def read_files_lr(self, paths, setEpaths): 161 | # word_list = [] 162 | # char_list = [] 163 | # tag_list = [] 164 | word_dict = defaultdict(lambda: 0) 165 | char_set = set() 166 | tag_set = set() 167 | 168 | def _read_a_file(path): 169 | with codecs.open(path, "r", "utf-8") as fin: 170 | to_read_line = [] 171 | for line in fin: 172 | if line.strip() == "": 173 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 174 | to_read_line = [] 175 | else: 176 | to_read_line.append(line.strip()) 177 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 178 | 179 | for path in paths: 180 | _read_a_file(path) 181 | 182 | #reading from SetE 183 | for path in setEpaths: 184 | with codecs.open(path, "r", "utf-8") as fin: 185 | for line in fin: 186 | fields = line.strip().split() 187 | for word in fields: 188 | for c in word: 189 | char_set.add(c) 190 | if self.orm_lower: 191 | word = word.lower() 192 | if self.orm_norm: 193 | #word = orm_morph.best_parse(word) 194 | word = ormnorm.normalize(word) 195 | word_dict[word] += 1 196 | 197 | tag_vocab = self.get_vocab_from_set(tag_set) 198 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 199 | char_vocab = self.get_vocab_from_set(char_set, 1) 200 | 201 | return tag_vocab, word_vocab, char_vocab 202 | 203 | def get_data_set(self, path, lang): 204 | sents = [] 205 | char_sents = [] 206 | tgt_tags = [] 207 | discrete_features = [] 208 | bc_features = [] 209 | 210 | def add_sent(one_sent): 211 | temp_sent = [] 212 | temp_ner = [] 213 | temp_char = [] 214 | temp_bc = [] 215 | for w in one_sent: 216 | fields = w.split() 217 | word = fields[0] 218 | ner_tag = fields[-1] 219 | if self.use_brown_cluster: 220 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""]) 221 | 222 | if self.orm_lower: 223 | word = word.lower() 224 | 225 | if self.orm_norm: 226 | #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char 227 | word = ormnorm.normalize(word) 228 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 229 | temp_ner.append(self.tag_to_id[ner_tag]) 230 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 231 | 232 | sents.append(temp_sent) 233 | char_sents.append(temp_char) 234 | tgt_tags.append(temp_ner) 235 | bc_features.append(temp_bc) 236 | if not self.args.isLr: 237 | discrete_features.append([]) 238 | else: 239 | discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else []) 240 | 241 | # print len(discrete_features[-1]) 242 | 243 | with codecs.open(path, "r", "utf-8") as fin: 244 | i = 0 245 | one_sent = [] 246 | for line in fin: 247 | if line.strip() == "": 248 | if len(one_sent) > 0: 249 | add_sent(one_sent) 250 | i += 1 251 | if i % 1000 == 0: 252 | print("Processed %d training data." % (i,)) 253 | one_sent = [] 254 | else: 255 | one_sent.append(line.strip()) 256 | 257 | if len(one_sent) > 0: 258 | add_sent(one_sent) 259 | 260 | if self.use_discrete_feature: 261 | self.num_feats = len(discrete_features[0][0]) 262 | else: 263 | self.num_feats = 0 264 | return sents, char_sents, tgt_tags, discrete_features, bc_features 265 | 266 | def get_lr_test(self, path, lang): 267 | # setE.txt 268 | sents = [] 269 | char_sents = [] 270 | discrete_features = [] 271 | bc_features = [] 272 | 273 | def add_sent(one_sent): 274 | temp_sent = [] 275 | temp_char = [] 276 | temp_bc = [] 277 | for word in one_sent: 278 | if self.use_brown_cluster: 279 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""]) 280 | if self.orm_lower: 281 | word = word.lower() 282 | if self.orm_norm: 283 | #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char 284 | word = ormnorm.normalize(word) 285 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 286 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 287 | 288 | sents.append(temp_sent) 289 | char_sents.append(temp_char) 290 | discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else []) 291 | bc_features.append(temp_bc) 292 | 293 | original_sents = [] 294 | with codecs.open(path, "r", "utf-8") as fin: 295 | i = 0 296 | for line in fin: 297 | one_sent = line.rstrip().split() 298 | if line: 299 | add_sent(one_sent) 300 | original_sents.append(one_sent) 301 | i += 1 302 | if i % 1000 == 0: 303 | print("Processed %d testing data." % (i,)) 304 | 305 | if self.use_discrete_feature: 306 | self.num_feats = len(discrete_features[0][0]) 307 | else: 308 | self.num_feats = 0 309 | 310 | return sents, char_sents, discrete_features, original_sents, bc_features 311 | 312 | def get_lr_test_setE(self, path, lang): 313 | # setE.conll 314 | sents = [] 315 | char_sents = [] 316 | discrete_features = [] 317 | bc_features = [] 318 | doc_ids = [] 319 | original_sents = [] 320 | 321 | def add_sent(one_sent): 322 | temp_sent = [] 323 | temp_char = [] 324 | temp_bc = [] 325 | temp_ori_sent = [] 326 | for w in one_sent: 327 | tokens = w.split('\t') 328 | word = tokens[0] 329 | temp_ori_sent.append(word) 330 | docfile = tokens[3] 331 | doc_type = docfile.split('_')[1] 332 | if self.use_brown_cluster: 333 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""]) 334 | 335 | if self.orm_lower: 336 | word = word.lower() 337 | 338 | if self.orm_norm: 339 | #word = orm_morph.best_parse(word) 340 | word = ormnorm.normalize(word) 341 | 342 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 343 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 344 | 345 | doc_ids.append(docfile.split('_')[1]) 346 | sents.append(temp_sent) 347 | char_sents.append(temp_char) 348 | bc_features.append(temp_bc) 349 | discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else []) 350 | original_sents.append(temp_ori_sent) 351 | # print len(discrete_features[-1]) 352 | 353 | with codecs.open(path, "r", "utf-8") as fin: 354 | i = 0 355 | one_sent = [] 356 | for line in fin: 357 | if line.strip() == "": 358 | if len(one_sent) > 0: 359 | add_sent(one_sent) 360 | one_sent = [] 361 | else: 362 | one_sent.append(line.strip()) 363 | i += 1 364 | if i % 1000 == 0: 365 | print("Processed %d testing data." % (i,)) 366 | 367 | if len(one_sent) > 0: 368 | add_sent(one_sent) 369 | 370 | if self.use_discrete_feature: 371 | self.num_feats = len(discrete_features[0][0]) 372 | else: 373 | self.num_feats = 0 374 | 375 | return sents, char_sents, discrete_features, bc_features, original_sents, doc_ids 376 | 377 | 378 | class Dataloader_Combine(): 379 | def __init__(self, args, normal_vocab, lower_vocab, char_to_id, brown_cluster_dicts=None, lower_brown_dicts=None): 380 | self.word_to_id = normal_vocab 381 | self.lower_word_to_id = lower_vocab 382 | self.args = args 383 | 384 | self.char_to_id = char_to_id 385 | self.brown_cluster_dicts = brown_cluster_dicts 386 | self.lower_brown_dicts = lower_brown_dicts 387 | 388 | self.use_discrete_feature = args.use_discrete_features 389 | self.use_brown_cluster = args.use_brown_cluster 390 | self.orm_norm = args.oromo_normalize 391 | self.orm_lower = args.train_lowercase_oromo 392 | 393 | def get_lr_test_setE(self, path, lang): 394 | # setE.conll 395 | sents = [] 396 | char_sents = [] 397 | discrete_features = [] 398 | bc_features = [] 399 | doc_ids = [] 400 | original_sents = [] 401 | 402 | def add_sent(one_sent): 403 | temp_sent = [] 404 | temp_char = [] 405 | temp_bc = [] 406 | temp_ori_sent = [] 407 | for w in one_sent: 408 | tokens = w.split('\t') 409 | word = tokens[0] 410 | temp_ori_sent.append(word) 411 | docfile = tokens[3] 412 | doc_type = docfile.split('_')[1] 413 | if self.use_brown_cluster: 414 | if doc_type == "SN": 415 | temp_bc.append(self.lower_brown_dicts[word] if word in self.lower_brown_dicts else self.lower_brown_dicts[""]) 416 | else: 417 | temp_bc.append(self.brown_cluster_dicts[word] if word in self.brown_cluster_dicts else self.brown_cluster_dicts[""]) 418 | 419 | if doc_type == "SN": 420 | if self.orm_lower: 421 | word = word.lower() 422 | 423 | if self.orm_norm: 424 | #word = orm_morph.best_parse(word) # Not sure whether it would be better adding this line behind or after temp_char 425 | word = ormnorm.normalize(word) 426 | temp_sent.append(self.lower_word_to_id[word] if word in self.lower_word_to_id else self.lower_word_to_id[""]) 427 | else: 428 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 429 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 430 | 431 | doc_ids.append(docfile.split('_')[1]) 432 | sents.append(temp_sent) 433 | char_sents.append(temp_char) 434 | bc_features.append(temp_bc) 435 | discrete_features.append(get_feature_sent(lang, one_sent, self.args) if self.use_discrete_feature else []) 436 | original_sents.append(temp_ori_sent) 437 | # print len(discrete_features[-1]) 438 | 439 | with codecs.open(path, "r", "utf-8") as fin: 440 | i = 0 441 | one_sent = [] 442 | for line in fin: 443 | if line.strip() == "": 444 | if len(one_sent) > 0: 445 | add_sent(one_sent) 446 | one_sent = [] 447 | else: 448 | one_sent.append(line.strip()) 449 | i += 1 450 | if i % 1000 == 0: 451 | print("Processed %d testing data." % (i,)) 452 | 453 | if len(one_sent) > 0: 454 | add_sent(one_sent) 455 | 456 | if self.use_discrete_feature: 457 | self.num_feats = len(discrete_features[0][0]) 458 | else: 459 | self.num_feats = 0 460 | 461 | return sents, char_sents, discrete_features, bc_features, original_sents, doc_ids 462 | -------------------------------------------------------------------------------- /dataloaders/data_loader_orig.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | from models.utils import * 3 | import codecs 4 | import os 5 | from models.features import * 6 | 7 | class NER_DataLoader(): 8 | def __init__(self, args): 9 | '''Data format: id word pos_tag syntactic_tag NER_tag''' 10 | ''' TODO: 1. normalizing all digits 11 | 2. Using full vocabulary from GloVe, when testing, lower case first''' 12 | self.train_path = args.train_path 13 | self.test_path = args.test_path 14 | self.dev_path = args.dev_path 15 | self.args = args 16 | 17 | self.tag_vocab_path = self.train_path + ".tag_vocab" 18 | self.word_vocab_path = self.train_path + ".word_vocab" 19 | self.char_vocab_path = self.train_path + ".char_vocab" 20 | 21 | self.pretrained_embedding_path = args.pretrain_emb_path 22 | self.use_discrete_feature = args.use_discrete_features 23 | 24 | if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path): 25 | # TODO: encoding? 26 | print("Load vocabs from file ....") 27 | self.tag_to_id = pkl_load(self.tag_vocab_path) 28 | self.word_to_id = pkl_load(self.word_vocab_path) 29 | self.char_to_id = pkl_load(self.char_vocab_path) 30 | print("Done!") 31 | else: 32 | print("Generating vocabs from training file ....") 33 | paths_to_read = [self.train_path, self.test_path, self.dev_path] 34 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read) 35 | # FIXME: Remember dictionary value for char and word has been shifted by 1 36 | print "Size of vocab before: ", len(self.word_to_id) 37 | self.word_to_id[''] = len(self.word_to_id) + 1 38 | self.char_to_id[''] = len(self.char_to_id) + 1 39 | 40 | self.word_to_id[''] = 0 41 | self.char_to_id[''] = 0 42 | print "Size of vocab after: ", len(self.word_to_id) 43 | pkl_dump(self.tag_to_id, self.tag_vocab_path) 44 | pkl_dump(self.char_to_id, self.char_vocab_path) 45 | pkl_dump(self.word_to_id, self.word_vocab_path) 46 | 47 | self.word_padding_token = 0 48 | self.char_padding_token = 0 49 | 50 | if self.pretrained_embedding_path is not None: 51 | self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path, 52 | self.word_to_id, args.word_emb_dim) 53 | # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the 54 | self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()} 55 | self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()} 56 | self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()} 57 | 58 | self.ner_vocab_size = len(self.id_to_tag) 59 | self.word_vocab_size = len(self.id_to_word) 60 | self.char_vocab_size = len(self.id_to_char) 61 | 62 | print "Size of vocab after: ", len(self.word_to_id) 63 | print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size)) 64 | 65 | @staticmethod 66 | def exists(path): 67 | return os.path.exists(path) 68 | 69 | def read_one_line(self, line, tag_set, word_dict, char_set): 70 | for w in line: 71 | fields = w.split() 72 | word = fields[0] 73 | ner_tag = fields[-1] 74 | for c in word: 75 | char_set.add(c) 76 | tag_set.add(ner_tag) 77 | word_dict[word] += 1 78 | 79 | def get_vocab_from_set(self, a_set, shift=0): 80 | vocab = {} 81 | for i, elem in enumerate(a_set): 82 | vocab[elem] = i + shift 83 | 84 | return vocab 85 | 86 | def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False): 87 | vocab = {} 88 | i = 0 89 | self.singleton_words = set() 90 | for k, v in a_dict.iteritems(): 91 | if v == 1: 92 | self.singleton_words.add(i + shift) 93 | if remove_singleton: 94 | if v > 1: 95 | # print k, v 96 | vocab[k] = i + shift 97 | i += 1 98 | else: 99 | vocab[k] = i + shift 100 | i += 1 101 | return vocab 102 | 103 | def read_files(self, paths): 104 | # word_list = [] 105 | # char_list = [] 106 | # tag_list = [] 107 | word_dict = defaultdict(lambda: 0) 108 | char_set = set() 109 | tag_set = set() 110 | 111 | def _read_a_file(path): 112 | with codecs.open(path, "r", "utf-8") as fin: 113 | to_read_line = [] 114 | for line in fin: 115 | if line.strip() == "": 116 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 117 | to_read_line = [] 118 | else: 119 | to_read_line.append(line.strip()) 120 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 121 | 122 | for path in paths: 123 | _read_a_file(path) 124 | 125 | tag_vocab = self.get_vocab_from_set(tag_set) 126 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 127 | char_vocab = self.get_vocab_from_set(char_set, 1) 128 | 129 | return tag_vocab, word_vocab, char_vocab 130 | 131 | def get_data_set(self, path, lang, training=True): 132 | sents = [] 133 | char_sents = [] 134 | tgt_tags = [] 135 | discrete_features = [] 136 | 137 | def add_sent(one_sent): 138 | temp_sent = [] 139 | temp_ner = [] 140 | temp_char = [] 141 | 142 | for w in one_sent: 143 | fields = w.split() 144 | word = fields[0] 145 | ner_tag = fields[-1] 146 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 147 | temp_ner.append(self.tag_to_id[ner_tag]) 148 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 149 | sents.append(temp_sent) 150 | char_sents.append(temp_char) 151 | tgt_tags.append(temp_ner) 152 | discrete_features.append(get_feature_w(lang, one_sent)[0] if self.use_discrete_feature else []) 153 | 154 | with codecs.open(path, "r", "utf-8") as fin: 155 | one_sent = [] 156 | for line in fin: 157 | if line.strip() == "": 158 | if len(one_sent) > 0: 159 | add_sent(one_sent) 160 | one_sent = [] 161 | else: 162 | one_sent.append(line.strip()) 163 | if len(one_sent) > 0: 164 | add_sent(one_sent) 165 | 166 | if self.use_discrete_feature: 167 | self.num_feats = len(discrete_features[0][0]) 168 | else: 169 | self.num_feats = 0 170 | return sents, char_sents, tgt_tags, discrete_features 171 | 172 | def get_lr_test(self, path, lang): 173 | sents = [] 174 | char_sents = [] 175 | discrete_features = [] 176 | 177 | def add_sent(one_sent): 178 | temp_sent = [] 179 | temp_char = [] 180 | temp_discrete = [] 181 | for word in one_sent: 182 | if self.use_discrete_feature: 183 | temp_discrete.append(get_feature_w(lang, word)) 184 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 185 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 186 | sents.append(temp_sent) 187 | char_sents.append(temp_char) 188 | discrete_features.append(temp_discrete) 189 | 190 | original_sents = [] 191 | with codecs.open(path, "r", "utf-8") as fin: 192 | for line in fin: 193 | one_sent = line.rstrip().split() 194 | if line: 195 | add_sent(one_sent) 196 | original_sents.append(one_sent) 197 | 198 | if self.use_discrete_feature: 199 | self.num_feats = len(discrete_features[0][0]) 200 | else: 201 | self.num_feats = 0 202 | 203 | return sents, char_sents, discrete_features, original_sents -------------------------------------------------------------------------------- /dataloaders/dataloader_unicode.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | import os 3 | 4 | from utils.features import * 5 | from utils.util import * 6 | 7 | 8 | class NER_DataLoader(): 9 | def __init__(self, args): 10 | '''Data format: id word pos_tag syntactic_tag NER_tag''' 11 | ''' TODO: 1. normalizing all digits 12 | 2. Using full vocabulary from GloVe, when testing, lower case first''' 13 | self.train_path = args.train_path 14 | self.test_path = args.test_path 15 | self.dev_path = args.dev_path 16 | self.args = args 17 | 18 | self.tag_vocab_path = self.train_path + ".tag_vocab" 19 | self.word_vocab_path = self.train_path + ".word_vocab" 20 | self.char_vocab_path = self.train_path + ".char_vocab" 21 | 22 | self.pretrained_embedding_path = args.pretrain_emb_path 23 | self.use_discrete_feature = args.use_discrete_features 24 | 25 | if False and os.path.exists(self.tag_vocab_path) and os.path.exists(self.word_vocab_path) and os.path.exists(self.char_vocab_path): 26 | # TODO: encoding? 27 | print("Load vocabs from file ....") 28 | self.tag_to_id = pkl_load(self.tag_vocab_path) 29 | self.word_to_id = pkl_load(self.word_vocab_path) 30 | self.char_to_id = pkl_load(self.char_vocab_path) 31 | print("Done!") 32 | else: 33 | print("Generating vocabs from training file ....") 34 | if not self.args.isLr: 35 | paths_to_read = [self.train_path, self.test_path, self.dev_path] 36 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files(paths_to_read) 37 | else: 38 | paths_to_read = [self.train_path, self.dev_path] 39 | self.tag_to_id, self.word_to_id, self.char_to_id = self.read_files_lr(paths_to_read,self.test_path) 40 | # FIXME: Remember dictionary value for char and word has been shifted by 1 41 | print "Size of vocab before: ", len(self.word_to_id) 42 | self.word_to_id[''] = len(self.word_to_id) + 1 43 | self.char_to_id[''] = len(self.char_to_id) + 1 44 | 45 | self.word_to_id['<\s>'] = 0 46 | self.char_to_id[''] = 0 47 | print "Size of vocab after: ", len(self.word_to_id) 48 | pkl_dump(self.tag_to_id, self.tag_vocab_path) 49 | pkl_dump(self.char_to_id, self.char_vocab_path) 50 | pkl_dump(self.word_to_id, self.word_vocab_path) 51 | 52 | self.word_padding_token = 0 53 | self.char_padding_token = 0 54 | 55 | if self.pretrained_embedding_path is not None: 56 | self.pretrain_word_emb, self.word_to_id = get_pretrained_emb(self.pretrained_embedding_path, 57 | self.word_to_id, args.word_emb_dim) 58 | # for char vocab and word vocab, we reserve id 0 for the eos padding, and len(vocab)-1 for the 59 | self.id_to_tag = {v: k for k, v in self.tag_to_id.iteritems()} 60 | self.id_to_word = {v: k for k, v in self.word_to_id.iteritems()} 61 | self.id_to_char = {v: k for k, v in self.char_to_id.iteritems()} 62 | 63 | self.ner_vocab_size = len(self.id_to_tag) 64 | self.word_vocab_size = len(self.id_to_word) 65 | self.char_vocab_size = len(self.id_to_char) 66 | 67 | print "Size of vocab after: ", len(self.word_to_id) 68 | print("NER tag num=%d, Word vocab size=%d, Char Vocab size=%d" % (self.ner_vocab_size, self.word_vocab_size, self.char_vocab_size)) 69 | 70 | @staticmethod 71 | def exists(path): 72 | return os.path.exists(path) 73 | 74 | def read_one_line(self, line, tag_set, word_dict, char_set): 75 | for w in line: 76 | fields = w.split() 77 | word = fields[0] 78 | ner_tag = fields[-1] 79 | for c in word: 80 | char_set.add(c) 81 | tag_set.add(ner_tag) 82 | word_dict[word] += 1 83 | 84 | def get_vocab_from_set(self, a_set, shift=0): 85 | vocab = {} 86 | for i, elem in enumerate(a_set): 87 | vocab[elem] = i + shift 88 | 89 | return vocab 90 | 91 | def get_vocab_from_dict(self, a_dict, shift=0, remove_singleton=False): 92 | vocab = {} 93 | i = 0 94 | self.singleton_words = set() 95 | for k, v in a_dict.iteritems(): 96 | if v == 1: 97 | self.singleton_words.add(i + shift) 98 | if remove_singleton: 99 | if v > 1: 100 | # print k, v 101 | vocab[k] = i + shift 102 | i += 1 103 | else: 104 | vocab[k] = i + shift 105 | i += 1 106 | return vocab 107 | 108 | def read_files(self, paths): 109 | # word_list = [] 110 | # char_list = [] 111 | # tag_list = [] 112 | word_dict = defaultdict(lambda: 0) 113 | char_set = set() 114 | tag_set = set() 115 | 116 | def _read_a_file(path): 117 | with codecs.open(path, "r") as fin: 118 | to_read_line = [] 119 | for line in fin: 120 | if line.strip() == "": 121 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 122 | to_read_line = [] 123 | else: 124 | to_read_line.append(line.strip()) 125 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 126 | 127 | for path in paths: 128 | _read_a_file(path) 129 | 130 | tag_vocab = self.get_vocab_from_set(tag_set) 131 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 132 | char_vocab = self.get_vocab_from_set(char_set, 1) 133 | 134 | return tag_vocab, word_vocab, char_vocab 135 | 136 | def read_files_lr(self, paths, test_path): 137 | # word_list = [] 138 | # char_list = [] 139 | # tag_list = [] 140 | word_dict = defaultdict(lambda: 0) 141 | char_set = set() 142 | tag_set = set() 143 | 144 | def _read_a_file(path): 145 | with codecs.open(path, "r") as fin: 146 | to_read_line = [] 147 | for line in fin: 148 | if line.strip() == "": 149 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 150 | to_read_line = [] 151 | else: 152 | to_read_line.append(line.strip()) 153 | self.read_one_line(to_read_line, tag_set, word_dict, char_set) 154 | 155 | for path in paths: 156 | _read_a_file(path) 157 | 158 | #reading from SetE 159 | with codecs.open(test_path, "r") as fin: 160 | for line in fin: 161 | fields = line.strip().split() 162 | for word in fields: 163 | for c in word: 164 | char_set.add(c) 165 | word_dict[word] += 1 166 | 167 | tag_vocab = self.get_vocab_from_set(tag_set) 168 | word_vocab = self.get_vocab_from_dict(word_dict, 1, self.args.remove_singleton) 169 | char_vocab = self.get_vocab_from_set(char_set, 1) 170 | 171 | return tag_vocab, word_vocab, char_vocab 172 | 173 | def get_data_set(self, path, lang): 174 | sents = [] 175 | char_sents = [] 176 | tgt_tags = [] 177 | discrete_features = [] 178 | 179 | def add_sent(one_sent): 180 | temp_sent = [] 181 | temp_ner = [] 182 | temp_char = [] 183 | 184 | for w in one_sent: 185 | fields = w.split() 186 | word = fields[0] 187 | ner_tag = fields[-1] 188 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 189 | temp_ner.append(self.tag_to_id[ner_tag]) 190 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 191 | sents.append(temp_sent) 192 | char_sents.append(temp_char) 193 | tgt_tags.append(temp_ner) 194 | discrete_features.append(get_feature_w(lang, one_sent) if self.use_discrete_feature else []) 195 | 196 | # print len(discrete_features[-1]) 197 | 198 | with codecs.open(path, "r") as fin: 199 | one_sent = [] 200 | for line in fin: 201 | if line.strip() == "": 202 | if len(one_sent) > 0: 203 | add_sent(one_sent) 204 | one_sent = [] 205 | else: 206 | one_sent.append(line.strip()) 207 | if len(one_sent) > 0: 208 | add_sent(one_sent) 209 | 210 | if self.use_discrete_feature: 211 | self.num_feats = len(discrete_features[0][0]) 212 | else: 213 | self.num_feats = 0 214 | return sents, char_sents, tgt_tags, discrete_features 215 | 216 | def get_lr_test(self, path, lang): 217 | sents = [] 218 | char_sents = [] 219 | discrete_features = [] 220 | 221 | def add_sent(one_sent): 222 | temp_sent = [] 223 | temp_char = [] 224 | for word in one_sent: 225 | temp_sent.append(self.word_to_id[word] if word in self.word_to_id else self.word_to_id[""]) 226 | temp_char.append([self.char_to_id[c] if c in self.char_to_id else self.char_to_id[""] for c in word]) 227 | sents.append(temp_sent) 228 | char_sents.append(temp_char) 229 | discrete_features.append(get_feature_w(lang, one_sent) if self.use_discrete_feature else []) 230 | 231 | original_sents = [] 232 | with codecs.open(path, "r") as fin: 233 | for line in fin: 234 | one_sent = line.rstrip().split() 235 | if line: 236 | add_sent(one_sent) 237 | original_sents.append(one_sent) 238 | 239 | if self.use_discrete_feature: 240 | self.num_feats = len(discrete_features[0][0]) 241 | else: 242 | self.num_feats = 0 243 | 244 | return sents, char_sents, discrete_features, original_sents 245 | -------------------------------------------------------------------------------- /eval/IO2BIO.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def transform(ifile, ofile): 4 | with open(ifile, 'r') as reader, open(ofile, 'w') as writer: 5 | prev = 'O' 6 | for line in reader: 7 | line = line.strip() 8 | if len(line) == 0: 9 | prev = 'O' 10 | writer.write('\n') 11 | continue 12 | 13 | tokens = line.split() 14 | # print tokens 15 | label = tokens[-1] 16 | if label != 'O' and label != prev: 17 | if prev == 'O': 18 | label = 'B-' + label[2:] 19 | elif label[2:] != prev[2:]: 20 | label = 'B-' + label[2:] 21 | else: 22 | label = label 23 | writer.write(" ".join(tokens[:-1]) + " " + label) 24 | writer.write('\n') 25 | prev = tokens[-1] 26 | 27 | if __name__ == '__main__': 28 | transform('eng.train.conll', 'eng.train.bio.conll') 29 | transform('eng.dev.conll', 'eng.dev.bio.conll') 30 | transform('eng.test.conll', 'eng.test.bio.conll') 31 | -------------------------------------------------------------------------------- /eval/IO2BIOES.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def transform(ifile, ofile): 4 | with open(ifile, 'r') as reader, open(ofile, 'w') as writer: 5 | sents = [] 6 | sent = [] 7 | for line in reader: 8 | line = line.strip() 9 | if len(line) == 0: 10 | sents.append(sent) 11 | sent = [] 12 | continue 13 | 14 | sent.append(line) 15 | if len(sent) > 0: 16 | sents.append(sent) 17 | 18 | for sent in sents: 19 | length = len(sent) 20 | labels = [] 21 | for line in sent: 22 | tokens = line.split() 23 | label = tokens[-1] 24 | labels.append(label) 25 | 26 | # print "%d %d" % (length, len(labels)) 27 | 28 | for i in range(length): 29 | tokens = sent[i].split() 30 | label = labels[i] 31 | new_label = label 32 | if label != 'O': 33 | if label.startswith('B-'): 34 | if i + 1 == length or not labels[i + 1].startswith('I-'): 35 | new_label = 'S-' + label[2:] 36 | elif label.startswith('I-'): 37 | if i + 1 == length or not labels[i + 1].startswith('I-'): 38 | new_label = 'E-' + label[2:] 39 | writer.write(" ".join(tokens[:-1]) + " " + new_label) 40 | writer.write('\n') 41 | writer.write('\n') 42 | 43 | 44 | if __name__ == '__main__': 45 | transform('../datasets/english/eng.train.bio.conll', '../datasets/english/eng.train.bioes.conll') 46 | transform('../datasets/english/eng.dev.bio.conll', '../datasets/english/eng.dev.bioes.conll') 47 | transform('../datasets/english/eng.test.bio.conll', '../datasets/english/eng.test.bioes.conll') 48 | -------------------------------------------------------------------------------- /eval/conlleval: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 265 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 266 | 267 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 269 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 270 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 271 | 272 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 273 | $chunkEnd = $true; 274 | } 275 | 276 | # corrected 1998-12-22: these chunks are assumed to have length 1 277 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 279 | 280 | return($chunkEnd); 281 | } 282 | 283 | # startOfChunk: checks if a chunk started between the previous and current word 284 | # arguments: previous and current chunk tags, previous and current types 285 | # note: this code is capable of handling other chunk representations 286 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 287 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 288 | 289 | sub startOfChunk { 290 | my $prevTag = shift(@_); 291 | my $tag = shift(@_); 292 | my $prevType = shift(@_); 293 | my $type = shift(@_); 294 | my $chunkStart = $false; 295 | 296 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 297 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 298 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 299 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 300 | 301 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 302 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 303 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 304 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 305 | 306 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 307 | $chunkStart = $true; 308 | } 309 | 310 | # corrected 1998-12-22: these chunks are assumed to have length 1 311 | if ( $tag eq "[" ) { $chunkStart = $true; } 312 | if ( $tag eq "]" ) { $chunkStart = $true; } 313 | 314 | return($chunkStart); 315 | } 316 | -------------------------------------------------------------------------------- /eval/conlleval.v2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkEnd = $true; } 265 | 266 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 267 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 269 | 270 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 271 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 272 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 273 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkEnd = $true; } 274 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkEnd = $true; } 275 | 276 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkEnd = $true; } 277 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "S" and $tag eq "O" ) { $chunkEnd = $true; } 279 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkEnd = $true; } 280 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkEnd = $true; } 281 | 282 | 283 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 284 | $chunkEnd = $true; 285 | } 286 | 287 | # corrected 1998-12-22: these chunks are assumed to have length 1 288 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 289 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 290 | 291 | return($chunkEnd); 292 | } 293 | 294 | # startOfChunk: checks if a chunk started between the previous and current word 295 | # arguments: previous and current chunk tags, previous and current types 296 | # note: this code is capable of handling other chunk representations 297 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 298 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 299 | 300 | sub startOfChunk { 301 | my $prevTag = shift(@_); 302 | my $tag = shift(@_); 303 | my $prevType = shift(@_); 304 | my $type = shift(@_); 305 | my $chunkStart = $false; 306 | 307 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 308 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 309 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 310 | if ( $prevTag eq "S" and $tag eq "B" ) { $chunkStart = $true; } 311 | if ( $prevTag eq "E" and $tag eq "B" ) { $chunkStart = $true; } 312 | 313 | if ( $prevTag eq "B" and $tag eq "S" ) { $chunkStart = $true; } 314 | if ( $prevTag eq "I" and $tag eq "S" ) { $chunkStart = $true; } 315 | if ( $prevTag eq "O" and $tag eq "S" ) { $chunkStart = $true; } 316 | if ( $prevTag eq "S" and $tag eq "S" ) { $chunkStart = $true; } 317 | if ( $prevTag eq "E" and $tag eq "S" ) { $chunkStart = $true; } 318 | 319 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 320 | if ( $prevTag eq "S" and $tag eq "I" ) { $chunkStart = $true; } 321 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 322 | 323 | if ( $prevTag eq "S" and $tag eq "E" ) { $chunkStart = $true; } 324 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 325 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 326 | 327 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 328 | $chunkStart = $true; 329 | } 330 | 331 | # corrected 1998-12-22: these chunks are assumed to have length 1 332 | if ( $tag eq "[" ) { $chunkStart = $true; } 333 | if ( $tag eq "]" ) { $chunkStart = $true; } 334 | 335 | return($chunkStart); 336 | } 337 | -------------------------------------------------------------------------------- /eval/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for file in $(ls $1) 3 | do 4 | echo evaluating $file 5 | ./conlleval < $1$file 6 | done 7 | -------------------------------------------------------------------------------- /eval/format.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def format(ifile, ofile): 5 | with open(ifile, 'r') as reader, open(ofile, 'w') as writer: 6 | i = 1 7 | for line in reader: 8 | line = line.strip() 9 | if len(line) == 0: 10 | i = 1 11 | writer.write('\n') 12 | else: 13 | writer.write('%d %s\n' % (i, line)) 14 | i += 1 15 | 16 | 17 | if __name__ == '__main__': 18 | format('eng.train', 'eng.train.conll') 19 | format('eng.dev', 'eng.dev.conll') 20 | format('eng.test', 'eng.test.conll') 21 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/models/__init__.py -------------------------------------------------------------------------------- /models/decoders.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | from utils.util import * 3 | 4 | 5 | class Decoder(): 6 | def __init__(self, tag_size): 7 | # type: () -> object 8 | pass 9 | 10 | def decode_loss(self): 11 | raise NotImplementedError 12 | 13 | def decoding(self): 14 | raise NotImplementedError 15 | 16 | 17 | def constrained_transition_init(transition_matrix, contraints): 18 | ''' 19 | :param transition_matrix: numpy array, (from, to) 20 | :param contraints: [[from_indexes], [to_indexes]] 21 | :return: newly initialized transition matrix 22 | ''' 23 | for cons in contraints: 24 | transition_matrix[cons[0], cons[1]] = -1000.0 25 | return transition_matrix 26 | 27 | 28 | class chain_CRF_decoder(Decoder): 29 | ''' For NER and POS Tagging. ''' 30 | 31 | def __init__(self, args, model, src_output_dim, tag_emb_dim, tag_size, constraints=None): 32 | Decoder.__init__(self, tag_size) 33 | self.model = model 34 | self.start_id = tag_size 35 | self.end_id = tag_size + 1 36 | self.tag_size = tag_size + 2 37 | tag_size = tag_size + 2 38 | 39 | # optional: transform the hidden space of src encodings into the tag embedding space 40 | self.W_src2tag_readout = model.add_parameters((tag_emb_dim, src_output_dim)) 41 | self.b_src2tag_readout = model.add_parameters((tag_emb_dim)) 42 | self.b_src2tag_readout.zero() 43 | 44 | self.W_scores_readout2tag = model.add_parameters((tag_size, tag_emb_dim)) 45 | self.b_scores_readout2tag = model.add_parameters((tag_size)) 46 | self.b_scores_readout2tag.zero() 47 | 48 | # (to, from), trans[i] is the transition score to i 49 | init_transition_matrix = np.random.randn(tag_size, tag_size) # from, to 50 | # init_transition_matrix[self.start_id, :] = -1000.0 51 | # init_transition_matrix[:, self.end_id] = -1000.0 52 | init_transition_matrix[self.end_id, :] = -1000.0 53 | init_transition_matrix[:, self.start_id] = -1000.0 54 | if constraints is not None: 55 | init_transition_matrix = constrained_transition_init(init_transition_matrix, constraints) 56 | # print init_transition_matrix 57 | self.transition_matrix = model.add_lookup_parameters((tag_size, tag_size), 58 | init=dy.NumpyInitializer(init_transition_matrix)) 59 | 60 | self.interpolation = args.interp_crf_score 61 | if self.interpolation: 62 | self.W_weight_transition = model.add_parameters((1, tag_emb_dim)) 63 | self.b_weight_transition = model.add_parameters((1)) 64 | self.b_weight_transition.zero() 65 | 66 | def forward_alg(self, tag_scores): 67 | ''' Forward DP for CRF. 68 | tag_scores (list of batched dy.Tensor): (tag_size, batchsize) 69 | ''' 70 | # Be aware: if a is lookup_parameter with 2 dimension, then a[i] returns one row; 71 | # if b = dy.parameter(a), then b[i] returns one column; which means dy.parameter(a) already transpose a 72 | transpose_transition_score = dy.parameter(self.transition_matrix) 73 | # transpose_transition_score = dy.transpose(transition_score) 74 | # alpha(t', s) = the score of sequence from t=0 to t=t' in log space 75 | # np_init_alphas = -100.0 * np.ones((self.tag_size, batch_size)) 76 | # np_init_alphas[self.start_id, :] = 0.0 77 | # alpha_tm1 = dy.inputTensor(np_init_alphas, batched=True) 78 | 79 | alpha_tm1 = transpose_transition_score[self.start_id] + tag_scores[0] 80 | # self.transition_matrix[i]: from i, column 81 | # transpose_score[i]: to i, row 82 | # transpose_score: to, from 83 | 84 | for tag_score in tag_scores[1:]: 85 | # extend for each transit 86 | alpha_tm1 = dy.concatenate_cols([alpha_tm1] * self.tag_size) # (from, to, batch_size) 87 | # each column i of tag_score will be the repeated emission score to tag i 88 | tag_score = dy.transpose(dy.concatenate_cols([tag_score] * self.tag_size)) 89 | alpha_t = alpha_tm1 + transpose_transition_score + tag_score 90 | alpha_tm1 = log_sum_exp_dim_0(alpha_t) # (tag_size, batch_size) 91 | 92 | terminal_alpha = log_sum_exp_dim_0(alpha_tm1 + self.transition_matrix[self.end_id]) # (1, batch_size) 93 | return terminal_alpha 94 | 95 | def score_one_sequence(self, tag_scores, tags, batch_size): 96 | ''' tags: list of tag ids at each time step ''' 97 | # print tags, batch_size 98 | # print batch_size 99 | # print "scoring one sentence" 100 | tags = [[self.start_id] * batch_size] + tags # len(tag_scores) = len(tags) - 1 101 | score = dy.inputTensor(np.zeros(batch_size), batched=True) 102 | # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size 103 | # print "tag dim: ", tag_scores.dim() 104 | for i in range(len(tags) - 1): 105 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \ 106 | + dy.pick_batch(tag_scores[i], tags[i + 1]) 107 | score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, [self.end_id]*batch_size), tags[-1]) 108 | return score 109 | 110 | def decode_loss(self, src_encodings, tgt_tags): 111 | # This is the batched version which requires bucketed batch input with the same length. 112 | ''' 113 | The length of src_encodings and tgt_tags are time_steps. 114 | src_encodings: list of dynet.Tensor (src_output_dim, batch_size) 115 | tgt_tags: list of tag ids [(1, batch_size)] 116 | return: average of negative log likelihood 117 | ''' 118 | # TODO: transpose tgt tags first 119 | batch_size = len(tgt_tags) 120 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0) 121 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 122 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 123 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 124 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 125 | 126 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) for src_encoding 127 | in src_encodings] 128 | if self.interpolation: 129 | W_transit = dy.parameter(self.W_weight_transition) 130 | b_transit = dy.parameter(self.b_weight_transition) 131 | step_weight_on_transit = [dy.logistic(dy.affine_transform([b_transit, W_transit, tag_emb])) for tag_emb in tag_embs] 132 | 133 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 134 | 135 | # scores over all paths, all scores are in log-space 136 | forward_scores = self.forward_alg(tag_scores) 137 | gold_score = self.score_one_sequence(tag_scores, tgt_tags, batch_size) 138 | # negative log likelihood 139 | loss = dy.sum_batches(forward_scores - gold_score) / batch_size 140 | return loss #, dy.sum_batches(forward_scores)/batch_size, dy.sum_batches(gold_score) / batch_size 141 | 142 | def get_crf_scores(self, src_encodings): 143 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 144 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 145 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 146 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 147 | 148 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) 149 | for src_encoding in src_encodings] 150 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 151 | 152 | transpose_transition_score = dy.parameter(self.transition_matrix) # (to, from) 153 | 154 | return transpose_transition_score.npvalue(), [ts.npvalue() for ts in tag_scores] 155 | 156 | def decoding(self, src_encodings): 157 | ''' Viterbi decoding for a single sequence. ''' 158 | W_src2tag_readout = dy.parameter(self.W_src2tag_readout) 159 | b_src2tag_readout = dy.parameter(self.b_src2tag_readout) 160 | W_score_tag = dy.parameter(self.W_scores_readout2tag) 161 | b_score_tag = dy.parameter(self.b_scores_readout2tag) 162 | 163 | tag_embs = [dy.tanh(dy.affine_transform([b_src2tag_readout, W_src2tag_readout, src_encoding])) 164 | for src_encoding in src_encodings] 165 | tag_scores = [dy.affine_transform([b_score_tag, W_score_tag, tag_emb]) for tag_emb in tag_embs] 166 | 167 | back_trace_tags = [] 168 | np_init_alpha = np.ones(self.tag_size) * -2000.0 169 | np_init_alpha[self.start_id] = 0.0 170 | max_tm1 = dy.inputTensor(np_init_alpha) 171 | transpose_transition_score = dy.parameter(self.transition_matrix) # (to, from) 172 | 173 | for i, tag_score in enumerate(tag_scores): 174 | max_tm1 = dy.concatenate_cols([max_tm1] * self.tag_size) 175 | max_t = max_tm1 + transpose_transition_score 176 | if i != 0: 177 | eval_score = max_t.npvalue()[:-2, :] 178 | else: 179 | eval_score = max_t.npvalue() 180 | best_tag = np.argmax(eval_score, axis=0) 181 | back_trace_tags.append(best_tag) 182 | max_tm1 = dy.inputTensor(eval_score[best_tag, range(self.tag_size)]) + tag_score 183 | 184 | terminal_max_T = max_tm1 + self.transition_matrix[self.end_id] 185 | eval_terminal = terminal_max_T.npvalue()[:-2] 186 | best_tag = np.argmax(eval_terminal, axis=0) 187 | best_path_score = eval_terminal[best_tag] 188 | 189 | best_path = [best_tag] 190 | for btpoint in reversed(back_trace_tags): 191 | best_tag = btpoint[best_tag] 192 | best_path.append(best_tag) 193 | start = best_path.pop() 194 | assert start == self.start_id 195 | best_path.reverse() 196 | return best_path_score, best_path 197 | 198 | def cal_accuracy(self, pred_path, true_path): 199 | return np.sum(np.equal(pred_path, true_path).astype(np.float32)) / len(pred_path) 200 | 201 | 202 | def ensemble_viterbi_decoding(l_tag_scores, l_transit_score, tag_size): 203 | back_trace_tags = [] 204 | tag_size = tag_size + 2 205 | start_id = tag_size - 2 206 | end_id = tag_size - 1 207 | max_tm1 = np.ones(tag_size) * -2000.0 208 | max_tm1[start_id] = 0.0 209 | 210 | tag_scores = [] 211 | for i in range(len(l_tag_scores[0])): 212 | tag_scores.append(sum([ts[i] for ts in l_tag_scores]) / len(l_tag_scores)) 213 | transpose_transition_score = sum(l_transit_score) / len(l_transit_score) # (from, to) 214 | 215 | for i, tag_score in enumerate(tag_scores): 216 | max_tm1 = np.tile(np.expand_dims(max_tm1, axis=1), (1, tag_size)) 217 | max_t = max_tm1 + transpose_transition_score 218 | if i != 0: 219 | eval_score = max_t[:-2, :] 220 | else: 221 | eval_score = max_t 222 | best_tag = np.argmax(eval_score, axis=0) 223 | back_trace_tags.append(best_tag) 224 | max_tm1 = eval_score[best_tag, range(tag_size)] + tag_score 225 | 226 | terminal_max_T = max_tm1 + transpose_transition_score[:, end_id] 227 | eval_terminal = terminal_max_T[:-2] 228 | best_tag = np.argmax(eval_terminal, axis=0) 229 | best_path_score = eval_terminal[best_tag] 230 | 231 | best_path = [best_tag] 232 | for btpoint in reversed(back_trace_tags): 233 | best_tag = btpoint[best_tag] 234 | best_path.append(best_tag) 235 | start = best_path.pop() 236 | assert start == start_id 237 | best_path.reverse() 238 | return best_path_score, best_path 239 | 240 | 241 | class classifier(Decoder): 242 | def __init__(self, model, input_dim, tag_size): 243 | self.W_softmax = model.add_parameters((tag_size, input_dim)) 244 | self.b_softmax = model.add_parameters((tag_size)) 245 | 246 | def decode_loss(self, src_encoding, tgt_tags): 247 | batch_size = len(tgt_tags) 248 | tgt_tags, tgt_mask = transpose_input(tgt_tags, 0) 249 | 250 | assert len(src_encoding) == len(tgt_tags) 251 | 252 | W_softmax = dy.parameter(self.W_softmax) 253 | b_softmax = dy.parameter(self.b_softmax) 254 | 255 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding] 256 | 257 | losses = [dy.pickneglogsoftmax_batch(pred, tgt) for pred, tgt in zip(predictions, tgt_tags)] 258 | 259 | loss = dy.sum_batches(dy.esum(losses)) / (batch_size * len(src_encoding)) 260 | 261 | return loss 262 | 263 | def decoding(self, src_encoding): 264 | W_softmax = dy.parameter(self.W_softmax) 265 | b_softmax = dy.parameter(self.b_softmax) 266 | predictions = [dy.affine_transform([b_softmax, W_softmax, src_emb]) for src_emb in src_encoding] 267 | 268 | predictions = [np.argmax(pred.npvalue()) for pred in predictions] 269 | 270 | return None, predictions 271 | -------------------------------------------------------------------------------- /models/encoders.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | from utils.util import * 3 | 4 | ''' Designing idea: the encoder should be agnostic to the input, it can be either 5 | arbitrary spans, characters, or words, or even raw feature. However, user has to specify 6 | whether to have the lookup table for any input. 7 | 8 | There are also two ways to feed in multiple input features: 9 | (a) First concatenate all features for each position, and then use them as features for one encoder, e.g. bilstm 10 | (b) Use multiple encoders for multiple features then combine outputs from multiple encoders, either concat them 11 | or feed them to another encoder.''' 12 | 13 | 14 | class Encoder(): 15 | def __init__(self): 16 | pass 17 | 18 | def encode(self): 19 | raise NotImplementedError 20 | 21 | # class concat_input_encoder(encoder): 22 | # def __init__(self, model, lookups, lookup_table_dims): 23 | # # length of elements in lookup_table_dims == number of elements in lookups which are true 24 | # self.num_inputs = len(lookups) 25 | # self.lookups = lookups 26 | # self.lookup_params = [] 27 | # for i, lookup in enumerate(lookups): 28 | # if lookup == 1: 29 | # # add loop up parameters 30 | # self.lookup_params.append(model.add_lookup_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1]))) 31 | # elif lookup == 2: 32 | # # add normal transformation parameters 33 | # # dims: discrete_feature_num, continuous_emb_dim 34 | # # the input should concatenate all the discrete features together first 35 | # self.lookup_params.append(model.add_parameters((lookup_table_dims[i][0], lookup_table_dims[i][1]))) 36 | # else: 37 | # self.lookup_params.append(0) 38 | # 39 | # def prepare_inputs(self, inputs): 40 | # # inputs: (a) 41 | # input_features = [] 42 | # for i, lookup in enumerate(self.lookups): 43 | # if lookup == 1: 44 | 45 | 46 | class Lookup_Encoder(Encoder): 47 | def __init__(self, model, args, vocab_size, emb_size, padding_token=None, pretrain_embedding=None, isFeatureEmb=False): 48 | Encoder.__init__(self) 49 | self.padding_token = padding_token 50 | self.map_pretrain = args.map_pretrain 51 | self.pretrain_fix = args.pretrain_fix 52 | self.isFeatureEmb = isFeatureEmb 53 | if args.map_pretrain: 54 | self.W_map = model.add_parameters((args.map_dim, emb_size)) 55 | self.b_map = model.add_parameters(args.map_dim) 56 | self.b_map.zero() 57 | if pretrain_embedding is not None: 58 | self.lookup_table = model.lookup_parameters_from_numpy(pretrain_embedding) 59 | else: 60 | self.lookup_table = model.add_lookup_parameters((vocab_size, emb_size)) 61 | 62 | def encode(self, input_seqs): 63 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token) 64 | embs = [dy.lookup_batch(self.lookup_table, wids) for wids in transpose_inputs] 65 | if self.pretrain_fix and not self.isFeatureEmb: 66 | embs = [dy.nobackprop(emb) for emb in embs] 67 | # TODO: initialize with ones vector, initialize W_map with identity matrix 68 | if self.map_pretrain and not self.isFeatureEmb: 69 | if not self.pretrain_fix: 70 | embs = [dy.nobackprop(emb) for emb in embs] 71 | W_map = dy.parameter(self.W_map) 72 | b_map = dy.parameter(self.b_map) 73 | embs = [dy.affine_transform([b_map, W_map, emb]) for emb in embs] 74 | return embs 75 | 76 | 77 | class Discrete_Feature_Encoder(Encoder): 78 | def __init__(self, model, num_feats, to_dim): 79 | Encoder.__init__(self) 80 | self.num_feats = num_feats 81 | self.to_dim = to_dim 82 | self.W_feat_emb = model.add_parameters((to_dim, num_feats)) 83 | 84 | def encode(self, input_feats): 85 | batch_size = len(input_feats) 86 | # after transpose: input_feats: [(num_feats, batch_size)] 87 | input_feats = transpose_discrete_features(input_feats) 88 | W_feat_emb = dy.parameter(self.W_feat_emb) 89 | output_emb = [] 90 | for wif in input_feats: 91 | extend_wif = dy.transpose(dy.concatenate_cols([wif for _ in range(self.to_dim)])) 92 | feature_emb = dy.cmult(extend_wif, W_feat_emb) 93 | output_emb.append(dy.reshape(feature_emb, (self.to_dim * self.num_feats, ), batch_size=batch_size)) 94 | return output_emb 95 | 96 | 97 | class CNN_Encoder(Encoder): 98 | def __init__(self, model, emb_size, win_size=3, filter_size=64, dropout=0.5, vocab_size=0, padding_token=0, lookup_emb=None): 99 | Encoder.__init__(self) 100 | self.vocab_size = vocab_size # if 0, no lookup tables 101 | self.win_size = win_size 102 | self.filter_size = filter_size 103 | self.emb_size = emb_size 104 | self.dropout_rate = dropout 105 | self.paddding_token = padding_token 106 | if vocab_size != 0: 107 | print "In CNN encoder: creating lookup embedding!" 108 | self.lookup_emb = model.add_lookup_parameters((vocab_size, 1, 1, emb_size)) 109 | else: 110 | assert lookup_emb is not None 111 | print "In CNN encoder: reusing lookup embedding!" 112 | self.lookup_emb = lookup_emb 113 | 114 | self.W_cnn = model.add_parameters((1, win_size, emb_size, filter_size)) 115 | self.b_cnn = model.add_parameters((filter_size)) 116 | self.b_cnn.zero() 117 | 118 | def _cnn_emb(self, input_embs, training): 119 | # input_embs: (h, time_step, dim, batch_size), h=1 120 | if self.dropout_rate > 0 and training: 121 | input_embs = dy.dropout(input_embs, self.dropout_rate) 122 | W_cnn = dy.parameter(self.W_cnn) 123 | b_cnn = dy.parameter(self.b_cnn) 124 | 125 | cnn_encs = dy.conv2d_bias(input_embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False) 126 | tanh_cnn_encs = dy.tanh(cnn_encs) 127 | max_pool_out = dy.reshape(dy.max_dim(tanh_cnn_encs, d=1), (self.filter_size,)) 128 | # rec_pool_out = dy.rectify(max_pool_out) 129 | return max_pool_out 130 | 131 | def encode(self, input_seqs, training=True, char=True): 132 | batch_size = len(input_seqs) 133 | sents_embs = [] 134 | if char: 135 | # we don't batch at first, we batch after cnn 136 | for sent in input_seqs: 137 | sent_emb = [] 138 | for w in sent: 139 | if len(w) < self.win_size: 140 | w += [self.paddding_token] * (self.win_size - len(w)) 141 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, c) for c in w], d=1) 142 | w_emb = self._cnn_emb(input_embs, training) # (filter_size, 1) 143 | sent_emb.append(w_emb) 144 | sents_embs.append(sent_emb) 145 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.filter_size) # [(filter_size, batch_size)] 146 | else: 147 | for sent in input_seqs: 148 | if self.vocab_size != 0: 149 | if len(sent) < self.win_size: 150 | sent += [0] * (self.win_size - len(sent)) 151 | input_embs = dy.concatenate([dy.lookup(self.lookup_emb, w) for w in sent], d=1) 152 | else: 153 | # input_seqs: [(emb_size, batch_size)] 154 | if len(sent) < self.win_size: 155 | sent += [dy.zeros(self.emb_size)] * (self.win_size - len(sent)) 156 | input_embs = dy.transpose(dy.concatenate_cols(sent)) # (time_step, emb_size, bs) 157 | input_embs = dy.reshape(input_embs, (1, len(sent), self.emb_size), ) 158 | 159 | sent_emb = self._cnn_emb(input_embs, training) # (filter_size, 1) 160 | sents_embs.append(sent_emb) 161 | sents_embs = dy.reshape(dy.concatenate(sents_embs, d=1), (self.filter_size,), batch_size =batch_size) # (filter_size, batch_size) 162 | 163 | return sents_embs 164 | 165 | 166 | class BiRNN_Encoder(Encoder): 167 | def __init__(self, 168 | model, 169 | input_dim, 170 | hidden_dim, 171 | emb_dropout_rate=0.3, 172 | output_dropout_rate=0.5, 173 | padding_token=None, 174 | vocab_size=0, 175 | emb_size=0, 176 | layer=1, 177 | rnn="lstm", 178 | vocab_emb=None): 179 | Encoder.__init__(self) 180 | # self.birnn = dy.BiRNNBuilder(layer, input_dim, hidden_dim, model, dy.LSTMBuilder if rnn == "lstm" else dy.GRUBuilder) 181 | self.fwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model) 182 | self.bwd_RNN = dy.LSTMBuilder(layer, input_dim, hidden_dim, model) if rnn == "lstm" else dy.GRUBuilder(layer, input_dim, hidden_dim, model) 183 | 184 | self.input_dim = input_dim 185 | self.vocab_size = vocab_size 186 | self.padding_token = padding_token 187 | self.drop_out_rate = output_dropout_rate 188 | self.emb_drop_rate = emb_dropout_rate 189 | self.hidden_dim = hidden_dim 190 | if vocab_size > 0: 191 | print "In BiRNN, creating lookup table!" 192 | self.vocab_emb = model.add_lookup_parameters((vocab_size, emb_size)) 193 | else: 194 | if vocab_emb is not None: 195 | # assert vocab_emb is not None 196 | self.vocab_emb = vocab_emb 197 | else: 198 | self.vocab_emb = None 199 | 200 | def encode(self, input_seqs, training=True, char=False): 201 | if char: 202 | return self.encode_word(input_seqs, training=training) 203 | else: 204 | return self.encode_seq(input_seqs, training=training) 205 | 206 | def encode_seq(self, input_seqs, training=True, char=False): 207 | if self.vocab_emb is not None: 208 | # input_seqs = [[w1, w2],[]] 209 | transpose_inputs, _ = transpose_input(input_seqs, self.padding_token) 210 | if self.vocab_size != 0: 211 | w_embs = [dy.dropout(dy.lookup_batch(self.vocab_emb, wids), 212 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training 213 | else dy.lookup_batch(self.vocab_emb, wids) 214 | for wids in transpose_inputs] 215 | else: 216 | # print "In BiRNN, reusing lookup table!" 217 | # print "In our case, use parameters shared by CNN char encoder, need conversion!" 218 | vocab_emb = dy.parameter(self.vocab_emb) 219 | vocab_size = vocab_emb.dim()[0][-1] 220 | # print "In BiRNN Char vocab size: ", vocab_size 221 | vocab_emb = dy.reshape(vocab_emb, (self.input_dim, vocab_size)) # expression, not lookup_parameters 222 | 223 | # for wids in transpose_inputs: 224 | # print wids 225 | # print vocab_emb.dim() 226 | # a = dy.pick_batch(vocab_emb, wids, dim=1) 227 | # print a.value() 228 | # Special case handler: use pick_batch 229 | w_embs = [dy.dropout(dy.pick_batch(vocab_emb, wids, dim=1), 230 | self.emb_drop_rate) if self.emb_drop_rate > 0. and training 231 | else dy.pick_batch(vocab_emb, wids, dim=1) 232 | for wids in transpose_inputs] 233 | # print "In BiRNN char: ", w_embs[0].dim() 234 | else: 235 | w_embs = [dy.dropout(emb, self.emb_drop_rate) if self.emb_drop_rate > 0. and training else emb for emb in input_seqs] 236 | # if vocab_size = 0: input_seqs = [(input_dim, batch_size)] 237 | 238 | w_embs_r = w_embs[::-1] 239 | # birnn_outputs = [dy.dropout(emb, self.drop_out_rate) if self.drop_out_rate > 0. else emb for emb in self.birnn.transduce(w_embs)] 240 | fwd_vectors = self.fwd_RNN.initial_state().transduce(w_embs) 241 | bwd_vectors = self.bwd_RNN.initial_state().transduce(w_embs_r)[::-1] 242 | 243 | if char: 244 | return dy.concatenate([fwd_vectors[-1], bwd_vectors[0]]) 245 | 246 | birnn_outputs = [dy.dropout(dy.concatenate([fwd_v, bwd_v]), self.drop_out_rate) if self.drop_out_rate > 0.0 and training 247 | else dy.concatenate([fwd_v, bwd_v]) 248 | for (fwd_v, bwd_v) in zip(fwd_vectors, bwd_vectors)] 249 | return birnn_outputs 250 | 251 | def encode_word(self, input_seqs, training=True): 252 | # embedding dropout rate is 0.0, because we dropout at the later stage of RNN 253 | sents_embs = [] 254 | 255 | for sent in input_seqs: 256 | sent_emb = [] 257 | for w in sent: 258 | w_emb = self.encode_seq([w], training=training, char=True) 259 | sent_emb.append(w_emb) 260 | sents_embs.append(sent_emb) 261 | sents_embs, sents_mask = transpose_and_batch_embs(sents_embs, self.hidden_dim*2) # [(hidden_dim*2, batch_size)] 262 | return sents_embs -------------------------------------------------------------------------------- /utils/Convert_Output_Darpa.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import codecs 4 | 5 | 6 | def run_program(input, output, setEconll): 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | if input is not None and setEconll is not None: 10 | with codecs.open(input, 'r', encoding='utf-8', errors='ignore') as input_file: 11 | lines = input_file.readlines() 12 | tags = [] 13 | for i, line in enumerate(lines): 14 | if len(line) >= 2: 15 | line_split = line.strip().split() 16 | # sys.stderr.write('line: ' + line.strip() + '\n') 17 | # sys.stderr.flush() 18 | assert len(line_split) == 4 19 | tags.append(line_split[-1]) 20 | 21 | output_lines = lines 22 | 23 | with codecs.open(setEconll, 'r',encoding='utf-8', errors='ignore') as input_file: 24 | lines = input_file.readlines() 25 | assert len(output_lines) == len(lines) 26 | with codecs.open(output,'w',encoding='utf-8') as output_file: 27 | ctr = -1 28 | for line in lines: 29 | if len(line) > 2: 30 | ctr += 1 31 | line_split = line.strip().split() 32 | assert len(line_split) == 10 33 | # print '\t'.join(line_split) + '\t' + tags[ctr] 34 | output_file.write('\t'.join(line_split) + '\t' + tags[ctr] +"\n") 35 | else: 36 | # print "" 37 | output_file.write("\n") 38 | assert ctr + 1 == len(tags) 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--input", type=str, default=None) 43 | parser.add_argument("--setEconll", type=str, default=None) 44 | parser.add_argument("--output", type=str, default=None) 45 | args = parser.parse_args() 46 | run_program(args.input, args.output, args.setEconll) 47 | -------------------------------------------------------------------------------- /utils/Convert_to_darpa_xml.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import codecs 3 | import argparse 4 | 5 | 6 | def print_entities(fout,entities, curr_docum, curr_anot): 7 | # print 'CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + ' '.join( 8 | # entities[0]) + '\t' + curr_docum + ':' + str(entities[2]) + '-' + str(entities[3]) + '\t' + 'NIL' + '\t' + \ 9 | # entities[1] + '\t' + 'NAM' + '\t' + '1.0' 10 | fout.write('CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + ' '.join( 11 | entities[0]) + '\t' + curr_docum + ':' + str(entities[2]) + '-' + str(entities[3]) + '\t' + 'NIL' + '\t' + \ 12 | entities[1] + '\t' + 'NAM' + '\t' + '1.0' + "\n") 13 | 14 | 15 | def run_program_darpa(input, output): 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | if input is not None and output is not None: 19 | with codecs.open(input, encoding='utf-8', mode='r') as input_file: 20 | lines = input_file.readlines() 21 | 22 | entities = [[], None, -1, -1] 23 | in_entity = False 24 | curr_docum = None 25 | curr_anot = 1 26 | fout = codecs.open(output,'w',encoding='utf-8') 27 | for i, line in enumerate(lines): 28 | if len(line) > 2: 29 | # print 'Line number: ' + str(i + 1) + '\n' 30 | #sys.stderr.flush() 31 | line_split = line.strip().split() 32 | if curr_docum != line_split[3]: 33 | curr_docum = line_split[3] 34 | curr_anot = 1 35 | # print '' 36 | if len(line_split) != 11: 37 | print line 38 | print 'Error in line: ' + str(i + 1) + '\n' 39 | assert len(line_split) == 11 40 | if line_split[-1][0] == 'B': 41 | if in_entity: 42 | print_entities(fout, entities, curr_docum, curr_anot) 43 | # restart 44 | entities[0] = [] 45 | entities[1] = None 46 | entities[2] = -1 47 | entities[3] = -1 48 | curr_anot += 1 49 | in_entity = False 50 | else: 51 | assert len(entities[0]) == 0 and entities[1] is None and entities[2] == -1 and entities[3] == -1 52 | assert not (in_entity) 53 | in_entity = True 54 | assert line_split[-1][1] == '-' 55 | entities[0].append(line_split[0]) 56 | entities[1] = ''.join(line_split[-1][2:]) 57 | entities[2] = int(line_split[-5]) 58 | entities[3] = int(line_split[-4]) 59 | elif line_split[-1][0] == 'I': 60 | # print 'line num: ' + str(i + 1) + '\n' 61 | 62 | if not in_entity or (len(entities[0]) > 0 and line_split[-1][2:] != entities[1]):# when first tag is I-PER treat it as B-PER 63 | in_entity = True 64 | entities[0].append(line_split[0]) 65 | entities[1] = ''.join(line_split[-1][2:]) 66 | entities[2] = int(line_split[-5]) 67 | entities[3] = int(line_split[-4]) 68 | 69 | else: 70 | assert in_entity and len(entities[0]) > 0 and not (entities[0] is None) and ''.join( 71 | line_split[-1][2:]) == entities[1] and entities[2] >= 0 and entities[3] >= 0 72 | entities[0].append(line_split[0]) 73 | assert entities[2] >= 0 74 | assert int(line_split[-4]) > entities[3] 75 | entities[3] = int(line_split[-4]) 76 | elif line_split[-1][0] == 'O': 77 | if in_entity: 78 | print_entities(fout,entities, curr_docum, curr_anot) 79 | entities[0] = [] 80 | entities[1] = None 81 | entities[2] = -1 82 | entities[3] = -1 83 | curr_anot += 1 84 | in_entity = False 85 | else: 86 | if in_entity: 87 | # print 'We are in an entity and met sentence boundary, line: ' + str(i + 1) + '\n' 88 | print_entities(fout, entities, curr_docum, curr_anot) 89 | entities[0] = [] 90 | entities[1] = None 91 | entities[2] = -1 92 | entities[3] = -1 93 | curr_anot += 1 94 | in_entity = False 95 | 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument("--input", type=str, default=None) 100 | parser.add_argument("--output", type=str, default=None) 101 | args = parser.parse_args() 102 | run_program_darpa(args.input, args.output) 103 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/__init__.py -------------------------------------------------------------------------------- /utils/extract_authors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import codecs 3 | import xml.etree.ElementTree as ET 4 | import sys 5 | 6 | def extract_authors(dir_name, output_fname): 7 | author_set = set() 8 | for fname in os.listdir(dir_name): 9 | fin_name = os.path.join(dir_name, fname) 10 | if os.path.isfile(fin_name): 11 | fs = fname.split('_') 12 | if fs[1] != "WL": 13 | continue 14 | print fname 15 | tree = ET.parse(fin_name) 16 | root = tree.getroot() 17 | # elems = root.findall(".//*[@type='post']/[@name='author']") 18 | elems = root.findall(".//*[@type='post']/attribute") 19 | for elem in elems: 20 | if elem.get('name') == u'author': 21 | author = elem.get(u'value') 22 | author_set.add(author) 23 | 24 | with codecs.open(output_fname, "w", "utf-8") as fout: 25 | for elem in author_set: 26 | fout.write(elem + '\n') 27 | 28 | if __name__ == "__main__": 29 | dname = sys.argv[1] 30 | fout_name = sys.argv[2] 31 | extract_authors(dname, fout_name) -------------------------------------------------------------------------------- /utils/features.py: -------------------------------------------------------------------------------- 1 | from utils.segnerfts import segnerfts 2 | import codecs 3 | 4 | 5 | def get_feature_sent(lang, sent, args): 6 | if args.use_gazatter and args.use_morph: 7 | return segnerfts.extract(lang, sent) 8 | elif args.use_gazatter: 9 | return segnerfts.extract_type_token_gaz(lang, sent) 10 | elif args.use_morph: 11 | return segnerfts.extract_type_token_morph(lang, sent) 12 | else: 13 | return segnerfts.extract_type_token_level(lang, sent) 14 | 15 | 16 | def get_brown_cluster(path): 17 | bc_dict = dict() 18 | linear_map = dict() 19 | with codecs.open(path, "r", "utf-8") as fin: 20 | for line in fin: 21 | fields = line.strip().split('\t') 22 | if len(fields) == 3: 23 | word = fields[1] 24 | binary_string = fields[0] 25 | bid = int(binary_string, 2) 26 | if bid not in linear_map: 27 | linear_map[bid] = len(linear_map) 28 | bc_dict[word] = linear_map[bid] 29 | return bc_dict -------------------------------------------------------------------------------- /utils/old_segnerfts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import regex as re 6 | 7 | 8 | LONG_TOKEN_THRESH = 8 9 | 10 | 11 | def ex_capitalized(ws): 12 | return [w[0].isupper() for w in ws] 13 | 14 | 15 | def ex_all_uppercased(ws): 16 | return [all(x.isupper() for x in w) for w in ws] 17 | 18 | 19 | def ex_mixed_case(ws): 20 | def mixed_case(w): 21 | noninit = [x.isupper() for x in w[1:]] 22 | return True in noninit and False in noninit 23 | return map(mixed_case, ws) 24 | 25 | 26 | def ex_internal_period(ws): 27 | return [len(w) > 2 and '.' in w[1:-1] for w in ws] 28 | 29 | 30 | def ex_non_letter(ws): 31 | return [bool(re.search(r'[^\p{Letter}\p{Mark}]', w)) for w in ws] 32 | 33 | 34 | def ex_digits(ws): 35 | return [bool(re.search(r'[\p{Number}]', w)) for w in ws] 36 | 37 | 38 | def ex_long_token(ws): 39 | return [len(w) > LONG_TOKEN_THRESH for w in ws] 40 | 41 | 42 | def ex_contains_latin(ws): 43 | return [bool(re.search(r'\p{Latin}', w)) for w in ws] 44 | 45 | 46 | def ex_contains_ethiopic(ws): 47 | return [bool(re.search(r'\p{Ethiopic}', w)) for w in ws] 48 | 49 | 50 | ex_title = { 51 | 'eng': lambda ws: [False] + [w in { 52 | 'Mister', 53 | 'Mr.', 54 | 'Mr', 55 | 'Misses', 56 | 'Mrs.', 57 | 'Mrs', 58 | 'Miss', 59 | 'Ms.', 60 | 'Ms', 61 | 'Doctor', 62 | 'Dr.', 63 | 'Dr', 64 | 'Professor', 65 | 'Prof.', 66 | 'Prof', 67 | 'Father', 68 | 'Fr.', 69 | 'Fr', 70 | 'Reverend', 71 | 'Rev.', 72 | 'Rev', 73 | 'Revd', 74 | 'Pastor', 75 | 'Bishop', 76 | 'Bp.', 77 | 'Bp', 78 | 'President', 79 | 'Pres.', 80 | 'Representative', 81 | 'Rep.', 82 | 'Rep', 83 | 'Congressman', 84 | 'Congresswoman', 85 | 'Congressperson', 86 | 'Senator', 87 | 'Sen.', 88 | 'Sen', 89 | 'Secretary', 90 | 'Sec.', 91 | 'Sec', 92 | 'Lord', 93 | 'Lady', 94 | 'Justice', 95 | 'Sheriff', 96 | 'Principal', 97 | 'Mayor', 98 | } for w in ws[:-1]], 99 | 'deu': lambda ws: [False] + [w in { 100 | 'Herr', 101 | 'Hr.', 102 | 'Frau', 103 | 'Fr.', 104 | 'Fraulein', 105 | 'Frl.', 106 | 'Doktor', 107 | 'Dr.', 108 | 'Dr.med.', 109 | 'Dr.phil.', 110 | 'Dr.rer.nat.', 111 | 'Dr.jur.', 112 | 'Dr.theol.', 113 | 'Professor', 114 | 'Prof.', 115 | 'a.o.Prof.', 116 | 'o.Pr.', 117 | 'Dozent', 118 | 'Doz.', 119 | 'Richter', 120 | 'Senator', 121 | 'Sen.', 122 | 'Ministerpräsident', 123 | 'Ministerpräsidentin', 124 | 'Bürgermeister', 125 | 'Abgeordenete', 126 | 'Abg.', 127 | 'Bundeskanzler', 128 | 'Landeshauptmann', 129 | 'Kaiser', 130 | 'Kaiserin', 131 | 'König', 132 | 'Königin', 133 | 'Kurfürst', 134 | 'Kurfürstin', 135 | 'Erzherzog', 136 | 'Erzherzogin', 137 | 'Großherzog', 138 | 'Großherzogin', 139 | 'Großfürst', 140 | 'Großfürstin', 141 | 'Herzog', 142 | 'Herzogin', 143 | 'Pfalzgraf', 144 | 'Pfalzgräfin', 145 | 'Markgraf', 146 | 'Markgräfin', 147 | 'Landgraf', 148 | 'Landgräfin', 149 | 'Reichsfürst', 150 | 'Reichsfürstin', 151 | 'Reichsgraf', 152 | 'Reichsgräfin', 153 | 'Burggraf', 154 | 'Burggräfin', 155 | 'Altgraf', 156 | 'Altgräfin', 157 | 'Reichsfreiherr', 158 | 'Reichsfreifrau', 159 | 'Reichsfreiin', 160 | 'Reichsritter', 161 | 'Ritter', 162 | 'Graf', 163 | 'Gräfin', 164 | 'Edler', 165 | 'Edle', 166 | 'Freifrau', 167 | 'Frfr.', 168 | 'Freiherr', 169 | 'Frhr.', 170 | 'Hochwürden', 171 | 'Pater', 172 | 'Pfarrer', 173 | 'Pastor', 174 | 'P.', 175 | 'Pfarrhelfer', 176 | 'Kaplan', 177 | 'Vikar', 178 | 'Dekan', 179 | 'Bischof', 180 | 'Kapitän', 181 | 'Kpt.', 182 | 'Leutnant', 183 | 'Lt.', 184 | 'Vorsitzender', 185 | 'Vors.', 186 | } for w in ws[:-1]], 187 | 'amh': lambda ws: [False] + [w in { 188 | 'አቶ', # Mr. 189 | 'ወይዘሮ', 190 | 'ወይዘሪት', 191 | 'ፕሮፌሰር', 192 | 'ፕሬዚዳንት', 193 | 'ፐሬዝዳንት', 194 | 'ፕሬዝዳንት', 195 | 'ኮለኔል', 196 | 'ጄኔራል', 197 | 'አቡነ', 198 | 'ቀስ', 199 | 'ሰላም', 200 | 'ሼኽ', 201 | 'ራስ', 202 | 'ቢትወደድ', 203 | 'ወ/ሮ', 204 | 'ወ/ሪት', 205 | 'ድ/ር', 206 | 'ፕ/ር', 207 | 'ፕ/ት', 208 | 'ኮ/ል', 209 | 'ጄ/ል', 210 | 'ሼኽ', 211 | 'ራስ', 212 | 'ቢትወደድ', 213 | 'አዛዥና', 214 | 'ልዑል', 215 | 'ሚኒስቴር', 216 | 'ዕድሜው', 217 | 'ወታደር', 218 | 'ም/ል', 219 | 'ጸሃፊ', 220 | 'ረዳት', 221 | 'ጸሐፊ', 222 | 'አምባሳደር', 223 | 'አስተዳዳሪ', 224 | 'ሪፖርተራችን', 225 | } for w in ws[:-1]], 226 | 'orm': lambda ws: [False] + [w.lower() in { 227 | 'obbo', # Mister 228 | 'obboo', # Mister 229 | 'obo', # Mister 230 | 'abbaa', # Father 231 | 'aba', 232 | 'ministeeraa', # Minister 233 | } for w in ws[:-1]], 234 | 'tir': lambda ws: [False] + [w in { 235 | 'ኣቶ', # Mister_1 236 | 'ጐይታይ', # Mister_2 237 | 'ሓላፊ', # President_1 238 | 'ሓለቓ', # President_2 239 | 'ወዘተ', # President_3 240 | 'ፕረሲደንት', # President_4 241 | 'ፕሬዝዳንት', # President_5 242 | 'ኣቦ', # Father 243 | } for w in ws[:-1]], 244 | 'som': lambda ws: [w in {} for w in ws], 245 | } 246 | 247 | 248 | ex_head_org = { 249 | 'eng': lambda ws: [w in { 250 | 'Ministry', 251 | 'Department', 252 | 'Agency', 253 | 'Bureau', 254 | 'Company', 255 | 'Corporation', 256 | 'Inc.', 257 | 'Inc', 258 | 'Corp.', 259 | 'Corp', 260 | 'Authority', 261 | 'Organization', 262 | 'Organisation', 263 | 'Committee', 264 | 'Bank', 265 | } for w in ws], 266 | 'deu': lambda ws: [w in { 267 | 'Amt', 268 | 'Ministerium', 269 | 'Agentur', 270 | 'Büro', 271 | 'Organisation', 272 | 'Abteilung', 273 | 'Abt.', 274 | 'Aktiengesellschaft', 275 | 'AG', 276 | 'Union', 277 | 'Genossenschaft', 278 | 'Gen.', 279 | 'Gesellschaft', 280 | 'GmbH', 281 | 'HTL', 282 | 'Regierung', 283 | 'Verband', 284 | 'Kommission', 285 | 'Bank', 286 | } for w in ws], 287 | 'amh': lambda ws: [w in { 288 | 'ሚኒስቴር', 289 | 'ኤጀንሲ', 290 | 'ኮሚሽን', 291 | 'ኮርፖሬሽን', # corporation 292 | 'ድርጅት', 293 | 'ባለሥልጣን', 294 | 'ባንክ', 295 | 'ቢሮ', 296 | 'ኮሚቴ', 297 | 'ኮርፖሬሽን', 298 | 'ምንጮች', 299 | 'ፓርቲ', # party 300 | 'ፓርቲን', # party_2 301 | 'ጋዜጣ', # newpaper 302 | } for w in ws], 303 | 'orm': lambda ws: [w.lower() in { 304 | 'ministirii', # Ministry 305 | 'ministiri', 306 | 'damiyyaa', # Department 307 | 'damiyya', 308 | 'wakkiila', # Agency 309 | 'wakila', 310 | 'dhaabbata', # Organization 311 | 'dhabata', 312 | 'koree', # Committee 313 | 'kore', 314 | 'baankii', # Bank 315 | 'banki', 316 | 'waldaa', # Society 317 | 'walda', 318 | 'waraanni', # Front 319 | 'warnani', 320 | } for w in ws], 321 | 'tir': lambda ws: [w in { 322 | 'ክፍሊ', # Department_1 323 | 'ጨንፈር', # Department_2 324 | 'ዋኒን', # Agency_1 325 | 'ተግባር', # Agency_2 326 | 'ስርሒት', # Agency_3 327 | 'ኤጄንሲ', # Agency_4 328 | 'ሰደቓ', # Bureau 329 | 'ኮርፖረሽን', # Corporation 330 | 'ውድብ', # Organization_1 331 | 'ኣወዳድባ', # Organization_2 332 | 'ኣመሰራርታ', # Organization_3 333 | 'ኮመት', # Committee_1 334 | 'ሽማግለ', # Committee_2 335 | 'ሰራዊት', # Army 336 | 'ስርዓት', # Regime 337 | } for w in ws], 338 | 'som': lambda ws: [w.lower() in { 339 | 'dowladda', # government 340 | 'maamulka', # administration 341 | 'xafiiska', # office 342 | 'wasaaradda', # ministry 343 | 'hay\'adda', # agency 344 | 'shirkadda', # corporation 345 | 'saacadaha', # organization 346 | 'guddi', # board 347 | 'bankiga', # bank 348 | 'ciidamada', # army 349 | 'kooxda', # faction 350 | 'shabakada', # network 351 | } for w in ws], 352 | } 353 | 354 | 355 | ex_head_loc = { 356 | 'eng': lambda ws: [w in { 357 | 'Island', 358 | 'Lake', 359 | 'River', 360 | 'Sea', 361 | 'Ocean', 362 | 'Mountain', 363 | 'Mountains', 364 | 'Valley', 365 | 'Bay', 366 | 'Mosque', 367 | 'Cathedral', 368 | 'Church', 369 | } for w in ws], 370 | 'deu': lambda ws: [any([ 371 | re.search('[Bb]erg$', w), 372 | re.search('[Gg]ebirge$', w), 373 | re.search('[Ss]ee$', w), 374 | re.search('[Mm]eer$', w), 375 | re.search('[Oo]zean$', w), 376 | re.search('[Tt]al$', w), 377 | re.search('wald$', w), 378 | re.search('[Bb]ucht$', w), 379 | re.search('[Kk]irche$', w), 380 | re.search('[Mm]oschee$', w), 381 | ]) for w in ws], 382 | 'amh': lambda ws: [w in { 383 | 'ደሴት', 384 | 'ሐይክ', 385 | 'ወንዝ', 386 | 'ባህር', 387 | 'ወቅያኖስ', 388 | 'ተራራ', 389 | 'ሸለቆ', 390 | 'ሰፈር', 391 | 'ወሽመጥ', 392 | 'መስጊድ', 393 | 'ሀገር', 394 | 'ሆስፒታል', # hospital 395 | } for w in ws], 396 | 'orm': lambda ws: [w.lower() in { 397 | 'odoola', # Island 398 | 'odola', 399 | 'odoolota', # Islands 400 | 'odolota', 401 | 'calalaqa', # Lake_1 402 | 'dabbal', # Lake_2 403 | 'dabal', 404 | 'hara', # Lake_3 405 | 'laaqii', # Lake_4 406 | 'laqi', 407 | 'lagaa', # River 408 | 'laga', 409 | 'garba', # Sea 410 | 'maanya', # Ocean 411 | 'manya', 412 | 'gooroo', # Mountains 413 | 'goro', 414 | 'gaara', # Mountain 415 | 'sulula', # Valley 416 | 'bataskaana', # Church 417 | 'masqiida', # Mosque 418 | } for w in ws], 419 | 'tir': lambda ws: [w in { 420 | 'ደሴት', # Island_1 421 | 'ግሉል', # Island_2 422 | 'ብሕቱው', # Island_3 423 | 'ቀላይ', # Lake_1 424 | 'ወይናይ', # Lake_2 425 | 'ፈለግ', # River 426 | 'ባሕሪ', # Sea 427 | 'ሰፊሕ', # Ocean 428 | 'ጎቦ', # Mountain_1 429 | 'እምባ', # Mountain_2 430 | 'ሩባ', # Valley_1 431 | 'ለሰ', # Valley_2 432 | 'ሕሉም', # Valley_3 433 | 'ስንጭሮ', # Valley_4 434 | 'በተኽስያን', # Church 435 | 'መስጊድ', # Mosque 436 | } for w in ws], 437 | 'som': lambda ws: [w.lower() in { 438 | 'jasiirad', # island 439 | 'harada', # lake 440 | 'buurta', # mountain 441 | 'dooxada', # valley 442 | 'badweynta', # ocean 443 | 'webiga', # river 444 | 'masaajid', # mosque 445 | 'hoteel', # hotel 446 | 'hotelka', # hotel 447 | 'hotel', # hotel 448 | 'degmada', # district 449 | 'deegaanka', # district 450 | } for w in ws], 451 | } 452 | 453 | 454 | ex_head_gpe = { 455 | 'eng': lambda ws: [w in { 456 | 'District', 457 | 'Zone', 458 | 'Region', 459 | 'Province', 460 | 'Division', 461 | 'Republic', 462 | 'Nation', 463 | 'City', 464 | 'Town', 465 | 'Village', 466 | 'State', 467 | } for w in ws], 468 | 'deu': lambda ws: [any([ 469 | re.search('[rR]epublik$', w), 470 | re.search('land$', w), 471 | re.search('stan$', w), 472 | re.search('[sS]tadt$', w), 473 | re.search('heim$', w), 474 | re.search('dorf$', w), 475 | re.search('hausen$', w), 476 | re.search('burg$', w), 477 | re.search('berg$', w), 478 | re.search('gau$', w), 479 | re.search('[pP]rovinz$', w) 480 | ]) for w in ws], 481 | 'amh': lambda ws: [w in { 482 | 'ከተማ', 483 | 'መንደር', 484 | 'ቀበሌ', 485 | 'ወረዳ', 486 | 'ዞን', 487 | 'ክልል', 488 | 'አውራጃ', 489 | 'መንግስት', 490 | 'ክፍላት', 491 | 'ጦር', 492 | 'ዙሪያ', 493 | 'ላይ', 494 | 'ተከማ', # town 495 | } for w in ws], 496 | 'orm': lambda ws: [w.lower() in { 497 | 'koonyaa', # District_1 498 | 'konya', 499 | 'aanaa', # District_2 500 | 'ana', 501 | 'goltaa', # Zone_1 502 | 'golta', 503 | 'godina', # Zone_2 504 | 'naannoo', # Region 505 | 'nano', 506 | 'jamuriyaa', # Republic_1 507 | 'jamuriya', 508 | 'republika', # Republic_2 509 | 'magaalaa', # City 510 | 'magala', 511 | 'magaalaan', 512 | 'magalan', 513 | 'daabbaa', # Town 514 | 'daba', 515 | 'dira', # Big Town 516 | 'gandaa', # Village 517 | 'ganda', 518 | 'mootummaa', 519 | 'motuma', 520 | } for w in ws], 521 | 'tir': lambda ws: [w in { 522 | 'ወረዳ', # District 523 | 'ዞባ', # Zone 524 | 'ከተማ', # City 525 | 'ዞና', # Region 526 | 'መንግስቲ', # State 527 | 'ኣውራጃ', # Prefecture/Province 528 | 'ረፑብሊክ', # Republic 529 | 'ከተማ', # City 530 | 'ገጠር', # Village_1 531 | 'ቁሸት', # Village_2 532 | 'ዓዲ', # Village_3 533 | } for w in ws], 534 | 'som': lambda ws: [w.lower() in { 535 | 'dalka', # country 536 | 'dalalka', # country 537 | 'gobolka', # province, state 538 | 'magaalada', # city 539 | 'tuulo', # village 540 | 'jamhuuriyadda', # republic 541 | } for w in ws], 542 | } 543 | 544 | 545 | ex_prep_from = { 546 | 'eng': lambda ws: [w.lower() == 'from' for w in ws], 547 | 'deu': lambda ws: [w.lower() in {'von', 'vom'} for w in ws], 548 | 'amh': lambda ws: [bool(re.match('ከ', w)) for w in ws], 549 | 'orm': lambda ws: [w.lower() in {'irraa', 'ira'} for w in ws], 550 | 'tir': lambda ws: [w in {'ካብ'} for w in ws], 551 | 'som': lambda ws: [w in {'ilaa'} for w in ws], 552 | } 553 | 554 | 555 | ex_prep_in = { 556 | 'eng': lambda ws: [w.lower() == 'in' for w in ws], 557 | 'deu': lambda ws: [w.lower() in {'in', 'im'} for w in ws], 558 | 'amh': lambda ws: [bool(re.match('በ', w)) for w in ws], 559 | 'orm': lambda ws: [w.lower() in {'keessa', 'kesa', 'itti', 'iti'} for w in ws], 560 | 'tir': lambda ws: [w in {'ኣብ'} for w in ws], 561 | 'som': lambda ws: [w in {'ee'} for w in ws], 562 | } 563 | 564 | 565 | extractors = [ 566 | lambda lang: ex_capitalized, 567 | lambda lang: ex_all_uppercased, 568 | lambda lang: ex_mixed_case, 569 | lambda lang: ex_internal_period, 570 | lambda lang: ex_non_letter, 571 | lambda lang: ex_digits, 572 | lambda lang: ex_long_token, 573 | lambda lang: ex_contains_latin, 574 | lambda lang: ex_contains_ethiopic, 575 | lambda lang: ex_title[lang], 576 | lambda lang: ex_head_org[lang], 577 | lambda lang: ex_head_loc[lang], 578 | lambda lang: ex_head_gpe[lang], 579 | lambda lang: ex_prep_from[lang], 580 | lambda lang: ex_prep_in[lang], 581 | ] 582 | 583 | 584 | TYPE_START, TYPE_END = 0, 9 585 | TOKEN_START, TOKEN_END = 9, 15 586 | 587 | 588 | def extract(lang, seg): 589 | fts = zip(*[ex(lang)(seg) for ex in extractors]) 590 | return [map(int, f) for f in fts] 591 | 592 | 593 | def extract_type_level(lang, seg): 594 | fts = extract(lang, seg) 595 | return [v[TYPE_START:TYPE_END] for v in fts] 596 | 597 | 598 | def extract_token_level(lang, seg): 599 | fts = extract(lang, seg) 600 | return [v[TOKEN_START:TOKEN_END] for v in fts] 601 | 602 | 603 | def extractIndicatorFeatures(lang, seg): 604 | fts = extract(lang, seg) 605 | return fts 606 | 607 | if __name__ == "__main__": 608 | seg = [u'\u121d\u12dd\u1263\u12d5', u'\u12a3\u12e8\u122d', u'-', u'\u12f6\u1265', u'\u12a3\u120d\u1266', u'\u12c8\u1325\u122a', u'\u12d3\u1208\u121d'] 609 | b = extract("tir", seg) 610 | print(b) -------------------------------------------------------------------------------- /utils/orm_morph.py: -------------------------------------------------------------------------------- 1 | def best_parse(a): 2 | return "www" -------------------------------------------------------------------------------- /utils/orm_norm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/orm_norm/__init__.py -------------------------------------------------------------------------------- /utils/orm_norm/orm_gaz.txt: -------------------------------------------------------------------------------- 1 | Abadula Gemeda Abbaaduulaa Gammadaa 2 | Abalti Mountains Gooroo Abbaltii 3 | Abbalti Abbaltii 4 | Abbaya Abbaya 5 | Abbaya Abbaayaa 6 | Abomsa Abboomsaa 7 | Abyssinia Habashaa 8 | Adabba Adaabbaa 9 | Adama Adaamaa 10 | Adama Hadaamaa 11 | Adami Tullu Adaamii Tulluu 12 | Addalle Addallee 13 | Addelle Addellee 14 | Addis Ababa Finfinne 15 | Addis Ababa Shaggar 16 | Adoolaa Adoolaa 17 | Adulala Adulaala 18 | Afan Oromo (language) Afaan Oromoo 19 | Afar Region Naannoo Affaar 20 | Afghanistan Afgaanistaan 21 | Africa Afrikaa 22 | Africa Aafrikaa 23 | Africa Afrikaa 24 | Agafra Agafraa 25 | Aggaro Aggaaroo 26 | Aji Ajjee 27 | Ajje Ajjee 28 | Akaki Aqaaqii 29 | Akaki Basaka Aqaaqii Basaqaa 30 | Alaltu Alaltuu 31 | Albania Albaaniyaa 32 | Alexandria Iskindiriyaa 33 | Algeria Aljeeriyaa 34 | Ambo Amboo 35 | Ambo Ambo 36 | Ambo University Yuunibarsiitii Amboo 37 | Amhara Region Naannoo Amaaraa 38 | Amigna Amiinyaa 39 | Amnesty International Amnistii Internaashinaal 40 | Andorra Andooraa 41 | Angar Angar 42 | Angettu Angeettuu 43 | Angola Angoolaa 44 | Antarctic Antaartikaa 45 | Antarctic Ocean Maanya Kibbacabbii 46 | Antigua and Barbuda Antiguwaa fi Barbuda 47 | Arba Bordode Arba Bordodee 48 | Arba Gugu Mountains Gooroo Arba Guguu 49 | Arboyye Arbooyyee 50 | Arctic Ocean Maanya Kaabacabbii 51 | Arero Areeroo 52 | Argentina Arjantiinaa 53 | Arjo Arjoo 54 | Armenia Armeeniyaa 55 | Arsi Arsii 56 | Arsi Zone Arsii 57 | Arsi Zone Godina Arsii 58 | Asabot Asaboot 59 | Asalla Asallaa 60 | Asandabo Asandaaboo 61 | Asasa Asaasa 62 | Aseko Asako 63 | Asgori Abebe Asgorii Abeebee 64 | Asgori Bacho Asgorii Bachoo 65 | Asha Ashaa 66 | Asia Eshiyaatti 67 | Asia Asiyaa 68 | Asosa Asaasaa 69 | Atlanta Atlaantaa 70 | Australia Awustiraaliyaa 71 | Australia Awustaraaliyaa 72 | Austria Ostiriyaa 73 | Awaash Malkasa Awaash Malkaasaa 74 | Awadai Awaday 75 | Awash Awaash 76 | Awash Baldho Awaash Baldhoo 77 | Awash Kunture Awaash Qunxuree 78 | Ayira Ayiraa 79 | Azerbaijan Azarbajaan 80 | Baabbile Baabbilee 81 | Babbicha Baabbicha 82 | Babylon Baabiloon 83 | Badda Kerro Baddaa Qeerroo 84 | Badda Rogge Baddaa Roggee 85 | Baddalle Baddallee 86 | Baddanno Baddannoo 87 | Baddessa Baddeessaa 88 | Bahamas Bahaamaa 89 | Bahir Dar Baahir Daar 90 | Bahrain Baahireen 91 | Bakke Bakkee 92 | Bakko Baakkoo 93 | Bako Tibe Baakkoo Tibbee 94 | Bale Baale 95 | Bale (zone) Baale 96 | Bale (zone) Godina Baale 97 | Bambasi Baambasii 98 | Bangladesh Baangiladeesh 99 | Bantu Baantuu 100 | Barbados Barbaadoos 101 | Barcelona Barseloonaa 102 | Baro Baaroo 103 | Bata (town in Equatorial Guinea) Bataa 104 | Bati Baatii 105 | Batu Baatuu 106 | Becho Bacho 107 | Begi Beegii 108 | Belarus Belarus 109 | Belgium Beeljigii 110 | Belize Beliz 111 | Bench Maji Beenchii Maajii 112 | Benin Bineen 113 | Benishangul-Gumuz (region) Benishangul-Gumuzii 114 | Bhutan Butaan 115 | Bijata Abijaataa 116 | Bila Biilaa 117 | Birbir Birbir 118 | Bishoftu Bishooftuu 119 | Bodda Bodda 120 | Bofa Bofaa 121 | Boke Tikko Bookee Xiqqoo 122 | Bokojjii Boqojjii 123 | Bokona Borkanaa 124 | Bole Bulbula Boolee Bulbulaa 125 | Bolivia Boliibiyaa 126 | Bombay Bombee 127 | Bora Booraa 128 | Borana Boorana 129 | Bore Booree 130 | Borena Zone Godina Booranaa 131 | Borana Zone Godina Booranaa 132 | Boroda Borodaa 133 | Boru Jawwi Boruu Jaawwii 134 | Bosnia and Herzegovina Bosniyaa fi Hersigobenaa 135 | Botswana Botiswaanaa 136 | Brazil Biraazil 137 | Brunei Birunaay 138 | Bulbuloo Bulbuloo 139 | Bulgaria Bulgaariyaa 140 | Burayu Buraayyuu 141 | Bure Buree 142 | Burka Burqaa 143 | Burkina Faso Burkiinaa Faasoo 144 | Burkina Faso Burkinaa Faasoo 145 | Burma Barmaa 146 | Burundi Burundii 147 | Busa Buusaa 148 | Calcutta Kaalikutaa 149 | Cambodia Kamboodiyaa 150 | Cameroon Kaameruun 151 | Cameroon Kamero 152 | Canada Kanaadaa 153 | Cape Town Keep Taawon 154 | Cape Verde Keppe Verdee 155 | Casablanca Kaasabilaankaa 156 | Central African Republic Republika Afrikaa Jiddugalee 157 | Chacha Caaca 158 | Chad Chaad 159 | Chaffa Robi Caffaa Roobii 160 | Chaffe Donsa Caffee Doonsaa 161 | Chaffo Caffo 162 | Chalalaka Calalqaa 163 | Chalanko Calanqoo 164 | Chalbi Calbii 165 | Chamo Caamoo 166 | Chancho Caancoo 167 | Chanka Caanqaa 168 | Cheliya Calliyaa 169 | Chile Chiilee 170 | China Chaayinaa 171 | Chira Ciraa 172 | Chiro Ciro 173 | Chirracha Cirracha 174 | Chittu Cittuu 175 | Chukkala Hara Cuqqaalaa 176 | Chulta Gurre Birki Cuultaa Gurree Birqii 177 | Colombia Kolombiyaa 178 | Comoros Komooroo 179 | Congo (idk if it's DRC or Republic of the Congo) Koongoo 180 | Congo Brazzaville Kongoo Biraazabil 181 | Congo Kinshasa Koongoo Kinshaasaa 182 | Costa Rica Kostaa Rikaa 183 | Cote D'ivoire Kotee Dibiwaar 184 | Croatia Kirowaatiyaa 185 | Cuba Kuubaa 186 | Cyprus Sippiras 187 | Czech Republic Republika Cheekii 188 | Dabana Daabanaa 189 | Dabbus Daabbus 190 | Dadar Dadar 191 | Dagaga Dagaagaa 192 | Dagam Dagam 193 | Dalatti Daalattii 194 | Dallo Sarbo Dalloo Sarboo 195 | Dambi Dambii 196 | Dambi Dollo Dambi Dolloo 197 | Dandi Mountains Gooroo Dandii 198 | Dannaba Dannaba 199 | Dano Daannoo 200 | Dase Dasee 201 | Dawwa Dawwaa 202 | Dawwe Daawwee 203 | Dembidolo Dambi Dooloo 204 | Denmark Denmaarkii 205 | Dhera Dheeraa 206 | Diddessa Dhiddheessa 207 | Diksis Diksiis 208 | Dilalla Diilallaa 209 | Dima Diimaa 210 | Dimtuu Diimtuu 211 | Dinsho Diinsho 212 | Dire Dawa Dirre Dhawaa 213 | Dire Dawa Dirre Dawaa 214 | Dire Dawa Dirree Dhawaa 215 | Dirre Dhawa Dirre Dhawaa 216 | Dirre Dhawa Dirre Dawaa 217 | Dirre Dhawa Dirree Dhawaa 218 | Djibouti Jibuutii 219 | Djibouti Jabuutii 220 | Dobba Doobbaa 221 | Dodola woreda Dodola 222 | Dollo Noonnoo 223 | Dominica Dominikaa 224 | Dominican Republic Republika Dominikaa 225 | Dongoro Dongoroo 226 | Doni Doonii 227 | Doriya Dooriyaa 228 | Dubar Dubar 229 | Dukam Duukam 230 | East Hararge Harargee Bahaa 231 | East Hararghe Zone Harargee Bahaa 232 | East Hararghe Zone Godina Harargee Bahaa 233 | East Shawa Shawaa Bahaa 234 | East Shewa Zone Baha Shawaa 235 | East Shewa Zone Godina Baha Shawaa 236 | East Welega Zone Wallagga Bahaa 237 | East Welega Zone Godina Wallagga Bahaa 238 | East Wellega Zone Wallagga Bahaa 239 | East Wellega Zone Godina Wallagga Bahaa 240 | East Wollega Zone Wallagga Bahaa 241 | East Wollega Zone Godina Wallagga Bahaa 242 | East Wallagga Wallagga Bahaa 243 | East Wallagga Godina Wallagga Bahaa 244 | Ebibiyin (city in Equatorial Guinea) Ebeebiyiin 245 | Ecuador Ekuwadoor 246 | Eddo Eddoo 247 | Egypt Gibxii 248 | Ejere Ejeree 249 | Ejersa Ejersa 250 | Ejersa Goro Ejersa Gooroo 251 | El Salvador Elsalbadoor 252 | Equatorial Guinea Ekuwaatooriyaal Giinii 253 | Equatorial Guinea Giinii Mundhilafaa 254 | Erer Erer 255 | Erer Gota Erer Gootaa 256 | Eritrea Eeritiraa, Eertiraa 257 | Eritrea Ertiraa 258 | Estonia Istooniyaa 259 | Ethiopia Itiyoophiyaa 260 | Ethiopia Itiyoopiyaa 261 | Ethiopia Itoophiyaa 262 | Ethiopia Itoopiyaa 263 | Ethiopia Toophiyaa 264 | Europe Yuurooppi 265 | Europe Oroppaa 266 | Fafan Faafan 267 | Feyisa Lilesa Fayyisaa Leellisaa 268 | Fiche Fiche 269 | Fiji Fiijii 270 | Filtu Filtuu 271 | Fincha'a Fincaa'aa 272 | Fincha Fincaa'aa 273 | Finchawa Fincaawa 274 | Finland Finlaandii 275 | France Firaansi 276 | France Faransaa 277 | Frankfurt Firaankifurtii 278 | Funyan Bira Funyaan Biraa 279 | Gabba Gabba 280 | Gabon Gaabon 281 | Gabon Gaaboon 282 | Gachi Gachii 283 | Gafarsa Hara Gafarsa 284 | Galamso Galamso 285 | Gambela Region Naannoo Gaambeellaa 286 | Gambia Gaambiyaa 287 | Gamo Gofa (zone) Gamuu-Gofaa 288 | Ganji Ganjii 289 | Gannale Gannaalee 290 | Garba Garbaa 291 | Garba Gurracha Garba Gurraacha 292 | Gasara Gasaraa 293 | Gattira Gaattiraa 294 | Gedo Geedoo 295 | Gembe Gembee 296 | Geneva Jenebaa 297 | Georgia Joorjiyaa 298 | Germany Biyya Jarmanii 299 | Ghana Gaanaa 300 | Gibe Gibe 301 | Gibe Kalla Gibe Qalldhaa 302 | Gidami Gidaamii 303 | Gidda Ayyana Giddaa Ayyaanaa 304 | Gimbi Gimbii 305 | Ginchi Giincii 306 | Gindo Gindo 307 | Ginde Beret Gindabarat 308 | Ginnir Gindhir 309 | Girma Seifu Girmaa Sayifuu 310 | Gobba Gobba 311 | Gobessa Gobeessa 312 | Gojjo Goojjoo 313 | Gojjota Goojjota 314 | Gore Goree 315 | Gori Gorii 316 | Goro Bale Gooroo Baalee 317 | Goro Sole Gooroo Soolee 318 | Goro Waliso Gooroo Walisoo 319 | Greece Giriik 320 | Greece Biyaa Giriikii 321 | Grenada Girenaadaa 322 | Guatemala Guwatimaalaa 323 | Gudar Gudar 324 | Guinea Giinii 325 | Guinea Bissau Giinii Bisaawu 326 | Guji Zone Godina Gujii 327 | Gullallee (district of Addis Ababa) Gullallee 328 | Gulliso Gullisoo 329 | Guraferda Gura Ferdaa 330 | Gurage Guraagee 331 | Guraghe Guraagee 332 | Gurra Dhamole Gurra Dhaamolee 333 | Gurura Gur'uraa 334 | Guruwa Guruwaa 335 | Guyana Guyaanaa 336 | Haile Selassie Hayila Sillaasee 337 | Haiti Hayitii 338 | Hamburg Hamburgii 339 | Hara Maya Hara Maayaa 340 | Haramaya Hara Maayaa 341 | Harar Harar 342 | Harari (region) Naannoo Hararii 343 | Harato Haratoo 344 | Harawacha Harawaaca 345 | Harbu Harbuu 346 | Harbu Chululle Harbuu Culullee 347 | Haro Dibbe Haroo Dibbee 348 | Haro Dumal Haroo Dumaal 349 | Haro Hara Liban Haroo Hara Liiban 350 | Harsadi Harsadii 351 | Hatayye Haxaayyee 352 | Herero Hereero 353 | Hiddi Lola Hiddii Lolaa 354 | Hirna Hirnaa 355 | Ho Chi Minh City Magaalaa Hochi Minii 356 | Holota Hoolota 357 | Holota Holota 358 | Honduras Honduraas 359 | Honqolloo Honqolloo 360 | Horn of Africa Gaanfi Afrikaa, Gaafa Afrikaa 361 | Horo Guduru Welega Zone Horroo Guduruu 362 | Horo Guduru Welega Zone Godina Horroo Guduruu 363 | Hulluka Hulluuqaa 364 | Human Rights Watch Hiyumaan Raaytis Waach 365 | Hungary Hungaariyaa 366 | Hurrumu Hurruumuu 367 | Hursa Huursaa 368 | Huruta Hurutaa 369 | Ibadan Ibaadan 370 | Ibsa (name) Ibsaa 371 | Iceland Islaandii 372 | Ijajji Ijaajjii 373 | Illubabor Zone Illuu Abbaa Booraa 374 | Illubabor Zone Illuu Abbaaboor 375 | Ilubba Bora Ilubbaa Booraa 376 | Inango Inaangoo 377 | Inchinni Metta Incinnii Meettaa 378 | Inchinnii Liban Incinnii Liiban 379 | India Biyya Hindii 380 | Indian Ocean Garba Indiyaa 381 | Indian Ocean Maanya Hindii 382 | Indonesia Indoneesiyaa 383 | Iran Iraan 384 | Iraq Iraaq 385 | Ireland Irlaandii 386 | Israel Israa'el 387 | Istanbul Istaanbul 388 | Italy Ixaaliyaa 389 | Italy Biyya Xaaliyaanii 390 | Itayya Itayyaa 391 | Ivory Coast Ayvorii Koost, Iyvoorikoost 392 | Jaja Jaaja 393 | Jajjabe Jajjabee 394 | Jalliyan Jalliyaan 395 | Jamaica Jamaayikaa 396 | Jamma Jammaa 397 | Japan Jaappaan 398 | Jarra Jaarraa 399 | Jarso Jaarso 400 | Jerusalem Yerusaalem 401 | Jido Jidda 402 | Jimma Mountains Gooroo Jimmaa 403 | Jimma University Yuunivarsiitii Jimmaa 404 | Jimma Zone Godina Jimmaa 405 | Jimma Jimma 406 | Jimma Jimaan 407 | Jimma Jimmam 408 | Jima Jimma 409 | Jima Jimaan 410 | Jima Jimmam 411 | Johannesburg Johaannisburgii 412 | Jordan Joordaan 413 | Kachisi Kaachisii 414 | Kake Qaaqee 415 | Kakka Kaakkaa 416 | Kamise Kamisee 417 | Kara Mille Karaa Millee 418 | Karachi Karaachii 419 | Karra Kore Kaarra Qoree 420 | Karsa Qarsaa 421 | Kazakhstan Kazaakistaan 422 | Kebbe Qebbee 423 | Keffa Kafaa 424 | Kelam Welega Zone Qeellam Wallaggaa 425 | Kelam Welega Zone Godina Qeellam Wallaggaa 426 | Kelam Wellega Zone Qeellam Wallaggaa 427 | Kelam Wellega Zone Godina Qeellam Wallaggaa 428 | Kelam Wollega Zone Qeellam Wallaggaa 429 | Kelam Wollega Zone Godina Qeellam Wallaggaa 430 | Kenya Keniyaa,Keeniyaa 431 | Kerransa Qeerransa 432 | Kersa Qarsaa 433 | Kiribati Kiribatii 434 | Kobbo Barento Qobboo Barentoo 435 | Kobbo Rayya Qobboo Raayyaa 436 | Kofale Kofalee 437 | Koka Qooqaa 438 | Kokossa Kokossa 439 | Kolobo Koloboo 440 | Kombolcha Guduru Kombolcha Guduruu 441 | Kombolcha Nole Kombolcha Noolee 442 | Kore Qoree 443 | Kotoba Kotoba 444 | Kullubbi Qullubbii 445 | Kunni Qundhii 446 | Kurfa Challe Kurfaa Callee 447 | Kuwait Kuweet 448 | Kuyera Kuyeeraa 449 | Kyrgyzstan Kirgizistaan 450 | Laga Dadhi Laga Daadhii 451 | Lagos Laagos 452 | Lake Awasa Laga Awaash 453 | Lake Dadi Hara Daadhii 454 | Lake Dambal Laaqii Dambal 455 | Lake Dandi Dabbal dandii 456 | Lake Maya Hara Maayaa 457 | Lake Wanchi Calalaqa Wanci 458 | Laki Dambal Laaqii Dambal 459 | Langanno Laangannoo 460 | Laos Laa'os 461 | Latvia Laatbiyaa 462 | Lebanon Lebaanon 463 | Lemman Leemman 464 | Lesotho Lesootoo 465 | Liben Zone Liiban 466 | Liberia Libeeriyaa 467 | Libya Liibiyaa 468 | Liechtenstein Lishtenistaayin 469 | Limmu Shaye Limmu Shaayee 470 | Lithuania Lituweeniyaa 471 | Los Angeles Los Anjeles 472 | Luxembourg Luksamburgii 473 | Macedonia Maasedoniyaa 474 | Machara Machaaraa 475 | Madagascar Madagaaskaar 476 | Maddo Gashi Maddo Gaashii 477 | Maki Maqii 478 | Malabo (town in Equatorial Guinea) Malabo 479 | Malawi Malaawii 480 | Malaysia Maleesiyaa 481 | Maldives Maaldibis 482 | Mali Maalii 483 | Malka Rafu Malkaa Raafuu 484 | Malka Wakkanna Malkaa Waakkannaa 485 | Malta Maaltaa 486 | Manchester United Manchastar Yunaayitid, Manchastar Siitii 487 | Mandi Mandii 488 | Mandida Mandiidaa 489 | Manna Maannaa 490 | Mararo Mararoo 491 | Marsa Marsaa 492 | Marseilles Maarsee 493 | Marshal Islands Odoolota Maarshaal 494 | Marti Martii 495 | Mata Hara Mata Haaraa 496 | Mattu Mattuu 497 | Mauritania Mawurtaaniyaa 498 | Mauritius Mawurishees 499 | Mecca Makkaa 500 | Mecha and Tulama Self-Help Association Waldaa Wal-gargaarsa Maccaa fi Tuulamaa 501 | Medina Madiinaa 502 | Mediterranean Sea Galaana Mediteraaniyaanii 503 | Mediterranean Sea Garba Meditraaniyaa 504 | Megga Meeggaa 505 | Meles Zenawi Meles Zeenaawwii 506 | Menelik (Emperor) Minilik 507 | Meta (woreda) Meettaa 508 | Metta Gafarsa Meettaa Gafarsaa 509 | Metti Meexxii 510 | Mexico Meeksikoo 511 | Michata Miiccataa 512 | Micronesia Mikroneeisyaa 513 | Miesso Mi'eesso 514 | Milan Milaanoo 515 | Minnesota Minnesootaa 516 | Minnesota Minisootaa 517 | Mogadishu Moqaadishoo 518 | Mogor Mogor 519 | Mojo Mojo 520 | Moldova Moldoobaa 521 | Mombasa Mombaasaa 522 | Monaco Monaakoo 523 | Mongolia Mongooliyaa 524 | Mongomo (town in Equatorial Guinea) Mongoma 525 | Montreal Montireel 526 | Mormor Mormor 527 | Morocco Morokko 528 | Morocco Morokoo 529 | Moyale Mooyalee 530 | Mozambique Moozaambik 531 | Mozambique Mozaambik 532 | Mt. Abella Gaara Abeellaa 533 | Mt. Adi Gaara Adii 534 | Mt. Asabot Agaar Asaboot 535 | Mt. Asha Gaara Ashaa 536 | Mt. Batu Gaara Baatuu 537 | Mt. Bora Gaara Booraa 538 | Mt. Chilalo Gaara Cilaaloo 539 | Mt. Chukkala Tulluu Cuqqaalaa 540 | Mt. Dalota Tulluu Daalotaa 541 | Mt. Dannaba Gaara Dannabaa 542 | Mt. Dimtu Tullu Diimtuu 543 | Mt. Erer Gaara Erer 544 | Mt. Fantalle Gaara Fantaallee 545 | Mt. Furi Tulluu Furii 546 | Mt. Gorfo Gaara Gorfoo 547 | Mt. Gorte Gaara Gorxee 548 | Mt. Hatabella Gaara Haxabeellaa 549 | Mt. Hochocha Gaara Hococaa 550 | Mt. Jibat Tulluu Jibaat 551 | Mt. Jorgo Tulluu Joorgoo 552 | Mt. Kunduddo Gaara Qunduddoo 553 | Mt. Mao Gaara Maa'oo 554 | Mt. Mara Tulluu Maraa 555 | Mt. Mullata Gaara Muldhataa 556 | Mt. Salale Gaara Salaalee 557 | Mt. Walal Tulluu Walal 558 | Muggi Muggii 559 | Munich Muniik 560 | Nagalle Arsi Nageellee Arsii 561 | Nageellee Metama Nageellee Meexamaa 562 | Nagelle Borana Nageellee Booranaa 563 | Najjo Najjoo 564 | Nakamte Naqamte 565 | Namagna Amanya 566 | Namibia Namibiyaa 567 | Nauru Nawuruu 568 | Negele Arsi Arsii-Nageellee 569 | Nekemte Naqamte 570 | Nekemte Naqamtee 571 | Nakamti Naqamte 572 | Nakamti Naqamtee 573 | Nek’emtē Naqamte 574 | Nek’emtē Naqamtee 575 | Nek'emte Naqamte 576 | Nek'emte Naqamtee 577 | Nepal Neppaal 578 | Netherlands Neezarlaandii 579 | New York Niwu Yoorki 580 | New Zealand Niw Zilaadii 581 | Nicaragua Nikaraaguwaa 582 | Niger Nijeer 583 | Nigeria Naayijeeriyaa 584 | Nigeria Naajeriyaa 585 | Nono Noonoo 586 | North America Ameerikaa Kaabaa 587 | North Korea Koriyaa Kaabaa 588 | North Sea Garba Boroo 589 | North Shewa Zone Shawaa Kaabaa 590 | North Shawa Shawaa Kaabaa 591 | Norway Noorwee 592 | Obbi Obbi 593 | Oborra Oborraa 594 | Oborso Oborso 595 | Ogolcho Ogolchoo 596 | Olonkomi Olonkomii 597 | Oman Omaan 598 | Oromia Media Network (OMN) OMN 599 | Oromia Region Oromiyaa 600 | Oromiya Region Oromiyaa 601 | Oromia Regional State Oromiyaa 602 | Oromo (people) Oromoo 603 | Oromo Liberation Front (OLF) Adda Bilisummaa Oromoo 604 | Oromo People's Democratic Organization (OPDO) Dhaabbata Dimookraatawaa Ummata Oromoo 605 | Osaka Osaakaa 606 | Pacific Ocean Garba Paasifiik 607 | Pakistan Paakistaan 608 | Palau Palawuu 609 | Panama Panamaa 610 | Papua New Guinea Pappawaa Niw giinii 611 | Paraguay Paraguwaay 612 | Philippines Filippiin 613 | Poland Polandii 614 | Portugal Portugaal 615 | Qatar Kataar 616 | Rammis Raammis 617 | Ras Dejen Raash Daashin 618 | Ras Dashen Raash Daashin 619 | Ras Dashan Raash Daashin 620 | Red Sea Galaana Diimaa 621 | Red Sea Garba Diimaa 622 | Rejji Reejjii 623 | Rio de Janeiro Riiyoo Di Jeneroo 624 | Robe Arsi Roobe Arsii 625 | Robe Bale roobe Baalee 626 | Rogge Ammayya Roggee Ammayyaa 627 | Romania Rumaaniyaa 628 | Rotterdam Roterdaam 629 | Russia Rusiyaa 630 | Rwanda Ruwaandaa 631 | Sabbata Sabbata 632 | Sagan Saagan 633 | Sagure Saaguee 634 | Saint Kitts and Nevis Seenti Kitii fi Nebis 635 | Saint Lucia Seenti Luchiyaa 636 | Saint Vincent and Grenadines Seenti Binchentii fi Girenadiin 637 | Sakka Saqqaa 638 | Sakkata Saaqqata 639 | Saku Saaku 640 | Sambate Sambatee 641 | same as Garba Meditraaniyaa Garba Jiddugaleessa 642 | Samoa Samowaa 643 | San Marino Saan Mariinoo 644 | Sandafa Sandaafa 645 | Sao Tome and Principe Sawo Toomee fi Pirinsippii 646 | Sarbo Sarboo 647 | Sasigga Saasiggaa 648 | Saudi Arabia Saudi Arabiya 649 | Saudi Arabia Sa'uudi Arabiya 650 | Saudi Arabia Suud Arabiyaa 651 | Saudi Arabia Sawudii Arabiyaa 652 | Sayyo sayyoo 653 | Seden Sodo Sadan Sooddoo 654 | Senegal Sengaal 655 | Senegal Senegaal 656 | Serbia and Montenegro Sarbiyaa fi Montenegroo 657 | Serofta Seeroftaa 658 | Seru Seeruu 659 | Seychelles Sechiliis 660 | Shabbe Shabbee 661 | Shaggar Shaggar 662 | Shaki Shakii 663 | Shakkiso Shaakkisoo 664 | Shala Shaalaa 665 | Shambu Shaambu 666 | Shanan Shanan 667 | Shanghai Shaangaay 668 | Shano Shano 669 | Shashamanne Shaashamannee 670 | Shashemene Shashemane 671 | Shashemene Shaashemannee 672 | Shashamane Shashemane 673 | Shashamane Shaashemannee 674 | Shayya Shaayyaa 675 | Shirbo Shirboo 676 | Shire Arsi Shiree Arsii 677 | Shoa: Shewa Shawaa 678 | Shoboka Shoboka 679 | Sibu Sire Siibuu Siree 680 | Sidama Zone Sidaamoo 681 | Sidama Zone Sidaamaa 682 | Sierra Leone Seraliyoon 683 | Silt'e Zone Silxee 684 | Siltie Zone Silxee 685 | Singapore Singapoor 686 | Sire Arsi Siree Arsii 687 | Sire Robi Siree Roobii 688 | Slovakia Islobaakiyaa 689 | Slovakia Islobeeniyaa 690 | Sodare Soodaree 691 | Sokorru Sokorruu 692 | Solomo Soolomo 693 | Solomon Islands Odoolota Solomon 694 | Somali Somaalee 695 | Somalia Somaaliyaa 696 | Sor Soor 697 | South Africa Afrikaa Kibbaa 698 | South Africa Aafrikaa Kibbaa 699 | South America Ameerikaa Kibbaa 700 | South Korea Koriyaa Kibbaa 701 | South Sudan Sudaan Kibbaa 702 | Southern Nations, Nationalities, and Peoples' Region (SSNPR) Naannoo Sabaa fi Sab-lammoota Ummattoota Kibbaa 703 | Southwest Shewa Zone Shawaa Kibba-lixaa 704 | Southwest Shewa Zone Shawaa Kill Lixaa 705 | Soyyama Sooyyama 706 | Spain Ispaanyaa 707 | Sri Lanka Siri Laankaa 708 | St. Petersburg Seenti Peterburgii 709 | Strasbourg Istiraasburgii 710 | Sudan Sudaan 711 | Sululta Sululta 712 | Suriname Surinaam 713 | Swaiziland Iswaazilaandii 714 | Sweden Iswiidin 715 | Switzerland Biyya Iswiisii 716 | Sydney Sidinee 717 | Syria Sooriyaa 718 | Tafki Tafkii 719 | Taiwan Taayiwaan 720 | Taji Tajii 721 | Tajikistan Tajikistaan 722 | Taltalle Taltallee 723 | Tanzania Tanzaaniyaa 724 | Tanzania Tanzaaniyaa 725 | Thailand Taayilaandii 726 | The Hague Haag 727 | Tibbe Tibbee 728 | Tigray Region Naannoo Tigraay 729 | Timbuktu Tumbuktuu 730 | Tiyya Xiyyaa 731 | Tobba Toobbaa 732 | Togo Toogoo 733 | Toke Kutaye Kuutaayee 734 | Toke Kutaye Tokkee Kuutaayee 735 | Tokke Irressa Tokkee Irreessaa 736 | Tole Tole 737 | Tonga Tongaa 738 | Toronto Torontoo 739 | Trinidad and Tobago Tirindaadii fi Tobaagoo 740 | Tullu Milki Tulluu Milkii 741 | Tulu Bolo Tullu Boolloo 742 | Tunisia Tuniisiyaa 743 | Tunisia Tuniisiyaa 744 | Turkey Biyya Turkii 745 | Turkmenistan Turkemenistaan 746 | Tuvalu Tabuluu 747 | U.S.A. Ameerikaa 748 | Uganda Yugaandaa 749 | Uganda Ugaandaa 750 | Ukraine Ukraayin 751 | United Arab Emirates Tokkummaa Imiroota Arabaa 752 | United Kingdom Biritaaniyaa 753 | United Kingdom Yunaayitid Kingidem 754 | UK Yunaayitid Kingidem 755 | United Liberation Forces of Oromia Tokkummaa Humnoota Bilisummaa Oromiyaa 756 | United States Yunaayitid Isteesi 757 | United States of America Yunaayitid Isteesi 758 | US Yunaayitid Isteesi 759 | USA Yunaayitid Isteesi 760 | Urgessa Urgeessaa 761 | Uruguay Uruguwaay 762 | Uzbekistan Uzbeekistaan 763 | Vanatu Banuwaatuu 764 | Vatican Baatikaan 765 | Venezuela Benezuweelaa 766 | Venice Benesiyaa 767 | Vietnam Beetinaam 768 | Voice of America raadiyoon sagalee Amerikaa 769 | Wabe Waabee 770 | Wabe Gafarsa Waabee Gafarsaa 771 | Wachu Waaccuu 772 | Wadera Wadeera 773 | Walabu Haroo Walaabuu 774 | Walanchitti Walancittii 775 | Waldiya Waldiyaa 776 | Walga Waalgaa 777 | Waliso (wordea) Waliso 778 | Waliso (wordea) Walisoo 779 | Walmal Walmal 780 | Wama Waamaa 781 | Wanci Mountains Gooroo Wancii 782 | Wanji Wanjii 783 | Wanji Gafarsa Wanjii Gafarsaa 784 | Warka Warqa 785 | Watar Watar 786 | Wayane Wayyannee 787 | Wayane Wayyanne 788 | Wayane Wayyaane 789 | Tigrayan People's Liberation Front Wayyannee 790 | Tigrayan People's Liberation Front Wayyanne 791 | Tigrayan People's Liberation Front Wayyaane 792 | Weyane Wayyannee 793 | Weyane Wayyanne 794 | Weyane Wayyaane 795 | Second Weyane Wayyannee 796 | Second Weyane Wayyanne 797 | Second Weyane Wayyaane 798 | Wayyu Waayyuu 799 | Welega Wallagga 800 | Wellega Wallagga 801 | Wollega Wallagga 802 | West Arsi Zone Arsii Lixaa 803 | West Arsi Zone Arsii Dhihaa 804 | Mirab Arsi Zone Arsii Lixaa 805 | Mirab Arsi Zone Arsii Dhihaa 806 | West Gojjam Zone Goojjaam Dhihaa 807 | West Hararghe Zone Harargee Dhihaa 808 | West Hararghe Zone Godina Harargee Dhihaa 809 | West Hararghe Zone Harargee Lixaa 810 | West Shewa Zone Shawaa Lixaa 811 | West Shewa Zone Shawaa Dhihaa 812 | West Shewa Zone Godina Shawaa Lixaa 813 | West Shewa Zone Godina Shawaa Dhihaa 814 | West Welega Zone Wallagga Lixaa 815 | West Welega Zone Wallagga Dhihaa 816 | West Welega Zone Godina Wallagga Lixaa 817 | West Welega Zone Godina Wallagga Dhihaa 818 | West Wellega Zone Wallagga Lixaa 819 | West Wellega Zone Wallagga Dhihaa 820 | West Wellega Zone Godina Wallagga Lixaa 821 | West Wellega Zone Godina Wallagga Dhihaa 822 | West Wollega Zone Wallagga Lixaa 823 | West Wollega Zone Wallagga Dhihaa 824 | West Wollega Zone Godina Wallagga Lixaa 825 | West Wollega Zone Godina Wallagga Dhihaa 826 | Mirab Welega Wallagga Lixaa 827 | Mirab Welega Wallagga Dhihaa 828 | Mirab Welega Godina Wallagga Lixaa 829 | Mirab Welega Godina Wallagga Dhihaa 830 | West Wallagga Wallagga Lixaa 831 | West Wallagga Wallagga Dhihaa 832 | West Wallagga Godina Wallagga Lixaa 833 | West Wallagga Godina Wallagga Dhihaa 834 | Weyib Weeyib 835 | World Bank Baankii Addunyaa 836 | Yaballo Yaaballoo 837 | Yabbu Yabbuu 838 | Yadot Yaadot 839 | Yambaro Yambaroo 840 | Yayyu Yaayyoo 841 | Yemen Yaman 842 | Yirba Muda Yirbaa Muudaa 843 | Yubdo Yuubdoo 844 | Zambia Zaambiyaa 845 | Zimbabwe Zimbaabwee 846 | -------------------------------------------------------------------------------- /utils/orm_norm/ormnorm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | 6 | PATTERNS = [ 7 | (r'([aeiou])(\1)', r'\1'), 8 | (r'(b|c|ch|d|dh|f|g|h|j|k|l|m|n|ny|p|ph|q|r|s|sh|t|v|w|x|y|z)\1', r'\1'), 9 | (r'ph', r'p'), 10 | (r'q', r'k'), 11 | (r'x', r't'), 12 | (r'c([^h]|\b)', r'ch\1'), 13 | (r'ai', r'ayi'), 14 | (r's(b|c|ch|d|dh|f|g|h|j|k|l|m|n|ny|p|ph|q|r|s|sh|t|v|w|x|y|z)', r'f\1'), 15 | ] 16 | 17 | 18 | def normalize(text): 19 | if all([x.isupper() for x in text]): 20 | return text 21 | cap = True if text[0].isupper() and all([x.islower() for x in text[1:]]) else False 22 | text = text.lower() 23 | for pattern, repl in PATTERNS: 24 | text = re.sub(pattern, repl, text) 25 | if cap: 26 | return text.capitalize() 27 | else: 28 | return text 29 | -------------------------------------------------------------------------------- /utils/post_process.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from collections import defaultdict 3 | # "GENERAL lookup table" 4 | tags = set(['GPE', 'PER', 'ORG', 'LOC']) 5 | 6 | 7 | def read_gold_file(gold_path): 8 | with codecs.open(gold_path, "r", "utf-8") as fin: 9 | doc_set = set() 10 | for line in fin: 11 | line = line.strip() 12 | if len(line) == 0: 13 | continue 14 | 15 | line = line.decode('utf-8') 16 | tokens = line.split('\t') 17 | 18 | doc_id = tokens[0] 19 | start = int(tokens[1]) 20 | end = int(tokens[2]) 21 | ner = tokens[5].split('/')[0] 22 | 23 | doc_set.add((doc_id, start, end)) 24 | 25 | print 'num of annotated doc: %d' % len(doc_set) 26 | return doc_set 27 | 28 | 29 | def make_darpa_format(span, curr_docum, curr_anot, start, end, tag): 30 | st = 'CMU_NER_LOREAL_CP1_TB_GS' + '\t' + curr_docum + '-ann-' + str(curr_anot) + '\t' + span\ 31 | + '\t' + curr_docum + ':' + str(start) + '-' + str(end) + '\t' + 'NIL' + '\t' + \ 32 | tag + '\t' + 'NAM' + '\t' + '1.0' + "\n" 33 | return st.split('\t') 34 | 35 | 36 | def combine_lookup_table(lookup_files): 37 | lookup_table = defaultdict(lambda: set()) 38 | 39 | for key, fname in lookup_files.iteritems(): 40 | if key in tags: 41 | with codecs.open(fname, "r", "utf-8") as fin: 42 | for line in fin: 43 | lookup_table[line.strip()].add(key) 44 | else: 45 | with codecs.open(fname, "r", "utf-8") as fin: 46 | for line in fin: 47 | fs = line.strip().split('\t') 48 | lookup_table[fs[0]].add(fs[1]) 49 | new_lookup_table = dict() 50 | 51 | # remove spans that are annotated with multiple entities 52 | for key, value in lookup_table.iteritems(): 53 | if len(value) == 1: 54 | new_lookup_table[key] = list(value)[0] 55 | return new_lookup_table 56 | 57 | 58 | def single_lookup_table(lookup_file, tag): 59 | lookup_table = dict() 60 | if tag in tags: 61 | with codecs.open(lookup_file, "r", "utf-8") as fin: 62 | for line in fin: 63 | lookup_table[line.strip()] = tag 64 | else: 65 | with codecs.open(lookup_file, "r", "utf-8") as fin: 66 | for line in fin: 67 | fs = line.strip().split('\t') 68 | lookup_table[fs[0]] = fs[1] 69 | return lookup_table 70 | 71 | 72 | def find_ngrams(sent, starts, ends, n): 73 | all_ngrams = [] 74 | all_starts = [] 75 | all_ends = [] 76 | for i in range(1, n+1): 77 | all_ngrams += zip(*[sent[j:] for j in range(i)]) 78 | all_starts += zip(*[starts[j:] for j in range(i)]) 79 | all_ends += zip(*[ends[j:] for j in range(i)]) 80 | return all_ngrams, all_starts, all_ends 81 | 82 | 83 | def post_processing(path_darpa_prediction, 84 | path_to_full_setE, 85 | path_to_author, 86 | output_file, 87 | lookup_files=None, 88 | label_propagate=True, 89 | conf_num=0, 90 | gold_file_path=None, 91 | most_freq_num=20, 92 | fout_conll_name=None): 93 | ''' 94 | 95 | :param path_darpa_prediction: Final output 96 | :param path_to_full_setE: setE.conll 97 | :param path_to_author: "path_to_author_list" 98 | :param output_file: 99 | :param lookup_files: {"General": "path_to_lexicon_1", "General": path2"} 100 | :param label_propagate: BOOLEAN 101 | :return: 102 | ''' 103 | 104 | predicted_doc = defaultdict(lambda: dict()) # (doc_id: (span_token, start, end):NER) 105 | unpredicted_spans = defaultdict(lambda: list()) # (doc_id: [(ngram_token, start, end)]) 106 | MAX_NGRAM = 5 107 | prediction_list = [] 108 | predicted_spans = defaultdict(lambda: list()) 109 | 110 | if lookup_files is not None: 111 | lookup_table = combine_lookup_table(lookup_files) 112 | author_lookup = single_lookup_table(path_to_author, "PER") 113 | annot_id = defaultdict(lambda: 0) # doc_id:annotation num 114 | 115 | gold_spans = read_gold_file(gold_file_path) 116 | 117 | def _look_up(span, doc_attribute): 118 | if doc_attribute == "DF" and span in author_lookup: 119 | return 'PER' 120 | if lookup_files is not None and span in lookup_table: 121 | return lookup_table[span] 122 | return None 123 | 124 | def _is_overlap(s1, e1, s2, e2): 125 | # Condition: s1 < e1, s2 < e2 126 | return not(e1 < s2 or e2 < s1) 127 | 128 | def _check_cross_annotations(list_spans, target_start, target_end): 129 | flag = False 130 | for (s, e) in list_spans: 131 | if _is_overlap(s, e, target_start, target_end): 132 | flag = True 133 | break 134 | return flag 135 | 136 | add_labels = 0 # includes both fixed labels and added labels 137 | 138 | # First using the lookup table to fix up the current predictions 139 | with codecs.open(path_darpa_prediction, "r", "utf-8") as fin: 140 | for line in fin: 141 | fields = line.strip().split('\t') 142 | span = fields[2] 143 | predict_tag = fields[5] 144 | doc_id_span = fields[3].split(":") 145 | doc_id = doc_id_span[0] 146 | doc_attribute = doc_id.split('_')[1] 147 | annot_id[doc_id] += 1 148 | span_id = [int(i.strip()) for i in doc_id_span[1].split('-')] 149 | start_id, end_id = span_id[0], span_id[1] 150 | 151 | lookup_tag = _look_up(span, doc_attribute) 152 | if lookup_tag is not None and lookup_tag != predict_tag and (doc_id, start_id, end_id) in gold_spans: 153 | add_labels += 1 154 | predict_tag = predict_tag if lookup_tag is None else lookup_tag 155 | 156 | predicted_doc[doc_id][(span, start_id, end_id)] = predict_tag 157 | prediction_list.append(make_darpa_format(span, doc_id, annot_id[doc_id], start_id, end_id, predict_tag)) 158 | predicted_spans[doc_id].append((start_id, end_id)) 159 | # Second, iterate over the full setE using the lookup tables to completed the predicted dict 160 | # In the mean time, give statistics of ngrams for label propagation. 161 | ngram_freq = defaultdict(lambda: 0) 162 | full_setE_list = [] 163 | with codecs.open(path_to_full_setE, "r", "utf-8") as fin: 164 | one_sent = [] 165 | start_ids = [] 166 | end_ids = [] 167 | doc_attribute = "" 168 | for line in fin: 169 | tokens = line.split('\t') 170 | if len(tokens) == 0 or line == "" or line == "\n": 171 | one_sent_place_holder = [] 172 | for k, (w, s, e) in enumerate(zip(one_sent, start_ids, end_ids)): 173 | one_sent_place_holder.append((s, e, doc_id, w)) 174 | full_setE_list.append(one_sent_place_holder) 175 | 176 | ngrams, starts, ends = find_ngrams(one_sent, start_ids, end_ids, MAX_NGRAM) 177 | for ngram, s, e in zip(ngrams, starts, ends): 178 | ngram = " ".join(ngram) 179 | ngram_freq[ngram] += 1 180 | predict_tag = _look_up(ngram, doc_attribute) 181 | key = (ngram, s[0], e[-1]) 182 | if predict_tag is not None: 183 | if key not in predicted_doc[doc_id] and not _check_cross_annotations(predicted_spans[doc_id], s[0], s[-1]): 184 | predicted_doc[doc_id][key] = predict_tag 185 | annot_id[doc_id] += 1 186 | prediction_list.append(make_darpa_format(ngram, doc_id, annot_id[doc_id], s[0], e[-1], predict_tag)) 187 | 188 | predicted_spans[doc_id].append((s[0], e[-1])) 189 | if (doc_id, s[0], e[-1]) in gold_spans: 190 | add_labels += 1 191 | else: 192 | if key not in predicted_doc[doc_id]: 193 | unpredicted_spans[doc_id].append(key) 194 | one_sent = [] 195 | start_ids = [] 196 | end_ids = [] 197 | else: 198 | word = tokens[0] 199 | doc_id = tokens[3] 200 | doc_attribute = doc_id.split('_')[1] 201 | start = int(tokens[6]) 202 | end = int(tokens[7]) 203 | 204 | one_sent.append(word) 205 | start_ids.append(start) 206 | end_ids.append(end) 207 | 208 | if len(one_sent) != 0: 209 | one_sent_place_holder = [] 210 | for k, (w, s, e) in enumerate(zip(one_sent, start_ids, end_ids)): 211 | one_sent_place_holder.append((s, e, doc_id, w)) 212 | full_setE_list.append(one_sent_place_holder) 213 | 214 | ngrams, starts, ends = find_ngrams(one_sent, start_ids, end_ids, MAX_NGRAM) 215 | for ngram, s, e in zip(ngrams, starts, ends): 216 | ngram = " ".join(ngram) 217 | ngram_freq[ngram] += 1 218 | predict_tag = _look_up(ngram, doc_attribute) 219 | key = (ngram, s[0], e[-1]) 220 | if predict_tag is not None: 221 | if key not in predicted_doc[doc_id] and not _check_cross_annotations(predicted_spans[doc_id], s[0], 222 | s[-1]): 223 | predicted_doc[doc_id][key] = predict_tag 224 | annot_id[doc_id] += 1 225 | prediction_list.append( 226 | make_darpa_format(ngram, doc_id, annot_id[doc_id], s[0], e[-1], predict_tag)) 227 | 228 | predicted_spans[doc_id].append((s[0], e[-1])) 229 | if (doc_id, s[0], e[-1]) in gold_spans: 230 | add_labels += 1 231 | else: 232 | if key not in predicted_doc[doc_id]: 233 | unpredicted_spans[doc_id].append(key) 234 | 235 | print("Total %d labels in the gold spans get fixed by the lookup tables!" % (add_labels,)) 236 | 237 | def _print(dic): 238 | for k, v in dic.iteritems(): 239 | print k, v 240 | 241 | tot_prop_label = 0 242 | if label_propagate: 243 | # Label propagation 244 | # (a) Within document propagation 245 | for doc_id, span_infos in predicted_doc.iteritems(): 246 | vote_tag = defaultdict(lambda: defaultdict(list)) # span: tag:[(start, end)] 247 | for span_info, tag in span_infos.iteritems(): 248 | span = span_info[0] 249 | start = span_info[1] 250 | end = span_info[2] 251 | vote_tag[span][tag].append((start, end)) 252 | new_vote_tag = dict() 253 | for span, other in vote_tag.iteritems(): 254 | max_tag = "" 255 | max_vote = 0 256 | for tag in other.keys(): 257 | vote = len(other[tag]) 258 | if vote > max_vote: 259 | max_vote = vote 260 | max_tag = tag 261 | new_vote_tag[span] = (max_tag, vote_tag[span][max_tag], max_vote) 262 | 263 | add_label = 0 264 | for unpredict_span in unpredicted_spans[doc_id]: 265 | s2, e2 = unpredict_span[1], unpredict_span[2] 266 | uspan = unpredict_span[0] 267 | if uspan in new_vote_tag: 268 | # conservative propagation 269 | if new_vote_tag[uspan][2] <= conf_num: 270 | continue 271 | pred_tag = new_vote_tag[uspan][0] 272 | # check if there is an overlap between spans 273 | flag = True 274 | for s1, e1 in new_vote_tag[uspan][1]: 275 | if _is_overlap(s1, e1, s2, e2): 276 | print "There is overlap: ", (s1, e1), (s2, e2) 277 | flag = False 278 | break 279 | if flag and not _check_cross_annotations(predicted_spans[doc_id], s2, e2): 280 | # propagate the label 281 | if (doc_id, s2, e2) in gold_spans: 282 | add_label += 1 283 | annot_id[doc_id] += 1 284 | prediction_list.append(make_darpa_format(uspan, doc_id, annot_id[doc_id], s2, e2, pred_tag)) 285 | predicted_spans[doc_id].append((s2, e2)) 286 | unpredicted_spans[doc_id].remove(unpredict_span) 287 | if add_label > 0: 288 | tot_prop_label += add_label 289 | print("Within Document Label Propagation: Add %d labels for Doc %s. " % (add_label, doc_id)) 290 | 291 | print("Total %d labels get propagated within document for gold setE!" % (tot_prop_label, )) 292 | 293 | # (b) Cross document propagation 294 | freq_ngram_list = sorted(ngram_freq, key=ngram_freq.get)[-most_freq_num:] 295 | # for w in freq_ngram_list: 296 | # print w 297 | vote_tag = defaultdict(lambda: defaultdict(lambda :0)) 298 | for doc_id, span_infos in predicted_doc.iteritems(): 299 | for span_info, tag in span_infos.iteritems(): 300 | span = span_info[0] 301 | if span in freq_ngram_list: 302 | vote_tag[span][tag] += 1 303 | vote_out_ents = dict() 304 | vote_ent_freq = defaultdict(lambda: 0) 305 | for span, other in vote_tag.iteritems(): 306 | max_tag = "" 307 | max_vote = 0 308 | for tag, vote in other.iteritems(): 309 | vote_ent_freq[span] += vote 310 | if vote > max_vote: 311 | max_tag = tag 312 | max_vote = vote 313 | vote_out_ents[span] = max_tag 314 | print("###### Among %d most frequent ngram, %d of which are given labels by the model! ########### " 315 | "\n The original form and their voted labels are as follows: " % (most_freq_num, len(vote_out_ents))) 316 | print vote_out_ents 317 | print("#" * 6 + "More friendly format: " + "#" * 6) 318 | _print(vote_out_ents) 319 | print("######## Please do some correction or addition here if you are willing to! #########") 320 | vote_out_ents["#VOATigrigna"] = "ORG" 321 | vote_out_ents[u"\u12ad\u120d\u120d"] = "O" 322 | # vote_out_ents.__delitem__(u"\u12ad\u120d\u120d") 323 | print("#" * 6 + "After your correction, now they are: " + "#" * 6) 324 | _print(vote_out_ents) 325 | print("######## The model predictions are also fixed using the new dictionary! #########") 326 | fixed_pred = 0 327 | for i, items in enumerate(prediction_list): 328 | if items[2] in vote_out_ents: 329 | if vote_out_ents[items[2]] == "O": 330 | del prediction_list[i] 331 | fixed_pred += 1 332 | elif items[5] != vote_out_ents[items[2]]: 333 | prediction_list[i][5] = vote_out_ents[items[2]] 334 | fixed_pred += 1 335 | print("Total %d labels in previous predictions get fixed!" % (fixed_pred,)) 336 | add_label = 0 337 | vote_ent_add_freq = defaultdict(lambda :0) 338 | for doc_id, unpredict_span_list in unpredicted_spans.iteritems(): 339 | for unpredict_span in unpredict_span_list: 340 | start, end = unpredict_span[1], unpredict_span[2] 341 | uspan = unpredict_span[0] 342 | if uspan in vote_out_ents and not _check_cross_annotations(predicted_spans[doc_id], start, end) and vote_out_ents[uspan] != "O": 343 | # if (doc_id, start, end) in gold_spans: 344 | # add_label += 1 345 | add_label += 1 346 | vote_ent_add_freq[uspan] += 1 347 | annot_id[doc_id] += 1 348 | prediction_list.append( 349 | make_darpa_format(uspan, doc_id, annot_id[doc_id], start, end, vote_out_ents[uspan])) 350 | 351 | predicted_spans[doc_id].append((start, end)) 352 | unpredicted_spans[doc_id].remove(unpredict_span) 353 | print("\nTotal %d labels get propagated across document for gold setE!" % (add_label, )) 354 | print("\n####### Before label prop, the number of predictions have been assigned for each span: ########") 355 | _print(vote_ent_freq) 356 | print("####### Number of labels of each span ADDED in label prop: #########") 357 | _print(vote_ent_add_freq) 358 | with codecs.open(output_file, "w", encoding='utf-8') as fout: 359 | for item in prediction_list: 360 | one_sent = "\t".join(item) 361 | fout.write(one_sent) 362 | 363 | print "#" * 10 + "Starting converting to conll format! " + "#" * 10 364 | if fout_conll_name is not None: 365 | prediction_dict = dict() 366 | 367 | for items in prediction_list: 368 | doc_id = items[1].split('-')[0] 369 | s = int(items[3].split(":")[1].split("-")[0]) 370 | e = int(items[3].split(":")[1].split("-")[1]) 371 | word = items[2] 372 | tag = items[5] 373 | prediction_dict[(s, e, doc_id)] = (word, tag) 374 | 375 | def _check_predicted(word, s, e, doc_id, first_index, last_index): 376 | if (s, e, doc_id) in prediction_dict: 377 | pword, tag = prediction_dict[(s, e, doc_id)] 378 | if word == pword: 379 | return True, "B-" + tag 380 | else: 381 | for i in range(e+1, last_index+1): 382 | if (s, i, doc_id) in prediction_dict: 383 | pword, tag = prediction_dict[(s, i, doc_id)] 384 | if word == pword[0:len(word)]: 385 | return True, "B-" + tag 386 | for i in range(first_index, s): 387 | if (i, e, doc_id) in prediction_dict: 388 | pword, tag = prediction_dict[(i, e, doc_id)] 389 | if word == pword[len(pword)-len(word):]: 390 | return True, "I-" + tag 391 | for i in range(first_index, s): 392 | for j in range(e+1, last_index+1): 393 | if (i, j, doc_id) in prediction_dict: 394 | pword, tag = prediction_dict[(i, j, doc_id)] 395 | if word in pword: 396 | return True, "I-" + tag 397 | return False, "O" 398 | 399 | num_preded = 0 400 | lines = 0 401 | with codecs.open(fout_conll_name, "w", encoding="utf-8") as fout: 402 | for sent in full_setE_list: 403 | first_index = sent[0][1] 404 | last_index = sent[-1][1] 405 | for s, e, doc_id, w in sent: 406 | exist, tag = _check_predicted(w, s, e, doc_id, first_index, last_index) 407 | fout.write(w + "\tNNP\tNP\t" + tag + "\n") 408 | if exist: 409 | num_preded += 1 410 | fout.write("\n") 411 | lines += 1 412 | if lines % 1000 == 0: 413 | print("Converted %d lines to conll!" % lines) 414 | assert num_preded >= len(prediction_dict) 415 | 416 | # based on ngram frequency 417 | if __name__ == "__main__": 418 | author_list = "./debug/set012E_author.txt" 419 | author_list = "/home/chuntinz/LORELEI_NER/datasets/post_data/tig/set012E_author.txt" 420 | 421 | setE_conll = "../datasets/setE/tig/setE.conll" 422 | pred = "./debug/pred.conll" 423 | pred = "../eval/ensemble3_59df10_darpa_output.conll" 424 | # pred = "./post_test.txt" 425 | setE_conll = "../new_datasets/setE/tig/setE.conll" 426 | pred = "./debug/ensemble_67.conll" 427 | 428 | # lookup_file = {"Gen": "../eval/oromo/Oromo_Annotated.txt"} 429 | output_file = "post_output_67.txt" 430 | gold_file_path = "../ner_score/tir_setE_edl.tac" 431 | f_conll_out = "post_output_67.conll" 432 | 433 | post_processing(pred, setE_conll, author_list, output_file, lookup_files=None, label_propagate=True, 434 | gold_file_path=gold_file_path, conf_num=2, most_freq_num=100, fout_conll_name=f_conll_out) 435 | # post_process_lookup(pred, setE_conll, author_list, output_file, lookup_file) 436 | 437 | import os 438 | 439 | score_file = "../ner_score/score_tir.sh" 440 | fout_name_before = "./before_score.txt" 441 | fout_name = "./score.txt" 442 | os.system("bash %s %s %s" % (score_file, output_file, fout_name)) 443 | os.system("bash %s %s %s" % (score_file, pred, fout_name_before)) 444 | print open(fout_name).read() 445 | 446 | -------------------------------------------------------------------------------- /utils/segnerfts/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for `segnerfts.py` 2 | 3 | The module `segnerfts` defines NER indicator feature extractors for the following languages: 4 | 5 | | Language | ISO 639-3 | 6 | |----------|-----------| 7 | | Amharic | amh | 8 | | English | eng | 9 | | German | deu | 10 | | Oromo | orm | 11 | | Somali | som | 12 | | Tigrinya | tir | 13 | 14 | ## Dependencies 15 | 16 | This code requires the `unicodecsv` package. 17 | 18 | ## Usage 19 | 20 | The function `extract` takes as arguments the ISO 639-3 code and a list of tokens (ideally, a sentence) and returns a list consisting of a list of feature values for each token in the input. 21 | 22 | ```python 23 | >>> import segnerfts 24 | >>> segnerfts.extract('deu', u'Vereinigten Arabischen Republik'.split()) 25 | [[1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]] 26 | ``` 27 | 28 | The functions `extract_type_level` and `extract_token_level` take arguments of the same types but return only type-level and token-level features, respectively. The function `extract_gaz_features` features takes arguments of the same type and returns only the gazetteer features. 29 | 30 | ## Features 31 | 32 | The type-level feature extractors are functions. The token-level features are dictionaries that take ISO 639-3 codes and return functions. 33 | 34 | ### Type-Level Features 35 | 36 | * `ex_capitalized`: is the first character of the token upper-case? 37 | * `ex_all_uppercased`: are all characters of the token upper-case? 38 | * `ex_mixed_case`: among the non-initial characters, are there both upper case and lower case characters? 39 | * `ex_internal_period`: does the token include a period (full stop) that is non-initial and non-final? 40 | * `ex_non_letter`: does the token include a character that is not a letter and not a mark (according to Unicode definitions)? 41 | * `ex_digits`: does the character contain digits? 42 | * `ex_long_token`: is the token longer than a threshold (default=8 characters)? 43 | * `ex_contains_latin`: does the token include Latin characters? 44 | * `ex_contains_ethiopic`: does the token include Ethiopic characters? 45 | 46 | ### Token-Level Features 47 | 48 | * `ex_title`: is the preceding token a title? Note that in Somali, titles are not used before personal names. 49 | * `ex_head_org`: is the token a head word for an organization? 50 | * `ex_head_loc`: is the token a head word for a location or does it include such a word? 51 | * `ex_head_gpe`: is the token a head word for a geopolitical entity or does it include such a word? 52 | * `ex_prep_from`: is the token, or does the token include, a preposition meaning 'from' 53 | * `ex_prep_in`: is the token, or does the token include, a preposition meaning 'in' 54 | 55 | ### Gazetteer Features 56 | 57 | * `ex_b_gaz, LOC`: token is first token of LOC in gazetteer 58 | * `ex_b_gaz, GPE`: token is first token of GPE in gazetteer 59 | * `ex_b_gaz, ORG`: token is first token of ORG in gazetteer 60 | * `ex_b_gaz, PER`: token is first token of PER in gazetteer 61 | * `ex_i_gaz, LOC`: token is non-initial token of LOC in gazetteer 62 | * `ex_i_gaz, GPE`: token is non-initial token of GPE in gazetteer 63 | * `ex_i_gaz, ORG`: token is non-initial token of ORG in gazetteer 64 | * `ex_i_gaz, PER`: token is non-initial token of PER in gazetteer 65 | * `ex_o_gaz`: token is not in a gazetteer entry 66 | -------------------------------------------------------------------------------- /utils/segnerfts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/cmu-ner/d35d57fe453d81cc98e3ee55bac58f9ca618f59b/utils/segnerfts/__init__.py -------------------------------------------------------------------------------- /utils/segnerfts_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | import regex as re 6 | import unicodecsv as csv 7 | import copy 8 | from functools import partial 9 | 10 | 11 | def find_ngrams(input_list, n): 12 | return zip(*[input_list[i:] for i in range(n)]) 13 | 14 | 15 | def get_variants(raw): 16 | raw = raw.replace('; ', ';') 17 | return [tuple(v.split()) for v in raw.split(';')] 18 | 19 | 20 | def load_gaz(gaz_fn): 21 | template = {'GPE': [], 'LOC': [], 'ORG': [], 'PER': []} 22 | gaz = { 23 | 'amh': copy.copy(template), 24 | 'eng': copy.copy(template), 25 | 'deu': copy.copy(template), 26 | 'orm': copy.copy(template), 27 | 'som': copy.copy(template), 28 | 'tir': copy.copy(template), 29 | } 30 | with open(gaz_fn, 'rb') as f: 31 | reader = csv.reader(f, encoding='utf-8') 32 | next(reader) 33 | for fields in reader: 34 | eng, lab, tir, tir_ipa, orm, orm_ipa, wik, id_, _ = fields 35 | if not lab: 36 | if len(eng.split()) == 1: 37 | lab = 'GPE' 38 | if tir and lab: 39 | for v in get_variants(tir): 40 | gaz['tir'][lab].append(v) 41 | if orm and lab: 42 | for v in get_variants(orm): 43 | gaz['orm'][lab].append(v) 44 | return gaz 45 | 46 | 47 | gazetteer = load_gaz('../utils/gaz.csv') 48 | 49 | 50 | def ex_b_gaz(segment, language=None, label=None): 51 | fts = list(map(lambda x: False, segment)) 52 | for entry in gazetteer[language][label]: 53 | ngrams = find_ngrams(segment, len(entry)) 54 | for i, ngram in enumerate(ngrams): 55 | if entry == ngram: 56 | fts[i] = True 57 | return fts 58 | 59 | 60 | def ex_i_gaz(segment, language=None, label=None): 61 | fts = list(map(lambda x: False, segment)) 62 | for entry in gazetteer[language][label]: 63 | ngrams = find_ngrams(segment, len(entry)) 64 | for i, ngram in enumerate(ngrams): 65 | if entry == ngram: 66 | for j in range(len(ngram) - 1): 67 | fts[i + j + 1] = True 68 | return fts 69 | 70 | 71 | def ex_o_gaz(segment, language=None): 72 | fts = list(map(lambda x: True, segment)) 73 | for label in gazetteer[language].keys(): 74 | for entry in gazetteer[language][label]: 75 | ngrams = find_ngrams(segment, len(entry)) 76 | for i, ngram in enumerate(ngrams): 77 | if entry == ngram: 78 | for j in range(len(ngram)): 79 | fts[i + j] = False 80 | return fts 81 | 82 | 83 | LONG_TOKEN_THRESH = 8 84 | 85 | 86 | def ex_capitalized(ws): 87 | return [w[0].isupper() for w in ws] 88 | 89 | 90 | def ex_all_uppercased(ws): 91 | return [all(x.isupper() for x in w) for w in ws] 92 | 93 | 94 | def ex_mixed_case(ws): 95 | def mixed_case(w): 96 | noninit = [x.isupper() for x in w[1:]] 97 | return True in noninit and False in noninit 98 | return map(mixed_case, ws) 99 | 100 | 101 | def ex_internal_period(ws): 102 | return [len(w) > 2 and '.' in w[1:-1] for w in ws] 103 | 104 | 105 | def ex_non_letter(ws): 106 | return [bool(re.search(r'[^\p{Letter}\p{Mark}]', w)) for w in ws] 107 | 108 | 109 | def ex_digits(ws): 110 | return [bool(re.search(r'[\p{Number}]', w)) for w in ws] 111 | 112 | 113 | def ex_long_token(ws): 114 | return [len(w) > LONG_TOKEN_THRESH for w in ws] 115 | 116 | 117 | def ex_contains_latin(ws): 118 | return [bool(re.search(r'\p{Latin}', w)) for w in ws] 119 | 120 | 121 | def ex_contains_ethiopic(ws): 122 | return [bool(re.search(r'\p{Ethiopic}', w)) for w in ws] 123 | 124 | 125 | ex_title = { 126 | 'eng': lambda ws: [w in { 127 | 'Mister', 128 | 'Mr.', 129 | 'Mr', 130 | 'Misses', 131 | 'Mrs.', 132 | 'Mrs', 133 | 'Miss', 134 | 'Ms.', 135 | 'Ms', 136 | 'Doctor', 137 | 'Dr.', 138 | 'Dr', 139 | 'Professor', 140 | 'Prof.', 141 | 'Prof', 142 | 'Father', 143 | 'Fr.', 144 | 'Fr', 145 | 'Reverend', 146 | 'Rev.', 147 | 'Rev', 148 | 'Revd', 149 | 'Pastor', 150 | 'Bishop', 151 | 'Bp.', 152 | 'Bp', 153 | 'President', 154 | 'Pres.', 155 | 'Representative', 156 | 'Rep.', 157 | 'Rep', 158 | 'Congressman', 159 | 'Congresswoman', 160 | 'Congressperson', 161 | 'Senator', 162 | 'Sen.', 163 | 'Sen', 164 | 'Secretary', 165 | 'Sec.', 166 | 'Sec', 167 | 'Lord', 168 | 'Lady', 169 | 'Justice', 170 | 'Sheriff', 171 | 'Principal', 172 | 'Mayor', 173 | } for w in ws], 174 | 'deu': lambda ws: [w in { 175 | 'Herr', 176 | 'Hr.', 177 | 'Frau', 178 | 'Fr.', 179 | 'Fraulein', 180 | 'Frl.', 181 | 'Doktor', 182 | 'Dr.', 183 | 'Dr.med.', 184 | 'Dr.phil.', 185 | 'Dr.rer.nat.', 186 | 'Dr.jur.', 187 | 'Dr.theol.', 188 | 'Professor', 189 | 'Prof.', 190 | 'a.o.Prof.', 191 | 'o.Pr.', 192 | 'Dozent', 193 | 'Doz.', 194 | 'Richter', 195 | 'Senator', 196 | 'Sen.', 197 | 'Ministerpräsident', 198 | 'Ministerpräsidentin', 199 | 'Bürgermeister', 200 | 'Abgeordenete', 201 | 'Abg.', 202 | 'Bundeskanzler', 203 | 'Landeshauptmann', 204 | 'Kaiser', 205 | 'Kaiserin', 206 | 'König', 207 | 'Königin', 208 | 'Kurfürst', 209 | 'Kurfürstin', 210 | 'Erzherzog', 211 | 'Erzherzogin', 212 | 'Großherzog', 213 | 'Großherzogin', 214 | 'Großfürst', 215 | 'Großfürstin', 216 | 'Herzog', 217 | 'Herzogin', 218 | 'Pfalzgraf', 219 | 'Pfalzgräfin', 220 | 'Markgraf', 221 | 'Markgräfin', 222 | 'Landgraf', 223 | 'Landgräfin', 224 | 'Reichsfürst', 225 | 'Reichsfürstin', 226 | 'Reichsgraf', 227 | 'Reichsgräfin', 228 | 'Burggraf', 229 | 'Burggräfin', 230 | 'Altgraf', 231 | 'Altgräfin', 232 | 'Reichsfreiherr', 233 | 'Reichsfreifrau', 234 | 'Reichsfreiin', 235 | 'Reichsritter', 236 | 'Ritter', 237 | 'Graf', 238 | 'Gräfin', 239 | 'Edler', 240 | 'Edle', 241 | 'Freifrau', 242 | 'Frfr.', 243 | 'Freiherr', 244 | 'Frhr.', 245 | 'Hochwürden', 246 | 'Pater', 247 | 'Pfarrer', 248 | 'Pastor', 249 | 'P.', 250 | 'Pfarrhelfer', 251 | 'Kaplan', 252 | 'Vikar', 253 | 'Dekan', 254 | 'Bischof', 255 | 'Kapitän', 256 | 'Kpt.', 257 | 'Leutnant', 258 | 'Lt.', 259 | 'Vorsitzender', 260 | 'Vors.', 261 | } for w in ws], 262 | 'amh': lambda ws: [w in { 263 | 'አቶ', # Mr. 264 | 'ወይዘሮ', 265 | 'ወይዘሪት', 266 | 'ፕሮፌሰር', 267 | 'ፕሬዚዳንት', 268 | 'ፐሬዝዳንት', 269 | 'ፕሬዝዳንት', 270 | 'ኮለኔል', 271 | 'ጄኔራል', 272 | 'አቡነ', 273 | 'ቀስ', 274 | 'ሰላም', 275 | 'ሼኽ', 276 | 'ራስ', 277 | 'ቢትወደድ', 278 | 'ወ/ሮ', 279 | 'ወ/ሪት', 280 | 'ድ/ር', 281 | 'ፕ/ር', 282 | 'ፕ/ት', 283 | 'ኮ/ል', 284 | 'ጄ/ል', 285 | 'ሼኽ', 286 | 'ራስ', 287 | 'ቢትወደድ', 288 | 'አዛዥና', 289 | 'ልዑል', 290 | 'ሚኒስቴር', 291 | 'ዕድሜው', 292 | 'ወታደር', 293 | 'ም/ል', 294 | 'ጸሃፊ', 295 | 'ረዳት', 296 | 'ጸሐፊ', 297 | 'አምባሳደር', 298 | 'አስተዳዳሪ', 299 | 'ሪፖርተራችን', 300 | } for w in ws], 301 | 'orm': lambda ws: [w.lower() in { 302 | 'obbo', # Mister 303 | 'obboo', # Mister 304 | 'obo', # Mister 305 | 'abbaa', # Father 306 | 'aba', # Father 307 | 'ministeeraa', # Minister 308 | 'durataa\'aa', # President 309 | 'jeneraal', # General 310 | } for w in ws], 311 | 'tir': lambda ws: [w in { 312 | 'ኣቶ', # Mister_1 313 | 'ጐይታይ', # Mister_2 314 | 'ሓላፊ', # President_1 315 | 'ሓለቓ', # President_2 316 | 'ወዘተ', # President_3 317 | 'ፕረሲደንት', # President_4 318 | 'ፕሬዝዳንት', # President_5 319 | 'ኣቦ', # Father 320 | } for w in ws], 321 | 'som': lambda ws: [w in {} for w in ws], 322 | } 323 | 324 | 325 | ex_head_org = { 326 | 'eng': lambda ws: [w in { 327 | 'Ministry', 328 | 'Department', 329 | 'Agency', 330 | 'Bureau', 331 | 'Company', 332 | 'Corporation', 333 | 'Inc.', 334 | 'Inc', 335 | 'Corp.', 336 | 'Corp', 337 | 'Authority', 338 | 'Organization', 339 | 'Organisation', 340 | 'Committee', 341 | 'Bank', 342 | } for w in ws], 343 | 'deu': lambda ws: [w in { 344 | 'Amt', 345 | 'Ministerium', 346 | 'Agentur', 347 | 'Büro', 348 | 'Organisation', 349 | 'Abteilung', 350 | 'Abt.', 351 | 'Aktiengesellschaft', 352 | 'AG', 353 | 'Union', 354 | 'Genossenschaft', 355 | 'Gen.', 356 | 'Gesellschaft', 357 | 'GmbH', 358 | 'HTL', 359 | 'Regierung', 360 | 'Verband', 361 | 'Kommission', 362 | 'Bank', 363 | } for w in ws], 364 | 'amh': lambda ws: [w in { 365 | 'ሚኒስቴር', 366 | 'ኤጀንሲ', 367 | 'ኮሚሽን', 368 | 'ኮርፖሬሽን', # corporation 369 | 'ድርጅት', 370 | 'ባለሥልጣን', 371 | 'ባንክ', 372 | 'ቢሮ', 373 | 'ኮሚቴ', 374 | 'ኮርፖሬሽን', 375 | 'ምንጮች', 376 | 'ፓርቲ', # party 377 | 'ፓርቲን', # party_2 378 | 'ጋዜጣ', # newpaper 379 | } for w in ws], 380 | 'orm': lambda ws: [w.lower() in { 381 | 'ministirii', # Ministry 382 | 'ministiri', 383 | 'damiyyaa', # Department 384 | 'damiyya', 385 | 'wakkiila', # Agency 386 | 'wakila', 387 | 'dhaabbata', # Organization 388 | 'dhabata', 389 | 'koree', # Committee 390 | 'kore', 391 | 'baankii', # Bank 392 | 'banki', 393 | 'waldaa', # Society 394 | 'walda', 395 | 'waraanni', # Front 396 | 'warnani', 397 | } for w in ws], 398 | 'tir': lambda ws: [w in { 399 | 'ክፍሊ', # Department_1 400 | 'ጨንፈር', # Department_2 401 | 'ዋኒን', # Agency_1 402 | 'ተግባር', # Agency_2 403 | 'ስርሒት', # Agency_3 404 | 'ኤጄንሲ', # Agency_4 405 | 'ሰደቓ', # Bureau 406 | 'ኮርፖረሽን', # Corporation 407 | 'ውድብ', # Organization_1 408 | 'ኣወዳድባ', # Organization_2 409 | 'ኣመሰራርታ', # Organization_3 410 | 'ኮመት', # Committee_1 411 | 'ሽማግለ', # Committee_2 412 | 'ሰራዊት', # Army 413 | 'ስርዓት', # Regime 414 | } for w in ws], 415 | 'som': lambda ws: [w.lower() in { 416 | 'dowladda', # government 417 | 'maamulka', # administration 418 | 'xafiiska', # office 419 | 'wasaaradda', # ministry 420 | 'hay\'adda', # agency 421 | 'shirkadda', # corporation 422 | 'saacadaha', # organization 423 | 'guddi', # board 424 | 'bankiga', # bank 425 | 'ciidamada', # army 426 | 'kooxda', # faction 427 | 'shabakada', # network 428 | } for w in ws], 429 | } 430 | 431 | 432 | ex_head_loc = { 433 | 'eng': lambda ws: [w in { 434 | 'Island', 435 | 'Lake', 436 | 'River', 437 | 'Sea', 438 | 'Ocean', 439 | 'Mountain', 440 | 'Mountains', 441 | 'Valley', 442 | 'Bay', 443 | 'Mosque', 444 | 'Cathedral', 445 | 'Church', 446 | } for w in ws], 447 | 'deu': lambda ws: [any([ 448 | re.search('[Bb]erg$', w), 449 | re.search('[Gg]ebirge$', w), 450 | re.search('[Ss]ee$', w), 451 | re.search('[Mm]eer$', w), 452 | re.search('[Oo]zean$', w), 453 | re.search('[Tt]al$', w), 454 | re.search('wald$', w), 455 | re.search('[Bb]ucht$', w), 456 | re.search('[Kk]irche$', w), 457 | re.search('[Mm]oschee$', w), 458 | ]) for w in ws], 459 | 'amh': lambda ws: [w in { 460 | 'ደሴት', 461 | 'ሐይክ', 462 | 'ወንዝ', 463 | 'ባህር', 464 | 'ወቅያኖስ', 465 | 'ተራራ', 466 | 'ሸለቆ', 467 | 'ሰፈር', 468 | 'ወሽመጥ', 469 | 'መስጊድ', 470 | 'ሀገር', 471 | 'ሆስፒታል', # hospital 472 | } for w in ws], 473 | 'orm': lambda ws: [w.lower() in { 474 | 'odoola', # Island 475 | 'odola', 476 | 'odoolota', # Islands 477 | 'odolota', 478 | 'calalaqa', # Lake_1 479 | 'dabbal', # Lake_2 480 | 'dabal', 481 | 'hara', # Lake_3 482 | 'laaqii', # Lake_4 483 | 'laqi', 484 | 'lagaa', # River 485 | 'laga', 486 | 'garba', # Sea 487 | 'maanya', # Ocean 488 | 'manya', 489 | 'gooroo', # Mountains 490 | 'goro', 491 | 'gaara', # Mountain 492 | 'sulula', # Valley 493 | 'bataskaana', # Church 494 | 'masqiida', # Mosque 495 | } for w in ws], 496 | 'tir': lambda ws: [w in { 497 | 'ደሴት', # Island_1 498 | 'ግሉል', # Island_2 499 | 'ብሕቱው', # Island_3 500 | 'ቀላይ', # Lake_1 501 | 'ወይናይ', # Lake_2 502 | 'ፈለግ', # River 503 | 'ባሕሪ', # Sea 504 | 'ሰፊሕ', # Ocean 505 | 'ጎቦ', # Mountain_1 506 | 'እምባ', # Mountain_2 507 | 'ሩባ', # Valley_1 508 | 'ለሰ', # Valley_2 509 | 'ሕሉም', # Valley_3 510 | 'ስንጭሮ', # Valley_4 511 | 'በተኽስያን', # Church 512 | 'መስጊድ', # Mosque 513 | } for w in ws], 514 | 'som': lambda ws: [w.lower() in { 515 | 'jasiirad', # island 516 | 'harada', # lake 517 | 'buurta', # mountain 518 | 'dooxada', # valley 519 | 'badweynta', # ocean 520 | 'webiga', # river 521 | 'masaajid', # mosque 522 | 'hoteel', # hotel 523 | 'hotelka', # hotel 524 | 'hotel', # hotel 525 | 'degmada', # district 526 | 'deegaanka', # district 527 | } for w in ws], 528 | } 529 | 530 | 531 | ex_head_gpe = { 532 | 'eng': lambda ws: [w in { 533 | 'District', 534 | 'Zone', 535 | 'Region', 536 | 'Province', 537 | 'Division', 538 | 'Republic', 539 | 'Nation', 540 | 'City', 541 | 'Town', 542 | 'Village', 543 | 'State', 544 | } for w in ws], 545 | 'deu': lambda ws: [any([ 546 | re.search('[rR]epublik$', w), 547 | re.search('land$', w), 548 | re.search('stan$', w), 549 | re.search('[sS]tadt$', w), 550 | re.search('heim$', w), 551 | re.search('dorf$', w), 552 | re.search('hausen$', w), 553 | re.search('burg$', w), 554 | re.search('berg$', w), 555 | re.search('gau$', w), 556 | re.search('[pP]rovinz$', w) 557 | ]) for w in ws], 558 | 'amh': lambda ws: [w in { 559 | 'ከተማ', 560 | 'መንደር', 561 | 'ቀበሌ', 562 | 'ወረዳ', 563 | 'ዞን', 564 | 'ክልል', 565 | 'አውራጃ', 566 | 'መንግስት', 567 | 'ክፍላት', 568 | 'ጦር', 569 | 'ዙሪያ', 570 | 'ላይ', 571 | 'ተከማ', # town 572 | } for w in ws], 573 | 'orm': lambda ws: [w.lower() in { 574 | 'koonyaa', # District_1 575 | 'konya', 576 | 'aanaa', # District_2 577 | 'ana', 578 | 'goltaa', # Zone_1 579 | 'golta', 580 | 'godina', # Zone_2 581 | 'naannoo', # Region 582 | 'nano', 583 | 'jamuriyaa', # Republic_1 584 | 'jamuriya', 585 | 'republika', # Republic_2 586 | 'magaalaa', # City 587 | 'magala', 588 | 'magaalaan', 589 | 'magalan', 590 | 'daabbaa', # Town 591 | 'daba', 592 | 'dira', # Big Town 593 | 'gandaa', # Village 594 | 'ganda', 595 | 'mootummaa', 596 | 'motuma', 597 | } for w in ws], 598 | 'tir': lambda ws: [w in { 599 | 'ወረዳ', # District 600 | 'ዞባ', # Zone 601 | 'ከተማ', # City 602 | 'ዞና', # Region 603 | 'መንግስቲ', # State 604 | 'ኣውራጃ', # Prefecture/Province 605 | 'ረፑብሊክ', # Republic 606 | 'ከተማ', # City 607 | 'ገጠር', # Village_1 608 | 'ቁሸት', # Village_2 609 | 'ዓዲ', # Village_3 610 | } for w in ws], 611 | 'som': lambda ws: [w.lower() in { 612 | 'dalka', # country 613 | 'dalalka', # country 614 | 'gobolka', # province, state 615 | 'magaalada', # city 616 | 'tuulo', # village 617 | 'jamhuuriyadda', # republic 618 | } for w in ws], 619 | } 620 | 621 | 622 | ex_prep_from = { 623 | 'eng': lambda ws: [w.lower() == 'from' for w in ws], 624 | 'deu': lambda ws: [w.lower() in {'von', 'vom'} for w in ws], 625 | 'amh': lambda ws: [bool(re.match('ከ', w)) for w in ws], 626 | 'orm': lambda ws: [w.lower() in {'irraa', 'ira'} for w in ws], 627 | 'tir': lambda ws: [w in {'ካብ'} for w in ws], 628 | 'som': lambda ws: [w in {'ilaa'} for w in ws], 629 | } 630 | 631 | 632 | ex_prep_in = { 633 | 'eng': lambda ws: [w.lower() == 'in' for w in ws], 634 | 'deu': lambda ws: [w.lower() in {'in', 'im'} for w in ws], 635 | 'amh': lambda ws: [bool(re.match('በ', w)) for w in ws], 636 | 'orm': lambda ws: [w.lower() in {'keessa', 'kesa', 'itti', 'iti'} for w in ws], 637 | 'tir': lambda ws: [w in {'ኣብ'} for w in ws], 638 | 'som': lambda ws: [w in {'ee'} for w in ws], 639 | } 640 | 641 | 642 | extractors = [ 643 | lambda lang: ex_capitalized, 644 | lambda lang: ex_all_uppercased, 645 | lambda lang: ex_mixed_case, 646 | lambda lang: ex_internal_period, 647 | lambda lang: ex_non_letter, 648 | lambda lang: ex_digits, 649 | lambda lang: ex_long_token, 650 | lambda lang: ex_contains_latin, 651 | lambda lang: ex_contains_ethiopic, 652 | lambda lang: ex_title[lang], 653 | lambda lang: ex_head_org[lang], 654 | lambda lang: ex_head_loc[lang], 655 | lambda lang: ex_head_gpe[lang], 656 | lambda lang: ex_prep_from[lang], 657 | lambda lang: ex_prep_in[lang], 658 | lambda lang: partial(ex_b_gaz, language=lang, label='GPE'), 659 | lambda lang: partial(ex_b_gaz, language=lang, label='LOC'), 660 | lambda lang: partial(ex_b_gaz, language=lang, label='ORG'), 661 | lambda lang: partial(ex_b_gaz, language=lang, label='PER'), 662 | lambda lang: partial(ex_i_gaz, language=lang, label='GPE'), 663 | lambda lang: partial(ex_i_gaz, language=lang, label='LOC'), 664 | lambda lang: partial(ex_i_gaz, language=lang, label='ORG'), 665 | lambda lang: partial(ex_i_gaz, language=lang, label='PER'), 666 | lambda lang: partial(ex_o_gaz, language=lang), 667 | ] 668 | 669 | 670 | TYPE_START, TYPE_END = 0, 9 671 | TOKEN_START, TOKEN_END = 9, 15 672 | GAZ_START, GAZ_END = 15, 24 673 | 674 | 675 | def fake_extract(lang, seg): 676 | fts = [ex(lang)(seg) for ex in extractors] 677 | return fts 678 | 679 | 680 | def extract(lang, seg): 681 | fts = zip(*[ex(lang)(seg) for ex in extractors]) 682 | return [list(map(int, f)) for f in fts] 683 | 684 | 685 | def extract_type_level(lang, seg): 686 | fts = extract(lang, seg) 687 | return [v[TYPE_START:TYPE_END] for v in fts] 688 | 689 | 690 | def extract_token_level(lang, seg): 691 | fts = extract(lang, seg) 692 | return [v[TOKEN_START:TOKEN_END] for v in fts] 693 | 694 | 695 | def extract_gaz_features(lang, seg): 696 | fts = extract(lang, seg) 697 | return [v[GAZ_START:GAZ_END] for v in fts] 698 | 699 | 700 | def extract_type_token_level(lang, seg): 701 | fts = extract(lang, seg) 702 | return [v[TYPE_START:TOKEN_END] for v in fts] 703 | 704 | if __name__ == "__main__": 705 | seg = [u'\u121d\u12dd\u1263\u12d5', u'\u12a3\u12e8\u122d', u'-', u'\u12f6\u1265', u'\u12a3\u120d\u1266', u'\u12c8\u1325\u122a', u'\u12d3\u1208\u121d'] 706 | b = extract("tir", seg) 707 | print(b) 708 | # a = extract_gaz_features("tir", seg) 709 | # print(a) 710 | -------------------------------------------------------------------------------- /utils/split_train_ensemble.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import codecs 3 | from random import shuffle 4 | 5 | 6 | def split(path, write_to, split_num): 7 | tot_data = [] 8 | 9 | with codecs.open(path, "r", "utf-8") as fin: 10 | one_sent = [] 11 | for line in fin: 12 | if line.strip() == "": 13 | if len(one_sent) > 0: 14 | tot_data.append(one_sent) 15 | one_sent = [] 16 | else: 17 | one_sent.append(line.strip()) 18 | if len(one_sent) > 0: 19 | tot_data.append(one_sent) 20 | 21 | shuffle(tot_data) 22 | 23 | divs = len(tot_data) / split_num 24 | splits = range(0, len(tot_data), divs) 25 | splits[-1] = len(tot_data) 26 | for i in range(split_num): 27 | with codecs.open(write_to + "cp3_train_ens_" + str(i) + ".conll", "w", "utf-8") as fout: 28 | for j in range(splits[i], splits[i+1]): 29 | for line in tot_data[j]: 30 | fout.write(line + "\n") 31 | fout.write("\n") 32 | 33 | if __name__ == "__main__": 34 | # Usage: python split_train_ensemble.py ../datasets/cp3/oromo/cp3_train.conll ../datasets/cp3/oromo/ 5 35 | fname = sys.argv[1] 36 | write_to_folder = sys.argv[2] 37 | split_num = int(sys.argv[3]) 38 | 39 | split(fname, write_to_folder, split_num) -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | __author__ = 'chuntingzhou' 2 | import dynet as dy 3 | import numpy as np 4 | from collections import defaultdict 5 | import gzip 6 | import cPickle as pkl 7 | import codecs 8 | import math 9 | import random 10 | from random import shuffle 11 | random.seed(448) 12 | np.random.seed(1) 13 | import operator 14 | import re 15 | MAX_CHAR_LENGTH = 45 16 | 17 | # Regular expressions used to normalize digits. 18 | DIGIT_RE = re.compile(br"\d") 19 | 20 | # word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0] 21 | 22 | 23 | def iob2(tags): 24 | """ 25 | Check that tags have a valid IOB format. 26 | Tags in IOB1 format are converted to IOB2. 27 | """ 28 | for i, tag in enumerate(tags): 29 | if tag == 'O': 30 | continue 31 | split = tag.split('-') 32 | if len(split) != 2 or split[0] not in ['I', 'B']: 33 | return False 34 | if split[0] == 'B': 35 | continue 36 | elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2 37 | tags[i] = 'B' + tag[1:] 38 | elif tags[i - 1][1:] == tag[1:]: 39 | continue 40 | else: # conversion IOB1 to IOB2 41 | tags[i] = 'B' + tag[1:] 42 | return True 43 | 44 | 45 | def get_entity(label): 46 | entities = [] 47 | i = 0 48 | while i < len(label): 49 | if label[i] != 'O': 50 | e_type = label[i][2:] 51 | j = i + 1 52 | while j < len(label) and label[j] == 'I-' + e_type: 53 | j += 1 54 | entities.append((i, j, e_type)) 55 | i = j 56 | else: 57 | i += 1 58 | return entities 59 | 60 | 61 | def evaluate_ner(pred, gold): 62 | tp = 0 63 | fp = 0 64 | fn = 0 65 | for i in range(len(pred)): 66 | pred_entities = get_entity(pred[i]) 67 | gold_entities = get_entity(gold[i]) 68 | temp = 0 69 | for entity in pred_entities: 70 | if entity in gold_entities: 71 | tp += 1 72 | temp += 1 73 | else: 74 | fp += 1 75 | fn += len(gold_entities) - temp 76 | precision = 1.0 * tp / (tp + fp) 77 | recall = 1.0 * tp / (tp + fn) 78 | f1 = 2 * precision * recall / (precision + recall) 79 | return precision, recall, f1 80 | 81 | 82 | def fopen(filename, mode='r'): 83 | if filename.endswith('.gz'): 84 | return gzip.open(filename, mode) 85 | return open(filename, mode) 86 | 87 | 88 | def get_pretrained_emb(path_to_emb, word_to_id, dim): 89 | word_emb = [] 90 | print "Loading pretrained embeddings from %s." % (path_to_emb) 91 | for _ in range(len(word_to_id)): 92 | word_emb.append(np.random.uniform(-math.sqrt(3.0/dim), math.sqrt(3.0/dim), size=dim)) 93 | 94 | print "length of dict: ", len(word_to_id) 95 | pretrain_word_emb = {} 96 | for line in codecs.open(path_to_emb, "r", "utf-8", errors='replace'): 97 | items = line.strip().split() 98 | if len(items) == dim + 1: 99 | try: 100 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32) 101 | except ValueError: 102 | continue 103 | 104 | not_covered = 0 105 | for word, id in word_to_id.iteritems(): 106 | if word in pretrain_word_emb: 107 | word_emb[id] = pretrain_word_emb[word] 108 | elif word.lower() in pretrain_word_emb: 109 | word_emb[id] = pretrain_word_emb[word.lower()] 110 | else: 111 | not_covered += 1 112 | 113 | emb = np.array(word_emb, dtype=np.float32) 114 | 115 | print "Word number not covered in pretrain embedding: ", not_covered 116 | return emb, word_to_id 117 | 118 | 119 | def pkl_dump(obj, path): 120 | with open(path, "wb") as fout: 121 | pkl.dump(obj, fout) 122 | 123 | 124 | def pkl_load(path): 125 | with open(path, "rb") as fin: 126 | obj = pkl.load(fin) 127 | return obj 128 | 129 | 130 | def log_sum_exp_dim_0(x): 131 | # numerically stable log_sum_exp 132 | dims = x.dim() 133 | max_score = dy.max_dim(x, 0) # (dim_1, batch_size) 134 | if len(dims[0]) == 1: 135 | max_score_extend = max_score 136 | else: 137 | max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1]) 138 | max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0]) 139 | x = x - max_score_extend 140 | exp_x = dy.exp(x) 141 | # (dim_1, batch_size), if no dim_1, return ((1,), batch_size) 142 | log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0]) 143 | return log_sum_exp_x + max_score 144 | 145 | 146 | def data_iterator(data_pair, batch_size): 147 | batches = make_bucket_batches(data_pair, batch_size) 148 | for batch in batches: 149 | yield batch 150 | 151 | 152 | def make_bucket_batches(data_collections, batch_size): 153 | # Data are bucketed according to the length of the first item in the data_collections. 154 | buckets = defaultdict(list) 155 | tot_items = len(data_collections[0]) 156 | for data_item in data_collections: 157 | src = data_item[0] 158 | buckets[len(src)].append(data_item) 159 | 160 | batches = [] 161 | # np.random.seed(2) 162 | for src_len in buckets: 163 | bucket = buckets[src_len] 164 | np.random.shuffle(bucket) 165 | 166 | num_batches = int(np.ceil(len(bucket) * 1.0 / batch_size)) 167 | for i in range(num_batches): 168 | cur_batch_size = batch_size if i < num_batches - 1 else len(bucket) - batch_size * i 169 | batches.append([[bucket[i * batch_size + j][k] for j in range(cur_batch_size)] for k in range(tot_items)]) 170 | np.random.shuffle(batches) 171 | return batches 172 | 173 | 174 | def transpose_input(seq, padding_token=0): 175 | # input seq: list of samples [[w1, w2, ..], [w1, w2, ..]] 176 | max_len = max([len(sent) for sent in seq]) 177 | seq_pad = [] 178 | seq_mask = [] 179 | for i in range(max_len): 180 | pad_temp = [sent[i] if i < len(sent) else padding_token for sent in seq] 181 | mask_temp = [1.0 if i < len(sent) else 0.0 for sent in seq] 182 | seq_pad.append(pad_temp) 183 | seq_mask.append(mask_temp) 184 | 185 | return seq_pad, seq_mask 186 | 187 | 188 | def transpose_discrete_features(feature_batch): 189 | # Discrete features are zero-one features 190 | # TODO: Other integer features, create lookup tables 191 | # tgt_batch: [[[feature of word 1 of sent 1], [feature of word 2 of sent 2], ]] 192 | # return: [(feature_num, batchsize)] 193 | max_sent_len = max([len(s) for s in feature_batch]) 194 | feature_num = len(feature_batch[0][0]) 195 | batch_size = len(feature_batch) 196 | features = [] # each: (feature_num, batch_size) 197 | for i in range(max_sent_len): 198 | w_i_feature = [dy.inputTensor(sent[i], batched=True) if i < len(sent) else dy.zeros(feature_num) for sent in feature_batch] 199 | w_i_feature = dy.reshape(dy.concatenate(w_i_feature, d=1), (feature_num,), batch_size=batch_size) 200 | features.append(w_i_feature) 201 | 202 | return features 203 | 204 | 205 | def transpose_and_batch_embs(input_embs, emb_size): 206 | # input_embs: [[w1_emb, w2_emb, ]], embs are dy.expressions 207 | max_len = max(len(sent) for sent in input_embs) 208 | batch_size = len(input_embs) 209 | padded_seq_emb = [] 210 | seq_masks = [] 211 | for i in range(max_len): 212 | w_i_emb = [sent[i] if i < len(sent) else dy.zeros(emb_size) for sent in input_embs] 213 | w_i_emb = dy.reshape(dy.concatenate(w_i_emb, d=1), (emb_size, ), batch_size=batch_size) 214 | w_i_mask = [1.0 if i < len(sent) else 0.0 for sent in input_embs] 215 | padded_seq_emb.append(w_i_emb) 216 | seq_masks.append(w_i_mask) 217 | 218 | return padded_seq_emb, seq_masks 219 | 220 | 221 | def transpose_char_input(tgt_batch, padding_token): 222 | # The tgt_batch may not be padded with and 223 | # tgt_batch: [[[, , ], [, s,h,e, ], 224 | # [, i,s, ], [, p,r,e,t,t,y, ], [, , ]], [[],[],[]]] 225 | max_sent_len = max([len(s) for s in tgt_batch]) 226 | sent_w_batch = [] # each is list of list: max_word_len, batch_size 227 | sent_mask_batch = [] # each is list of list: max_word_len, batch_size 228 | max_w_lens = [] 229 | SOW_PAD = 0 230 | EOW_PAD = 1 231 | EOS_PAD = 2 232 | for i in range(max_sent_len): 233 | max_len_w = max([len(sent[i]) for sent in tgt_batch if i < len(sent)]) 234 | max_w_lens.append(max_len_w) 235 | w_batch = [] 236 | mask_batch = [] 237 | for j in range(0, max_len_w): 238 | temp_j_w = [] 239 | for sent in tgt_batch: 240 | if i < len(sent) and j < len(sent[i]): 241 | temp_j_w.append(sent[i][j]) 242 | elif i >= len(sent): 243 | if j == 0: 244 | temp_j_w.append(SOW_PAD) 245 | elif j == max_len_w - 1: 246 | temp_j_w.append(EOW_PAD) 247 | else: 248 | temp_j_w.append(EOS_PAD) 249 | else: 250 | temp_j_w.append(EOW_PAD) 251 | # w_batch = [sent[i][j] if i < len(sent) and j < len(sent[i]) else self.EOW for sent in tgt_batch] 252 | # print "temp: ", temp_j_w 253 | w_batch.append(temp_j_w) 254 | mask_batch.append([1. if i < len(sent) and j < len(sent[i]) else 0.0 for sent in tgt_batch]) 255 | sent_w_batch.append(w_batch) 256 | sent_mask_batch.append(mask_batch) 257 | return sent_w_batch, sent_mask_batch, max_sent_len, max_w_lens 258 | 259 | 260 | if __name__ == "__main__": 261 | # from scipy.misc import logsumexp 262 | # import numpy as np 263 | # 264 | # # a = np.random.rand(3, 4, 2) 265 | # # b = logsumexp(a, axis=0) 266 | # # a_t = dy.inputTensor(a, batched=True) 267 | # # b_t = log_sum_exp_dim_0(a_t) 268 | # # print "numpy " 269 | # # print b 270 | # # print "dynet " 271 | # # print b_t.value(), b_t.dim() 272 | # # print dy.pick_batch_elem(b_t, 1).npvalue() 273 | # 274 | # a = np.random.rand(3, 2) 275 | # b = logsumexp(a, axis=0) 276 | # a_t = dy.inputTensor(a, batched=True) 277 | # b_t = log_sum_exp_dim_0(a_t) 278 | # print "numpy " 279 | # print b 280 | # print "dynet " 281 | # print b_t.value(), b_t.dim() 282 | # print dy.pick_batch_elem(b_t, 1).npvalue() 283 | dim = 100 284 | #9 1000 285 | path_to_emb = "/Users/zct/Downloads/tir1.emb" 286 | # path_to_emb = "../datasets/english/glove.6B/glove.6B.100d.txt" 287 | pretrain_word_emb = {} 288 | i = 1 289 | for line in codecs.open(path_to_emb, "r", 'utf-8', errors='replace'): 290 | items = line.strip().split() 291 | if len(items) == dim + 1: 292 | try: 293 | pretrain_word_emb[items[0]] = np.asarray(items[1:]).astype(np.float32) 294 | except ValueError: 295 | continue 296 | print items[0], i, pretrain_word_emb[items[0]][:3] 297 | i += 1 298 | 299 | # gradient clipping 300 | # turn off the dropout 301 | # use smaller initial lr 302 | # variational dropout --------------------------------------------------------------------------------