├── arch.png ├── example.png ├── weights.hd5f ├── model.py ├── README.md ├── test.py └── train.py /arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/arch.png -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/example.png -------------------------------------------------------------------------------- /weights.hd5f: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/weights.hd5f -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[21]: 5 | 6 | 7 | import keras 8 | from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation, Masking 9 | from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed 10 | from keras.models import Model, Sequential 11 | from keras_contrib.layers import CRF 12 | 13 | 14 | 15 | def build_model(): 16 | crf_layer = CRF(9) 17 | input_layer = Input(shape=(None,300,)) 18 | # embedding = Embedding(212, 20, input_length=None, mask_zero=True)(input_layer) 19 | mask_layer = Masking(mask_value=0., input_shape=(212, 300))(input_layer) 20 | bi_gru = Bidirectional(GRU(10, return_sequences=True))(mask_layer) 21 | bi_gru = TimeDistributed(Dense(10, activation="relu"))(bi_gru) 22 | output_layer = crf_layer(bi_gru) 23 | return Model(input_layer, output_layer), crf_layer 24 | 25 | 26 | # In[22]: 27 | 28 | 29 | # compile model 30 | # def compile_model(model): 31 | # model.compile(optimizer="rmsprop", loss=crf_layer.loss_function, metrics=[crf_layer.accuracy], validation_split=0.25) 32 | # model.summary() 33 | # return model 34 | 35 | # build_model().summary() 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Arabic-NER 2 | 3 | Model uses Named Entity Recognition concept to tag words in arabic text. 4 | 5 | ## Architecture 6 | 7 | It consists of Bi-directional GRU units(One forward and the other is backward) and a CRF layer 8 | ![Architecture](https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/master/arch.png) 9 | referenced from https://arxiv.org/pdf/1508.01991v1.pdf 10 | 11 | ## Specifications 12 | 13 | Model is trained on [ANERCorp dataset](http://users.dsic.upv.es/~ybenajiba/downloads.html).[more](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp). And uses FastText's Arabic vectors for word embedding. 14 | 15 | No. epochs: 20 16 | 17 | Accuracy: 94.2% 18 | 19 | Classification report: 20 | 21 | precision recall f1-score support 22 | 23 | LOC 0.99 0.99 0.99 11055 24 | PERS 0.74 0.65 0.69 824 25 | ORG 0.64 0.46 0.54 503 26 | MISC 0.63 0.38 0.47 237 27 | avg / total 0.95 0.94 0.94 12619 28 | 29 | F1_score: 95.0% 30 | 31 | ## Sample 32 | #### Input 33 | ماذا يفعل طلال عبد الهادي في دبي بعد ما رجع من برلين؟ كان يعمل هناك في شركة فولكسفاجن، صحيح؟ 34 | #### Output 35 | ![Example](https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/master/example.png) 36 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import model 9 | import time 10 | import re 11 | import gensim 12 | import gensim.models.keyedvectors as word2vec 13 | 14 | print('Loading Word Embedding model...') 15 | 16 | start_time = time.time() 17 | embedding = word2vec.KeyedVectors.load_word2vec_format('wiki.ar.vec', binary=False) 18 | # embedding = gensim.models.Word2Vec.load('tweet_cbow_300/tweets_cbow_300') 19 | load_time = time.time() - start_time 20 | 21 | print('loaded model in ' + str(load_time) + ' seconds') 22 | 23 | 24 | # In[23]: 25 | 26 | 27 | dump_chars = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~’،ـ؟؛«» ' 28 | 29 | def clean_word(word): 30 | word = word.translate(str.maketrans({key: None for key in dump_chars})) 31 | 32 | #remove tashkeel 33 | p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') 34 | word = re.sub(p_tashkeel,"", word) 35 | 36 | return word 37 | 38 | raw_sent = input() 39 | raw_sent = raw_sent.split() 40 | sent = [] 41 | X = [] 42 | for i, word in enumerate(raw_sent): 43 | word = clean_word(word) 44 | if len(word) > 0: 45 | sent.append(word) 46 | try: 47 | X.append(embedding[word]) 48 | except KeyError: 49 | X.append(embedding['unk']) 50 | 51 | X += [[0]*300]*(212 - len(sent)) 52 | # print(len(X)) 53 | 54 | 55 | # In[24]: 56 | 57 | 58 | test_model, _ = model.build_model() 59 | test_model.load_weights('weights.hd5f') 60 | pred = test_model.predict(np.array([X])) 61 | 62 | 63 | # In[26]: 64 | 65 | 66 | tag_classes = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PERS', 'O'] 67 | from terminaltables import AsciiTable 68 | table_data = [['word', 'prediction']] 69 | for i, word in enumerate(sent): 70 | table_data.append([word, tag_classes[np.argmax(pred[0][i])]]) 71 | table = AsciiTable(table_data) 72 | print(table.table) 73 | 74 | 75 | # In[8]: 76 | 77 | 78 | 79 | 80 | 81 | # In[12]: 82 | 83 | 84 | 85 | 86 | 87 | # In[16]: 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import model 4 | import time 5 | import re 6 | import gensim 7 | import gensim.models.keyedvectors as word2vec 8 | from keras.utils import to_categorical 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | from seqeval.metrics import accuracy_score 12 | from seqeval.metrics import classification_report 13 | from seqeval.metrics import f1_score 14 | 15 | print('Loading Word Embedding model...') 16 | 17 | start_time = time.time() 18 | embedding = word2vec.KeyedVectors.load_word2vec_format('wiki.ar.vec', binary=False) 19 | # embedding = gensim.models.Word2Vec.load('tweet_cbow_300/tweets_cbow_300') 20 | load_time = time.time() - start_time 21 | 22 | print('loaded model in ' + str(load_time) + ' seconds') 23 | 24 | dump_chars = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~’،ـ؟؛«» ' 25 | 26 | def clean_word(word): 27 | word = word.translate(str.maketrans({key: None for key in dump_chars})) 28 | 29 | #remove tashkeel 30 | p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') 31 | word = re.sub(p_tashkeel,"", word) 32 | 33 | return word 34 | 35 | def load_data(): 36 | path = 'ANERCorp' 37 | f = open(path, 'r') 38 | sents = f.read().split('\n. O\n') 39 | f.close() 40 | 41 | # tokenize words 42 | words = [None]*len(sents) 43 | tokens = [None]*len(sents) 44 | for i, sent in enumerate(sents): 45 | sent = sent.split('\n') 46 | words[i] = [] 47 | tokens[i] = [] 48 | for word in sent: 49 | line = word.rsplit(' ', 1) 50 | line[0] = clean_word(line[0]) 51 | if len(line[0]) > 0: 52 | words[i].append(line[0]) 53 | tokens[i].append(line[1]) 54 | 55 | 56 | return [d for d in words if len(d) > 0], [d for d in tokens if len(d) > 0] 57 | 58 | # load data 59 | sents, labels = load_data() 60 | 61 | 62 | tag_classes = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PERS', 'O'] 63 | 64 | # embed words 65 | for i, sent in enumerate(sents): 66 | for j, word in enumerate(sent): 67 | try: 68 | sents[i][j] = embedding[word] 69 | except KeyError: 70 | sents[i][j] = embedding['unk'] 71 | 72 | # embed labels 73 | for i, tokens in enumerate(labels): 74 | labels[i] = [to_categorical(tag_classes.index(tag), num_classes=len(tag_classes)) for tag in tokens] 75 | 76 | ################################ 77 | # No. sentences: 4898 78 | # No. all words: 135717 79 | # No. 3/4 all words: 101787 80 | # Index of 3/4 sentences: 3569 81 | ################################ 82 | 83 | # pad sequences 84 | max_sent_length = 212 85 | sents_lengths = [] 86 | for i, sent in enumerate(sents): 87 | sents_lengths.append(len(sent)) 88 | l = max_sent_length - len(sent) 89 | sents[i] += [[0]*300]*l 90 | 91 | for i, label in enumerate(labels): 92 | l = max_sent_length - len(label) 93 | labels[i] += [[0]*8+[0]]*l 94 | 95 | 96 | # split data 97 | train_x, train_y = sents[:3673], labels[:3673] 98 | test_x, test_y = sents[3674:], labels[3674:] 99 | 100 | # build model 101 | train_model, crf_layer = model.build_model() 102 | train_model.compile(optimizer="rmsprop", loss=crf_layer.loss_function, metrics=[crf_layer.accuracy]) 103 | train_model.summary() 104 | 105 | # train model 106 | history = train_model.fit(np.array(train_x, dtype='float64'), np.array(train_y, dtype='float64'), epochs=20, verbose=1, validation_data=(np.array(test_x, dtype='float64'), np.array(test_y, dtype='float64'))) 107 | 108 | # save weights 109 | train_model.save_weights('weights.hd5f') 110 | 111 | # plot accuracy 112 | hist = pd.DataFrame(history.history) 113 | plt.style.use("ggplot") 114 | plt.figure(figsize=(6,6)) 115 | plt.plot(hist["val_acc"]) 116 | plt.plot(hist["val_loss"]) 117 | plt.show() 118 | 119 | 120 | # testing 121 | pred = train_model.predict(np.array(test_x, dtype='float64')) 122 | pred_x = [] 123 | pred_y = [] 124 | for i, sent in enumerate(pred): 125 | pred_x.append([tag_classes[np.argmax(w)] for w in pred[i][:sents_lengths[i]]]) 126 | pred_y.append([tag_classes[np.argmax(w)] for w in test_y[i][:sents_lengths[i]]]) 127 | 128 | print(classification_report(pred_y, pred_x)) 129 | print('f1_score: ') 130 | print(f1_score(pred_y, pred_x)) 131 | --------------------------------------------------------------------------------