├── arch.png
├── example.png
├── weights.hd5f
├── model.py
├── README.md
├── test.py
└── train.py


/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/arch.png


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/example.png


--------------------------------------------------------------------------------
/weights.hd5f:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/HEAD/weights.hd5f


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[21]:
 5 | 
 6 | 
 7 | import keras
 8 | from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation, Masking
 9 | from keras.layers import Bidirectional, GlobalMaxPool1D, TimeDistributed
10 | from keras.models import Model, Sequential
11 | from keras_contrib.layers import CRF
12 | 
13 | 
14 | 
15 | def build_model():
16 |     crf_layer = CRF(9)
17 |     input_layer = Input(shape=(None,300,))
18 | #     embedding = Embedding(212, 20, input_length=None, mask_zero=True)(input_layer)
19 |     mask_layer = Masking(mask_value=0., input_shape=(212, 300))(input_layer)
20 |     bi_gru = Bidirectional(GRU(10, return_sequences=True))(mask_layer)
21 |     bi_gru = TimeDistributed(Dense(10, activation="relu"))(bi_gru)
22 |     output_layer = crf_layer(bi_gru)
23 |     return Model(input_layer, output_layer), crf_layer
24 | 
25 | 
26 | # In[22]:
27 | 
28 | 
29 | # compile model
30 | # def compile_model(model):
31 | #     model.compile(optimizer="rmsprop", loss=crf_layer.loss_function, metrics=[crf_layer.accuracy], validation_split=0.25)
32 | #     model.summary()
33 | #     return model
34 | 
35 | # build_model().summary()
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Arabic-NER
 2 | 
 3 | Model uses Named Entity Recognition concept to tag words in arabic text.
 4 | 
 5 | ## Architecture
 6 | 
 7 | It consists of Bi-directional GRU units(One forward and the other is backward) and a CRF layer
 8 | ![Architecture](https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/master/arch.png)
 9 | <sup>referenced from https://arxiv.org/pdf/1508.01991v1.pdf<sup>
10 | 
11 | ## Specifications
12 | 
13 | Model is trained on [ANERCorp dataset](http://users.dsic.upv.es/~ybenajiba/downloads.html).<sup>[more](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp).</sup> And uses FastText's Arabic vectors for word embedding.
14 | 
15 | No. epochs: 20
16 | 
17 | Accuracy: 94.2%
18 | 
19 | Classification report:
20 | 
21 |                   precision    recall  f1-score   support
22 | 
23 |              LOC       0.99      0.99      0.99     11055
24 |             PERS       0.74      0.65      0.69       824
25 |              ORG       0.64      0.46      0.54       503
26 |             MISC       0.63      0.38      0.47       237
27 |      avg / total       0.95      0.94      0.94     12619
28 | 
29 | F1_score: 95.0%
30 | 
31 | ## Sample
32 | #### Input
33 | ماذا يفعل طلال عبد الهادي في دبي بعد ما رجع من برلين؟ كان يعمل هناك في شركة فولكسفاجن، صحيح؟
34 | #### Output
35 | ![Example](https://raw.githubusercontent.com/HassanAzzam/Arabic-NER/master/example.png)
36 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import numpy as np
 8 | import model
 9 | import time
10 | import re
11 | import gensim
12 | import gensim.models.keyedvectors as word2vec
13 | 
14 | print('Loading Word Embedding model...')
15 | 
16 | start_time = time.time()
17 | embedding = word2vec.KeyedVectors.load_word2vec_format('wiki.ar.vec', binary=False)
18 | # embedding = gensim.models.Word2Vec.load('tweet_cbow_300/tweets_cbow_300')
19 | load_time = time.time() - start_time
20 | 
21 | print('loaded model in ' + str(load_time) + ' seconds')
22 | 
23 | 
24 | # In[23]:
25 | 
26 | 
27 | dump_chars = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~’،ـ؟؛«» '
28 | 
29 | def clean_word(word):
30 |     word = word.translate(str.maketrans({key: None for key in dump_chars}))
31 |     
32 |     #remove tashkeel
33 |     p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
34 |     word = re.sub(p_tashkeel,"", word)
35 |     
36 |     return word
37 | 
38 | raw_sent = input()
39 | raw_sent = raw_sent.split()
40 | sent = []
41 | X = []
42 | for i, word in enumerate(raw_sent):
43 |     word = clean_word(word)
44 |     if len(word) > 0:
45 |         sent.append(word)
46 |         try:
47 |             X.append(embedding[word])
48 |         except KeyError:
49 |             X.append(embedding['unk'])
50 | 
51 | X += [[0]*300]*(212 - len(sent))
52 | # print(len(X))
53 | 
54 | 
55 | # In[24]:
56 | 
57 | 
58 | test_model, _ = model.build_model()
59 | test_model.load_weights('weights.hd5f')
60 | pred = test_model.predict(np.array([X]))
61 | 
62 | 
63 | # In[26]:
64 | 
65 | 
66 | tag_classes = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PERS', 'O']
67 | from terminaltables import AsciiTable
68 | table_data = [['word', 'prediction']]
69 | for i, word in enumerate(sent):
70 |     table_data.append([word, tag_classes[np.argmax(pred[0][i])]])
71 | table = AsciiTable(table_data)
72 | print(table.table)
73 | 
74 | 
75 | # In[8]:
76 | 
77 | 
78 | 
79 | 
80 | 
81 | # In[12]:
82 | 
83 | 
84 | 
85 | 
86 | 
87 | # In[16]:
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import model
  4 | import time
  5 | import re
  6 | import gensim
  7 | import gensim.models.keyedvectors as word2vec
  8 | from keras.utils import to_categorical
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | from seqeval.metrics import accuracy_score
 12 | from seqeval.metrics import classification_report
 13 | from seqeval.metrics import f1_score
 14 | 
 15 | print('Loading Word Embedding model...')
 16 | 
 17 | start_time = time.time()
 18 | embedding = word2vec.KeyedVectors.load_word2vec_format('wiki.ar.vec', binary=False)
 19 | # embedding = gensim.models.Word2Vec.load('tweet_cbow_300/tweets_cbow_300')
 20 | load_time = time.time() - start_time
 21 | 
 22 | print('loaded model in ' + str(load_time) + ' seconds')
 23 | 
 24 | dump_chars = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~’،ـ؟؛«» '
 25 | 
 26 | def clean_word(word):
 27 |     word = word.translate(str.maketrans({key: None for key in dump_chars}))
 28 |     
 29 |     #remove tashkeel
 30 |     p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
 31 |     word = re.sub(p_tashkeel,"", word)
 32 |     
 33 |     return word
 34 | 
 35 | def load_data():
 36 |     path = 'ANERCorp'
 37 |     f = open(path, 'r')
 38 |     sents = f.read().split('\n. O\n')
 39 |     f.close()
 40 |     
 41 |     # tokenize words
 42 |     words = [None]*len(sents)
 43 |     tokens = [None]*len(sents)
 44 |     for i, sent in enumerate(sents):
 45 |         sent = sent.split('\n')
 46 |         words[i] = []
 47 |         tokens[i] = []
 48 |         for word in sent:
 49 |             line = word.rsplit(' ', 1)
 50 |             line[0] = clean_word(line[0])
 51 |             if len(line[0]) > 0:
 52 |                 words[i].append(line[0])
 53 |                 tokens[i].append(line[1])
 54 |                     
 55 |                 
 56 |     return [d for d in words if len(d) > 0], [d for d in tokens if len(d) > 0]
 57 | 
 58 | # load data
 59 | sents, labels = load_data()
 60 | 
 61 | 
 62 | tag_classes = ['B-LOC', 'B-MISC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PERS', 'O']
 63 | 
 64 | # embed words
 65 | for i, sent in enumerate(sents):
 66 |     for j, word in enumerate(sent):
 67 |         try:
 68 |             sents[i][j] = embedding[word]
 69 |         except KeyError:
 70 |             sents[i][j] = embedding['unk']
 71 | 
 72 | # embed labels
 73 | for i, tokens in enumerate(labels):
 74 |     labels[i] = [to_categorical(tag_classes.index(tag), num_classes=len(tag_classes)) for tag in tokens]
 75 |         
 76 | ################################
 77 | # No. sentences: 4898
 78 | # No. all words: 135717
 79 | # No. 3/4 all words: 101787
 80 | # Index of 3/4 sentences: 3569
 81 | ################################
 82 | 
 83 | # pad sequences
 84 | max_sent_length = 212
 85 | sents_lengths = []
 86 | for i, sent in enumerate(sents):
 87 |     sents_lengths.append(len(sent))
 88 |     l = max_sent_length - len(sent)
 89 |     sents[i] += [[0]*300]*l
 90 |     
 91 | for i, label in enumerate(labels):
 92 |     l = max_sent_length - len(label)
 93 |     labels[i] += [[0]*8+[0]]*l
 94 |     
 95 |     
 96 | # split data
 97 | train_x, train_y = sents[:3673], labels[:3673]
 98 | test_x, test_y = sents[3674:], labels[3674:]
 99 | 
100 | # build model
101 | train_model, crf_layer = model.build_model()
102 | train_model.compile(optimizer="rmsprop", loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
103 | train_model.summary()
104 | 
105 | # train model
106 | history = train_model.fit(np.array(train_x, dtype='float64'), np.array(train_y, dtype='float64'), epochs=20, verbose=1, validation_data=(np.array(test_x, dtype='float64'), np.array(test_y, dtype='float64')))
107 | 
108 | # save weights
109 | train_model.save_weights('weights.hd5f')
110 | 
111 | # plot accuracy
112 | hist = pd.DataFrame(history.history)
113 | plt.style.use("ggplot")
114 | plt.figure(figsize=(6,6))
115 | plt.plot(hist["val_acc"])
116 | plt.plot(hist["val_loss"])
117 | plt.show()
118 | 
119 | 
120 | # testing
121 | pred = train_model.predict(np.array(test_x, dtype='float64'))
122 | pred_x = []
123 | pred_y = []
124 | for i, sent in enumerate(pred):
125 |     pred_x.append([tag_classes[np.argmax(w)] for w in pred[i][:sents_lengths[i]]])
126 |     pred_y.append([tag_classes[np.argmax(w)] for w in test_y[i][:sents_lengths[i]]])
127 |     
128 | print(classification_report(pred_y, pred_x))
129 | print('f1_score: ')
130 | print(f1_score(pred_y, pred_x))
131 | 


--------------------------------------------------------------------------------