├── src
    ├── __init__.py
    ├── config.py
    ├── train.py
    ├── alphabet.py
    ├── dataset.py
    └── model.py
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | class Config:
 2 |     sentence_max_length = 200
 3 |     input_dropout = 0.3
 4 |     output_dropout = 0.5
 5 |     recurrent_stack_depth = 5
 6 |     batch_size = 32
 7 |     max_epochs = 100
 8 |     learning_rate = 0.001
 9 |     embed_size = 256
10 |     num_lstm_units = 128
11 |     early_stopping = 2


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | from config import Config
 2 | from dataset import CharBasedNERDataset
 3 | from model import CharacterBasedLSTMModel
 4 | 
 5 | if __name__ == '__main__':
 6 |     config = Config()
 7 |     dataset = CharBasedNERDataset()
 8 |     model = CharacterBasedLSTMModel(config, dataset)
 9 | 
10 |     model.fit()
11 |     model.evaluate()
12 |     print(model.predict_str('La nostalgie n’a rien d’un sentiment esthétique, elle n’est même pas liée non plus au souvenir d’un bonheur, on est nostalgique d’un endroit simplement parce qu’on y a vécu, bien ou mal peu importe, le passé est toujours beau, et le futur aussi d’ailleurs, il n’y a que le présent qui fasse mal, qu’on transporte avec soi comme un abcès de souffrance qui vous accompagne entre deux infinis de bonheur paisible'))
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Character-Based Named Entity Recognition in Keras
 2 | ## Using a Bi-Directional LSTM Recurrent Neural Network
 3 | 
 4 | #### Keras implementation based on models by:
 5 | 
 6 |  * Kuru, Onur, Ozan Arkan Can, and Deniz Yuret. [*CharNER: Character-Level Named Entity Recognition.*](http://www.aclweb.org/anthology/C/C16/C16-1087.pdf)
 7 |  
 8 |  * Klein, D., Smarr, J., Nguyen, H., & Manning, C. D. (2003, May). [*Named entity recognition with character-level models. In Proceedings of the seventh conference on Natural language learning at HLT-NAACL 2003-Volume 4 (pp. 180-183). Association for Computational Linguistics.*](http://nlp.stanford.edu/manning/papers/conll-ner.pdf)
 9 |   
10 | ## Usage
11 | 
12 | - Implement `get_texts()`, `get_labels()` and `get_x_y()` or `get_x_y_generator()` with your own data source. 
13 | 
14 |     - `x` is a tensor of shape: `(batch_size, max_length)`.
15 |         Entries in dimension 1 are alphabet indices, index 0 is the padding symbol.
16 |         
17 |     - `y` is a tensor of shape: `(batch_size, max_length, number_of_labels)`.
18 |         Entries in dimension 2 are label indices, index 0 is the null label.
19 | 
20 | - Tweak the model hyper-parameters in `config.py` 
21 |      
22 | - Run `train.py`


--------------------------------------------------------------------------------
/src/alphabet.py:
--------------------------------------------------------------------------------
 1 | class CharBasedNERAlphabet:
 2 |     PADDING_SYMBOL = '<PAD>'
 3 |     UNKNOWN_CHAR_SYMBOL = '<UNK>'
 4 |     BASE_ALPHABET = [PADDING_SYMBOL, UNKNOWN_CHAR_SYMBOL]
 5 | 
 6 |     def __init__(self, texts):
 7 |         self.characters = self.BASE_ALPHABET + self.get_alphabet_from_texts(texts)
 8 |         self.char_to_num = None
 9 |         self.num_to_char = None
10 |         self.init_mappings()
11 | 
12 |     def get_alphabet_from_texts(self, texts):
13 |         all_characters = set()
14 | 
15 |         for t in texts:
16 |             text_characters = set(t)
17 |             all_characters |= text_characters
18 | 
19 |         alphabet = sorted(list(all_characters))
20 |         return alphabet
21 | 
22 |     def init_mappings(self):
23 |         self.char_to_num = self.get_char_to_num()
24 |         self.num_to_char = self.get_num_to_char()
25 | 
26 |     def get_char_to_num(self):
27 |         return {char: c for c, char in enumerate(self.characters)}
28 | 
29 |     def get_num_to_char(self):
30 |         return {c: char for c, char in enumerate(self.characters)}
31 | 
32 |     def get_char_index(self, char):
33 |         try:
34 |             num = self.char_to_num[char]
35 |         except KeyError:
36 |             num = self.char_to_num[self.UNKNOWN_CHAR_SYMBOL]
37 |         return num
38 | 
39 |     def __str__(self):
40 |         return str(self.characters)
41 | 
42 |     def __len__(self):
43 |         return len(self.characters)
44 | 
45 |     def __iter__(self):
46 |         return self.characters.__iter__()
47 | 


--------------------------------------------------------------------------------
/src/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from alphabet import CharBasedNERAlphabet
 3 | 
 4 | 
 5 | class CharBasedNERDataset:
 6 |     NULL_LABEL = '0'
 7 |     BASE_LABELS = [NULL_LABEL]
 8 | 
 9 |     def __init__(self):
10 |         self.texts = self.get_texts()
11 |         self.alphabet = CharBasedNERAlphabet(self.texts)
12 |         self.labels = self.BASE_LABELS + self.get_labels()
13 |         self.num_labels = len(self.labels)
14 |         self.num_to_label = {}
15 |         self.label_to_num = {}
16 | 
17 |         self.init_mappings()
18 | 
19 |     def get_texts(self):
20 |         """ Implement with own data source. """
21 |         raise NotImplementedError
22 | 
23 |     def get_x_y(self, sentence_maxlen, dataset_name='all'):
24 |         """ Implement with own data source.
25 | 
26 |         :param sentence_maxlen: maximum number of characters per sample
27 |         :param dataset_name: 'all', 'train', 'dev' or 'test'
28 |         :return: Tuple (x, y)
29 |                 x: Array of shape (batch_size, sentence_maxlen). Entries in dimension 1 are alphabet indices, index 0 is the padding symbol
30 |                 y: Array of shape (batch_size, sentence_maxlen, self.num_labels). Entries in dimension 2 are label indices, index 0 is the null label
31 |         """
32 |         raise NotImplementedError
33 | 
34 |     def get_x_y_generator(self, sentence_maxlen, dataset_name='all'):
35 |         """ Implement with own data source.
36 | 
37 |         :return: Generator object that yields tuples (x, y), same as in get_x_y()
38 |         """
39 |         raise NotImplementedError
40 | 
41 |     def get_labels(self):
42 |         """ Implement with own data source.
43 | 
44 |         :return: List of labels (classes) to predict, e.g. 'PER', 'LOC', not including the null label '0'.
45 |         """
46 |         raise NotImplementedError
47 | 
48 |     def str_to_x(self, s, maxlen):
49 |         x = np.zeros(maxlen)
50 |         for c, char in enumerate(s[:maxlen]):
51 |             x[c] = self.alphabet.get_char_index(char)
52 |         return x.reshape((-1, maxlen))
53 | 
54 |     def x_to_str(self, x):
55 |         return [[self.alphabet.num_to_char[i] for i in row] for row in x]
56 | 
57 |     def y_to_labels(self, y):
58 |         Y = []
59 |         for row in y:
60 |             Y.append([self.num_to_label[np.argmax(one_hot_labels)] for one_hot_labels in row])
61 |         return Y
62 | 
63 |     def init_mappings(self):
64 |         for num, label in enumerate(self.labels):
65 |             self.num_to_label[num] = label
66 |             self.label_to_num[label] = num
67 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from keras import backend as K
  3 | from keras.optimizers import Adam
  4 | from keras.models import Sequential
  5 | from keras.layers.wrappers import TimeDistributed
  6 | from keras.callbacks import EarlyStopping, ModelCheckpoint
  7 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
  8 | 
  9 | 
 10 | class CharacterBasedLSTMModel:
 11 |     """ Character-based stacked bi-directional LSTM model
 12 |     Based on: `Kuru, Onur, Ozan Arkan Can, and Deniz Yuret. "CharNER: Character-Level Named Entity Recognition.`
 13 |     """
 14 | 
 15 |     def __init__(self, config, dataset):
 16 |         self.config = config
 17 |         self.dataset = dataset
 18 |         self.model = self.get_model()
 19 | 
 20 |     def get_model(self):
 21 |         num_words = len(self.dataset.alphabet)
 22 |         num_labels = len(self.dataset.labels)
 23 | 
 24 |         model = Sequential()
 25 | 
 26 |         model.add(Embedding(num_words,
 27 |                             self.config.embed_size,
 28 |                             mask_zero=True))
 29 |         model.add(Dropout(self.config.input_dropout))
 30 | 
 31 |         for _ in range(self.config.recurrent_stack_depth):
 32 |             model.add(Bidirectional(LSTM(self.config.num_lstm_units, return_sequences=True)))
 33 | 
 34 |         model.add(Dropout(self.config.output_dropout))
 35 |         model.add(TimeDistributed(Dense(num_labels, activation='softmax')))
 36 | 
 37 |         # TODO Add Viterbi decoder here, see Kuru et al.
 38 | 
 39 |         optimizer = Adam(lr=self.config.learning_rate,
 40 |                          clipnorm=1.0)
 41 | 
 42 |         model.compile(optimizer=optimizer, loss='categorical_crossentropy',
 43 |                       metrics=['categorical_accuracy', self.non_null_label_accuracy])
 44 |         return model
 45 | 
 46 |     def fit(self):
 47 |         x_train, y_train = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='train')
 48 |         x_dev, y_dev = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='dev')
 49 | 
 50 |         early_stopping = EarlyStopping(patience=self.config.early_stopping,
 51 |                                        verbose=1)
 52 |         checkpointer = ModelCheckpoint(filepath="/tmp/model.weights.hdf5",
 53 |                                        verbose=1,
 54 |                                        save_best_only=True)
 55 | 
 56 |         self.model.fit(x_train,
 57 |                        y_train,
 58 |                        batch_size=self.config.batch_size,
 59 |                        epochs=self.config.max_epochs,
 60 |                        validation_data=(x_dev, y_dev),
 61 |                        shuffle=True,
 62 |                        callbacks=[early_stopping, checkpointer])
 63 | 
 64 |     def fit_generator(self):
 65 |         train_data_generator = self.dataset.get_x_y_generator(dataset_name='train',
 66 |                                                               maxlen=self.config.sentence_max_length,
 67 |                                                               batch_size=self.config.batch_size)
 68 |         dev_data_generator = self.dataset.get_x_y_generator(dataset_name='dev',
 69 |                                                             maxlen=self.config.sentence_max_length,
 70 |                                                             batch_size=self.config.batch_size)
 71 |         early_stopping = EarlyStopping(patience=self.config.early_stopping,
 72 |                                        verbose=1)
 73 | 
 74 |         self.model.fit_generator(train_data_generator,
 75 |                                  steps_per_epoch=self.dataset.num_train_docs / self.config.batch_size,
 76 |                                  epochs=self.config.max_epochs,
 77 |                                  validation_data=dev_data_generator,
 78 |                                  validation_steps=self.dataset.num_dev_docs / self.config.batch_size,
 79 |                                  callbacks=[early_stopping]
 80 |                                  )
 81 | 
 82 |     def evaluate(self):
 83 |         x_test, y_test = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='test')
 84 |         self.model.evaluate(x_test, y_test, batch_size=self.config.batch_size)
 85 | 
 86 |     def evaluate_generator(self):
 87 |         test_data_generator = self.dataset.get_x_y_generator(dataset_name='test',
 88 |                                                              maxlen=self.config.sentence_max_length,
 89 |                                                              batch_size=self.config.batch_size)
 90 | 
 91 |         self.model.evaluate_generator(test_data_generator, steps=self.dataset.num_test_docs / self.config.batch_size)
 92 | 
 93 |     def predict_str(self, s):
 94 |         """ Get model prediction for a string
 95 |         :param s: string to get named entities for
 96 |         :return: a list of len(s) tuples: [(character, predicted-label for character), ...]
 97 |         """
 98 |         x = self.dataset.str_to_x(s, self.config.sentence_max_length)
 99 |         predicted_classes = self.predict_x(x)
100 |         chars = self.dataset.x_to_str(x)[0]
101 |         labels = self.dataset.y_to_labels(predicted_classes)[0]
102 | 
103 |         return list(zip(chars, labels))
104 | 
105 |     def predict_x(self, x):
106 |         return self.model.predict(x, batch_size=1)
107 | 
108 |     @staticmethod
109 |     def non_null_label_accuracy(y_true, y_pred):
110 |         """Calculate accuracy excluding null-label targets (index 0).
111 |         Useful when the null label is over-represented in the data, like in Named Entity Recognition tasks.
112 | 
113 |         typical y shape: (batch_size, sentence_length, num_labels)
114 |         """
115 | 
116 |         y_true_argmax = K.argmax(y_true, -1)  # ==> (batch_size, sentence_length, 1)
117 |         y_pred_argmax = K.argmax(y_pred, -1)  # ==> (batch_size, sentence_length, 1)
118 | 
119 |         y_true_argmax_flat = tf.reshape(y_true_argmax, [-1])
120 |         y_pred_argmax_flat = tf.reshape(y_pred_argmax, [-1])
121 | 
122 |         non_null_targets_bool = K.not_equal(y_true_argmax_flat, K.zeros_like(y_true_argmax_flat))
123 |         non_null_target_idx = K.flatten(K.cast(tf.where(non_null_targets_bool), 'int32'))
124 | 
125 |         y_true_without_null = K.gather(y_true_argmax_flat, non_null_target_idx)
126 |         y_pred_without_null = K.gather(y_pred_argmax_flat, non_null_target_idx)
127 | 
128 |         mean = K.mean(K.cast(K.equal(y_pred_without_null,
129 |                                      y_true_without_null),
130 |                              K.floatx()))
131 | 
132 |         # If the model contains a masked layer, Keras forces metric output to have same shape as y:
133 |         fake_shape_mean = K.ones_like(y_true_argmax, K.floatx()) * mean
134 |         return fake_shape_mean
135 | 
136 |     def get_custom_objects(self):
137 |         return {'non_null_label_accuracy': self.non_null_label_accuracy}
138 | 


--------------------------------------------------------------------------------