├── src ├── __init__.py ├── config.py ├── train.py ├── alphabet.py ├── dataset.py └── model.py └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | sentence_max_length = 200 3 | input_dropout = 0.3 4 | output_dropout = 0.5 5 | recurrent_stack_depth = 5 6 | batch_size = 32 7 | max_epochs = 100 8 | learning_rate = 0.001 9 | embed_size = 256 10 | num_lstm_units = 128 11 | early_stopping = 2 -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | from config import Config 2 | from dataset import CharBasedNERDataset 3 | from model import CharacterBasedLSTMModel 4 | 5 | if __name__ == '__main__': 6 | config = Config() 7 | dataset = CharBasedNERDataset() 8 | model = CharacterBasedLSTMModel(config, dataset) 9 | 10 | model.fit() 11 | model.evaluate() 12 | print(model.predict_str('La nostalgie n’a rien d’un sentiment esthétique, elle n’est même pas liée non plus au souvenir d’un bonheur, on est nostalgique d’un endroit simplement parce qu’on y a vécu, bien ou mal peu importe, le passé est toujours beau, et le futur aussi d’ailleurs, il n’y a que le présent qui fasse mal, qu’on transporte avec soi comme un abcès de souffrance qui vous accompagne entre deux infinis de bonheur paisible')) 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Character-Based Named Entity Recognition in Keras 2 | ## Using a Bi-Directional LSTM Recurrent Neural Network 3 | 4 | #### Keras implementation based on models by: 5 | 6 | * Kuru, Onur, Ozan Arkan Can, and Deniz Yuret. [*CharNER: Character-Level Named Entity Recognition.*](http://www.aclweb.org/anthology/C/C16/C16-1087.pdf) 7 | 8 | * Klein, D., Smarr, J., Nguyen, H., & Manning, C. D. (2003, May). [*Named entity recognition with character-level models. In Proceedings of the seventh conference on Natural language learning at HLT-NAACL 2003-Volume 4 (pp. 180-183). Association for Computational Linguistics.*](http://nlp.stanford.edu/manning/papers/conll-ner.pdf) 9 | 10 | ## Usage 11 | 12 | - Implement `get_texts()`, `get_labels()` and `get_x_y()` or `get_x_y_generator()` with your own data source. 13 | 14 | - `x` is a tensor of shape: `(batch_size, max_length)`. 15 | Entries in dimension 1 are alphabet indices, index 0 is the padding symbol. 16 | 17 | - `y` is a tensor of shape: `(batch_size, max_length, number_of_labels)`. 18 | Entries in dimension 2 are label indices, index 0 is the null label. 19 | 20 | - Tweak the model hyper-parameters in `config.py` 21 | 22 | - Run `train.py` -------------------------------------------------------------------------------- /src/alphabet.py: -------------------------------------------------------------------------------- 1 | class CharBasedNERAlphabet: 2 | PADDING_SYMBOL = '' 3 | UNKNOWN_CHAR_SYMBOL = '' 4 | BASE_ALPHABET = [PADDING_SYMBOL, UNKNOWN_CHAR_SYMBOL] 5 | 6 | def __init__(self, texts): 7 | self.characters = self.BASE_ALPHABET + self.get_alphabet_from_texts(texts) 8 | self.char_to_num = None 9 | self.num_to_char = None 10 | self.init_mappings() 11 | 12 | def get_alphabet_from_texts(self, texts): 13 | all_characters = set() 14 | 15 | for t in texts: 16 | text_characters = set(t) 17 | all_characters |= text_characters 18 | 19 | alphabet = sorted(list(all_characters)) 20 | return alphabet 21 | 22 | def init_mappings(self): 23 | self.char_to_num = self.get_char_to_num() 24 | self.num_to_char = self.get_num_to_char() 25 | 26 | def get_char_to_num(self): 27 | return {char: c for c, char in enumerate(self.characters)} 28 | 29 | def get_num_to_char(self): 30 | return {c: char for c, char in enumerate(self.characters)} 31 | 32 | def get_char_index(self, char): 33 | try: 34 | num = self.char_to_num[char] 35 | except KeyError: 36 | num = self.char_to_num[self.UNKNOWN_CHAR_SYMBOL] 37 | return num 38 | 39 | def __str__(self): 40 | return str(self.characters) 41 | 42 | def __len__(self): 43 | return len(self.characters) 44 | 45 | def __iter__(self): 46 | return self.characters.__iter__() 47 | -------------------------------------------------------------------------------- /src/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from alphabet import CharBasedNERAlphabet 3 | 4 | 5 | class CharBasedNERDataset: 6 | NULL_LABEL = '0' 7 | BASE_LABELS = [NULL_LABEL] 8 | 9 | def __init__(self): 10 | self.texts = self.get_texts() 11 | self.alphabet = CharBasedNERAlphabet(self.texts) 12 | self.labels = self.BASE_LABELS + self.get_labels() 13 | self.num_labels = len(self.labels) 14 | self.num_to_label = {} 15 | self.label_to_num = {} 16 | 17 | self.init_mappings() 18 | 19 | def get_texts(self): 20 | """ Implement with own data source. """ 21 | raise NotImplementedError 22 | 23 | def get_x_y(self, sentence_maxlen, dataset_name='all'): 24 | """ Implement with own data source. 25 | 26 | :param sentence_maxlen: maximum number of characters per sample 27 | :param dataset_name: 'all', 'train', 'dev' or 'test' 28 | :return: Tuple (x, y) 29 | x: Array of shape (batch_size, sentence_maxlen). Entries in dimension 1 are alphabet indices, index 0 is the padding symbol 30 | y: Array of shape (batch_size, sentence_maxlen, self.num_labels). Entries in dimension 2 are label indices, index 0 is the null label 31 | """ 32 | raise NotImplementedError 33 | 34 | def get_x_y_generator(self, sentence_maxlen, dataset_name='all'): 35 | """ Implement with own data source. 36 | 37 | :return: Generator object that yields tuples (x, y), same as in get_x_y() 38 | """ 39 | raise NotImplementedError 40 | 41 | def get_labels(self): 42 | """ Implement with own data source. 43 | 44 | :return: List of labels (classes) to predict, e.g. 'PER', 'LOC', not including the null label '0'. 45 | """ 46 | raise NotImplementedError 47 | 48 | def str_to_x(self, s, maxlen): 49 | x = np.zeros(maxlen) 50 | for c, char in enumerate(s[:maxlen]): 51 | x[c] = self.alphabet.get_char_index(char) 52 | return x.reshape((-1, maxlen)) 53 | 54 | def x_to_str(self, x): 55 | return [[self.alphabet.num_to_char[i] for i in row] for row in x] 56 | 57 | def y_to_labels(self, y): 58 | Y = [] 59 | for row in y: 60 | Y.append([self.num_to_label[np.argmax(one_hot_labels)] for one_hot_labels in row]) 61 | return Y 62 | 63 | def init_mappings(self): 64 | for num, label in enumerate(self.labels): 65 | self.num_to_label[num] = label 66 | self.label_to_num[label] = num 67 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras import backend as K 3 | from keras.optimizers import Adam 4 | from keras.models import Sequential 5 | from keras.layers.wrappers import TimeDistributed 6 | from keras.callbacks import EarlyStopping, ModelCheckpoint 7 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 8 | 9 | 10 | class CharacterBasedLSTMModel: 11 | """ Character-based stacked bi-directional LSTM model 12 | Based on: `Kuru, Onur, Ozan Arkan Can, and Deniz Yuret. "CharNER: Character-Level Named Entity Recognition.` 13 | """ 14 | 15 | def __init__(self, config, dataset): 16 | self.config = config 17 | self.dataset = dataset 18 | self.model = self.get_model() 19 | 20 | def get_model(self): 21 | num_words = len(self.dataset.alphabet) 22 | num_labels = len(self.dataset.labels) 23 | 24 | model = Sequential() 25 | 26 | model.add(Embedding(num_words, 27 | self.config.embed_size, 28 | mask_zero=True)) 29 | model.add(Dropout(self.config.input_dropout)) 30 | 31 | for _ in range(self.config.recurrent_stack_depth): 32 | model.add(Bidirectional(LSTM(self.config.num_lstm_units, return_sequences=True))) 33 | 34 | model.add(Dropout(self.config.output_dropout)) 35 | model.add(TimeDistributed(Dense(num_labels, activation='softmax'))) 36 | 37 | # TODO Add Viterbi decoder here, see Kuru et al. 38 | 39 | optimizer = Adam(lr=self.config.learning_rate, 40 | clipnorm=1.0) 41 | 42 | model.compile(optimizer=optimizer, loss='categorical_crossentropy', 43 | metrics=['categorical_accuracy', self.non_null_label_accuracy]) 44 | return model 45 | 46 | def fit(self): 47 | x_train, y_train = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='train') 48 | x_dev, y_dev = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='dev') 49 | 50 | early_stopping = EarlyStopping(patience=self.config.early_stopping, 51 | verbose=1) 52 | checkpointer = ModelCheckpoint(filepath="/tmp/model.weights.hdf5", 53 | verbose=1, 54 | save_best_only=True) 55 | 56 | self.model.fit(x_train, 57 | y_train, 58 | batch_size=self.config.batch_size, 59 | epochs=self.config.max_epochs, 60 | validation_data=(x_dev, y_dev), 61 | shuffle=True, 62 | callbacks=[early_stopping, checkpointer]) 63 | 64 | def fit_generator(self): 65 | train_data_generator = self.dataset.get_x_y_generator(dataset_name='train', 66 | maxlen=self.config.sentence_max_length, 67 | batch_size=self.config.batch_size) 68 | dev_data_generator = self.dataset.get_x_y_generator(dataset_name='dev', 69 | maxlen=self.config.sentence_max_length, 70 | batch_size=self.config.batch_size) 71 | early_stopping = EarlyStopping(patience=self.config.early_stopping, 72 | verbose=1) 73 | 74 | self.model.fit_generator(train_data_generator, 75 | steps_per_epoch=self.dataset.num_train_docs / self.config.batch_size, 76 | epochs=self.config.max_epochs, 77 | validation_data=dev_data_generator, 78 | validation_steps=self.dataset.num_dev_docs / self.config.batch_size, 79 | callbacks=[early_stopping] 80 | ) 81 | 82 | def evaluate(self): 83 | x_test, y_test = self.dataset.get_x_y(self.config.sentence_max_length, dataset_name='test') 84 | self.model.evaluate(x_test, y_test, batch_size=self.config.batch_size) 85 | 86 | def evaluate_generator(self): 87 | test_data_generator = self.dataset.get_x_y_generator(dataset_name='test', 88 | maxlen=self.config.sentence_max_length, 89 | batch_size=self.config.batch_size) 90 | 91 | self.model.evaluate_generator(test_data_generator, steps=self.dataset.num_test_docs / self.config.batch_size) 92 | 93 | def predict_str(self, s): 94 | """ Get model prediction for a string 95 | :param s: string to get named entities for 96 | :return: a list of len(s) tuples: [(character, predicted-label for character), ...] 97 | """ 98 | x = self.dataset.str_to_x(s, self.config.sentence_max_length) 99 | predicted_classes = self.predict_x(x) 100 | chars = self.dataset.x_to_str(x)[0] 101 | labels = self.dataset.y_to_labels(predicted_classes)[0] 102 | 103 | return list(zip(chars, labels)) 104 | 105 | def predict_x(self, x): 106 | return self.model.predict(x, batch_size=1) 107 | 108 | @staticmethod 109 | def non_null_label_accuracy(y_true, y_pred): 110 | """Calculate accuracy excluding null-label targets (index 0). 111 | Useful when the null label is over-represented in the data, like in Named Entity Recognition tasks. 112 | 113 | typical y shape: (batch_size, sentence_length, num_labels) 114 | """ 115 | 116 | y_true_argmax = K.argmax(y_true, -1) # ==> (batch_size, sentence_length, 1) 117 | y_pred_argmax = K.argmax(y_pred, -1) # ==> (batch_size, sentence_length, 1) 118 | 119 | y_true_argmax_flat = tf.reshape(y_true_argmax, [-1]) 120 | y_pred_argmax_flat = tf.reshape(y_pred_argmax, [-1]) 121 | 122 | non_null_targets_bool = K.not_equal(y_true_argmax_flat, K.zeros_like(y_true_argmax_flat)) 123 | non_null_target_idx = K.flatten(K.cast(tf.where(non_null_targets_bool), 'int32')) 124 | 125 | y_true_without_null = K.gather(y_true_argmax_flat, non_null_target_idx) 126 | y_pred_without_null = K.gather(y_pred_argmax_flat, non_null_target_idx) 127 | 128 | mean = K.mean(K.cast(K.equal(y_pred_without_null, 129 | y_true_without_null), 130 | K.floatx())) 131 | 132 | # If the model contains a masked layer, Keras forces metric output to have same shape as y: 133 | fake_shape_mean = K.ones_like(y_true_argmax, K.floatx()) * mean 134 | return fake_shape_mean 135 | 136 | def get_custom_objects(self): 137 | return {'non_null_label_accuracy': self.non_null_label_accuracy} 138 | --------------------------------------------------------------------------------