├── README.md ├── loader.py ├── model.py ├── nn.py ├── tagger.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # tf-lstm-crf-batch 2 | 3 | The tf-lstm-crf-batch tool is an implementation of a Named Entity Recognizer combined bi-lstm and crf based on tensorflow. Details about the model can be found at:https://arxiv.org/pdf/1603.01360.pdf 4 | 5 | 6 | # Initial setup 7 | To use the tool, you need Python 2.7, with Numpy and Tensorflow installed. 8 | 9 | 10 | # Tag sentences 11 | 12 | The fastest way to use the tool is to use one of the pretrained models: 13 | 14 | ``` 15 | ./tagger.py --model models/your_model_name/ --saver models/saver/ --input input.txt --output output.txt 16 | ``` 17 | 18 | The input file should contain one sentence by line, and they have to be tokenized. 19 | 20 | 21 | # Train a model 22 | 23 | To train your own model, you need to use the train.py script and provide the location of the training, development and testing set: 24 | 25 | ``` 26 | ./train.py --train train.txt --dev dev.txt --test test.txt 27 | ``` 28 | 29 | The training script will automatically give a name to the model and store it in ./models/ There are many parameters you can tune (CRF, dropout rate, embedding dimension, LSTM hidden layer size, batch_size, gpu, etc). To see all parameters, simply run: 30 | ``` 31 | ./train.py --help 32 | ``` 33 | 34 | Input files for the training script: each word has to be on a separate line, and there must be an empty line after each sentence. A line must contain at least 2 columns, the first one being the word itself, the last one being the named entity. It does not matter if there are extra columns that contain tags or chunks in between. 35 | -------------------------------------------------------------------------------- /loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import codecs 5 | from utils import create_dico, create_mapping, zero_digits 6 | 7 | def load_sentences(path, lower, zeros): 8 | """ 9 | Load sentences. A line must contain at least a word and its tag. 10 | Sentences are separated by empty lines. 11 | """ 12 | sentences = [] 13 | sentence = [] 14 | for line in codecs.open(path, 'r', 'utf8'): 15 | line = zero_digits(line.rstrip()) if zeros else line.rstrip() 16 | if not line: 17 | if len(sentence) > 0: 18 | if 'DOCSTART' not in sentence[0][0]: 19 | sentences.append(sentence) 20 | sentence = [] 21 | else: 22 | word = line.split() 23 | assert len(word) >= 2 24 | sentence.append(word) 25 | if len(sentence) > 0: 26 | if 'DOCSTART' not in sentence[0][0]: 27 | sentences.append(sentence) 28 | return sentences 29 | 30 | 31 | def word_mapping(sentences, lower): 32 | """ 33 | Create a dictionary and a mapping of words, sorted by frequency. 34 | """ 35 | words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] 36 | dico = create_dico(words) 37 | dico[''] = sys.maxint 38 | word_to_id, id_to_word = create_mapping(dico) 39 | print "Found %i unique words (%i in total)" % ( 40 | len(dico), sum(len(x) for x in words) 41 | ) 42 | return dico, word_to_id, id_to_word 43 | 44 | 45 | def char_mapping(sentences): 46 | """ 47 | Create a dictionary and mapping of characters, sorted by frequency. 48 | """ 49 | chars = ["".join([w[0] for w in s]) for s in sentences] 50 | dico = create_dico(chars) 51 | char_to_id, id_to_char = create_mapping(dico) 52 | print "Found %i unique characters" % len(dico) 53 | return dico, char_to_id, id_to_char 54 | 55 | 56 | def tag_mapping(sentences): 57 | """ 58 | Create a dictionary and a mapping of tags, sorted by frequency. 59 | """ 60 | tags = [[word[-1] for word in s] for s in sentences] 61 | dico = create_dico(tags) 62 | tag_to_id, id_to_tag = create_mapping(dico) 63 | print "Found %i unique named entity tags" % len(dico) 64 | return dico, tag_to_id, id_to_tag 65 | 66 | 67 | def cap_feature(s): 68 | """ 69 | Capitalization feature: 70 | 0 = low caps 71 | 1 = all caps 72 | 2 = first letter caps 73 | 3 = one capital (not first letter) 74 | """ 75 | if s.lower() == s: 76 | return 0 77 | elif s.upper() == s: 78 | return 1 79 | elif s[0].upper() == s[0]: 80 | return 2 81 | else: 82 | return 3 83 | 84 | 85 | def prepare_sentence_(str_words, word_to_id, lower=False): 86 | """ 87 | Prepare a sentence for evaluation. 88 | """ 89 | def f(x): return x.lower() if lower else x 90 | words = [word_to_id[f(w) if f(w) in word_to_id else ''] 91 | for w in str_words] 92 | return { 93 | 'str_words': str_words, 94 | 'words': words, 95 | } 96 | 97 | 98 | def prepare_sentence(str_words, word_to_id, char_to_id, lower=False): 99 | """ 100 | Prepare a sentence for evaluation. 101 | """ 102 | def f(x): return x.lower() if lower else x 103 | words = [word_to_id[f(w) if f(w) in word_to_id else ''] 104 | for w in str_words] 105 | chars = [[char_to_id[c] for c in w if c in char_to_id] 106 | for w in str_words] 107 | caps = [cap_feature(w) for w in str_words] 108 | return { 109 | 'str_words': str_words, 110 | 'words': words, 111 | 'chars': chars, 112 | 'caps': caps 113 | } 114 | 115 | 116 | def prepare_dataset_(sentences, word_to_id, tag_to_id, lower=False): 117 | """ 118 | Prepare the dataset. Return a list of lists of dictionaries containing: 119 | - word indexes 120 | - tag indexes 121 | """ 122 | def f(x): return x.lower() if lower else x 123 | data = [] 124 | for s in sentences: 125 | str_words = [w[0] for w in s] 126 | words = [word_to_id[f(w) if f(w) in word_to_id else ''] 127 | for w in str_words] 128 | tags = [tag_to_id[w[-1]] for w in s] 129 | assert len(words) == len(tags) 130 | data.append({ 131 | 'str_words': str_words, 132 | 'words': words, 133 | 'tags': tags, 134 | }) 135 | return data 136 | 137 | 138 | def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False): 139 | """ 140 | Prepare the dataset. Return a list of lists of dictionaries containing: 141 | - word indexes 142 | - word char indexes 143 | - tag indexes 144 | """ 145 | def f(x): return x.lower() if lower else x 146 | data = [] 147 | for s in sentences: 148 | str_words = [w[0] for w in s] 149 | words = [word_to_id[f(w) if f(w) in word_to_id else ''] 150 | for w in str_words] 151 | # Skip characters that are not in the training set 152 | chars = [[char_to_id[c] for c in w if c in char_to_id] 153 | for w in str_words] 154 | caps = [cap_feature(w) for w in str_words] 155 | tags = [tag_to_id[w[-1]] for w in s] 156 | assert len(words) == len(tags) 157 | data.append({ 158 | 'str_words': str_words, 159 | 'words': words, 160 | 'chars': chars, 161 | 'caps': caps, 162 | 'tags': tags, 163 | }) 164 | return data 165 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import cPickle 5 | 6 | from utils import shared, get_name 7 | from nn import HiddenLayer, EmbeddingLayer, LSTM, forward 8 | 9 | 10 | class Model(object): 11 | """ 12 | Network architecture. 13 | """ 14 | def __init__(self, parameters=None, models_path=None, model_path=None): 15 | """ 16 | Initialize the model. We either provide the parameters and a path where 17 | we store the models, or the location of a trained model. 18 | """ 19 | if model_path is None: 20 | assert parameters and models_path 21 | # Create a name based on the parameters 22 | self.parameters = parameters 23 | self.name = get_name(parameters) 24 | # Model location 25 | model_path = os.path.join(models_path, self.name) 26 | self.model_path = model_path 27 | self.mappings_path = os.path.join(model_path, 'mappings.pkl') 28 | self.parameters_path = os.path.join(model_path, 'parameters.pkl') 29 | # Create directory for the model if it does not exist 30 | if not os.path.exists(self.model_path): 31 | os.makedirs(self.model_path) 32 | # Save the parameters to disk 33 | with open(self.parameters_path, 'wb') as f: 34 | cPickle.dump(parameters, f) 35 | else: 36 | assert parameters is None and models_path is None 37 | # Model location 38 | self.model_path = model_path 39 | self.mappings_path = os.path.join(model_path, 'mappings.pkl') 40 | self.parameters_path = os.path.join(model_path, 'parameters.pkl') 41 | # Load the parameters and the mappings from disk 42 | with open(self.parameters_path, 'rb') as f: 43 | self.parameters = cPickle.load(f) 44 | self.reload_mappings() 45 | 46 | def save_mappings(self, id_to_word, id_to_char, id_to_tag): 47 | """ 48 | We need to save the mappings if we want to use the model later. 49 | """ 50 | self.id_to_word = id_to_word 51 | self.id_to_char = id_to_char 52 | self.id_to_tag = id_to_tag 53 | with open(self.mappings_path, 'wb') as f: 54 | mappings = { 55 | 'id_to_word': self.id_to_word, 56 | 'id_to_char': self.id_to_char, 57 | 'id_to_tag': self.id_to_tag, 58 | } 59 | cPickle.dump(mappings, f) 60 | 61 | def reload_mappings(self): 62 | """ 63 | Load mappings from disk. 64 | """ 65 | with open(self.mappings_path, 'rb') as f: 66 | mappings = cPickle.load(f) 67 | self.id_to_word = mappings['id_to_word'] 68 | self.id_to_char = mappings['id_to_char'] 69 | self.id_to_tag = mappings['id_to_tag'] 70 | 71 | def build(self, 72 | dropout, 73 | char_dim, 74 | char_lstm_dim, 75 | char_bidirect, 76 | word_dim, 77 | word_lstm_dim, 78 | word_bidirect, 79 | lr_method, 80 | lr_rate, 81 | clip_norm, 82 | crf, 83 | is_train, 84 | **kwargs 85 | ): 86 | """ 87 | Build the network. 88 | """ 89 | # Training parameters 90 | n_words = len(self.id_to_word) 91 | n_chars = len(self.id_to_char) 92 | n_tags = len(self.id_to_tag) 93 | 94 | # Network variables 95 | self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name='word_ids') # shape:[batch_size, max_word_len] 96 | self.word_pos_ids = tf.placeholder(tf.int32, shape=[None], name='word_pos_ids') # shape: [batch_size] 97 | self.char_for_ids = tf.placeholder(tf.int32, shape=[None, None, None], name='char_for_ids') # shape: [batch_size, word_max_len, char_max_len] 98 | self.char_rev_ids = tf.placeholder(tf.int32, shape=[None, None, None], name='char_rev_ids') # shape: [batch_size, word_max_len, char_max_len] 99 | self.char_pos_ids = tf.placeholder(tf.int32, shape=[None, None], name='char_pos_ids') # shape: [batch_size*word_max_len, char_max_len] 100 | self.tag_ids = tf.placeholder(tf.int32, shape=[None, None], name='tag_ids') # shape: [batch_size,word_max_len] 101 | self.tag_id_trans = tf.placeholder(tf.int32, shape=[None, None, None], name='tag_id_trans') # shape: [batch_size,word_max_len+1,2] 102 | self.tag_id_index = tf.placeholder(tf.int32, shape=[None, None, None], name='tag_id_index') # shape: [batch_size,word_max_len,2] 103 | # Final input (all word features) 104 | input_dim = 0 105 | inputs = [] 106 | # 107 | # Word inputs 108 | # 109 | if word_dim: 110 | input_dim += word_dim 111 | with tf.device("/cpu:0"): 112 | word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') 113 | word_input = word_layer.link(self.word_ids) 114 | inputs.append(word_input) 115 | 116 | # 117 | # Phars inputs 118 | # 119 | if char_dim: 120 | input_dim += char_lstm_dim 121 | char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') 122 | 123 | char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, 124 | name='char_lstm_for') 125 | char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, 126 | name='char_lstm_rev') 127 | 128 | with tf.device("/cpu:0"): 129 | char_for_embedding_batch = char_layer.link(self.char_for_ids) 130 | char_rev_embedding_batch = char_layer.link(self.char_rev_ids) 131 | shape_for = tf.shape(char_for_embedding_batch) 132 | # reshape from [batch_size, word_max_len, char_max_len, char_dim] to [batch_size*word_max_len, char_max_len, char_dim] 133 | char_for_embedding = tf.reshape(char_for_embedding_batch, 134 | (shape_for[0]*shape_for[1], shape_for[2], shape_for[3])) 135 | shape_rev = tf.shape(char_rev_embedding_batch) 136 | char_rev_embedding = tf.reshape(char_rev_embedding_batch, 137 | (shape_rev[0] * shape_rev[1], shape_rev[2], shape_rev[3])) 138 | char_lstm_for_states = char_lstm_for.link(char_for_embedding) 139 | char_lstm_rev_states = char_lstm_rev.link(char_rev_embedding) 140 | char_lstm_for_h_trans = tf.transpose(char_lstm_for_states[1], (1, 0, 2), name='char_lstm_for_h_trans') 141 | char_lstm_rev_h_trans = tf.transpose(char_lstm_rev_states[1], (1, 0, 2), name='char_lstm_rev_h_trans') 142 | char_for_output = tf.gather_nd(char_lstm_for_h_trans, self.char_pos_ids, name='char_for_output') 143 | char_rev_output = tf.gather_nd(char_lstm_rev_h_trans, self.char_pos_ids, name='char_rev_output') 144 | char_for_output_batch = tf.reshape(char_for_output, (shape_for[0], shape_for[1], char_lstm_dim)) 145 | char_rev_output_batch = tf.reshape(char_rev_output, (shape_rev[0], shape_rev[1], char_lstm_dim)) 146 | inputs.append(char_for_output_batch) 147 | if char_bidirect: 148 | inputs.append(char_rev_output_batch) 149 | input_dim += char_lstm_dim 150 | inputs = tf.concat(inputs, axis=-1) 151 | # Dropout on final input 152 | assert dropout < 1 and 0.0 <= dropout 153 | if dropout: 154 | input_train = tf.nn.dropout(inputs, 1 - dropout) 155 | if is_train: 156 | inputs = input_train 157 | # LSTM for words 158 | word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=True, 159 | name='word_lstm_for') 160 | word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=True, 161 | name='word_lstm_rev') 162 | # fordword hidden output 163 | word_states_for = word_lstm_for.link(inputs) 164 | word_lstm_for_output = tf.transpose(word_states_for[1], (1, 0, 2), name='word_lstm_for_h_trans') 165 | 166 | # reverse hidden ouput 167 | inputs_rev = tf.reverse_sequence(inputs, self.word_pos_ids, seq_dim=1, batch_dim=0) 168 | word_states_rev = word_lstm_rev.link(inputs_rev) 169 | word_lstm_rev_h_trans = tf.transpose(word_states_rev[1], (1, 0, 2), name='word_lstm_rev_h_trans') 170 | word_lstm_rev_output = tf.reverse_sequence(word_lstm_rev_h_trans, self.word_pos_ids, seq_dim=1, batch_dim=0) 171 | if word_bidirect: 172 | final_output = tf.concat([word_lstm_for_output, word_lstm_rev_output],axis=-1) 173 | tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') 174 | final_output = tanh_layer.link(final_output) 175 | else: 176 | final_output = word_lstm_for_output 177 | final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer') 178 | tags_scores = final_layer.link(final_output) 179 | # No CRF 180 | if not crf: 181 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.tag_ids, logits=tags_scores, name='xentropy') 182 | cost = tf.reduce_mean(cross_entropy, name='xentropy_mean') 183 | else: 184 | transitions = shared((n_tags + 2, n_tags + 2), 'transitions') 185 | small = -1000 186 | b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) 187 | e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) 188 | 189 | # for batch observation 190 | #def recurrence(prev, obs): 191 | # s_len = tf.shape(obs)[0] 192 | # obvs = tf.concat([obs, small * tf.ones((s_len, 2))], axis=1) 193 | # observations = tf.concat([b_s, obvs, e_s], axis=0) 194 | # return observations 195 | #tags_scores_shape = tf.shape(tags_scores) 196 | #obs_initial = tf.ones((tags_scores_shape[1] + 2, n_tags + 2)) 197 | #obs_batch = tf.scan(fn=recurrence, elems=tags_scores, initializer=obs_initial) 198 | 199 | # Score from tags 200 | def recurrence_real_score(prev,obs): 201 | tags_score = obs[0] 202 | tag_id_index_ = obs[1] 203 | tag_id_trans_= obs[2] 204 | word_pos_ = obs[3] + 1 205 | tags_score_slice = tags_score[0:word_pos_,:] 206 | tag_id_index_slice = tag_id_index_[0:word_pos_,:] 207 | tag_id_trans_slice = tag_id_trans_[0:(word_pos_+1),:] 208 | real_path_score = tf.reduce_sum(tf.gather_nd(tags_score_slice, tag_id_index_slice)) 209 | real_path_score += tf.reduce_sum(tf.gather_nd(transitions, tag_id_trans_slice)) 210 | return tf.reshape(real_path_score,[]) 211 | real_path_score_list = tf.scan(fn=recurrence_real_score, elems=[tags_scores, self.tag_id_index, self.tag_id_trans, self.word_pos_ids], initializer=0.0) 212 | 213 | def recurrence_all_path(prev, obs): 214 | tags_score = obs[0] 215 | word_pos_ = obs[1] + 1 216 | tags_score_slice = tags_score[0:word_pos_,:] 217 | s_len = tf.shape(tags_score_slice)[0] 218 | obvs = tf.concat([tags_score_slice, small * tf.ones((s_len, 2))], axis=1) 219 | observations = tf.concat([b_s, obvs, e_s], axis=0) 220 | all_paths_scores = forward(observations, transitions) 221 | return tf.reshape(all_paths_scores,[]) 222 | all_paths_scores_list = tf.scan(fn=recurrence_all_path, elems=[tags_scores, self.word_pos_ids], initializer=0.0) 223 | cost = - tf.reduce_mean(real_path_score_list - all_paths_scores_list) 224 | # Network parameters 225 | if not crf: 226 | f_score = tf.nn.softmax(tags_scores) 227 | else: 228 | def recurrence_predict(prev, obs): 229 | tags_score = obs[0] 230 | word_pos_ = obs[1] + 1 231 | tags_score_slice = tags_score[0:word_pos_,:] 232 | s_len = tf.shape(tags_score_slice)[0] 233 | obvs = tf.concat([tags_score_slice, small * tf.ones((s_len, 2))], axis=1) 234 | observations = tf.concat([b_s, obvs, e_s], axis=0) 235 | all_paths_scores = forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True) 236 | all_paths_scores = tf.concat([all_paths_scores, tf.zeros([tf.shape(tags_score)[0]-s_len], tf.int32)], axis=0) 237 | return all_paths_scores 238 | f_score = tf.scan(fn=recurrence_predict, elems=[tags_scores, self.word_pos_ids], initializer=tf.zeros([tf.shape(tags_scores)[1]+2], tf.int32)) 239 | # Optimization 240 | tvars = tf.trainable_variables() 241 | grads = tf.gradients(cost, tvars) 242 | if clip_norm > 0: 243 | grads, _ = tf.clip_by_global_norm(grads, clip_norm) 244 | 245 | if lr_method == 'sgd': 246 | optimizer = tf.train.GradientDescentOptimizer(lr_rate) 247 | elif lr_method == 'adagrad': 248 | optimizer = tf.train.AdagradOptimizer(lr_rate) 249 | elif lr_method == 'adadelta': 250 | optimizer = tf.train.AdadeltaOptimizer(lr_rate) 251 | elif lr_method == 'adam': 252 | optimizer = tf.train.AdamOptimizer(lr_rate) 253 | elif lr_method == 'rmsprop': 254 | optimizer = tf.train.RMSPropOptimizer(lr_rate) 255 | else: 256 | raise("Not implemented learning method: %s" % lr_method) 257 | 258 | train_op = optimizer.apply_gradients(zip(grads, tvars)) 259 | 260 | return cost, f_score, train_op 261 | -------------------------------------------------------------------------------- /nn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from utils import shared 3 | 4 | 5 | class HiddenLayer(object): 6 | """ 7 | Hidden layer with or without bias. 8 | Input: tensor of dimension (dims*, input_dim) 9 | Output: tensor of dimension (dims*, output_dim) 10 | """ 11 | def __init__(self, input_dim, output_dim, bias=True, activation=None, 12 | name='hidden_layer'): 13 | self.input_dim = input_dim 14 | self.output_dim = output_dim 15 | self.bias = bias 16 | self.name = name 17 | if activation is None: 18 | self.activation = None 19 | elif activation == 'tanh': 20 | self.activation = tf.nn.tanh 21 | elif activation == 'sigmoid': 22 | self.activation = tf.nn.sigmoid 23 | elif activation == 'softmax': 24 | self.activation = tf.nn.softmax 25 | else: 26 | raise Exception("Unknown activation function: " % activation) 27 | 28 | # Initialize weights and bias 29 | self.weights = shared((input_dim, output_dim), name + '__weights') 30 | self.bias = shared((output_dim,), name + '__bias') 31 | 32 | def link(self, input): 33 | """ 34 | The input has to be a tensor with the right 35 | most dimension equal to input_dim. 36 | """ 37 | input_shape = tf.shape(input) 38 | self.input = tf.reshape(input, (input_shape[0]*input_shape[1], input_shape[-1])) 39 | self.linear_output = tf.matmul(self.input, self.weights) 40 | if self.bias: 41 | self.linear_output = self.linear_output + self.bias 42 | if self.activation is None: 43 | self.output = self.linear_output 44 | else: 45 | self.output = self.activation(self.linear_output) 46 | self.output = tf.reshape(self.output, (input_shape[0], input_shape[1], self.output_dim)) 47 | return self.output 48 | 49 | 50 | class EmbeddingLayer(object): 51 | """ 52 | Embedding layer: word embeddings representations 53 | Input: tensor of dimension (dim*) with values in range(0, input_dim) 54 | Output: tensor of dimension (dim*, output_dim) 55 | """ 56 | 57 | def __init__(self, input_dim, output_dim, name='embedding_layer'): 58 | """ 59 | Typically, input_dim is the vocabulary size, 60 | and output_dim the embedding dimension. 61 | """ 62 | self.input_dim = input_dim 63 | self.output_dim = output_dim 64 | self.name = name 65 | 66 | # Randomly generate weights 67 | self.embeddings = shared((input_dim, output_dim), 68 | self.name + '__embeddings') 69 | 70 | def link(self, input): 71 | """ 72 | Return the embeddings of the given indexes. 73 | Input: tensor of shape (dim*) 74 | Output: tensor of shape (dim*, output_dim) 75 | """ 76 | self.input = input 77 | self.output = tf.gather(self.embeddings, input) 78 | return self.output 79 | 80 | 81 | class DropoutLayer(object): 82 | """ 83 | Dropout layer. Randomly set to 0 values of the input 84 | with probability p. 85 | """ 86 | def __init__(self, p=0.5, name='dropout_layer'): 87 | """ 88 | p has to be between 0 and 1 (1 excluded). 89 | p is the probability of dropping out a unit, so 90 | setting p to 0 is equivalent to have an identity layer. 91 | """ 92 | assert 0. <= p < 1. 93 | self.p = p 94 | self.name = name 95 | 96 | def link(self, input): 97 | """ 98 | Dropout link: we just apply mask to the input. 99 | """ 100 | if self.p > 0: 101 | self.output = tf.nn.dropout(input, 1 - self.p) 102 | else: 103 | self.output = input 104 | 105 | return self.output 106 | 107 | 108 | class LSTM(object): 109 | """ 110 | Long short-term memory (LSTM). Can be used with or without batches. 111 | Without batches: 112 | Input: matrix of dimension (sequence_length, input_dim) 113 | Output: vector of dimension (output_dim) 114 | With batches: 115 | Input: tensor3 of dimension (batch_size, sequence_length, input_dim) 116 | Output: matrix of dimension (batch_size, output_dim) 117 | """ 118 | def __init__(self, input_dim, hidden_dim, with_batch=True, name='LSTM'): 119 | """ 120 | Initialize neural network. 121 | """ 122 | self.input_dim = input_dim 123 | self.hidden_dim = hidden_dim 124 | self.with_batch = with_batch 125 | self.name = name 126 | 127 | # Input gate weights 128 | self.w_xi = shared((input_dim, hidden_dim), name + '__w_xi') 129 | self.w_hi = shared((hidden_dim, hidden_dim), name + '__w_hi') 130 | self.w_ci = shared((hidden_dim, hidden_dim), name + '__w_ci') 131 | 132 | # Forget gate weights 133 | # self.w_xf = shared((input_dim, hidden_dim), name + '__w_xf') 134 | # self.w_hf = shared((hidden_dim, hidden_dim), name + '__w_hf') 135 | # self.w_cf = shared((hidden_dim, hidden_dim), name + '__w_cf') 136 | 137 | # Output gate weights 138 | self.w_xo = shared((input_dim, hidden_dim), name + '__w_xo') 139 | self.w_ho = shared((hidden_dim, hidden_dim), name + '__w_ho') 140 | self.w_co = shared((hidden_dim, hidden_dim), name + '__w_co') 141 | 142 | # Cell weights 143 | self.w_xc = shared((input_dim, hidden_dim), name + '__w_xc') 144 | self.w_hc = shared((hidden_dim, hidden_dim), name + '__w_hc') 145 | 146 | # Initialize the bias vectors, c_0 and h_0 to zero vectors 147 | self.b_i = shared((hidden_dim,), name + '__b_i') 148 | # self.b_f = shared((hidden_dim,), name + '__b_f') 149 | self.b_c = shared((hidden_dim,), name + '__b_c') 150 | self.b_o = shared((hidden_dim,), name + '__b_o') 151 | self.c_0 = shared((hidden_dim,), name + '__c_0') 152 | self.h_0 = shared((hidden_dim,), name + '__h_0') 153 | 154 | def link(self, input): 155 | """ 156 | Propagate the input through the network and return the last hidden 157 | vector. The whole sequence is also accessible via self.h, but 158 | where self.h of shape (sequence_length, batch_size, output_dim) 159 | """ 160 | def recurrence(prev, x_t): 161 | c_tm1 = prev[0] 162 | h_tm1 = prev[1] 163 | if len(x_t.shape) == 1: 164 | x_t = tf.reshape(x_t, [1, self.input_dim]) 165 | if len(c_tm1.shape) == 1: 166 | c_tm1 = tf.reshape(c_tm1, [1, self.hidden_dim]) 167 | if len(h_tm1.shape) == 1: 168 | h_tm1 = tf.reshape(h_tm1, [1, self.hidden_dim]) 169 | i_t = tf.nn.sigmoid(tf.matmul(x_t, self.w_xi) + 170 | tf.matmul(h_tm1, self.w_hi) + 171 | tf.matmul(c_tm1, self.w_ci) + 172 | self.b_i) 173 | # f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) + 174 | # T.dot(h_tm1, self.w_hf) + 175 | # T.dot(c_tm1, self.w_cf) + 176 | # self.b_f) 177 | c_t = ((1 - i_t) * c_tm1 + i_t * tf.nn.tanh(tf.matmul(x_t, self.w_xc) + 178 | tf.matmul(h_tm1, self.w_hc) + self.b_c)) 179 | o_t = tf.nn.sigmoid(tf.matmul(x_t, self.w_xo) + 180 | tf.matmul(h_tm1, self.w_ho) + 181 | tf.matmul(c_t, self.w_co) + 182 | self.b_o) 183 | h_t = o_t * tf.nn.tanh(c_t) 184 | if self.with_batch == False: 185 | c_t = tf.squeeze(c_t, axis=[0]) 186 | h_t = tf.squeeze(h_t, axis=[0]) 187 | return [c_t, h_t] 188 | # If we use batches, we have to permute the first and second dimension. 189 | if self.with_batch: 190 | batch_size = tf.shape(input)[0] 191 | zeros = tf.ones([batch_size]) 192 | def alloc(prev, x): 193 | return [self.c_0, self.h_0] 194 | out_info = [self.c_0, self.h_0] 195 | outputs_info = tf.scan(fn=alloc, elems=zeros, initializer=out_info, name='batch_init') 196 | self.input = tf.transpose(input, (1, 0, 2)) 197 | 198 | else: 199 | self.input = input 200 | outputs_info = [self.c_0, self.h_0] 201 | 202 | states = tf.scan( 203 | fn=recurrence, 204 | elems=self.input, 205 | initializer=outputs_info, 206 | name='state' 207 | ) 208 | return states 209 | 210 | 211 | class GRU(object): 212 | """ 213 | Gated recurrent unit (GRU). Can be used with or without batches. 214 | Without batches: 215 | Input: matrix of dimension (sequence_length, input_dim) 216 | Output: vector of dimension (output_dim) 217 | With batches: 218 | Input: tensor3 of dimension (batch_size, sequence_length, input_dim) 219 | Output: matrix of dimension (batch_size, output_dim) 220 | """ 221 | def __init__(self, input_dim, hidden_dim, with_batch=True, name='GRU'): 222 | """ 223 | Initialize neural network. 224 | """ 225 | self.input_dim = input_dim 226 | self.hidden_dim = hidden_dim 227 | self.with_batch = with_batch 228 | self.name = name 229 | 230 | # Input weight tensor 231 | self.w_x = shared((input_dim, hidden_dim), name + '__w_x') 232 | 233 | # Reset weight tensor 234 | self.w_xr = shared((input_dim, hidden_dim), name + '__w_xr') 235 | self.w_hr = shared((hidden_dim, hidden_dim), name + '__w_hr') 236 | 237 | # Update weight tensor 238 | self.w_xz = shared((input_dim, hidden_dim), name + '__w_xz') 239 | self.w_hz = shared((hidden_dim, hidden_dim), name + '__w_hz') 240 | 241 | # Hidden weight tensor 242 | self.w_h = shared((hidden_dim, hidden_dim), name + '__w_h') 243 | 244 | # Initialize the bias vectors, h_0 to zero vectors 245 | self.b_r = tf.Variable(tf.truncated_normal((hidden_dim,), mean=1), name=(name + '__b_r')) 246 | self.b_z = tf.Variable(tf.truncated_normal((hidden_dim,), mean=1), name=(name + '__b_z')) 247 | self.h_0 = shared((hidden_dim,), name + '__h_0') 248 | 249 | def link(self, input): 250 | """ 251 | Propagate the input through the network and return the last hidden 252 | vector. The whole sequence is also accessible via self.h, but 253 | where self.h of shape (sequence_length, batch_size, output_dim) 254 | """ 255 | 256 | def recurrence(previous_hidden_state, x_t): 257 | if len(x_t.shape) == 1: 258 | x_t = tf.reshape(x_t, [1, self.input_dim]) 259 | if len(previous_hidden_state.shape) == 1: 260 | previous_hidden_state = tf.reshape(previous_hidden_state, [1, self.hidden_dim]) 261 | 262 | # update gate 263 | z_t = tf.sigmoid(tf.matmul(x_t, self.w_xz) + tf.matmul(previous_hidden_state, self.w_hz) + self.b_z) 264 | # reset gate 265 | r_t = tf.sigmoid(tf.matmul(x_t, self.w_xr) + tf.matmul(previous_hidden_state, self.w_hr) + self.b_r) 266 | # candidate activation 267 | h_ = tf.tanh(tf.matmul(x_t, self.w_x) + tf.matmul(tf.multiply(previous_hidden_state, r_t), self.w_h)) 268 | 269 | h_t = tf.multiply((1 - z_t), previous_hidden_state) + tf.multiply(z_t, h_) 270 | 271 | if self.with_batch == False: 272 | h_t = tf.squeeze(h_t, axis=[0]) 273 | return h_t 274 | 275 | # If we use batches, we have to permute the first and second dimension. 276 | if self.with_batch: 277 | batch_size = tf.shape(input)[0] 278 | zeros = tf.ones([batch_size]) 279 | 280 | def alloc(prev, x): 281 | return self.h_0 282 | 283 | out_info = self.h_0 284 | outputs_info = tf.scan(fn=alloc, elems=zeros, initializer=out_info, name='batch_init') 285 | self.input = tf.transpose(input, (1, 0, 2)) 286 | 287 | else: 288 | self.input = input 289 | outputs_info = self.h_0 290 | 291 | states = tf.scan( 292 | fn=recurrence, 293 | elems=self.input, 294 | initializer=outputs_info, 295 | name='state' 296 | ) 297 | return states 298 | 299 | 300 | def log_sum_exp(x, axis=None): 301 | """ 302 | Sum probabilities in the log-space. 303 | """ 304 | xmax = tf.reduce_max(x, axis=axis, keep_dims=True) 305 | xmax_ = tf.reduce_max(x, axis=axis) 306 | return xmax_ + tf.log(tf.reduce_sum(tf.exp(x - xmax), axis=axis)) 307 | 308 | 309 | def get_array_arg_max_coordinate(x): 310 | """ 311 | Get the coodinate of the max score in each row of the matrix x 312 | :param x: matrix 313 | :return: coodinate 314 | """ 315 | shape_x = tf.shape(x) 316 | row_size = shape_x[0] 317 | row_size_range = tf.range(0, row_size) 318 | row_size_range_reshape = tf.reshape(row_size_range, (row_size, 1)) 319 | x_reshape = tf.reshape(x, (row_size, 1)) 320 | row_argmax_coodinate = tf.concat([row_size_range_reshape, x_reshape], axis=1) 321 | return row_argmax_coodinate 322 | 323 | 324 | def forward_batch(observations, transitions, viterbi=False, 325 | return_alpha=False, return_best_sequence=False): 326 | """ 327 | Takes as input: 328 | - observations, sequence of shape (batch_size, n_steps, n_classes) 329 | - transitions, sequence of shape (n_classes, n_classes) 330 | Probabilities must be given in the log space. 331 | Compute alpha, matrix of size (n_steps, batch_size n_classes), such that 332 | alpha[i, j] represents one of these 2 values: 333 | - the probability that the real path at node i ends in j 334 | - the maximum probability of a path finishing in j at node i (Viterbi) 335 | Returns one of these 2 values: 336 | - alpha 337 | - the final probability, which can be: 338 | - the sum of the probabilities of all paths 339 | - the probability of the best path (Viterbi) 340 | """ 341 | assert not return_best_sequence or (viterbi and not return_alpha) 342 | 343 | shape_t = transitions.get_shape().dims 344 | transitions_ = tf.reshape(transitions, (1, shape_t[0].value, shape_t[1].value)) 345 | 346 | def recurrence(prev, obs): 347 | previous = prev 348 | if return_best_sequence: 349 | previous = prev[0] 350 | shape_ = tf.shape(previous) 351 | previous = tf.reshape(previous, (shape_[0], shape_t[0].value, 1)) 352 | obs = tf.reshape(obs, (shape_[0], 1, shape_t[0].value)) 353 | if viterbi: 354 | scores = previous + obs + transitions_ 355 | out = tf.reduce_max(scores, axis=1) 356 | if return_best_sequence: 357 | out2 = tf.argmax(scores, axis=1) 358 | return [out, out2] 359 | else: 360 | return out 361 | else: 362 | return log_sum_exp(previous + obs + transitions, axis=1) 363 | 364 | obs = tf.transpose(observations, (1, 0, 2)) 365 | initial = obs[0] 366 | ones = tf.ones(tf.shape(initial), dtype=tf.int64) 367 | if return_best_sequence: 368 | initial = [initial, ones] 369 | alpha = tf.scan( 370 | fn=recurrence, 371 | elems=obs[1:], 372 | initializer=initial 373 | ) 374 | if return_alpha: 375 | return alpha 376 | elif return_best_sequence: 377 | output_info = get_array_arg_max_coordinate(tf.cast(tf.argmax(alpha[0][-1], axis=1), tf.int32)) 378 | 379 | def recurrence_cal(prev, x): 380 | sequ = tf.gather_nd(x, prev) 381 | return get_array_arg_max_coordinate(sequ) 382 | sequence = tf.scan( 383 | fn=recurrence_cal, 384 | elems=tf.cast(alpha[1][::-1], tf.int32), 385 | initializer=output_info 386 | ) 387 | sequence = sequence[:, :, -1] 388 | sequence = tf.concat([sequence[::-1], [tf.cast(tf.argmax(alpha[0][-1], axis=1), tf.int32)]], axis=0) 389 | return tf.transpose(sequence) 390 | else: 391 | if viterbi: 392 | return tf.reduce_max(alpha[-1], axis=1) 393 | else: 394 | return log_sum_exp(alpha[-1], axis=1) 395 | 396 | 397 | def forward(observations, transitions, viterbi=False, 398 | return_alpha=False, return_best_sequence=False): 399 | """ 400 | Takes as input: 401 | - observations, sequence of shape (n_steps, n_classes) 402 | - transitions, sequence of shape (n_classes, n_classes) 403 | Probabilities must be given in the log space. 404 | Compute alpha, matrix of size (n_steps, n_classes), such that 405 | alpha[i, j] represents one of these 2 values: 406 | - the probability that the real path at node i ends in j 407 | - the maximum probability of a path finishing in j at node i (Viterbi) 408 | Returns one of these 2 values: 409 | - alpha 410 | - the final probability, which can be: 411 | - the sum of the probabilities of all paths 412 | - the probability of the best path (Viterbi) 413 | """ 414 | assert not return_best_sequence or (viterbi and not return_alpha) 415 | 416 | def recurrence(prev, obs): 417 | previous = prev 418 | if return_best_sequence: 419 | previous = prev[0] 420 | previous = tf.expand_dims(previous, 1) 421 | obs = tf.expand_dims(obs, 0) 422 | if viterbi: 423 | scores = previous + obs + transitions 424 | out = tf.reduce_max(scores, axis=0) 425 | if return_best_sequence: 426 | out2 = tf.argmax(scores, axis=0) 427 | return [out, out2] 428 | else: 429 | return out 430 | else: 431 | return log_sum_exp(previous + obs + transitions, axis=0) 432 | 433 | 434 | initial = observations[0] 435 | ones = tf.ones(tf.shape(initial), dtype=tf.int64) 436 | if return_best_sequence: 437 | initial = [initial, ones] 438 | alpha = tf.scan( 439 | fn=recurrence, 440 | elems=observations[1:], 441 | initializer=initial 442 | ) 443 | if return_alpha: 444 | return alpha 445 | elif return_best_sequence: 446 | output_info = tf.cast(tf.argmax(alpha[0][-1], axis=0), tf.int32) 447 | sequence = tf.scan( 448 | fn=lambda previous, beta_i: beta_i[previous], 449 | elems=tf.cast(alpha[1][::-1], tf.int32), 450 | initializer=output_info 451 | ) 452 | sequence = tf.concat([sequence[::-1], [tf.cast(tf.argmax(alpha[0][-1], axis=0), tf.int32)]], axis=0) 453 | return sequence 454 | else: 455 | if viterbi: 456 | return tf.reduce_max(alpha[-1], axis=0) 457 | else: 458 | return log_sum_exp(alpha[-1], axis=0) 459 | -------------------------------------------------------------------------------- /tagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import time 5 | import codecs 6 | import optparse 7 | from loader import prepare_sentence, prepare_sentence_ 8 | from utils import create_input_batch, zero_digits 9 | from model import Model 10 | import tensorflow as tf 11 | 12 | optparser = optparse.OptionParser() 13 | optparser.add_option( 14 | "-m", "--model", default="", 15 | help="Model location" 16 | ) 17 | optparser.add_option( 18 | "-s", "--saver", default="", 19 | help="tf checkpoint location" 20 | ) 21 | optparser.add_option( 22 | "-i", "--input", default="", 23 | help="Input file location" 24 | ) 25 | optparser.add_option( 26 | "-o", "--output", default="", 27 | help="Output file location" 28 | ) 29 | optparser.add_option( 30 | "-d", "--delimiter", default="__", 31 | help="Delimiter to separate words from their tags" 32 | ) 33 | 34 | opts = optparser.parse_args()[0] 35 | 36 | # Check parameters validity 37 | assert opts.delimiter 38 | assert os.path.isdir(opts.model) 39 | assert os.path.isdir(opts.saver) 40 | assert os.path.isfile(opts.input) 41 | 42 | 43 | # Load existing model 44 | print "Loading model..." 45 | model = Model(model_path=opts.model) 46 | parameters = model.parameters 47 | parameters['is_train'] = 0 48 | parameters['dropout'] = 0 49 | batch_size = parameters['batch_size'] 50 | # Load reverse mappings 51 | word_to_id, char_to_id, tag_to_id = [ 52 | {v: k for k, v in x.items()} 53 | for x in [model.id_to_word, model.id_to_char, model.id_to_tag] 54 | ] 55 | tag_count = len(tag_to_id) 56 | # Load the model 57 | cost, f_eval, _ = model.build(**parameters) 58 | 59 | f_output = codecs.open(opts.output, 'w', 'utf-8') 60 | start = time.time() 61 | saver = tf.train.Saver() 62 | print 'Tagging...' 63 | with tf.Session() as sess: 64 | ckpt = tf.train.get_checkpoint_state(opts.saver) 65 | if ckpt and ckpt.model_checkpoint_path: 66 | saver.restore(sess, ckpt.model_checkpoint_path) 67 | test_data = [] 68 | word_data = [] 69 | with codecs.open(opts.input, 'r', 'utf-8') as f_input: 70 | for line in f_input: 71 | words = line.rstrip().split() 72 | if line: 73 | # Lowercase sentence 74 | if parameters['lower']: 75 | line = line.lower() 76 | # Replace all digits with zeros 77 | if parameters['zeros']: 78 | line = zero_digits(line) 79 | # Prepare input 80 | if parameters['char_dim']: 81 | sentence = prepare_sentence(words, word_to_id, char_to_id, 82 | lower=parameters['lower']) 83 | else: 84 | sentence = prepare_sentence_(words, word_to_id,lower=parameters['lower']) 85 | test_data.append(sentence) 86 | word_data.append(words) 87 | else: 88 | continue 89 | count = 0 90 | assert len(test_data) == len(word_data) 91 | while count < len(test_data): 92 | batch_data = [] 93 | batch_words = [] 94 | for i in xrange(batch_size): 95 | index = i + count 96 | if index >= len(test_data): 97 | break 98 | data = test_data[index] 99 | batch_data.append(test_data[index]) 100 | batch_words.append(word_data[index]) 101 | if len(batch_data) <= 0: 102 | break 103 | input_ = create_input_batch(batch_data, parameters) 104 | feed_dict_ = {} 105 | if parameters['char_dim']: 106 | feed_dict_[model.word_ids] = input_[0] 107 | feed_dict_[model.word_pos_ids] = input_[1] 108 | feed_dict_[model.char_for_ids] = input_[2] 109 | feed_dict_[model.char_rev_ids] = input_[3] 110 | feed_dict_[model.char_pos_ids] = input_[4] 111 | else: 112 | feed_dict_[model.word_ids] = input_[0] 113 | feed_dict_[model.word_pos_ids] = input_[1] 114 | f_scores = sess.run(f_eval, feed_dict=feed_dict_) 115 | # Decoding 116 | if parameters['crf']: 117 | for x in xrange(len(batch_data)): 118 | f_score = f_scores[x] 119 | word_pos = input_[1][x] + 2 120 | y_pred = f_score[1:word_pos] 121 | words = batch_words[x] 122 | y_preds = [model.id_to_tag[pred] for pred in y_pred] 123 | assert len(words) == len(y_preds) 124 | # Write tags 125 | f_output.write('%s\n' % ' '.join('%s%s%s' % (w, opts.delimiter, y) for w, y in zip(words, y_preds))) 126 | else: 127 | f_score = f_scores.argmax(axis=-1) 128 | for x in xrange(len(batch_data)): 129 | word_pos = input_[1][x] + 1 130 | y_pred = f_score[x][0:word_pos] 131 | words = batch_words[x] 132 | y_preds = [model.id_to_tag[pred] for pred in y_pred] 133 | assert len(words) == len(y_preds) 134 | # Write tags 135 | f_output.write('%s\n' % ' '.join('%s%s%s' % (w, opts.delimiter, y) for w, y in zip(words, y_preds))) 136 | count += len(batch_data) 137 | print '---- %i lines tagged in %.4fs ----' % (count, time.time() - start) 138 | f_output.close() 139 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import datetime 5 | import numpy as np 6 | import optparse 7 | from collections import OrderedDict 8 | from utils import create_input_batch 9 | import loader 10 | 11 | from utils import models_path, models_saver_path, evaluate 12 | from loader import word_mapping, char_mapping, tag_mapping 13 | from loader import prepare_dataset, prepare_dataset_ 14 | from model import Model 15 | import tensorflow as tf 16 | 17 | # Read parameters from command line 18 | optparser = optparse.OptionParser() 19 | optparser.add_option( 20 | "-T", "--train", default="", 21 | help="Train set location" 22 | ) 23 | optparser.add_option( 24 | "-d", "--dev", default="", 25 | help="Dev set location" 26 | ) 27 | optparser.add_option( 28 | "-t", "--test", default="", 29 | help="Test set location" 30 | ) 31 | optparser.add_option( 32 | "-l", "--lower", default="0", 33 | type='int', help="Lowercase words (this will not affect character inputs)" 34 | ) 35 | optparser.add_option( 36 | "-z", "--zeros", default="0", 37 | type='int', help="Replace digits with 0" 38 | ) 39 | optparser.add_option( 40 | "-c", "--char_dim", default="25", 41 | type='int', help="Char embedding dimension" 42 | ) 43 | optparser.add_option( 44 | "-C", "--char_lstm_dim", default="25", 45 | type='int', help="Char LSTM hidden layer size" 46 | ) 47 | optparser.add_option( 48 | "-b", "--char_bidirect", default="1", 49 | type='int', help="Use a bidirectional LSTM for chars" 50 | ) 51 | optparser.add_option( 52 | "-w", "--word_dim", default="100", 53 | type='int', help="Token embedding dimension" 54 | ) 55 | optparser.add_option( 56 | "-W", "--word_lstm_dim", default="100", 57 | type='int', help="Token LSTM hidden layer size" 58 | ) 59 | optparser.add_option( 60 | "-B", "--word_bidirect", default="1", 61 | type='int', help="Use a bidirectional LSTM for words" 62 | ) 63 | optparser.add_option( 64 | "-f", "--crf", default="1", 65 | type='int', help="Use CRF (0 to disable)" 66 | ) 67 | optparser.add_option( 68 | "-D", "--dropout", default="0", 69 | type='float', help="Droupout on the input (0 = no dropout)" 70 | ) 71 | optparser.add_option( 72 | "-L", "--lr_method", default="sgd", 73 | help="Learning method (SGD, Adadelta, Adam..)" 74 | ) 75 | optparser.add_option( 76 | "-R", "--lr_rate", default="0.005", 77 | type='float', help="learning rate" 78 | ) 79 | optparser.add_option( 80 | "-p", "--clip_norm", default="0", 81 | type='float', help="The clipping ratio" 82 | ) 83 | optparser.add_option( 84 | "-r", "--mode", default="1", 85 | type='int', help="1 for Train and 0 for Test" 86 | ) 87 | optparser.add_option( 88 | "-G", "--batch_size", default="20", 89 | type='int', help="batch size" 90 | ) 91 | optparser.add_option( 92 | "-g", "--singleton", default="0", 93 | type='float', help=" whether it needs to replace singletons by the unknown word or not" 94 | ) 95 | optparser.add_option( 96 | "-E", "--epoch", default="50", 97 | type='int', help="number of epochs over the training set" 98 | ) 99 | optparser.add_option( 100 | "-F", "--freq", default="5000", 101 | type='int', help="evaluate on dev every freq_eval steps" 102 | ) 103 | optparser.add_option( 104 | "-Z", "--gpu_no", default="-1", 105 | type='int', help="whether using the cpu or gpu" 106 | ) 107 | opts = optparser.parse_args()[0] 108 | 109 | # Parse parameters 110 | parameters = OrderedDict() 111 | parameters['lower'] = opts.lower == 1 112 | parameters['zeros'] = opts.zeros == 1 113 | parameters['char_dim'] = opts.char_dim 114 | parameters['char_lstm_dim'] = opts.char_lstm_dim 115 | parameters['char_bidirect'] = opts.char_bidirect == 1 116 | parameters['word_dim'] = opts.word_dim 117 | parameters['word_lstm_dim'] = opts.word_lstm_dim 118 | parameters['word_bidirect'] = opts.word_bidirect == 1 119 | parameters['crf'] = opts.crf == 1 120 | parameters['dropout'] = opts.dropout 121 | parameters['lr_method'] = opts.lr_method 122 | parameters['lr_rate'] = opts.lr_rate 123 | parameters['clip_norm'] = opts.clip_norm 124 | parameters['is_train'] = opts.mode 125 | parameters['update'] = opts.update_scheme 126 | parameters['batch_size'] = opts.batch_size 127 | 128 | # Check parameters validity 129 | assert os.path.isfile(opts.train) 130 | assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 131 | assert 0. <= parameters['dropout'] < 1.0 132 | 133 | if not os.path.exists(models_path): 134 | os.makedirs(models_path) 135 | if not os.path.exists(models_saver_path): 136 | os.makedirs(models_saver_path) 137 | 138 | # Initialize model 139 | model = Model(parameters=parameters, models_path=models_path) 140 | print "Model location: %s" % model.model_path 141 | 142 | # Data parameters 143 | lower = parameters['lower'] 144 | zeros = parameters['zeros'] 145 | batch_size = parameters['batch_size'] 146 | 147 | # Load sentences 148 | train_sentences = loader.load_sentences(opts.train, lower, zeros) 149 | dev_sentences = loader.load_sentences(opts.dev, lower, zeros) 150 | test_sentences = loader.load_sentences(opts.test, lower, zeros) 151 | 152 | # Create a dictionary / mapping of words 153 | dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) 154 | dico_words_train = dico_words 155 | 156 | # Create a dictionary and a mapping for words / POS tags / tags 157 | id_to_char = {} 158 | if opts.char_dim: 159 | dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) 160 | dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) 161 | 162 | n_tag = len(id_to_tag) 163 | 164 | # Index data 165 | if opts.char_dim: 166 | train_data = prepare_dataset( 167 | train_sentences, word_to_id, char_to_id, tag_to_id, lower 168 | ) 169 | dev_data = prepare_dataset( 170 | dev_sentences, word_to_id, char_to_id, tag_to_id, lower 171 | ) 172 | test_data = prepare_dataset( 173 | test_sentences, word_to_id, char_to_id, tag_to_id, lower 174 | ) 175 | else: 176 | train_data = prepare_dataset_( 177 | train_sentences, word_to_id, tag_to_id, lower 178 | ) 179 | dev_data = prepare_dataset_( 180 | dev_sentences, word_to_id, tag_to_id, lower 181 | ) 182 | test_data = prepare_dataset_( 183 | test_sentences, word_to_id, tag_to_id, lower 184 | ) 185 | print "%i / %i / %i sentences in train / dev / test." % ( 186 | len(train_data), len(dev_data), len(test_data)) 187 | 188 | # Save the mappings to disk 189 | print 'Saving the mappings to disk...' 190 | model.save_mappings(id_to_word, id_to_char, id_to_tag) 191 | 192 | # Build the model 193 | if opts.gpu_no < 0: 194 | with tf.device("/cpu:0"): 195 | cost, tags_scores, train_op = model.build(**parameters) 196 | else: 197 | with tf.device("/gpu:" + str(opts.gpu_no)): 198 | cost, tags_scores, train_op = model.build(**parameters) 199 | # 200 | # Train network 201 | # 202 | singletons = None 203 | if opts.singleton: 204 | singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) 205 | 206 | n_epochs = opts.epoch # number of epochs over the training set 207 | freq_eval = opts.freq # evaluate on dev every freq_eval steps 208 | count = 0 209 | best_dev = -np.inf 210 | best_test = -np.inf 211 | saver = tf.train.Saver() 212 | start_time_all = datetime.datetime.now() 213 | config = tf.ConfigProto() 214 | config.gpu_options.allow_growth = True 215 | config.allow_soft_placement = True 216 | with tf.Session(config=config) as sess: 217 | sess.run(tf.global_variables_initializer()) 218 | for epoch in xrange(n_epochs): 219 | epoch_costs = [] 220 | epoch_accus = [] 221 | epoch_sentence = [] 222 | print "Starting epoch %i..." % epoch 223 | permutation_index = np.random.permutation(len(train_data)) 224 | train_data_count = 0 225 | start_time_epoch = datetime.datetime.now() 226 | token_count = 0.0 227 | while train_data_count <= len(permutation_index): 228 | batch_data = [] 229 | start_time = datetime.datetime.now() 230 | for i in xrange(batch_size): 231 | count += 1 232 | index = i + train_data_count 233 | if index >= len(permutation_index): 234 | index %= len(permutation_index) 235 | batch_data.append(train_data[permutation_index[index]]) 236 | input_ = create_input_batch(batch_data, parameters, n_tag, True, singletons) 237 | feed_dict_ = {} 238 | if parameters['char_dim']: 239 | assert len(input_) == 8 240 | feed_dict_[model.word_ids] = input_[0] 241 | feed_dict_[model.word_pos_ids] = input_[1] 242 | feed_dict_[model.char_for_ids] = input_[2] 243 | feed_dict_[model.char_rev_ids] = input_[3] 244 | feed_dict_[model.char_pos_ids] = input_[4] 245 | feed_dict_[model.tag_ids] = input_tag = input_[5] 246 | feed_dict_[model.tag_id_trans] = input_[6] 247 | feed_dict_[model.tag_id_index] = input_[7] 248 | else: 249 | assert len(input_) == 5 250 | feed_dict_[model.word_ids] = input_[0] 251 | feed_dict_[model.word_pos_ids] = input_[1] 252 | feed_dict_[model.tag_ids] = input_tag = input_[2] 253 | feed_dict_[model.tag_id_trans] = input_[3] 254 | feed_dict_[model.tag_id_index] = input_[4] 255 | new_cost, f_scores, _ = sess.run([cost, tags_scores, train_op], feed_dict=feed_dict_) 256 | accus_batch = [] 257 | sentence_batch = [] 258 | if parameters['crf']: 259 | for x in xrange(batch_size): 260 | f_score = f_scores[x] 261 | word_pos = input_[1][x] + 2 262 | y_pred = f_score[1:word_pos] 263 | y_real = input_tag[x][0:(word_pos-1)] 264 | correct_prediction = np.equal(y_pred, y_real) 265 | accus = np.array(correct_prediction).astype(float).sum() 266 | accus_mean = np.array(correct_prediction).astype(float).mean() 267 | accus_batch.append(accus) 268 | if accus_mean < 1.0: 269 | sentence_batch.append(0.0) 270 | else: 271 | sentence_batch.append(1.0) 272 | token_count += (input_[1][x] + 1) 273 | sentence_val = np.array(sentence_batch).astype(float).mean() 274 | else: 275 | y_preds = f_scores.argmax(axis=-1) 276 | y_reals = np.array(input_tag).astype(np.int32) 277 | for x in xrange(batch_size): 278 | word_pos = input_[1][x] + 1 279 | y_pred = y_preds[x][0:word_pos] 280 | y_real = y_reals[x][0:word_pos] 281 | correct_prediction = np.equal(y_pred, y_real) 282 | accus = np.array(correct_prediction).astype(float).sum() 283 | accus_mean = np.array(correct_prediction).astype(float).mean() 284 | accus_batch.append(accus) 285 | if accus_mean < 1.0: 286 | sentence_batch.append(0.0) 287 | else: 288 | sentence_batch.append(1.0) 289 | token_count += word_pos 290 | sentence_val = np.array(sentence_batch).astype(float).mean() 291 | epoch_costs.append(new_cost) 292 | epoch_accus.extend(accus_batch) 293 | epoch_sentence.append(sentence_val) 294 | end_time = datetime.datetime.now() 295 | cost_time = (end_time - start_time).seconds 296 | if train_data_count % freq_eval == 0 and train_data_count > 0: 297 | assert token_count != 0.0 298 | token_accus_freq = np.sum(epoch_accus) / token_count 299 | print "%i, cost average: %f, accuracy average: %f, sentence accuracy avg: %f, cost time: %i" % (train_data_count, np.mean(epoch_costs), token_accus_freq, np.mean(epoch_sentence), cost_time) 300 | if train_data_count % freq_eval == 0 and train_data_count > 0: 301 | dev_score, dev_sentence_score = evaluate(sess, tags_scores, model, parameters, dev_data, n_tag) 302 | test_score, test_sentence_score = evaluate(sess, tags_scores, model, parameters, test_data, n_tag) 303 | print "Score on dev: %.5f" % dev_score 304 | print "Score on test: %.5f" % test_score 305 | if dev_score > best_dev: 306 | best_dev = dev_score 307 | print "New best score on dev." 308 | print "Saving model to disk..." 309 | saver.save(sess, os.path.join(models_saver_path, 'model.ckpt'), global_step=count) 310 | if test_score > best_test: 311 | best_test = test_score 312 | print "New best score on test." 313 | train_data_count += batch_size 314 | assert token_count != 0.0 315 | token_accus_epoch = np.sum(epoch_accus) / token_count 316 | end_time_epoch = datetime.datetime.now() 317 | cost_time_epoch = (end_time_epoch - start_time_epoch).seconds 318 | print "Epoch %i done. Average cost: %f, Average accuracy: %f, Average sentence: %f, Cost time: %i" % (epoch, np.mean(epoch_costs), token_accus_epoch, np.mean(epoch_sentence), cost_time_epoch) 319 | end_time_all = datetime.datetime.now() 320 | cost_time_a = (end_time_all - start_time_all).seconds 321 | print "Epoch %i done. Cost time: %i" % (n_epochs, cost_time_a) 322 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import datetime 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | models_path = "./models" 8 | models_saver_path = "./models/saver" 9 | 10 | 11 | def get_name(parameters): 12 | """ 13 | Generate a model name from its parameters. 14 | """ 15 | l = [] 16 | for k, v in parameters.items(): 17 | if type(v) is str and "/" in v: 18 | l.append((k, v[::-1][:v[::-1].index('/')][::-1])) 19 | else: 20 | l.append((k, v)) 21 | name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l]) 22 | return "".join(i for i in name if i not in "\/:*?<>|") 23 | 24 | 25 | def shared(shape, name): 26 | """ 27 | Create a shared object of a numpy array. 28 | """ 29 | if len(shape) == 1: 30 | # bias are initialized with zeros 31 | return tf.get_variable(name, shape, tf.float32, tf.constant_initializer(0)) 32 | else: 33 | drange = np.sqrt(6. / (np.sum(shape))) 34 | return tf.get_variable(name, shape, tf.float32, tf.random_uniform_initializer(-drange, drange)) 35 | 36 | def create_dico(item_list): 37 | """ 38 | Create a dictionary of items from a list of list of items. 39 | """ 40 | assert type(item_list) is list 41 | dico = {} 42 | for items in item_list: 43 | for item in items: 44 | if item not in dico: 45 | dico[item] = 1 46 | else: 47 | dico[item] += 1 48 | return dico 49 | 50 | 51 | def create_mapping(dico): 52 | """ 53 | Create a mapping (item to ID / ID to item) from a dictionary. 54 | Items are ordered by decreasing frequency. 55 | """ 56 | sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0])) 57 | id_to_item = {i: v[0] for i, v in enumerate(sorted_items)} 58 | item_to_id = {v: k for k, v in id_to_item.items()} 59 | return item_to_id, id_to_item 60 | 61 | 62 | def zero_digits(s): 63 | """ 64 | Replace every digit in a string by a zero. 65 | """ 66 | return re.sub('\d', '0', s) 67 | 68 | 69 | def insert_singletons(words, singletons, p=0.5): 70 | """ 71 | Replace singletons by the unknown word with a probability p. 72 | """ 73 | new_words = [] 74 | for word in words: 75 | if word in singletons and np.random.uniform() < p: 76 | new_words.append(0) 77 | else: 78 | new_words.append(word) 79 | return new_words 80 | 81 | 82 | def pad_word_chars(words): 83 | """ 84 | Pad the characters of the words in a sentence. 85 | Input: 86 | - list of lists of ints (list of words, a word being a list of char indexes) 87 | Output: 88 | - padded list of lists of ints 89 | - padded list of lists of ints (where chars are reversed) 90 | - list of ints corresponding to the index of the last character of each word 91 | """ 92 | max_length = max([len(word) for word in words]) 93 | char_for = [] 94 | char_rev = [] 95 | char_pos = [] 96 | for word in words: 97 | padding = [0] * (max_length - len(word)) 98 | char_for.append(word + padding) 99 | char_rev.append(word[::-1] + padding) 100 | char_pos.append(len(word) - 1) 101 | return char_for, char_rev, char_pos 102 | 103 | 104 | def pad_word_chars(words, max_length): 105 | """ 106 | Pad the characters of the words in a sentence. 107 | Input: 108 | - list of lists of ints (list of words, a word being a list of char indexes) 109 | - the max length of word 110 | Output: 111 | - padded list of lists of ints 112 | - padded list of lists of ints (where chars are reversed) 113 | - list of ints corresponding to the index of the last character of each word 114 | """ 115 | char_for = [] 116 | char_rev = [] 117 | char_pos = [] 118 | for word in words: 119 | padding = [0] * (max_length - len(word)) 120 | char_for.append(word + padding) 121 | char_rev.append(word[::-1] + padding) 122 | char_pos.append(len(word) - 1) 123 | return char_for, char_rev, char_pos 124 | 125 | 126 | def pad_sentence_words(sentences): 127 | """ 128 | Pad the words of the sentence in the batch_sentence. 129 | Input: 130 | - list of lists of ints (list of sentence, a sentence being a list of word indexes) 131 | Output: 132 | - padded list of lists of ints 133 | - padded list of lists of ints (where chars are reversed) 134 | - list of ints corresponding to the index of the last character of each word 135 | """ 136 | max_length = max([len(sentence) for sentence in sentences]) 137 | word_for = [] 138 | word_pos = [] 139 | for words in sentences: 140 | padding = [0] * (max_length - len(words)) 141 | word_for.append(words + padding) 142 | word_pos.append(len(words) - 1) 143 | return word_for, word_pos 144 | 145 | 146 | def create_input(data, parameters, add_label, singletons=None): 147 | """ 148 | Take sentence data and return an input for 149 | the training or the evaluation function. 150 | """ 151 | words = data['words'] 152 | chars = data['chars'] 153 | if singletons is not None: 154 | words = insert_singletons(words, singletons) 155 | if parameters['cap_dim']: 156 | caps = data['caps'] 157 | char_for, char_rev, char_pos = pad_word_chars(chars) 158 | input = [] 159 | if parameters['word_dim']: 160 | input.append(words) 161 | if parameters['char_dim']: 162 | input.append(char_for) 163 | if parameters['char_bidirect']: 164 | input.append(char_rev) 165 | input.append(char_pos) 166 | if parameters['cap_dim']: 167 | input.append(caps) 168 | if add_label: 169 | input.append(data['tags']) 170 | return input 171 | 172 | 173 | def create_input_(data, n_tags, parameters, add_label, singletons=None): 174 | """ 175 | Take sentence data and return an input for 176 | the training or the evaluation function. 177 | """ 178 | words = data['words'] 179 | chars = data['chars'] 180 | if singletons is not None: 181 | words = insert_singletons(words, singletons) 182 | char_for, char_rev, char_pos = pad_word_chars(chars) 183 | input = [] 184 | input.append(words) 185 | input.append(char_for) 186 | input.append(char_rev) 187 | char_pos_array = [] 188 | for i in xrange(len(char_pos)): 189 | temp = [] 190 | temp.append(i) 191 | temp.append(char_pos[i]) 192 | char_pos_array.append(temp) 193 | input.append(char_pos_array) 194 | if add_label: 195 | input.append(data['tags']) 196 | tag_id_trans_array = [] 197 | tag_len = len(data['tags']) 198 | for i in xrange(tag_len): 199 | temp = [] 200 | if i == 0: 201 | temp.append(n_tags) 202 | temp.append(data['tags'][i]) 203 | tag_id_trans_array.append(temp) 204 | temp = [] 205 | if i < (tag_len - 1): 206 | temp.append(data['tags'][i]) 207 | temp.append(data['tags'][i+1]) 208 | tag_id_trans_array.append(temp) 209 | else: 210 | temp.append(data['tags'][i]) 211 | temp.append(n_tags+1) 212 | tag_id_trans_array.append(temp) 213 | input.append(tag_id_trans_array) 214 | tag_index_array = [] 215 | for i in xrange(tag_len): 216 | temp = [] 217 | temp.append(i) 218 | temp.append(data['tags'][i]) 219 | tag_index_array.append(temp) 220 | input.append(tag_index_array) 221 | return input 222 | 223 | 224 | def create_input_batch(sentences, parameters, n_tags=0, add_label=False, singletons=None): 225 | """ 226 | Take batch_sentence data and return a batch_input for 227 | the training or the evaluation function. 228 | """ 229 | batch_size = len(sentences) 230 | max_length = max([len(sentence['words']) for sentence in sentences]) 231 | #print 'max_length..................................' 232 | #print max_length 233 | input = [] 234 | words_batch = [] 235 | chars_batch = [] 236 | chars_max_batch = [] 237 | input_tag = [] 238 | input_tag_id_trans = [] 239 | input_tag_id_index = [] 240 | for k in xrange(batch_size): 241 | data = sentences[k] 242 | words = data['words'] 243 | words_len = len(words) 244 | #print 'words........................' 245 | #print words 246 | if singletons is not None: 247 | words = insert_singletons(words, singletons) 248 | words_batch.append(words) 249 | if parameters['char_dim']: 250 | chars = data['chars'] 251 | max_word_length = max([len(word) for word in chars]) 252 | chars_max_batch.append(max_word_length) 253 | if len(chars) > words_len: 254 | chars = chars[0:words_len] 255 | assert len(chars) == words_len 256 | if len(chars) < max_length: 257 | padding = [[0]] * (max_length - len(chars)) 258 | chars.extend(padding) 259 | chars_batch.append(chars) 260 | if add_label: 261 | #print 'tags............................' 262 | #print data['tags'] 263 | tag_len = len(data['tags']) 264 | if tag_len > words_len: 265 | data['tags'] = data['tags'][0:words_len] 266 | tag_len = len(data['tags']) 267 | assert words_len == tag_len 268 | tag_array = data['tags'] 269 | #print tag_array 270 | if max_length > tag_len: 271 | for i in xrange(max_length - tag_len): 272 | tag_array.append(0) 273 | input_tag.append(tag_array) 274 | tag_id_trans_array = [] 275 | #print 'tag_len.........................' 276 | #print tag_len 277 | for i in xrange(tag_len): 278 | temp = [] 279 | if i == 0: 280 | temp.append(n_tags) 281 | temp.append(data['tags'][i]) 282 | tag_id_trans_array.append(temp) 283 | temp = [] 284 | if i < (tag_len - 1): 285 | temp.append(data['tags'][i]) 286 | temp.append(data['tags'][i+1]) 287 | tag_id_trans_array.append(temp) 288 | else: 289 | temp.append(data['tags'][i]) 290 | temp.append(n_tags+1) 291 | tag_id_trans_array.append(temp) 292 | if max_length > tag_len: 293 | for i in xrange(max_length - tag_len): 294 | temp = [] 295 | if i == 0: 296 | temp.append(n_tags+1) 297 | temp.append(0) 298 | tag_id_trans_array.append(temp) 299 | else: 300 | temp.append(0) 301 | temp.append(0) 302 | tag_id_trans_array.append(temp) 303 | input_tag_id_trans.append(tag_id_trans_array) 304 | tag_index_array = [] 305 | for i in xrange(tag_len): 306 | temp = [] 307 | temp.append(i) 308 | temp.append(data['tags'][i]) 309 | tag_index_array.append(temp) 310 | if max_length > tag_len: 311 | for i in xrange(max_length - tag_len): 312 | temp = [] 313 | temp.append(i+tag_len) 314 | temp.append(0) 315 | tag_index_array.append(temp) 316 | input_tag_id_index.append(tag_index_array) 317 | # words 318 | words_for, words_pos = pad_sentence_words(words_batch) 319 | input.append(words_for) 320 | input.append(words_pos) 321 | input_char_for = [] 322 | input_char_rev = [] 323 | if parameters['char_dim']: 324 | chars_max_batch_all = max(chars_max_batch) 325 | count = 0 326 | char_pos_array = [] 327 | for i in xrange(len(chars_batch)): 328 | chars = chars_batch[i] 329 | char_for, char_rev, char_pos = pad_word_chars(chars, chars_max_batch_all) 330 | input_char_for.append(char_for) 331 | input_char_rev.append(char_rev) 332 | for pos in char_pos: 333 | temp = [] 334 | temp.append(count) 335 | temp.append(pos) 336 | char_pos_array.append(temp) 337 | count += 1 338 | input.append(input_char_for) 339 | input.append(input_char_rev) 340 | input.append(char_pos_array) 341 | if add_label: 342 | batch_input_tag_id_trans = np.vstack([np.expand_dims(x, 0) for x in input_tag_id_trans]) 343 | batch_input_tag_id_index = np.vstack([np.expand_dims(x, 0) for x in input_tag_id_index]) 344 | input.append(input_tag) 345 | input.append(batch_input_tag_id_trans) 346 | input.append(batch_input_tag_id_index) 347 | return input 348 | 349 | 350 | def evaluate(sess, f_eval, model, parameters, parsed_sentences, n_tags=0): 351 | count = 0 352 | token_accus_all = [] 353 | sentence_accus_all = [] 354 | batch_size = parameters['batch_size'] 355 | start_time = datetime.datetime.now() 356 | token_count = 0.0 357 | while count < len(parsed_sentences): 358 | batch_data = [] 359 | for i in xrange(batch_size): 360 | index = i + count 361 | if index >= len(parsed_sentences): 362 | index %= len(parsed_sentences) 363 | data = parsed_sentences[index] 364 | batch_data.append(parsed_sentences[index]) 365 | input_ = create_input_batch(batch_data, parameters, n_tags, True) 366 | feed_dict_ = {} 367 | if parameters['char_dim']: 368 | feed_dict_[model.word_ids] = input_[0] 369 | feed_dict_[model.word_pos_ids] = input_[1] 370 | feed_dict_[model.char_for_ids] = input_[2] 371 | feed_dict_[model.char_rev_ids] = input_[3] 372 | feed_dict_[model.char_pos_ids] = input_[4] 373 | input_tag = input_[5] 374 | else: 375 | feed_dict_[model.word_ids] = input_[0] 376 | feed_dict_[model.word_pos_ids] = input_[1] 377 | input_tag = input_[2] 378 | f_scores = sess.run(f_eval, feed_dict=feed_dict_) 379 | accus_batch = [] 380 | sentence_batch = [] 381 | if parameters['crf']: 382 | for x in xrange(len(batch_data)): 383 | f_score = f_scores[x] 384 | word_pos = input_[1][x] + 2 385 | y_pred = f_score[1:word_pos] 386 | y_real = input_tag[x][0:(word_pos-1)] 387 | correct_prediction = np.equal(y_pred, y_real) 388 | accus = np.array(correct_prediction).astype(float).sum() 389 | accus_mean = np.array(correct_prediction).astype(float).mean() 390 | accus_batch.append(accus) 391 | if accus_mean < 1.0: 392 | sentence_batch.append(0.0) 393 | else: 394 | sentence_batch.append(1.0) 395 | token_count += (input_[1][x] + 1) 396 | accus_val = accus_batch 397 | sentence_val = np.array(sentence_batch).astype(float).mean() 398 | else: 399 | y_preds = f_scores.argmax(axis=-1) 400 | y_reals = np.array(input_tag).astype(np.int32) 401 | for x in xrange(batch_size): 402 | word_pos = input_[1][x] + 1 403 | y_pred = y_preds[x][0:word_pos] 404 | y_real = y_reals[x][0:word_pos] 405 | correct_prediction = np.equal(y_pred, y_real) 406 | accus = np.array(correct_prediction).astype(float).sum() 407 | accus_mean = np.array(correct_prediction).astype(float).mean() 408 | accus_batch.append(accus) 409 | if accus_mean < 1.0: 410 | sentence_batch.append(0.0) 411 | else: 412 | sentence_batch.append(1.0) 413 | token_count += word_pos 414 | accus_val = accus_batch 415 | sentence_val = np.array(sentence_batch).astype(float).mean() 416 | count += batch_size 417 | token_accus_all.extend(accus_val) 418 | sentence_accus_all.append(sentence_val) 419 | token_accuracy = np.sum(token_accus_all) / (token_count + 0.000001) 420 | sentence_accuracy = np.mean(sentence_accus_all) 421 | end_time = datetime.datetime.now() 422 | cost_time = (end_time - start_time).seconds 423 | print "token accuracy: %f, sentence accuracy: %f, cost time: %i" % (token_accuracy, sentence_accuracy, cost_time) 424 | return token_accuracy, sentence_accuracy 425 | --------------------------------------------------------------------------------