├── config.conf ├── sequence_labeling_evaluator.py ├── crf.py ├── punctuator.py ├── recurrence.py ├── error_calculator.py ├── README.md ├── punctuation_data_converter.py ├── sequence_labeler.py ├── conlleval.py └── sequence_labeling_experiment.py /config.conf: -------------------------------------------------------------------------------- 1 | [config] 2 | path_train = ./data/train 3 | path_dev = ./data/dev 4 | path_test = ./data/test 5 | main_label = .PERIOD 6 | conll_eval = False 7 | preload_vectors = /home/ottokar/old/large_files/GoogleNews-vectors-negative300.txt 8 | word_embedding_size = 300 9 | char_embedding_size = 50 10 | word_recurrent_size = 200 11 | char_recurrent_size = 200 12 | narrow_layer_size = 50 13 | best_model_selector = dev_f:high 14 | epochs = 20 15 | stop_if_no_improvement_for_epochs = 7 16 | learningrate = 1.0 17 | min_word_freq = 2 18 | max_batch_size = 64 19 | save = punctuator.model 20 | load = 21 | random_seed = 1001 22 | crf_on_top = True 23 | char_integration_method = attention 24 | -------------------------------------------------------------------------------- /sequence_labeling_evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | import collections 3 | import numpy 4 | 5 | import conlleval 6 | 7 | class SequenceLabelingEvaluator(object): 8 | def __init__(self, main_label_id, label2id=None, conll_eval=False): 9 | self.main_label_id = main_label_id 10 | self.label2id = label2id 11 | self.conll_eval = conll_eval 12 | 13 | self.cost_sum = 0.0 14 | self.correct_sum = 0.0 15 | self.main_predicted_count = 0 16 | self.main_total_count = 0 17 | self.main_correct_count = 0 18 | self.token_count = 0 19 | self.start_time = time.time() 20 | 21 | if self.label2id is not None: 22 | self.id2label = collections.OrderedDict() 23 | for label in self.label2id: 24 | self.id2label[self.label2id[label]] = label 25 | 26 | self.conll_format = [] 27 | 28 | def append_data(self, cost, predicted_labels, word_ids, label_ids): 29 | self.cost_sum += cost 30 | self.token_count += label_ids.size 31 | self.correct_sum += numpy.equal(predicted_labels, label_ids).sum() 32 | self.main_predicted_count += (predicted_labels == self.main_label_id).sum() 33 | self.main_total_count += (label_ids == self.main_label_id).sum() 34 | self.main_correct_count += ((predicted_labels == self.main_label_id)*(label_ids == self.main_label_id)).sum() 35 | 36 | for i in range(word_ids.shape[0]): 37 | for j in range(word_ids.shape[1]-2): 38 | try: 39 | self.conll_format.append(str(word_ids[i][j+1]) + "\t" + str(self.id2label[label_ids[i][j]]) + "\t" + str(self.id2label[predicted_labels[i][j]])) 40 | except KeyError: 41 | print("Unexpected label id in predictions.") # Probably means the CRF decided to predict a start/end label, which it shouldn't 42 | self.conll_format.append("") 43 | 44 | 45 | def get_results(self, name): 46 | p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0 47 | r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0 48 | f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0 49 | f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0 50 | 51 | results = collections.OrderedDict() 52 | results[name + "_cost_avg"] = self.cost_sum / float(self.token_count) 53 | results[name + "_cost_sum"] = self.cost_sum 54 | results[name + "_main_predicted_count"] = self.main_predicted_count 55 | results[name + "_main_total_count"] = self.main_total_count 56 | results[name + "_main_correct_count"] = self.main_correct_count 57 | results[name + "_p"] = p 58 | results[name + "_r"] = r 59 | results[name + "_f"] = f 60 | results[name + "_f05"] = f05 61 | results[name + "_accuracy"] = self.correct_sum / float(self.token_count) 62 | results[name + "_token_count"] = self.token_count 63 | results[name + "_time"] = float(time.time()) - float(self.start_time) 64 | 65 | if self.label2id is not None and self.conll_eval == True: 66 | conll_counts = conlleval.evaluate(self.conll_format) 67 | conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts) 68 | results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter) 69 | results[name + "_conll_p"] = conll_metrics_overall.prec 70 | results[name + "_conll_r"] = conll_metrics_overall.rec 71 | results[name + "_conll_f"] = conll_metrics_overall.fscore 72 | # for i, m in sorted(conll_metrics_by_type.items()): 73 | # results[name + "_conll_p_" + str(i)] = m.prec 74 | # results[name + "_conll_r_" + str(i)] = m.rec 75 | # results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i]) 76 | 77 | return results 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /crf.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy 3 | 4 | # CRF implementation based on Lample et al. 5 | # "Neural Architectures for Named Entity Recognition" 6 | 7 | floatX=theano.config.floatX 8 | 9 | def log_sum(x, axis=None): 10 | x_max_value = x.max(axis=axis) 11 | x_max_tensor = x.max(axis=axis, keepdims=True) 12 | return x_max_value + theano.tensor.log(theano.tensor.exp(x - x_max_tensor).sum(axis=axis)) 13 | 14 | 15 | def forward(observation_weights, transition_weights, return_best_sequence=False): 16 | def recurrence(observation_weights, previous_scores, transition_weights): 17 | previous_scores = previous_scores.dimshuffle(0, 1, 'x') 18 | observation_weights = observation_weights.dimshuffle(0, 'x', 1) 19 | scores = previous_scores + observation_weights + transition_weights.dimshuffle('x', 0, 1) 20 | if return_best_sequence: 21 | best_scores = scores.max(axis=1) 22 | best_states = scores.argmax(axis=1) 23 | return best_scores, best_states 24 | else: 25 | return log_sum(scores, axis=1) 26 | 27 | initial = observation_weights[0] 28 | crf_states, _ = theano.scan( 29 | fn=recurrence, 30 | outputs_info=(initial, None) if return_best_sequence else initial, 31 | sequences=[observation_weights[1:],], 32 | non_sequences=transition_weights 33 | ) 34 | 35 | if return_best_sequence: 36 | sequence, _ = theano.scan( 37 | fn=lambda beta_i, previous: beta_i[theano.tensor.arange(previous.shape[0]), previous], 38 | outputs_info=theano.tensor.cast(theano.tensor.argmax(crf_states[0][-1], axis=1), 'int32'), 39 | sequences=theano.tensor.cast(crf_states[1][::-1], 'int32') 40 | ) 41 | sequence = theano.tensor.concatenate([sequence[::-1], [theano.tensor.argmax(crf_states[0][-1], axis=1)]]) 42 | return sequence, crf_states[0] 43 | else: 44 | return log_sum(crf_states[-1], axis=1) 45 | 46 | 47 | def construct(name, input_tensor, n_labels, gold_labels, fn_create_parameter_matrix): 48 | transition_weights = fn_create_parameter_matrix(name + "_crf_transition_weights", (n_labels + 2, n_labels + 2)) 49 | 50 | small = -1000.0 51 | padding_start = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small 52 | padding_start = theano.tensor.set_subtensor(padding_start[:,:,-2], 0.0) 53 | padding_end = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small 54 | padding_end = theano.tensor.set_subtensor(padding_end[:,:,-1], 0.0) 55 | observation_weights = theano.tensor.concatenate([input_tensor, theano.tensor.zeros((input_tensor.shape[0], input_tensor.shape[1], 2)) + small], axis=2) 56 | observation_weights = theano.tensor.concatenate([padding_start, observation_weights, padding_end], axis=1) 57 | observation_weights = observation_weights.dimshuffle(1,0,2) # reordering the tensor (words, sentences, labels) 58 | 59 | # Score from tags 60 | real_paths_scores = input_tensor[theano.tensor.arange(input_tensor.shape[0])[:, numpy.newaxis], theano.tensor.arange(input_tensor.shape[1]), gold_labels].sum(axis=1) 61 | 62 | # Score from transition_weights 63 | padding_id_start = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels 64 | padding_id_end = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels + 1 65 | padded_gold_labels = theano.tensor.concatenate([padding_id_start, gold_labels, padding_id_end], axis=1) 66 | real_paths_scores += transition_weights[ 67 | padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1)], 68 | padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1) + 1] 69 | ].sum(axis=1) 70 | 71 | all_paths_scores = forward(observation_weights, transition_weights) 72 | 73 | best_sequence, scores = forward(observation_weights, transition_weights, return_best_sequence=True) 74 | 75 | scores = scores.dimshuffle(1,0,2)[:,:-1,:-2] 76 | best_sequence = best_sequence.dimshuffle(1,0)[:,1:-1] 77 | 78 | return all_paths_scores, real_paths_scores, best_sequence, scores 79 | 80 | -------------------------------------------------------------------------------- /punctuator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy 3 | 4 | from collections import OrderedDict 5 | from sequence_labeler import SequenceLabeler 6 | from sequence_labeling_experiment import read_dataset, create_batches, parse_config, map_text_to_ids 7 | from punctuation_data_converter import EOS_TOKENS, SPACE, MAX_SEQUENCE_LEN 8 | 9 | def last_index_of(array, element): 10 | try: 11 | return len(array) -1 - array[::-1].index(element) 12 | except: 13 | return 0 14 | 15 | def up_to_last_instance_of(array, elements): 16 | idx = max(last_index_of(array, element) for element in elements) 17 | if idx == 0: 18 | return array 19 | else: 20 | return array[:idx + 1] 21 | 22 | def reverse_mapping(d): 23 | return OrderedDict([(v,k) for (k,v) in d.items()]) 24 | 25 | def convert_to_batch(word_sequence, lowercase_words, lowercase_chars, replace_digits, word2id, char2id): 26 | raw_word_ids = map_text_to_ids(" ".join(word_sequence), word2id, "", "", "", lowercase=lowercase_words, replace_digits=replace_digits) 27 | raw_char_ids = [map_text_to_ids("", char2id, "", "", "")] + \ 28 | [map_text_to_ids(" ".join(list(word)), char2id, "", "", "", lowercase=lowercase_chars, replace_digits=replace_digits) for word in word_sequence] + \ 29 | [map_text_to_ids("", char2id, "", "", "")] 30 | 31 | assert(len(raw_char_ids) == len(raw_word_ids)) 32 | 33 | # Mask and convert to numpy array 34 | batch_size = 1 35 | seq_len = len(raw_word_ids) 36 | 37 | max_word_length = numpy.array([len(c) for c in raw_char_ids]).max() 38 | 39 | word_ids = numpy.zeros((batch_size, seq_len), dtype=numpy.int32) 40 | char_ids = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32) 41 | char_mask = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32) 42 | 43 | for i in range(batch_size): 44 | for j in range(seq_len): 45 | word_ids[i][j] = raw_word_ids[j] 46 | for j in range(seq_len): 47 | for k in range(len(raw_char_ids[j])): 48 | char_ids[i][j][k] = raw_char_ids[j][k] 49 | char_mask[i][j][k] = 1 50 | 51 | return word_ids, char_ids, char_mask 52 | 53 | def punctuate(config_path): 54 | config = parse_config("config", config_path) 55 | if config["path_test"] is None: 56 | print("No test data configured") 57 | return 58 | 59 | sequencelabeler = SequenceLabeler.load(config["save"]) 60 | label2id = sequencelabeler.config["label2id"] 61 | word2id = sequencelabeler.config["word2id"] 62 | char2id = sequencelabeler.config["char2id"] 63 | 64 | config["word2id"] = word2id 65 | config["char2id"] = char2id 66 | config["label2id"] = label2id 67 | 68 | id2label = reverse_mapping(label2id) 69 | eos_labels = [label2id[l] for l in EOS_TOKENS if l in label2id] 70 | space_id = label2id[SPACE] 71 | 72 | all_predicted_labels = [] 73 | 74 | for path_test in config["path_test"].strip().split(":"): 75 | 76 | with open(path_test + '.orig', 'r') as f: 77 | all_words = [w for w in f.read().split() if w not in label2id] 78 | 79 | last_eos_idx = 0 80 | 81 | while True: 82 | word_sequence = all_words[last_eos_idx:last_eos_idx+MAX_SEQUENCE_LEN] 83 | if len(word_sequence) == 0: 84 | break 85 | word_ids, char_ids, char_mask = convert_to_batch(word_sequence, False, False, True, word2id, char2id) 86 | predicted_labels = sequencelabeler.predict(word_ids, char_ids, char_mask) 87 | predicted_labels = up_to_last_instance_of(list(predicted_labels.flatten()), eos_labels) 88 | if len(predicted_labels) == 0: 89 | break 90 | all_predicted_labels += predicted_labels 91 | last_eos_idx += len(predicted_labels) 92 | 93 | with open(path_test + '.pred', 'w') as f: 94 | for w, l_id in zip(all_words, all_predicted_labels): 95 | f.write('%s %s ' % (w, '' if l_id == space_id else id2label[l_id])) 96 | 97 | if __name__ == "__main__": 98 | punctuate(sys.argv[1]) 99 | 100 | -------------------------------------------------------------------------------- /recurrence.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy 3 | 4 | floatX=theano.config.floatX 5 | 6 | def create_birnn(input_tensor, input_size, mask, recurrent_size, return_combined, fn_create_parameter_matrix, name): 7 | rnn_mask = mask.dimshuffle(1,0) if (mask is not None) else None 8 | recurrent_forward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask, 9 | recurrent_size, only_return_final=return_combined, go_backwards=False, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_forward") 10 | recurrent_backward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask, 11 | recurrent_size, only_return_final=return_combined, go_backwards=True, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_backward") 12 | if return_combined == True: 13 | return theano.tensor.concatenate([recurrent_forward, recurrent_backward], axis=1) 14 | else: 15 | return theano.tensor.concatenate([recurrent_forward.dimshuffle(1,0,2), recurrent_backward.dimshuffle(1,0,2)], axis=2) 16 | 17 | 18 | def create_lstm(input_tensor, input_size, mask, recurrent_size, only_return_final, go_backwards, fn_create_parameter_matrix, name): 19 | # LSTM. Following Graves et al. 20 | # "Hybrid speech recognition with deep bidirectional LSTM" 21 | def lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co): 22 | m_xhb = theano.tensor.dot(x, W_x) + theano.tensor.dot(h_prev, W_h) + b 23 | i = theano.tensor.nnet.sigmoid(_slice(m_xhb, 0, 4) + c_prev * W_ci) 24 | f = theano.tensor.nnet.sigmoid(_slice(m_xhb, 1, 4) + c_prev * W_cf) 25 | c = f * c_prev + i * theano.tensor.tanh(_slice(m_xhb, 2, 4)) 26 | o = theano.tensor.nnet.sigmoid(_slice(m_xhb, 3, 4) + c * W_co) 27 | h = o * theano.tensor.tanh(c) 28 | return h, c 29 | 30 | def lstm_mask_step(x, mask, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co): 31 | h_new, c_new = lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co) 32 | h = theano.tensor.switch(mask, h_new, h_prev) 33 | c = theano.tensor.switch(mask, c_new, c_prev) 34 | return h, c 35 | 36 | def _slice(M, slice_num, total_slices): 37 | if M.ndim == 3: 38 | l = M.shape[2] / total_slices 39 | return M[:, :, slice_num*l:(slice_num+1)*l] 40 | elif M.ndim == 2: 41 | l = M.shape[1] / total_slices 42 | return M[:, slice_num*l:(slice_num+1)*l] 43 | elif M.ndim == 1: 44 | l = M.shape[0] / total_slices 45 | return M[slice_num*l:(slice_num+1)*l] 46 | 47 | h_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size) 48 | c_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size) 49 | 50 | if mask is not None: 51 | mask = mask.dimshuffle(0, 1, 'x') 52 | fn_step = locals()["lstm_mask_step"] 53 | sequences = [input_tensor, mask] 54 | else: 55 | fn_step = locals()["lstm_step"] 56 | sequences = input_tensor 57 | 58 | W_x = fn_create_parameter_matrix('W_x_'+name, (input_size, recurrent_size*4)) 59 | W_h = fn_create_parameter_matrix('W_h_'+name, (recurrent_size, recurrent_size*4)) 60 | b = fn_create_parameter_matrix('b_'+name, (recurrent_size*4,)) 61 | W_ci = fn_create_parameter_matrix('W_ci_'+name, (recurrent_size,)) 62 | W_cf = fn_create_parameter_matrix('W_cf_'+name, (recurrent_size,)) 63 | W_co = fn_create_parameter_matrix('W_co_'+name, (recurrent_size,)) 64 | result, _ = theano.scan( 65 | fn_step, 66 | sequences = sequences, 67 | outputs_info = [h_initial, c_initial], 68 | non_sequences = [W_x, W_h, b, W_ci, W_cf, W_co], 69 | go_backwards=go_backwards) 70 | 71 | h = result[0] 72 | if only_return_final == True: 73 | h = h[-1] 74 | else: 75 | if go_backwards == True: 76 | h = h[::-1] 77 | return h 78 | 79 | 80 | 81 | def create_feedforward(input_tensor, input_size, output_size, activation, fn_create_parameter_matrix, name): 82 | weights = fn_create_parameter_matrix('ff_weights_' + name, (input_size, output_size)) 83 | bias = fn_create_parameter_matrix('ff_bias_' + name, (output_size,)) 84 | output = theano.tensor.dot(input_tensor, weights) + bias 85 | if activation == "tanh": 86 | output = theano.tensor.tanh(output) 87 | elif activation == "sigmoid": 88 | output = theano.tensor.nnet.sigmoid(output) 89 | return output 90 | 91 | -------------------------------------------------------------------------------- /error_calculator.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | Computes and prints the overall classification error and precision, recall, F-score over punctuations. 5 | """ 6 | 7 | from numpy import nan 8 | import punctuation_data_converter as data 9 | import codecs 10 | import sys 11 | 12 | MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example 13 | 14 | def compute_error(target_paths, predicted_paths): 15 | counter = 0 16 | total_correct = 0 17 | 18 | correct = 0. 19 | substitutions = 0. 20 | deletions = 0. 21 | insertions = 0. 22 | 23 | true_positives = {} 24 | false_positives = {} 25 | false_negatives = {} 26 | 27 | for target_path, predicted_path in zip(target_paths, predicted_paths): 28 | 29 | target_punctuation = " " 30 | predicted_punctuation = " " 31 | 32 | t_i = 0 33 | p_i = 0 34 | 35 | with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted: 36 | 37 | target_stream = target.read().split() 38 | predicted_stream = predicted.read().split() 39 | 40 | while True: 41 | 42 | if data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: 43 | while data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: # skip multiple consecutive punctuations 44 | target_punctuation = data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) 45 | target_punctuation = MAPPING.get(target_punctuation, target_punctuation) 46 | t_i += 1 47 | else: 48 | target_punctuation = " " 49 | 50 | if predicted_stream[p_i] in data.PUNCTUATION_VOCABULARY: 51 | predicted_punctuation = MAPPING.get(predicted_stream[p_i], predicted_stream[p_i]) 52 | p_i += 1 53 | else: 54 | predicted_punctuation = " " 55 | 56 | is_correct = target_punctuation == predicted_punctuation 57 | 58 | counter += 1 59 | total_correct += is_correct 60 | 61 | if predicted_punctuation == " " and target_punctuation != " ": 62 | deletions += 1 63 | elif predicted_punctuation != " " and target_punctuation == " ": 64 | insertions += 1 65 | elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation == target_punctuation: 66 | correct += 1 67 | elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation != target_punctuation: 68 | substitutions += 1 69 | 70 | true_positives[target_punctuation] = true_positives.get(target_punctuation, 0.) + float(is_correct) 71 | false_positives[predicted_punctuation] = false_positives.get(predicted_punctuation, 0.) + float(not is_correct) 72 | false_negatives[target_punctuation] = false_negatives.get(target_punctuation, 0.) + float(not is_correct) 73 | 74 | assert target_stream[t_i] == predicted_stream[p_i] or predicted_stream[p_i] == "", \ 75 | ("File: %s \n" + \ 76 | "Error: %s (%s) != %s (%s) \n" + \ 77 | "Target context: %s \n" + \ 78 | "Predicted context: %s") % \ 79 | (target_path, 80 | target_stream[t_i], t_i, predicted_stream[p_i], p_i, 81 | " ".join(target_stream[t_i-2:t_i+2]), 82 | " ".join(predicted_stream[p_i-2:p_i+2])) 83 | 84 | t_i += 1 85 | p_i += 1 86 | 87 | if t_i >= len(target_stream)-1 and p_i >= len(predicted_stream)-1: 88 | break 89 | 90 | overall_tp = 0.0 91 | overall_fp = 0.0 92 | overall_fn = 0.0 93 | 94 | print "-"*46 95 | print "{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE') 96 | for p in data.PUNCTUATION_VOCABULARY: 97 | 98 | if p == data.SPACE: 99 | continue 100 | 101 | overall_tp += true_positives.get(p,0.) 102 | overall_fp += false_positives.get(p,0.) 103 | overall_fn += false_negatives.get(p,0.) 104 | 105 | punctuation = p 106 | precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan 107 | recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan 108 | f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan 109 | print "{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100) 110 | print "-"*46 111 | pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan 112 | rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan 113 | f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan 114 | print "{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100) 115 | print "Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2) 116 | print "SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1) 117 | 118 | 119 | if __name__ == "__main__": 120 | 121 | if len(sys.argv) > 1: 122 | target_path = sys.argv[1] 123 | else: 124 | sys.exit("Ground truth file path argument missing") 125 | 126 | if len(sys.argv) > 2: 127 | predicted_path = sys.argv[2] 128 | else: 129 | sys.exit("Model predictions file path argument missing") 130 | 131 | compute_error([target_path], [predicted_path]) 132 | 133 | 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A fork from https://github.com/marekrei/sequence-labeler to enable punctuation restoration in unsegmented text. 2 | 3 | ## Performance on English TED talks 4 | (Training set size: 2.1M words) 5 | 6 | PUNCTUATION | PRECISION | RECALL | F-SCORE 7 | --- | --- | --- | --- 8 | ,COMMA | 58.5 | 58.7 | 58.6 9 | ?QUESTIONMARK | 71.4 | 54.3 | 61.7 10 | .PERIOD | 69.9 | 72.0 | 70.9 11 | _Overall_ | _64.3_ | _64.9_ | _64.6_ 12 | 13 | Performance is very similar (even slightly better) to https://github.com/ottokart/punctuator2 although they are not directly comparable as punctuator2 used pretrained embeddings that were trained on much less data and had much smaller size. More details can be found [here](http://www.isca-speech.org/archive/Interspeech_2016/pdfs/1517.PDF). 14 | 15 | Original README: 16 | ========================= 17 | 18 | Sequence labeler 19 | ========================= 20 | 21 | This is a neural network sequence labeling system. Given a sequence of tokens, it will learn to assign labels to each token. Can be used for named entity recognition, POS-tagging, error detection, chunking, CCG supertagging, etc. 22 | 23 | The main model implements a bidirectional LSTM for sequence tagging. In addition, you can incorporate character-level information -- either by concatenating a character-based representation, or by using an attention/gating mechanism for combining it with a word embedding. 24 | 25 | Run with: 26 | 27 | python sequence_labeling_experiment.py config.conf 28 | 29 | Preferably with Theano set up to use CUDA, so the process can run on a GPU. 30 | 31 | Requirements 32 | ------------------------- 33 | 34 | * numpy 35 | * theano 36 | * lasagne 37 | 38 | Configuration 39 | ------------------------- 40 | 41 | Edit the values in config.conf as needed: 42 | 43 | * **path_train** - Path to the training data, in CoNLL tab-separated format. One word per line, first column is the word, last column is the label. Empty lines between sentences. 44 | * **path_dev** - Path to the development data, used for choosing the best epoch. 45 | * **path_test** - Path to the test file. Can contain multiple files, colon separated. 46 | * **main_label** - The output label for which precision/recall/F-measure are calculated. 47 | * **conll_eval** - Whether the standard CoNLL NER evaluation should be run. 48 | * **preload_vectors** - Path to the pretrained word embeddings, in word2vec plain text format. If your embeddings are in binary, you can use [convertvec](https://github.com/marekrei/convertvec) to convert them to plain text. 49 | * **word_embedding_size** - Size of the word embeddings used in the model. 50 | * **char_embedding_size** - Size of the character embeddings. 51 | * **word_recurrent_size** - Size of the word-level LSTM hidden layers. 52 | * **char_recurrent_size** - Size of the char-level LSTM hidden layers. 53 | * **narrow_layer_size** - Size of the extra hidden layer on top of the bi-LSTM. 54 | * **best_model_selector** - What is measured on the dev set for model selection: "dev_conll_f:high" for NER and chunking, "dev_acc:high" for POS-tagging, "dev_f05:high" for error detection. 55 | * **epochs** - Maximum number of epochs to run. 56 | * **stop_if_no_improvement_for_epochs** - Training will be stopped if there has been no improvement for n epochs. 57 | * **learningrate** - Learning rate. 58 | * **min_word_freq** - Minimal frequency of words to be included in the vocabulary. Others will be considered OOV. 59 | * **max_batch_size** - Maximum batch size. 60 | * **save** - Path to save the model. 61 | * **load** - Path to load the model. 62 | * **random_seed** - Random seed for initialisation and data shuffling. This can affect results, so for robust conclusions I recommend running multiple experiments with different seeds and averaging the metrics. 63 | * **crf_on_top** - If True, use a CRF as the output layer. If False, use softmax instead. 64 | * **char_integration_method** - How character information is integrated. Options are: "none" (not integrated), "input" (concatenated), "attention" (the method proposed in Rei et al. (2016)). 65 | 66 | 67 | References 68 | ------------------------- 69 | 70 | If you use the main sequence labeling code, please reference: 71 | 72 | [**Compositional Sequence Labeling Models for Error Detection in Learner Writing**](http://aclweb.org/anthology/P/P16/P16-1112.pdf) 73 | Marek Rei and Helen Yannakoudakis 74 | *In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL-2016)* 75 | 76 | 77 | If you use the character-level attention component, please reference: 78 | 79 | [**Attending to characters in neural sequence labeling models**](https://aclweb.org/anthology/C/C16/C16-1030.pdf) 80 | Marek Rei, Sampo Pyysalo and Gamal K.O. Crichton 81 | *In Proceedings of the 26th International Conference on Computational Linguistics (COLING-2016)* 82 | 83 | 84 | The CRF implementation is based on: 85 | 86 | [**Neural Architectures for Named Entity Recognition**](https://arxiv.org/abs/1603.01360) 87 | Guillaume Lample, Miguel Ballesteros, Sandeep Subramanian, Kazuya Kawakami and Chris Dyer 88 | *In Proceedings of NAACL-HLT 2016* 89 | 90 | 91 | The conlleval.py script is from: https://github.com/spyysalo/conlleval.py 92 | 93 | 94 | License 95 | --------------------------- 96 | 97 | MIT License 98 | 99 | Copyright (c) 2016 Marek Rei 100 | 101 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 102 | 103 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 104 | 105 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 106 | -------------------------------------------------------------------------------- /punctuation_data_converter.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import division 3 | 4 | import random 5 | import os 6 | import sys 7 | import operator 8 | import cPickle 9 | import codecs 10 | import fnmatch 11 | 12 | DATA_PATH = "./data" 13 | 14 | END = "" 15 | UNK = "" 16 | 17 | SPACE = "_SPACE" 18 | 19 | MAX_WORD_VOCABULARY_SIZE = 100000 20 | MIN_WORD_COUNT_IN_VOCAB = 2 21 | MAX_SEQUENCE_LEN = 50 22 | 23 | TRAIN_FILE = os.path.join(DATA_PATH, "train") 24 | DEV_FILE = os.path.join(DATA_PATH, "dev") 25 | TEST_FILE = os.path.join(DATA_PATH, "test") 26 | 27 | PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"} 28 | PUNCTUATION_MAPPING = {} 29 | 30 | # Comma, period & question mark only: 31 | # PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK"} 32 | # PUNCTUATION_MAPPING = {"!EXCLAMATIONMARK": ".PERIOD", ":COLON": ",COMMA", ";SEMICOLON": ".PERIOD", "-DASH": ",COMMA"} 33 | 34 | EOS_TOKENS = {".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK"} 35 | CRAP_TOKENS = {"", ""} # punctuations that are not included in vocabulary nor mapping, must be added to CRAP_TOKENS 36 | 37 | def write_processed_dataset(input_files, output_file): 38 | """ 39 | data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens (actually punctuation sequence will be 1 element shorter). 40 | If a sentence is cut, then it will be added to next subsequence entirely (words before the cut belong to both sequences) 41 | """ 42 | 43 | current_words = [] 44 | current_punctuations = [] 45 | 46 | last_eos_idx = 0 # if it's still 0 when MAX_SEQUENCE_LEN is reached, then the sentence is too long and skipped. 47 | last_token_was_punctuation = True # skipt first token if it's punctuation 48 | 49 | skip_until_eos = False # if a sentence does not fit into subsequence, then we need to skip tokens until we find a new sentence 50 | 51 | for input_file in input_files: 52 | 53 | with codecs.open(input_file, 'r', 'utf-8') as text, \ 54 | codecs.open(output_file, 'w', 'utf-8') as text_out: 55 | 56 | for line in text: 57 | 58 | for token in line.split(): 59 | 60 | # First map oov punctuations to known punctuations 61 | if token in PUNCTUATION_MAPPING: 62 | token = PUNCTUATION_MAPPING[token] 63 | 64 | if skip_until_eos: 65 | 66 | if token in EOS_TOKENS: 67 | skip_until_eos = False 68 | 69 | continue 70 | 71 | elif token in CRAP_TOKENS: 72 | continue 73 | 74 | elif token in PUNCTUATION_VOCABULARY: 75 | 76 | if last_token_was_punctuation: # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", then we only use the first punctuation and skip the ones that follow 77 | continue 78 | 79 | if token in EOS_TOKENS: 80 | last_eos_idx = len(current_punctuations) # no -1, because the token is not added yet 81 | 82 | punctuation = token 83 | 84 | current_punctuations.append(punctuation) 85 | last_token_was_punctuation = True 86 | 87 | else: 88 | 89 | if not last_token_was_punctuation: 90 | current_punctuations.append(SPACE) 91 | 92 | word = token 93 | 94 | current_words.append(word) 95 | last_token_was_punctuation = False 96 | 97 | if len(current_words) == MAX_SEQUENCE_LEN: # this also means, that last token was a word 98 | 99 | assert len(current_words) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations)) 100 | 101 | # Sentence did not fit into subsequence - skip it 102 | if last_eos_idx == 0: 103 | skip_until_eos = True 104 | 105 | current_words = [] 106 | current_punctuations = [] 107 | 108 | last_token_was_punctuation = True # next sequence starts with a new sentence, so is preceded by eos which is punctuation 109 | 110 | else: 111 | 112 | for w, p in zip(current_words, current_punctuations): 113 | text_out.write('%s\t%s\n' % (w, p)) 114 | text_out.write('\n') 115 | 116 | # Carry unfinished sentence to next subsequence 117 | current_words = current_words[last_eos_idx+1:] 118 | current_punctuations = current_punctuations[last_eos_idx+1:] 119 | 120 | last_eos_idx = 0 # sequence always starts with a new sentence 121 | 122 | def create_dev_test_train_split(root_path, train_output, dev_output, test_output): 123 | 124 | train_txt_files = [] 125 | dev_txt_files = [] 126 | test_txt_files = [] 127 | 128 | for root, dirnames, filenames in os.walk(root_path): 129 | for filename in fnmatch.filter(filenames, '*.txt'): 130 | 131 | path = os.path.join(root, filename) 132 | 133 | if filename.endswith(".test.txt"): 134 | test_txt_files.append(path) 135 | 136 | elif filename.endswith(".dev.txt"): 137 | dev_txt_files.append(path) 138 | 139 | else: 140 | train_txt_files.append(path) 141 | 142 | write_processed_dataset(train_txt_files, train_output) 143 | write_processed_dataset(dev_txt_files, dev_output) 144 | write_processed_dataset(test_txt_files, test_output) 145 | 146 | if __name__ == "__main__": 147 | 148 | if len(sys.argv) > 1: 149 | path = sys.argv[1] 150 | else: 151 | sys.exit("The path to source data directory with txt files is missing") 152 | 153 | if not os.path.exists(DATA_PATH): 154 | os.makedirs(DATA_PATH) 155 | else: 156 | sys.exit("Data already exists") 157 | 158 | create_dev_test_train_split(path, TRAIN_FILE, DEV_FILE, TEST_FILE) 159 | 160 | -------------------------------------------------------------------------------- /sequence_labeler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import theano 3 | import numpy 4 | import collections 5 | import cPickle 6 | import lasagne 7 | 8 | import crf 9 | import recurrence 10 | 11 | sys.setrecursionlimit(50000) 12 | floatX=theano.config.floatX 13 | 14 | class SequenceLabeler(object): 15 | def __init__(self, config): 16 | self.config = config 17 | self.params = collections.OrderedDict() 18 | self.rng = numpy.random.RandomState(config["random_seed"]) 19 | 20 | word_ids = theano.tensor.imatrix('word_ids') 21 | char_ids = theano.tensor.itensor3('char_ids') 22 | char_mask = theano.tensor.ftensor3('char_mask') 23 | label_ids = theano.tensor.imatrix('label_ids') 24 | learningrate = theano.tensor.fscalar('learningrate') 25 | 26 | cost = 0.0 27 | input_tensor = None 28 | input_vector_size = 0 29 | 30 | self.word_embeddings = self.create_parameter_matrix('word_embeddings', (config["n_words"], config["word_embedding_size"])) 31 | input_tensor = self.word_embeddings[word_ids] 32 | input_vector_size = config["word_embedding_size"] 33 | 34 | char_embeddings = self.create_parameter_matrix('char_embeddings', (config["n_chars"], config["char_embedding_size"])) 35 | char_input_tensor = char_embeddings[char_ids].reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2],config["char_embedding_size"])) 36 | char_mask_reshaped = char_mask.reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2])) 37 | 38 | char_output_tensor = recurrence.create_birnn(char_input_tensor, config["char_embedding_size"], char_mask_reshaped, config["char_recurrent_size"], return_combined=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="char_birnn") 39 | char_output_tensor = recurrence.create_feedforward(char_output_tensor, config["char_recurrent_size"]*2, config["word_embedding_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="char_ff") 40 | char_output_tensor = char_output_tensor.reshape((char_ids.shape[0],char_ids.shape[1],config["word_embedding_size"])) 41 | 42 | if config["char_integration_method"] == "input": 43 | input_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2) 44 | input_vector_size += config["word_embedding_size"] 45 | 46 | elif config["char_integration_method"] == "attention": 47 | static_input_tensor = theano.gradient.disconnected_grad(input_tensor) 48 | is_unk = theano.tensor.eq(word_ids, config["unk_token_id"]) 49 | is_unk_tensor = is_unk.dimshuffle(0,1,'x') 50 | char_output_tensor_normalised = char_output_tensor / char_output_tensor.norm(2, axis=2)[:, :, numpy.newaxis] 51 | static_input_tensor_normalised = static_input_tensor / static_input_tensor.norm(2, axis=2)[:, :, numpy.newaxis] 52 | cosine_cost = 1.0 - (char_output_tensor_normalised * static_input_tensor_normalised).sum(axis=2) 53 | cost += theano.tensor.switch(is_unk, 0.0, cosine_cost).sum() 54 | attention_evidence_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2) 55 | attention_output = recurrence.create_feedforward(attention_evidence_tensor, config["word_embedding_size"]*2, config["word_embedding_size"], "tanh", self.create_parameter_matrix, "attention_tanh") 56 | attention_output = recurrence.create_feedforward(attention_output, config["word_embedding_size"], config["word_embedding_size"], "sigmoid", self.create_parameter_matrix, "attention_sigmoid") 57 | input_tensor = input_tensor * attention_output + char_output_tensor * (1.0 - attention_output) 58 | 59 | processed_tensor = recurrence.create_birnn(input_tensor, input_vector_size, None, config["word_recurrent_size"], return_combined=False, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn") 60 | processed_tensor = recurrence.create_feedforward(processed_tensor, config["word_recurrent_size"]*2, config["narrow_layer_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="narrow_ff") 61 | 62 | W_output = self.create_parameter_matrix('W_output', (config["narrow_layer_size"], config["n_labels"])) 63 | bias_output = self.create_parameter_matrix('bias_output', (config["n_labels"],)) 64 | output = theano.tensor.dot(processed_tensor, W_output) + bias_output 65 | output = output[:,1:-1,:] # removing and 66 | 67 | if config["crf_on_top"] == True: 68 | all_paths_scores, real_paths_scores, best_sequence, scores = crf.construct("crf", output, config["n_labels"], label_ids, self.create_parameter_matrix) 69 | predicted_labels = best_sequence 70 | output_probs = scores 71 | cost += - (real_paths_scores - all_paths_scores).sum() 72 | else: 73 | output_probs = theano.tensor.nnet.softmax(output.reshape((word_ids.shape[0]*(word_ids.shape[1]-2), config["n_labels"]))) 74 | predicted_labels = theano.tensor.argmax(output_probs.reshape((word_ids.shape[0], (word_ids.shape[1]-2), config["n_labels"])), axis=2) 75 | cost += theano.tensor.nnet.categorical_crossentropy(output_probs, label_ids.reshape((-1,))).sum() 76 | 77 | gradients = theano.tensor.grad(cost, self.params.values(), disconnected_inputs='ignore') 78 | updates = lasagne.updates.adadelta(gradients, self.params.values(), learningrate) 79 | 80 | input_vars_train = [word_ids, char_ids, char_mask, label_ids, learningrate] 81 | input_vars_test = [word_ids, char_ids, char_mask, label_ids] 82 | output_vars = [cost, predicted_labels] 83 | self.train = theano.function(input_vars_train, output_vars, updates=updates, on_unused_input='ignore', allow_input_downcast = True) 84 | self.test = theano.function(input_vars_test, output_vars, on_unused_input='ignore', allow_input_downcast = True) 85 | self.predict = theano.function([word_ids, char_ids, char_mask], predicted_labels, on_unused_input='ignore', allow_input_downcast = True) 86 | 87 | def create_parameter_matrix(self, name, size): 88 | param_vals = numpy.asarray(self.rng.normal(loc=0.0, scale=0.1, size=size), dtype=floatX) 89 | param_shared = theano.shared(param_vals, name) 90 | self.params[name] = param_shared 91 | return param_shared 92 | 93 | 94 | def get_parameter_count(self): 95 | total = 0 96 | for key, val in self.params.iteritems(): 97 | total += val.get_value().size 98 | return total 99 | 100 | def get_parameter_count_without_word_embeddings(self): 101 | total = 0 102 | for key, val in self.params.iteritems(): 103 | if val == self.word_embeddings: 104 | continue 105 | total += val.get_value().size 106 | return total 107 | 108 | def save(self, filename): 109 | dump = {} 110 | dump["config"] = self.config 111 | dump["params"] = {} 112 | for param_name in self.params: 113 | dump["params"][param_name] = self.params[param_name].get_value() 114 | f = file(filename, 'wb') 115 | cPickle.dump(dump, f, protocol=cPickle.HIGHEST_PROTOCOL) 116 | f.close() 117 | 118 | @staticmethod 119 | def load(filename, new_output_layer_size=None): 120 | f = file(filename, 'rb') 121 | dump = cPickle.load(f) 122 | f.close() 123 | if new_output_layer_size is not None: 124 | dump["n_labels"] = new_output_layer_size 125 | sequencelabeler = SequenceLabeler(dump["config"]) 126 | for param_name in sequencelabeler.params: 127 | assert(param_name in dump["params"]) 128 | if new_output_layer_size is not None and param_name in ["W_output", "bias_output"]: 129 | continue 130 | sequencelabeler.params[param_name].set_value(dump["params"][param_name]) 131 | return sequencelabeler 132 | -------------------------------------------------------------------------------- /conlleval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python version of the evaluation script from CoNLL'00- 4 | # Originates from: https://github.com/spyysalo/conlleval.py 5 | 6 | 7 | # Intentional differences: 8 | # - accept any space as delimiter by default 9 | # - optional file argument (default STDIN) 10 | # - option to set boundary (-b argument) 11 | # - LaTeX output (-l argument) not supported 12 | # - raw tags (-r argument) not supported 13 | 14 | import sys 15 | import re 16 | 17 | from collections import defaultdict, namedtuple 18 | 19 | ANY_SPACE = '' 20 | 21 | class FormatError(Exception): 22 | pass 23 | 24 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') 25 | 26 | class EvalCounts(object): 27 | def __init__(self): 28 | self.correct_chunk = 0 # number of correctly identified chunks 29 | self.correct_tags = 0 # number of correct chunk tags 30 | self.found_correct = 0 # number of chunks in corpus 31 | self.found_guessed = 0 # number of identified chunks 32 | self.token_counter = 0 # token counter (ignores sentence breaks) 33 | 34 | # counts by type 35 | self.t_correct_chunk = defaultdict(int) 36 | self.t_found_correct = defaultdict(int) 37 | self.t_found_guessed = defaultdict(int) 38 | 39 | def parse_args(argv): 40 | import argparse 41 | parser = argparse.ArgumentParser( 42 | description='evaluate tagging results using CoNLL criteria', 43 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 44 | ) 45 | arg = parser.add_argument 46 | arg('-b', '--boundary', metavar='STR', default='-X-', 47 | help='sentence boundary') 48 | arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, 49 | help='character delimiting items in input') 50 | arg('-o', '--otag', metavar='CHAR', default='O', 51 | help='alternative outside tag') 52 | arg('file', nargs='?', default=None) 53 | return parser.parse_args(argv) 54 | 55 | def parse_tag(t): 56 | m = re.match(r'^([^-]*)-(.*)$', t) 57 | return m.groups() if m else (t, '') 58 | 59 | def evaluate(iterable, options=None): 60 | if options is None: 61 | options = parse_args([]) # use defaults 62 | 63 | counts = EvalCounts() 64 | num_features = None # number of features per line 65 | in_correct = False # currently processed chunks is correct until now 66 | last_correct = 'O' # previous chunk tag in corpus 67 | last_correct_type = '' # type of previously identified chunk tag 68 | last_guessed = 'O' # previously identified chunk tag 69 | last_guessed_type = '' # type of previous chunk tag in corpus 70 | 71 | for line in iterable: 72 | line = line.rstrip('\r\n') 73 | 74 | if options.delimiter == ANY_SPACE: 75 | features = line.split() 76 | else: 77 | features = line.split(options.delimiter) 78 | 79 | if num_features is None: 80 | num_features = len(features) 81 | elif num_features != len(features) and len(features) != 0: 82 | raise FormatError('unexpected number of features: %d (%d)' % 83 | (len(features), num_features)) 84 | 85 | if len(features) == 0 or features[0] == options.boundary: 86 | features = [options.boundary, 'O', 'O'] 87 | if len(features) < 3: 88 | raise FormatError('unexpected number of features in line %s' % line) 89 | 90 | guessed, guessed_type = parse_tag(features.pop()) 91 | correct, correct_type = parse_tag(features.pop()) 92 | first_item = features.pop(0) 93 | 94 | if first_item == options.boundary: 95 | guessed = 'O' 96 | 97 | end_correct = end_of_chunk(last_correct, correct, 98 | last_correct_type, correct_type) 99 | end_guessed = end_of_chunk(last_guessed, guessed, 100 | last_guessed_type, guessed_type) 101 | start_correct = start_of_chunk(last_correct, correct, 102 | last_correct_type, correct_type) 103 | start_guessed = start_of_chunk(last_guessed, guessed, 104 | last_guessed_type, guessed_type) 105 | 106 | if in_correct: 107 | if (end_correct and end_guessed and 108 | last_guessed_type == last_correct_type): 109 | in_correct = False 110 | counts.correct_chunk += 1 111 | counts.t_correct_chunk[last_correct_type] += 1 112 | elif (end_correct != end_guessed or guessed_type != correct_type): 113 | in_correct = False 114 | 115 | if start_correct and start_guessed and guessed_type == correct_type: 116 | in_correct = True 117 | 118 | if start_correct: 119 | counts.found_correct += 1 120 | counts.t_found_correct[correct_type] += 1 121 | if start_guessed: 122 | counts.found_guessed += 1 123 | counts.t_found_guessed[guessed_type] += 1 124 | if first_item != options.boundary: 125 | if correct == guessed and guessed_type == correct_type: 126 | counts.correct_tags += 1 127 | counts.token_counter += 1 128 | 129 | last_guessed = guessed 130 | last_correct = correct 131 | last_guessed_type = guessed_type 132 | last_correct_type = correct_type 133 | 134 | if in_correct: 135 | counts.correct_chunk += 1 136 | counts.t_correct_chunk[last_correct_type] += 1 137 | 138 | return counts 139 | 140 | def uniq(iterable): 141 | seen = set() 142 | return [i for i in iterable if not (i in seen or seen.add(i))] 143 | 144 | def calculate_metrics(correct, guessed, total): 145 | tp, fp, fn = correct, guessed-correct, total-correct 146 | p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) 147 | r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) 148 | f = 0 if p + r == 0 else 2 * p * r / (p + r) 149 | return Metrics(tp, fp, fn, p, r, f) 150 | 151 | def metrics(counts): 152 | c = counts 153 | overall = calculate_metrics( 154 | c.correct_chunk, c.found_guessed, c.found_correct 155 | ) 156 | by_type = {} 157 | for t in uniq(c.t_found_correct.keys() + c.t_found_guessed.keys()): 158 | by_type[t] = calculate_metrics( 159 | c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] 160 | ) 161 | return overall, by_type 162 | 163 | def report(counts, out=None): 164 | if out is None: 165 | out = sys.stdout 166 | 167 | overall, by_type = metrics(counts) 168 | 169 | c = counts 170 | out.write('processed %d tokens with %d phrases; ' % 171 | (c.token_counter, c.found_correct)) 172 | out.write('found: %d phrases; correct: %d.\n' % 173 | (c.found_guessed, c.correct_chunk)) 174 | 175 | if c.token_counter > 0: 176 | out.write('accuracy: %6.2f%%; ' % 177 | (100.*c.correct_tags/c.token_counter)) 178 | out.write('precision: %6.2f%%; ' % (100.*overall.prec)) 179 | out.write('recall: %6.2f%%; ' % (100.*overall.rec)) 180 | out.write('FB1: %6.2f\n' % (100.*overall.fscore)) 181 | 182 | for i, m in sorted(by_type.items()): 183 | out.write('%17s: ' % i) 184 | out.write('precision: %6.2f%%; ' % (100.*m.prec)) 185 | out.write('recall: %6.2f%%; ' % (100.*m.rec)) 186 | out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) 187 | 188 | def end_of_chunk(prev_tag, tag, prev_type, type_): 189 | # check if a chunk ended between the previous and current word 190 | # arguments: previous and current chunk tags, previous and current types 191 | chunk_end = False 192 | 193 | if prev_tag == 'E': chunk_end = True 194 | if prev_tag == 'S': chunk_end = True 195 | 196 | if prev_tag == 'B' and tag == 'B': chunk_end = True 197 | if prev_tag == 'B' and tag == 'S': chunk_end = True 198 | if prev_tag == 'B' and tag == 'O': chunk_end = True 199 | if prev_tag == 'I' and tag == 'B': chunk_end = True 200 | if prev_tag == 'I' and tag == 'S': chunk_end = True 201 | if prev_tag == 'I' and tag == 'O': chunk_end = True 202 | 203 | if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: 204 | chunk_end = True 205 | 206 | # these chunks are assumed to have length 1 207 | if prev_tag == ']': chunk_end = True 208 | if prev_tag == '[': chunk_end = True 209 | 210 | return chunk_end 211 | 212 | def start_of_chunk(prev_tag, tag, prev_type, type_): 213 | # check if a chunk started between the previous and current word 214 | # arguments: previous and current chunk tags, previous and current types 215 | chunk_start = False 216 | 217 | if tag == 'B': chunk_start = True 218 | if tag == 'S': chunk_start = True 219 | 220 | if prev_tag == 'E' and tag == 'E': chunk_start = True 221 | if prev_tag == 'E' and tag == 'I': chunk_start = True 222 | if prev_tag == 'S' and tag == 'E': chunk_start = True 223 | if prev_tag == 'S' and tag == 'I': chunk_start = True 224 | if prev_tag == 'O' and tag == 'E': chunk_start = True 225 | if prev_tag == 'O' and tag == 'I': chunk_start = True 226 | 227 | if tag != 'O' and tag != '.' and prev_type != type_: 228 | chunk_start = True 229 | 230 | # these chunks are assumed to have length 1 231 | if tag == '[': chunk_start = True 232 | if tag == ']': chunk_start = True 233 | 234 | return chunk_start 235 | 236 | def main(argv): 237 | args = parse_args(argv[1:]) 238 | 239 | if args.file is None: 240 | counts = evaluate(sys.stdin, args) 241 | else: 242 | with open(args.file) as f: 243 | counts = evaluate(f, args) 244 | report(counts) 245 | 246 | if __name__ == '__main__': 247 | sys.exit(main(sys.argv)) 248 | -------------------------------------------------------------------------------- /sequence_labeling_experiment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import collections 3 | import numpy 4 | import random 5 | import math 6 | import gc 7 | import os 8 | import re 9 | import ConfigParser 10 | import theano 11 | 12 | from sequence_labeler import SequenceLabeler 13 | from sequence_labeling_evaluator import SequenceLabelingEvaluator 14 | 15 | floatX=theano.config.floatX 16 | 17 | def read_input_files(file_paths): 18 | sentences = [] 19 | for file_path in file_paths.strip().split(","): 20 | with open(file_path, "r") as f: 21 | words, labels = [], [] 22 | for line in f: 23 | if len(line.strip()) > 0: 24 | line_parts = line.strip().split() 25 | assert(len(line_parts) >= 2) 26 | words.append(line_parts[0]) 27 | labels.append(line_parts[-1]) 28 | elif len(line.strip()) == 0 and len(words) > 0: 29 | sentences.append((words, labels)) 30 | words, labels = [], [] 31 | if len(words) > 0: 32 | raise ValueError("The format expects an empty line at the end of the file in: " + file_path) 33 | return sentences 34 | 35 | 36 | def read_dataset(file_paths, lowercase_words, lowercase_chars, replace_digits, word2id, char2id, label2id): 37 | dataset = [] 38 | sentences = read_input_files(file_paths) 39 | 40 | for i in range(len(sentences)): 41 | word_ids = map_text_to_ids(" ".join(sentences[i][0]), word2id, "", "", "", lowercase=lowercase_words, replace_digits=replace_digits) 42 | char_ids = [map_text_to_ids("", char2id, "", "", "")] + \ 43 | [map_text_to_ids(" ".join(list(word)), char2id, "", "", "", lowercase=lowercase_chars, replace_digits=replace_digits) for word in sentences[i][0]] + \ 44 | [map_text_to_ids("", char2id, "", "", "")] 45 | label_ids = map_text_to_ids(" ".join(sentences[i][1]), label2id) 46 | 47 | assert(len(char_ids) == len(word_ids)) 48 | assert(len(char_ids) == len(label_ids) + 2) 49 | 50 | dataset.append((word_ids, char_ids, label_ids)) 51 | return dataset 52 | 53 | 54 | 55 | def create_batches(dataset, max_batch_size): 56 | """ 57 | Sort sentences by length and organise them into batches 58 | """ 59 | sentence_ids_by_length = collections.OrderedDict() 60 | for i in range(len(dataset)): 61 | length = len(dataset[i][0]) 62 | if length not in sentence_ids_by_length: 63 | sentence_ids_by_length[length] = [] 64 | sentence_ids_by_length[length].append(i) 65 | 66 | batches = [] 67 | for sentence_length in sentence_ids_by_length: 68 | for i in range(0, len(sentence_ids_by_length[sentence_length]), max_batch_size): 69 | sentence_ids_in_batch = sentence_ids_by_length[sentence_length][i:i + max_batch_size] 70 | max_word_length = numpy.array([[len(char_ids) for char_ids in dataset[sentence_id][1]] for sentence_id in sentence_ids_in_batch]).max() 71 | 72 | word_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length), dtype=numpy.int32) 73 | char_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32) 74 | char_mask = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32) 75 | label_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length-2), dtype=numpy.int32) 76 | 77 | for i in range(len(sentence_ids_in_batch)): 78 | for j in range(sentence_length): 79 | word_ids[i][j] = dataset[sentence_ids_in_batch[i]][0][j] 80 | for j in range(sentence_length): 81 | for k in range(len(dataset[sentence_ids_in_batch[i]][1][j])): 82 | char_ids[i][j][k] = dataset[sentence_ids_in_batch[i]][1][j][k] 83 | char_mask[i][j][k] = 1 84 | for j in range(sentence_length-2): 85 | label_ids[i][j] = dataset[sentence_ids_in_batch[i]][2][j] 86 | batches.append((word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch)) 87 | return batches 88 | 89 | 90 | def process_batches(sequencelabeler, batches, testing, learningrate, name, main_label_id, label2id=None, conll_eval=False, verbose=True): 91 | evaluator = SequenceLabelingEvaluator(main_label_id, label2id, conll_eval) 92 | for word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch in batches: 93 | if testing == True: 94 | cost, predicted_labels = sequencelabeler.test(word_ids, char_ids, char_mask, label_ids) 95 | else: 96 | cost, predicted_labels = sequencelabeler.train(word_ids, char_ids, char_mask, label_ids, learningrate) 97 | evaluator.append_data(cost, predicted_labels, word_ids, label_ids) 98 | 99 | results = evaluator.get_results(name) 100 | if verbose == True: 101 | for key in results: 102 | print key + ": " + str(results[key]) 103 | return results[name + "_cost_sum"], results 104 | 105 | 106 | 107 | def is_float(value): 108 | try: 109 | float(value) 110 | return True 111 | except ValueError: 112 | return False 113 | 114 | def parse_config(config_section, config_path): 115 | config_parser = ConfigParser.SafeConfigParser(allow_no_value=True) 116 | config_parser.read(config_path) 117 | config = collections.OrderedDict() 118 | for key, value in config_parser.items(config_section): 119 | if value is None or len(value.strip()) == 0: 120 | config[key] = None 121 | elif value.lower() in ["true", "false"]: 122 | config[key] = config_parser.getboolean(config_section, key) 123 | elif value.isdigit(): 124 | config[key] = config_parser.getint(config_section, key) 125 | elif is_float(value): 126 | config[key] = config_parser.getfloat(config_section, key) 127 | else: 128 | config[key] = config_parser.get(config_section, key) 129 | return config 130 | 131 | 132 | def generate_word2id_dictionary(texts, min_freq=-1, insert_words=None, lowercase=False, replace_digits=False): 133 | counter = collections.Counter() 134 | for text in texts: 135 | if lowercase: 136 | text = text.lower() 137 | if replace_digits: 138 | text = re.sub(r'\d', '0', text) 139 | counter.update(text.strip().split()) 140 | 141 | word2id = collections.OrderedDict() 142 | if insert_words is not None: 143 | for word in insert_words: 144 | word2id[word] = len(word2id) 145 | 146 | word_count_list = counter.most_common() 147 | 148 | for (word, count) in word_count_list: 149 | if min_freq <= 0 or count >= min_freq: 150 | word2id[word] = len(word2id) 151 | 152 | return word2id 153 | 154 | 155 | def map_text_to_ids(text, word2id, start_token=None, end_token=None, unk_token=None, lowercase=False, replace_digits=False): 156 | ids = [] 157 | 158 | if lowercase: 159 | text = text.lower() 160 | if replace_digits: 161 | text = re.sub(r'\d', '0', text) 162 | 163 | if start_token != None: 164 | text = start_token + " " + text 165 | if end_token != None: 166 | text = text + " " + end_token 167 | for word in text.strip().split(): 168 | if word in word2id: 169 | ids.append(word2id[word]) 170 | elif unk_token != None: 171 | ids.append(word2id[unk_token]) 172 | return ids 173 | 174 | 175 | 176 | def preload_vectors(word2id, vector_size, word2vec_path): 177 | rng = numpy.random.RandomState(123) 178 | preloaded_vectors = numpy.asarray(rng.normal(loc=0.0, scale=0.1, size=(len(word2id), vector_size)), dtype=floatX) 179 | 180 | with open(word2vec_path) as f: 181 | for line in f: 182 | line_parts = line.strip().split() 183 | if len(line_parts) <= 2: 184 | continue 185 | word = line_parts[0] 186 | if word in word2id: 187 | word_id = word2id[word] 188 | vector = numpy.array(line_parts[1:]) 189 | preloaded_vectors[word_id] = vector 190 | return preloaded_vectors 191 | 192 | 193 | def run_experiment(config_path): 194 | config = parse_config("config", config_path) 195 | random.seed(config["random_seed"] + 1) 196 | temp_model_path = config_path + ".model" 197 | sequencelabeler = None 198 | 199 | # Preparing dictionaries 200 | if config["path_train"] is not None and len(config["path_train"]) > 0: 201 | sentences_train = read_input_files(config["path_train"]) 202 | word2id = generate_word2id_dictionary([" ".join(sentence[0]) for sentence in sentences_train], 203 | min_freq=config["min_word_freq"], 204 | insert_words=["", "", ""], 205 | lowercase=False, 206 | replace_digits=True) 207 | label2id = generate_word2id_dictionary([" ".join(sentence[1]) for sentence in sentences_train]) 208 | char2id = generate_word2id_dictionary([" ".join([" ".join(list(word)) for word in sentence[0]]) for sentence in sentences_train], 209 | min_freq=-1, 210 | insert_words=["", "", "", "", ""], 211 | lowercase=False, 212 | replace_digits=True) 213 | 214 | if config["load"] is not None and len(config["load"]) > 0: 215 | if config["rebuild_output_layer"] == True: 216 | sequencelabeler = SequenceLabeler.load(config["load"], new_output_layer_size=len(label2id)) 217 | # label2id = label2id 218 | else: 219 | sequencelabeler = SequenceLabeler.load(config["load"]) 220 | label2id = sequencelabeler.config["label2id"] 221 | word2id = sequencelabeler.config["word2id"] 222 | char2id = sequencelabeler.config["char2id"] 223 | 224 | if config["path_train"] is not None and len(config["path_train"]) > 0: 225 | data_train = read_dataset(config["path_train"], False, False, True, word2id, char2id, label2id) 226 | 227 | if config["load"] is None or len(config["load"]) == 0: 228 | config["n_words"] = len(word2id) 229 | config["n_chars"] = len(char2id) 230 | config["n_labels"] = len(label2id) 231 | config["unk_token"] = "" 232 | config["unk_token_id"] = word2id[""] 233 | sequencelabeler = SequenceLabeler(config) 234 | if config['preload_vectors'] is not None: 235 | new_embeddings = preload_vectors(word2id, config['word_embedding_size'], config['preload_vectors']) 236 | sequencelabeler.word_embeddings.set_value(new_embeddings) 237 | 238 | if config["path_dev"] is not None and len(config["path_dev"]) > 0: 239 | data_dev = read_dataset(config["path_dev"], False, False, True, word2id, char2id, label2id) 240 | batches_dev = create_batches(data_dev, config['max_batch_size']) 241 | 242 | # printing config 243 | for key, val in config.items(): 244 | print key, ": ", val 245 | print "parameter_count: ", sequencelabeler.get_parameter_count() 246 | print "parameter_count_without_word_embeddings: ", sequencelabeler.get_parameter_count_without_word_embeddings() 247 | 248 | config["word2id"] = word2id 249 | config["char2id"] = char2id 250 | config["label2id"] = label2id 251 | 252 | if config["path_train"] is not None and len(config["path_train"]) > 0: 253 | best_selector_value = 0.0 254 | learningrate = config["learningrate"] 255 | for epoch in xrange(config["epochs"]): 256 | print("EPOCH: " + str(epoch)) 257 | print("learningrate: " + str(learningrate)) 258 | random.shuffle(data_train) 259 | batches_train = create_batches(data_train, config['max_batch_size']) 260 | random.shuffle(batches_train) 261 | 262 | train_cost_sum, results_train = process_batches(sequencelabeler, batches_train, testing=False, learningrate=learningrate, name="train", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True) 263 | dev_cost_sum, results_dev = process_batches(sequencelabeler, batches_dev, testing=True, learningrate=0.0, name="dev", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True) 264 | 265 | if math.isnan(dev_cost_sum) or math.isinf(dev_cost_sum): 266 | sys.stderr.write("ERROR: Cost is NaN or Inf. Exiting.\n") 267 | break 268 | 269 | if (epoch == 0 or (config["best_model_selector"].split(":")[1] == "high" and results_dev[config["best_model_selector"].split(":")[0]] > best_selector_value) 270 | or (config["best_model_selector"].split(":")[1] == "low" and results_dev[config["best_model_selector"].split(":")[0]] < best_selector_value)): 271 | best_epoch = epoch 272 | best_selector_value = results_dev[config["best_model_selector"].split(":")[0]] 273 | sequencelabeler.save(temp_model_path) 274 | print("best_epoch: " + str(best_epoch)) 275 | 276 | batches_train = None 277 | gc.collect() 278 | 279 | if config["stop_if_no_improvement_for_epochs"] > 0 and (epoch - best_epoch) >= config["stop_if_no_improvement_for_epochs"]: 280 | break 281 | 282 | # loading the best model so far 283 | if config["epochs"] > 0: 284 | sequencelabeler = SequenceLabeler.load(temp_model_path) 285 | os.remove(temp_model_path) 286 | 287 | if config["save"] is not None and len(config["save"]) > 0: 288 | sequencelabeler.save(config["save"]) 289 | 290 | if config["path_test"] is not None: 291 | i = 0 292 | for path_test in config["path_test"].strip().split(":"): 293 | data_test = read_dataset(path_test, False, False, True, word2id, char2id, label2id) 294 | batches_test = create_batches(data_test, config['max_batch_size']) 295 | test_cost_sum, results_test = process_batches(sequencelabeler, batches_test, testing=True, learningrate=0.0, name="test" + (str(i) if len(batches_test) > 1 else ""), main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True) 296 | i += 1 297 | 298 | 299 | if __name__ == "__main__": 300 | run_experiment(sys.argv[1]) 301 | --------------------------------------------------------------------------------