├── config.conf
├── sequence_labeling_evaluator.py
├── crf.py
├── punctuator.py
├── recurrence.py
├── error_calculator.py
├── README.md
├── punctuation_data_converter.py
├── sequence_labeler.py
├── conlleval.py
└── sequence_labeling_experiment.py
/config.conf:
--------------------------------------------------------------------------------
1 | [config]
2 | path_train = ./data/train
3 | path_dev = ./data/dev
4 | path_test = ./data/test
5 | main_label = .PERIOD
6 | conll_eval = False
7 | preload_vectors = /home/ottokar/old/large_files/GoogleNews-vectors-negative300.txt
8 | word_embedding_size = 300
9 | char_embedding_size = 50
10 | word_recurrent_size = 200
11 | char_recurrent_size = 200
12 | narrow_layer_size = 50
13 | best_model_selector = dev_f:high
14 | epochs = 20
15 | stop_if_no_improvement_for_epochs = 7
16 | learningrate = 1.0
17 | min_word_freq = 2
18 | max_batch_size = 64
19 | save = punctuator.model
20 | load =
21 | random_seed = 1001
22 | crf_on_top = True
23 | char_integration_method = attention
24 |
--------------------------------------------------------------------------------
/sequence_labeling_evaluator.py:
--------------------------------------------------------------------------------
1 | import time
2 | import collections
3 | import numpy
4 |
5 | import conlleval
6 |
7 | class SequenceLabelingEvaluator(object):
8 | def __init__(self, main_label_id, label2id=None, conll_eval=False):
9 | self.main_label_id = main_label_id
10 | self.label2id = label2id
11 | self.conll_eval = conll_eval
12 |
13 | self.cost_sum = 0.0
14 | self.correct_sum = 0.0
15 | self.main_predicted_count = 0
16 | self.main_total_count = 0
17 | self.main_correct_count = 0
18 | self.token_count = 0
19 | self.start_time = time.time()
20 |
21 | if self.label2id is not None:
22 | self.id2label = collections.OrderedDict()
23 | for label in self.label2id:
24 | self.id2label[self.label2id[label]] = label
25 |
26 | self.conll_format = []
27 |
28 | def append_data(self, cost, predicted_labels, word_ids, label_ids):
29 | self.cost_sum += cost
30 | self.token_count += label_ids.size
31 | self.correct_sum += numpy.equal(predicted_labels, label_ids).sum()
32 | self.main_predicted_count += (predicted_labels == self.main_label_id).sum()
33 | self.main_total_count += (label_ids == self.main_label_id).sum()
34 | self.main_correct_count += ((predicted_labels == self.main_label_id)*(label_ids == self.main_label_id)).sum()
35 |
36 | for i in range(word_ids.shape[0]):
37 | for j in range(word_ids.shape[1]-2):
38 | try:
39 | self.conll_format.append(str(word_ids[i][j+1]) + "\t" + str(self.id2label[label_ids[i][j]]) + "\t" + str(self.id2label[predicted_labels[i][j]]))
40 | except KeyError:
41 | print("Unexpected label id in predictions.") # Probably means the CRF decided to predict a start/end label, which it shouldn't
42 | self.conll_format.append("")
43 |
44 |
45 | def get_results(self, name):
46 | p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0
47 | r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0
48 | f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0
49 | f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0
50 |
51 | results = collections.OrderedDict()
52 | results[name + "_cost_avg"] = self.cost_sum / float(self.token_count)
53 | results[name + "_cost_sum"] = self.cost_sum
54 | results[name + "_main_predicted_count"] = self.main_predicted_count
55 | results[name + "_main_total_count"] = self.main_total_count
56 | results[name + "_main_correct_count"] = self.main_correct_count
57 | results[name + "_p"] = p
58 | results[name + "_r"] = r
59 | results[name + "_f"] = f
60 | results[name + "_f05"] = f05
61 | results[name + "_accuracy"] = self.correct_sum / float(self.token_count)
62 | results[name + "_token_count"] = self.token_count
63 | results[name + "_time"] = float(time.time()) - float(self.start_time)
64 |
65 | if self.label2id is not None and self.conll_eval == True:
66 | conll_counts = conlleval.evaluate(self.conll_format)
67 | conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts)
68 | results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter)
69 | results[name + "_conll_p"] = conll_metrics_overall.prec
70 | results[name + "_conll_r"] = conll_metrics_overall.rec
71 | results[name + "_conll_f"] = conll_metrics_overall.fscore
72 | # for i, m in sorted(conll_metrics_by_type.items()):
73 | # results[name + "_conll_p_" + str(i)] = m.prec
74 | # results[name + "_conll_r_" + str(i)] = m.rec
75 | # results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i])
76 |
77 | return results
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/crf.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import numpy
3 |
4 | # CRF implementation based on Lample et al.
5 | # "Neural Architectures for Named Entity Recognition"
6 |
7 | floatX=theano.config.floatX
8 |
9 | def log_sum(x, axis=None):
10 | x_max_value = x.max(axis=axis)
11 | x_max_tensor = x.max(axis=axis, keepdims=True)
12 | return x_max_value + theano.tensor.log(theano.tensor.exp(x - x_max_tensor).sum(axis=axis))
13 |
14 |
15 | def forward(observation_weights, transition_weights, return_best_sequence=False):
16 | def recurrence(observation_weights, previous_scores, transition_weights):
17 | previous_scores = previous_scores.dimshuffle(0, 1, 'x')
18 | observation_weights = observation_weights.dimshuffle(0, 'x', 1)
19 | scores = previous_scores + observation_weights + transition_weights.dimshuffle('x', 0, 1)
20 | if return_best_sequence:
21 | best_scores = scores.max(axis=1)
22 | best_states = scores.argmax(axis=1)
23 | return best_scores, best_states
24 | else:
25 | return log_sum(scores, axis=1)
26 |
27 | initial = observation_weights[0]
28 | crf_states, _ = theano.scan(
29 | fn=recurrence,
30 | outputs_info=(initial, None) if return_best_sequence else initial,
31 | sequences=[observation_weights[1:],],
32 | non_sequences=transition_weights
33 | )
34 |
35 | if return_best_sequence:
36 | sequence, _ = theano.scan(
37 | fn=lambda beta_i, previous: beta_i[theano.tensor.arange(previous.shape[0]), previous],
38 | outputs_info=theano.tensor.cast(theano.tensor.argmax(crf_states[0][-1], axis=1), 'int32'),
39 | sequences=theano.tensor.cast(crf_states[1][::-1], 'int32')
40 | )
41 | sequence = theano.tensor.concatenate([sequence[::-1], [theano.tensor.argmax(crf_states[0][-1], axis=1)]])
42 | return sequence, crf_states[0]
43 | else:
44 | return log_sum(crf_states[-1], axis=1)
45 |
46 |
47 | def construct(name, input_tensor, n_labels, gold_labels, fn_create_parameter_matrix):
48 | transition_weights = fn_create_parameter_matrix(name + "_crf_transition_weights", (n_labels + 2, n_labels + 2))
49 |
50 | small = -1000.0
51 | padding_start = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small
52 | padding_start = theano.tensor.set_subtensor(padding_start[:,:,-2], 0.0)
53 | padding_end = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small
54 | padding_end = theano.tensor.set_subtensor(padding_end[:,:,-1], 0.0)
55 | observation_weights = theano.tensor.concatenate([input_tensor, theano.tensor.zeros((input_tensor.shape[0], input_tensor.shape[1], 2)) + small], axis=2)
56 | observation_weights = theano.tensor.concatenate([padding_start, observation_weights, padding_end], axis=1)
57 | observation_weights = observation_weights.dimshuffle(1,0,2) # reordering the tensor (words, sentences, labels)
58 |
59 | # Score from tags
60 | real_paths_scores = input_tensor[theano.tensor.arange(input_tensor.shape[0])[:, numpy.newaxis], theano.tensor.arange(input_tensor.shape[1]), gold_labels].sum(axis=1)
61 |
62 | # Score from transition_weights
63 | padding_id_start = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels
64 | padding_id_end = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels + 1
65 | padded_gold_labels = theano.tensor.concatenate([padding_id_start, gold_labels, padding_id_end], axis=1)
66 | real_paths_scores += transition_weights[
67 | padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1)],
68 | padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1) + 1]
69 | ].sum(axis=1)
70 |
71 | all_paths_scores = forward(observation_weights, transition_weights)
72 |
73 | best_sequence, scores = forward(observation_weights, transition_weights, return_best_sequence=True)
74 |
75 | scores = scores.dimshuffle(1,0,2)[:,:-1,:-2]
76 | best_sequence = best_sequence.dimshuffle(1,0)[:,1:-1]
77 |
78 | return all_paths_scores, real_paths_scores, best_sequence, scores
79 |
80 |
--------------------------------------------------------------------------------
/punctuator.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy
3 |
4 | from collections import OrderedDict
5 | from sequence_labeler import SequenceLabeler
6 | from sequence_labeling_experiment import read_dataset, create_batches, parse_config, map_text_to_ids
7 | from punctuation_data_converter import EOS_TOKENS, SPACE, MAX_SEQUENCE_LEN
8 |
9 | def last_index_of(array, element):
10 | try:
11 | return len(array) -1 - array[::-1].index(element)
12 | except:
13 | return 0
14 |
15 | def up_to_last_instance_of(array, elements):
16 | idx = max(last_index_of(array, element) for element in elements)
17 | if idx == 0:
18 | return array
19 | else:
20 | return array[:idx + 1]
21 |
22 | def reverse_mapping(d):
23 | return OrderedDict([(v,k) for (k,v) in d.items()])
24 |
25 | def convert_to_batch(word_sequence, lowercase_words, lowercase_chars, replace_digits, word2id, char2id):
26 | raw_word_ids = map_text_to_ids(" ".join(word_sequence), word2id, "", "", "", lowercase=lowercase_words, replace_digits=replace_digits)
27 | raw_char_ids = [map_text_to_ids("", char2id, "", "", "")] + \
28 | [map_text_to_ids(" ".join(list(word)), char2id, "", "", "", lowercase=lowercase_chars, replace_digits=replace_digits) for word in word_sequence] + \
29 | [map_text_to_ids("", char2id, "", "", "")]
30 |
31 | assert(len(raw_char_ids) == len(raw_word_ids))
32 |
33 | # Mask and convert to numpy array
34 | batch_size = 1
35 | seq_len = len(raw_word_ids)
36 |
37 | max_word_length = numpy.array([len(c) for c in raw_char_ids]).max()
38 |
39 | word_ids = numpy.zeros((batch_size, seq_len), dtype=numpy.int32)
40 | char_ids = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32)
41 | char_mask = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32)
42 |
43 | for i in range(batch_size):
44 | for j in range(seq_len):
45 | word_ids[i][j] = raw_word_ids[j]
46 | for j in range(seq_len):
47 | for k in range(len(raw_char_ids[j])):
48 | char_ids[i][j][k] = raw_char_ids[j][k]
49 | char_mask[i][j][k] = 1
50 |
51 | return word_ids, char_ids, char_mask
52 |
53 | def punctuate(config_path):
54 | config = parse_config("config", config_path)
55 | if config["path_test"] is None:
56 | print("No test data configured")
57 | return
58 |
59 | sequencelabeler = SequenceLabeler.load(config["save"])
60 | label2id = sequencelabeler.config["label2id"]
61 | word2id = sequencelabeler.config["word2id"]
62 | char2id = sequencelabeler.config["char2id"]
63 |
64 | config["word2id"] = word2id
65 | config["char2id"] = char2id
66 | config["label2id"] = label2id
67 |
68 | id2label = reverse_mapping(label2id)
69 | eos_labels = [label2id[l] for l in EOS_TOKENS if l in label2id]
70 | space_id = label2id[SPACE]
71 |
72 | all_predicted_labels = []
73 |
74 | for path_test in config["path_test"].strip().split(":"):
75 |
76 | with open(path_test + '.orig', 'r') as f:
77 | all_words = [w for w in f.read().split() if w not in label2id]
78 |
79 | last_eos_idx = 0
80 |
81 | while True:
82 | word_sequence = all_words[last_eos_idx:last_eos_idx+MAX_SEQUENCE_LEN]
83 | if len(word_sequence) == 0:
84 | break
85 | word_ids, char_ids, char_mask = convert_to_batch(word_sequence, False, False, True, word2id, char2id)
86 | predicted_labels = sequencelabeler.predict(word_ids, char_ids, char_mask)
87 | predicted_labels = up_to_last_instance_of(list(predicted_labels.flatten()), eos_labels)
88 | if len(predicted_labels) == 0:
89 | break
90 | all_predicted_labels += predicted_labels
91 | last_eos_idx += len(predicted_labels)
92 |
93 | with open(path_test + '.pred', 'w') as f:
94 | for w, l_id in zip(all_words, all_predicted_labels):
95 | f.write('%s %s ' % (w, '' if l_id == space_id else id2label[l_id]))
96 |
97 | if __name__ == "__main__":
98 | punctuate(sys.argv[1])
99 |
100 |
--------------------------------------------------------------------------------
/recurrence.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import numpy
3 |
4 | floatX=theano.config.floatX
5 |
6 | def create_birnn(input_tensor, input_size, mask, recurrent_size, return_combined, fn_create_parameter_matrix, name):
7 | rnn_mask = mask.dimshuffle(1,0) if (mask is not None) else None
8 | recurrent_forward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask,
9 | recurrent_size, only_return_final=return_combined, go_backwards=False, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_forward")
10 | recurrent_backward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask,
11 | recurrent_size, only_return_final=return_combined, go_backwards=True, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_backward")
12 | if return_combined == True:
13 | return theano.tensor.concatenate([recurrent_forward, recurrent_backward], axis=1)
14 | else:
15 | return theano.tensor.concatenate([recurrent_forward.dimshuffle(1,0,2), recurrent_backward.dimshuffle(1,0,2)], axis=2)
16 |
17 |
18 | def create_lstm(input_tensor, input_size, mask, recurrent_size, only_return_final, go_backwards, fn_create_parameter_matrix, name):
19 | # LSTM. Following Graves et al.
20 | # "Hybrid speech recognition with deep bidirectional LSTM"
21 | def lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co):
22 | m_xhb = theano.tensor.dot(x, W_x) + theano.tensor.dot(h_prev, W_h) + b
23 | i = theano.tensor.nnet.sigmoid(_slice(m_xhb, 0, 4) + c_prev * W_ci)
24 | f = theano.tensor.nnet.sigmoid(_slice(m_xhb, 1, 4) + c_prev * W_cf)
25 | c = f * c_prev + i * theano.tensor.tanh(_slice(m_xhb, 2, 4))
26 | o = theano.tensor.nnet.sigmoid(_slice(m_xhb, 3, 4) + c * W_co)
27 | h = o * theano.tensor.tanh(c)
28 | return h, c
29 |
30 | def lstm_mask_step(x, mask, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co):
31 | h_new, c_new = lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co)
32 | h = theano.tensor.switch(mask, h_new, h_prev)
33 | c = theano.tensor.switch(mask, c_new, c_prev)
34 | return h, c
35 |
36 | def _slice(M, slice_num, total_slices):
37 | if M.ndim == 3:
38 | l = M.shape[2] / total_slices
39 | return M[:, :, slice_num*l:(slice_num+1)*l]
40 | elif M.ndim == 2:
41 | l = M.shape[1] / total_slices
42 | return M[:, slice_num*l:(slice_num+1)*l]
43 | elif M.ndim == 1:
44 | l = M.shape[0] / total_slices
45 | return M[slice_num*l:(slice_num+1)*l]
46 |
47 | h_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size)
48 | c_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size)
49 |
50 | if mask is not None:
51 | mask = mask.dimshuffle(0, 1, 'x')
52 | fn_step = locals()["lstm_mask_step"]
53 | sequences = [input_tensor, mask]
54 | else:
55 | fn_step = locals()["lstm_step"]
56 | sequences = input_tensor
57 |
58 | W_x = fn_create_parameter_matrix('W_x_'+name, (input_size, recurrent_size*4))
59 | W_h = fn_create_parameter_matrix('W_h_'+name, (recurrent_size, recurrent_size*4))
60 | b = fn_create_parameter_matrix('b_'+name, (recurrent_size*4,))
61 | W_ci = fn_create_parameter_matrix('W_ci_'+name, (recurrent_size,))
62 | W_cf = fn_create_parameter_matrix('W_cf_'+name, (recurrent_size,))
63 | W_co = fn_create_parameter_matrix('W_co_'+name, (recurrent_size,))
64 | result, _ = theano.scan(
65 | fn_step,
66 | sequences = sequences,
67 | outputs_info = [h_initial, c_initial],
68 | non_sequences = [W_x, W_h, b, W_ci, W_cf, W_co],
69 | go_backwards=go_backwards)
70 |
71 | h = result[0]
72 | if only_return_final == True:
73 | h = h[-1]
74 | else:
75 | if go_backwards == True:
76 | h = h[::-1]
77 | return h
78 |
79 |
80 |
81 | def create_feedforward(input_tensor, input_size, output_size, activation, fn_create_parameter_matrix, name):
82 | weights = fn_create_parameter_matrix('ff_weights_' + name, (input_size, output_size))
83 | bias = fn_create_parameter_matrix('ff_bias_' + name, (output_size,))
84 | output = theano.tensor.dot(input_tensor, weights) + bias
85 | if activation == "tanh":
86 | output = theano.tensor.tanh(output)
87 | elif activation == "sigmoid":
88 | output = theano.tensor.nnet.sigmoid(output)
89 | return output
90 |
91 |
--------------------------------------------------------------------------------
/error_calculator.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | """
4 | Computes and prints the overall classification error and precision, recall, F-score over punctuations.
5 | """
6 |
7 | from numpy import nan
8 | import punctuation_data_converter as data
9 | import codecs
10 | import sys
11 |
12 | MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example
13 |
14 | def compute_error(target_paths, predicted_paths):
15 | counter = 0
16 | total_correct = 0
17 |
18 | correct = 0.
19 | substitutions = 0.
20 | deletions = 0.
21 | insertions = 0.
22 |
23 | true_positives = {}
24 | false_positives = {}
25 | false_negatives = {}
26 |
27 | for target_path, predicted_path in zip(target_paths, predicted_paths):
28 |
29 | target_punctuation = " "
30 | predicted_punctuation = " "
31 |
32 | t_i = 0
33 | p_i = 0
34 |
35 | with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted:
36 |
37 | target_stream = target.read().split()
38 | predicted_stream = predicted.read().split()
39 |
40 | while True:
41 |
42 | if data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY:
43 | while data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: # skip multiple consecutive punctuations
44 | target_punctuation = data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i])
45 | target_punctuation = MAPPING.get(target_punctuation, target_punctuation)
46 | t_i += 1
47 | else:
48 | target_punctuation = " "
49 |
50 | if predicted_stream[p_i] in data.PUNCTUATION_VOCABULARY:
51 | predicted_punctuation = MAPPING.get(predicted_stream[p_i], predicted_stream[p_i])
52 | p_i += 1
53 | else:
54 | predicted_punctuation = " "
55 |
56 | is_correct = target_punctuation == predicted_punctuation
57 |
58 | counter += 1
59 | total_correct += is_correct
60 |
61 | if predicted_punctuation == " " and target_punctuation != " ":
62 | deletions += 1
63 | elif predicted_punctuation != " " and target_punctuation == " ":
64 | insertions += 1
65 | elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation == target_punctuation:
66 | correct += 1
67 | elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation != target_punctuation:
68 | substitutions += 1
69 |
70 | true_positives[target_punctuation] = true_positives.get(target_punctuation, 0.) + float(is_correct)
71 | false_positives[predicted_punctuation] = false_positives.get(predicted_punctuation, 0.) + float(not is_correct)
72 | false_negatives[target_punctuation] = false_negatives.get(target_punctuation, 0.) + float(not is_correct)
73 |
74 | assert target_stream[t_i] == predicted_stream[p_i] or predicted_stream[p_i] == "", \
75 | ("File: %s \n" + \
76 | "Error: %s (%s) != %s (%s) \n" + \
77 | "Target context: %s \n" + \
78 | "Predicted context: %s") % \
79 | (target_path,
80 | target_stream[t_i], t_i, predicted_stream[p_i], p_i,
81 | " ".join(target_stream[t_i-2:t_i+2]),
82 | " ".join(predicted_stream[p_i-2:p_i+2]))
83 |
84 | t_i += 1
85 | p_i += 1
86 |
87 | if t_i >= len(target_stream)-1 and p_i >= len(predicted_stream)-1:
88 | break
89 |
90 | overall_tp = 0.0
91 | overall_fp = 0.0
92 | overall_fn = 0.0
93 |
94 | print "-"*46
95 | print "{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')
96 | for p in data.PUNCTUATION_VOCABULARY:
97 |
98 | if p == data.SPACE:
99 | continue
100 |
101 | overall_tp += true_positives.get(p,0.)
102 | overall_fp += false_positives.get(p,0.)
103 | overall_fn += false_negatives.get(p,0.)
104 |
105 | punctuation = p
106 | precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan
107 | recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan
108 | f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan
109 | print "{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100)
110 | print "-"*46
111 | pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan
112 | rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan
113 | f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan
114 | print "{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)
115 | print "Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)
116 | print "SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1)
117 |
118 |
119 | if __name__ == "__main__":
120 |
121 | if len(sys.argv) > 1:
122 | target_path = sys.argv[1]
123 | else:
124 | sys.exit("Ground truth file path argument missing")
125 |
126 | if len(sys.argv) > 2:
127 | predicted_path = sys.argv[2]
128 | else:
129 | sys.exit("Model predictions file path argument missing")
130 |
131 | compute_error([target_path], [predicted_path])
132 |
133 |
134 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | A fork from https://github.com/marekrei/sequence-labeler to enable punctuation restoration in unsegmented text.
2 |
3 | ## Performance on English TED talks
4 | (Training set size: 2.1M words)
5 |
6 | PUNCTUATION | PRECISION | RECALL | F-SCORE
7 | --- | --- | --- | ---
8 | ,COMMA | 58.5 | 58.7 | 58.6
9 | ?QUESTIONMARK | 71.4 | 54.3 | 61.7
10 | .PERIOD | 69.9 | 72.0 | 70.9
11 | _Overall_ | _64.3_ | _64.9_ | _64.6_
12 |
13 | Performance is very similar (even slightly better) to https://github.com/ottokart/punctuator2 although they are not directly comparable as punctuator2 used pretrained embeddings that were trained on much less data and had much smaller size. More details can be found [here](http://www.isca-speech.org/archive/Interspeech_2016/pdfs/1517.PDF).
14 |
15 | Original README:
16 | =========================
17 |
18 | Sequence labeler
19 | =========================
20 |
21 | This is a neural network sequence labeling system. Given a sequence of tokens, it will learn to assign labels to each token. Can be used for named entity recognition, POS-tagging, error detection, chunking, CCG supertagging, etc.
22 |
23 | The main model implements a bidirectional LSTM for sequence tagging. In addition, you can incorporate character-level information -- either by concatenating a character-based representation, or by using an attention/gating mechanism for combining it with a word embedding.
24 |
25 | Run with:
26 |
27 | python sequence_labeling_experiment.py config.conf
28 |
29 | Preferably with Theano set up to use CUDA, so the process can run on a GPU.
30 |
31 | Requirements
32 | -------------------------
33 |
34 | * numpy
35 | * theano
36 | * lasagne
37 |
38 | Configuration
39 | -------------------------
40 |
41 | Edit the values in config.conf as needed:
42 |
43 | * **path_train** - Path to the training data, in CoNLL tab-separated format. One word per line, first column is the word, last column is the label. Empty lines between sentences.
44 | * **path_dev** - Path to the development data, used for choosing the best epoch.
45 | * **path_test** - Path to the test file. Can contain multiple files, colon separated.
46 | * **main_label** - The output label for which precision/recall/F-measure are calculated.
47 | * **conll_eval** - Whether the standard CoNLL NER evaluation should be run.
48 | * **preload_vectors** - Path to the pretrained word embeddings, in word2vec plain text format. If your embeddings are in binary, you can use [convertvec](https://github.com/marekrei/convertvec) to convert them to plain text.
49 | * **word_embedding_size** - Size of the word embeddings used in the model.
50 | * **char_embedding_size** - Size of the character embeddings.
51 | * **word_recurrent_size** - Size of the word-level LSTM hidden layers.
52 | * **char_recurrent_size** - Size of the char-level LSTM hidden layers.
53 | * **narrow_layer_size** - Size of the extra hidden layer on top of the bi-LSTM.
54 | * **best_model_selector** - What is measured on the dev set for model selection: "dev_conll_f:high" for NER and chunking, "dev_acc:high" for POS-tagging, "dev_f05:high" for error detection.
55 | * **epochs** - Maximum number of epochs to run.
56 | * **stop_if_no_improvement_for_epochs** - Training will be stopped if there has been no improvement for n epochs.
57 | * **learningrate** - Learning rate.
58 | * **min_word_freq** - Minimal frequency of words to be included in the vocabulary. Others will be considered OOV.
59 | * **max_batch_size** - Maximum batch size.
60 | * **save** - Path to save the model.
61 | * **load** - Path to load the model.
62 | * **random_seed** - Random seed for initialisation and data shuffling. This can affect results, so for robust conclusions I recommend running multiple experiments with different seeds and averaging the metrics.
63 | * **crf_on_top** - If True, use a CRF as the output layer. If False, use softmax instead.
64 | * **char_integration_method** - How character information is integrated. Options are: "none" (not integrated), "input" (concatenated), "attention" (the method proposed in Rei et al. (2016)).
65 |
66 |
67 | References
68 | -------------------------
69 |
70 | If you use the main sequence labeling code, please reference:
71 |
72 | [**Compositional Sequence Labeling Models for Error Detection in Learner Writing**](http://aclweb.org/anthology/P/P16/P16-1112.pdf)
73 | Marek Rei and Helen Yannakoudakis
74 | *In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL-2016)*
75 |
76 |
77 | If you use the character-level attention component, please reference:
78 |
79 | [**Attending to characters in neural sequence labeling models**](https://aclweb.org/anthology/C/C16/C16-1030.pdf)
80 | Marek Rei, Sampo Pyysalo and Gamal K.O. Crichton
81 | *In Proceedings of the 26th International Conference on Computational Linguistics (COLING-2016)*
82 |
83 |
84 | The CRF implementation is based on:
85 |
86 | [**Neural Architectures for Named Entity Recognition**](https://arxiv.org/abs/1603.01360)
87 | Guillaume Lample, Miguel Ballesteros, Sandeep Subramanian, Kazuya Kawakami and Chris Dyer
88 | *In Proceedings of NAACL-HLT 2016*
89 |
90 |
91 | The conlleval.py script is from: https://github.com/spyysalo/conlleval.py
92 |
93 |
94 | License
95 | ---------------------------
96 |
97 | MIT License
98 |
99 | Copyright (c) 2016 Marek Rei
100 |
101 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
102 |
103 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
104 |
105 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
106 |
--------------------------------------------------------------------------------
/punctuation_data_converter.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import division
3 |
4 | import random
5 | import os
6 | import sys
7 | import operator
8 | import cPickle
9 | import codecs
10 | import fnmatch
11 |
12 | DATA_PATH = "./data"
13 |
14 | END = ""
15 | UNK = ""
16 |
17 | SPACE = "_SPACE"
18 |
19 | MAX_WORD_VOCABULARY_SIZE = 100000
20 | MIN_WORD_COUNT_IN_VOCAB = 2
21 | MAX_SEQUENCE_LEN = 50
22 |
23 | TRAIN_FILE = os.path.join(DATA_PATH, "train")
24 | DEV_FILE = os.path.join(DATA_PATH, "dev")
25 | TEST_FILE = os.path.join(DATA_PATH, "test")
26 |
27 | PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"}
28 | PUNCTUATION_MAPPING = {}
29 |
30 | # Comma, period & question mark only:
31 | # PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK"}
32 | # PUNCTUATION_MAPPING = {"!EXCLAMATIONMARK": ".PERIOD", ":COLON": ",COMMA", ";SEMICOLON": ".PERIOD", "-DASH": ",COMMA"}
33 |
34 | EOS_TOKENS = {".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK"}
35 | CRAP_TOKENS = {"", ""} # punctuations that are not included in vocabulary nor mapping, must be added to CRAP_TOKENS
36 |
37 | def write_processed_dataset(input_files, output_file):
38 | """
39 | data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens (actually punctuation sequence will be 1 element shorter).
40 | If a sentence is cut, then it will be added to next subsequence entirely (words before the cut belong to both sequences)
41 | """
42 |
43 | current_words = []
44 | current_punctuations = []
45 |
46 | last_eos_idx = 0 # if it's still 0 when MAX_SEQUENCE_LEN is reached, then the sentence is too long and skipped.
47 | last_token_was_punctuation = True # skipt first token if it's punctuation
48 |
49 | skip_until_eos = False # if a sentence does not fit into subsequence, then we need to skip tokens until we find a new sentence
50 |
51 | for input_file in input_files:
52 |
53 | with codecs.open(input_file, 'r', 'utf-8') as text, \
54 | codecs.open(output_file, 'w', 'utf-8') as text_out:
55 |
56 | for line in text:
57 |
58 | for token in line.split():
59 |
60 | # First map oov punctuations to known punctuations
61 | if token in PUNCTUATION_MAPPING:
62 | token = PUNCTUATION_MAPPING[token]
63 |
64 | if skip_until_eos:
65 |
66 | if token in EOS_TOKENS:
67 | skip_until_eos = False
68 |
69 | continue
70 |
71 | elif token in CRAP_TOKENS:
72 | continue
73 |
74 | elif token in PUNCTUATION_VOCABULARY:
75 |
76 | if last_token_was_punctuation: # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", then we only use the first punctuation and skip the ones that follow
77 | continue
78 |
79 | if token in EOS_TOKENS:
80 | last_eos_idx = len(current_punctuations) # no -1, because the token is not added yet
81 |
82 | punctuation = token
83 |
84 | current_punctuations.append(punctuation)
85 | last_token_was_punctuation = True
86 |
87 | else:
88 |
89 | if not last_token_was_punctuation:
90 | current_punctuations.append(SPACE)
91 |
92 | word = token
93 |
94 | current_words.append(word)
95 | last_token_was_punctuation = False
96 |
97 | if len(current_words) == MAX_SEQUENCE_LEN: # this also means, that last token was a word
98 |
99 | assert len(current_words) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations))
100 |
101 | # Sentence did not fit into subsequence - skip it
102 | if last_eos_idx == 0:
103 | skip_until_eos = True
104 |
105 | current_words = []
106 | current_punctuations = []
107 |
108 | last_token_was_punctuation = True # next sequence starts with a new sentence, so is preceded by eos which is punctuation
109 |
110 | else:
111 |
112 | for w, p in zip(current_words, current_punctuations):
113 | text_out.write('%s\t%s\n' % (w, p))
114 | text_out.write('\n')
115 |
116 | # Carry unfinished sentence to next subsequence
117 | current_words = current_words[last_eos_idx+1:]
118 | current_punctuations = current_punctuations[last_eos_idx+1:]
119 |
120 | last_eos_idx = 0 # sequence always starts with a new sentence
121 |
122 | def create_dev_test_train_split(root_path, train_output, dev_output, test_output):
123 |
124 | train_txt_files = []
125 | dev_txt_files = []
126 | test_txt_files = []
127 |
128 | for root, dirnames, filenames in os.walk(root_path):
129 | for filename in fnmatch.filter(filenames, '*.txt'):
130 |
131 | path = os.path.join(root, filename)
132 |
133 | if filename.endswith(".test.txt"):
134 | test_txt_files.append(path)
135 |
136 | elif filename.endswith(".dev.txt"):
137 | dev_txt_files.append(path)
138 |
139 | else:
140 | train_txt_files.append(path)
141 |
142 | write_processed_dataset(train_txt_files, train_output)
143 | write_processed_dataset(dev_txt_files, dev_output)
144 | write_processed_dataset(test_txt_files, test_output)
145 |
146 | if __name__ == "__main__":
147 |
148 | if len(sys.argv) > 1:
149 | path = sys.argv[1]
150 | else:
151 | sys.exit("The path to source data directory with txt files is missing")
152 |
153 | if not os.path.exists(DATA_PATH):
154 | os.makedirs(DATA_PATH)
155 | else:
156 | sys.exit("Data already exists")
157 |
158 | create_dev_test_train_split(path, TRAIN_FILE, DEV_FILE, TEST_FILE)
159 |
160 |
--------------------------------------------------------------------------------
/sequence_labeler.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import theano
3 | import numpy
4 | import collections
5 | import cPickle
6 | import lasagne
7 |
8 | import crf
9 | import recurrence
10 |
11 | sys.setrecursionlimit(50000)
12 | floatX=theano.config.floatX
13 |
14 | class SequenceLabeler(object):
15 | def __init__(self, config):
16 | self.config = config
17 | self.params = collections.OrderedDict()
18 | self.rng = numpy.random.RandomState(config["random_seed"])
19 |
20 | word_ids = theano.tensor.imatrix('word_ids')
21 | char_ids = theano.tensor.itensor3('char_ids')
22 | char_mask = theano.tensor.ftensor3('char_mask')
23 | label_ids = theano.tensor.imatrix('label_ids')
24 | learningrate = theano.tensor.fscalar('learningrate')
25 |
26 | cost = 0.0
27 | input_tensor = None
28 | input_vector_size = 0
29 |
30 | self.word_embeddings = self.create_parameter_matrix('word_embeddings', (config["n_words"], config["word_embedding_size"]))
31 | input_tensor = self.word_embeddings[word_ids]
32 | input_vector_size = config["word_embedding_size"]
33 |
34 | char_embeddings = self.create_parameter_matrix('char_embeddings', (config["n_chars"], config["char_embedding_size"]))
35 | char_input_tensor = char_embeddings[char_ids].reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2],config["char_embedding_size"]))
36 | char_mask_reshaped = char_mask.reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2]))
37 |
38 | char_output_tensor = recurrence.create_birnn(char_input_tensor, config["char_embedding_size"], char_mask_reshaped, config["char_recurrent_size"], return_combined=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="char_birnn")
39 | char_output_tensor = recurrence.create_feedforward(char_output_tensor, config["char_recurrent_size"]*2, config["word_embedding_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="char_ff")
40 | char_output_tensor = char_output_tensor.reshape((char_ids.shape[0],char_ids.shape[1],config["word_embedding_size"]))
41 |
42 | if config["char_integration_method"] == "input":
43 | input_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2)
44 | input_vector_size += config["word_embedding_size"]
45 |
46 | elif config["char_integration_method"] == "attention":
47 | static_input_tensor = theano.gradient.disconnected_grad(input_tensor)
48 | is_unk = theano.tensor.eq(word_ids, config["unk_token_id"])
49 | is_unk_tensor = is_unk.dimshuffle(0,1,'x')
50 | char_output_tensor_normalised = char_output_tensor / char_output_tensor.norm(2, axis=2)[:, :, numpy.newaxis]
51 | static_input_tensor_normalised = static_input_tensor / static_input_tensor.norm(2, axis=2)[:, :, numpy.newaxis]
52 | cosine_cost = 1.0 - (char_output_tensor_normalised * static_input_tensor_normalised).sum(axis=2)
53 | cost += theano.tensor.switch(is_unk, 0.0, cosine_cost).sum()
54 | attention_evidence_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2)
55 | attention_output = recurrence.create_feedforward(attention_evidence_tensor, config["word_embedding_size"]*2, config["word_embedding_size"], "tanh", self.create_parameter_matrix, "attention_tanh")
56 | attention_output = recurrence.create_feedforward(attention_output, config["word_embedding_size"], config["word_embedding_size"], "sigmoid", self.create_parameter_matrix, "attention_sigmoid")
57 | input_tensor = input_tensor * attention_output + char_output_tensor * (1.0 - attention_output)
58 |
59 | processed_tensor = recurrence.create_birnn(input_tensor, input_vector_size, None, config["word_recurrent_size"], return_combined=False, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn")
60 | processed_tensor = recurrence.create_feedforward(processed_tensor, config["word_recurrent_size"]*2, config["narrow_layer_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="narrow_ff")
61 |
62 | W_output = self.create_parameter_matrix('W_output', (config["narrow_layer_size"], config["n_labels"]))
63 | bias_output = self.create_parameter_matrix('bias_output', (config["n_labels"],))
64 | output = theano.tensor.dot(processed_tensor, W_output) + bias_output
65 | output = output[:,1:-1,:] # removing and
66 |
67 | if config["crf_on_top"] == True:
68 | all_paths_scores, real_paths_scores, best_sequence, scores = crf.construct("crf", output, config["n_labels"], label_ids, self.create_parameter_matrix)
69 | predicted_labels = best_sequence
70 | output_probs = scores
71 | cost += - (real_paths_scores - all_paths_scores).sum()
72 | else:
73 | output_probs = theano.tensor.nnet.softmax(output.reshape((word_ids.shape[0]*(word_ids.shape[1]-2), config["n_labels"])))
74 | predicted_labels = theano.tensor.argmax(output_probs.reshape((word_ids.shape[0], (word_ids.shape[1]-2), config["n_labels"])), axis=2)
75 | cost += theano.tensor.nnet.categorical_crossentropy(output_probs, label_ids.reshape((-1,))).sum()
76 |
77 | gradients = theano.tensor.grad(cost, self.params.values(), disconnected_inputs='ignore')
78 | updates = lasagne.updates.adadelta(gradients, self.params.values(), learningrate)
79 |
80 | input_vars_train = [word_ids, char_ids, char_mask, label_ids, learningrate]
81 | input_vars_test = [word_ids, char_ids, char_mask, label_ids]
82 | output_vars = [cost, predicted_labels]
83 | self.train = theano.function(input_vars_train, output_vars, updates=updates, on_unused_input='ignore', allow_input_downcast = True)
84 | self.test = theano.function(input_vars_test, output_vars, on_unused_input='ignore', allow_input_downcast = True)
85 | self.predict = theano.function([word_ids, char_ids, char_mask], predicted_labels, on_unused_input='ignore', allow_input_downcast = True)
86 |
87 | def create_parameter_matrix(self, name, size):
88 | param_vals = numpy.asarray(self.rng.normal(loc=0.0, scale=0.1, size=size), dtype=floatX)
89 | param_shared = theano.shared(param_vals, name)
90 | self.params[name] = param_shared
91 | return param_shared
92 |
93 |
94 | def get_parameter_count(self):
95 | total = 0
96 | for key, val in self.params.iteritems():
97 | total += val.get_value().size
98 | return total
99 |
100 | def get_parameter_count_without_word_embeddings(self):
101 | total = 0
102 | for key, val in self.params.iteritems():
103 | if val == self.word_embeddings:
104 | continue
105 | total += val.get_value().size
106 | return total
107 |
108 | def save(self, filename):
109 | dump = {}
110 | dump["config"] = self.config
111 | dump["params"] = {}
112 | for param_name in self.params:
113 | dump["params"][param_name] = self.params[param_name].get_value()
114 | f = file(filename, 'wb')
115 | cPickle.dump(dump, f, protocol=cPickle.HIGHEST_PROTOCOL)
116 | f.close()
117 |
118 | @staticmethod
119 | def load(filename, new_output_layer_size=None):
120 | f = file(filename, 'rb')
121 | dump = cPickle.load(f)
122 | f.close()
123 | if new_output_layer_size is not None:
124 | dump["n_labels"] = new_output_layer_size
125 | sequencelabeler = SequenceLabeler(dump["config"])
126 | for param_name in sequencelabeler.params:
127 | assert(param_name in dump["params"])
128 | if new_output_layer_size is not None and param_name in ["W_output", "bias_output"]:
129 | continue
130 | sequencelabeler.params[param_name].set_value(dump["params"][param_name])
131 | return sequencelabeler
132 |
--------------------------------------------------------------------------------
/conlleval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Python version of the evaluation script from CoNLL'00-
4 | # Originates from: https://github.com/spyysalo/conlleval.py
5 |
6 |
7 | # Intentional differences:
8 | # - accept any space as delimiter by default
9 | # - optional file argument (default STDIN)
10 | # - option to set boundary (-b argument)
11 | # - LaTeX output (-l argument) not supported
12 | # - raw tags (-r argument) not supported
13 |
14 | import sys
15 | import re
16 |
17 | from collections import defaultdict, namedtuple
18 |
19 | ANY_SPACE = ''
20 |
21 | class FormatError(Exception):
22 | pass
23 |
24 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
25 |
26 | class EvalCounts(object):
27 | def __init__(self):
28 | self.correct_chunk = 0 # number of correctly identified chunks
29 | self.correct_tags = 0 # number of correct chunk tags
30 | self.found_correct = 0 # number of chunks in corpus
31 | self.found_guessed = 0 # number of identified chunks
32 | self.token_counter = 0 # token counter (ignores sentence breaks)
33 |
34 | # counts by type
35 | self.t_correct_chunk = defaultdict(int)
36 | self.t_found_correct = defaultdict(int)
37 | self.t_found_guessed = defaultdict(int)
38 |
39 | def parse_args(argv):
40 | import argparse
41 | parser = argparse.ArgumentParser(
42 | description='evaluate tagging results using CoNLL criteria',
43 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
44 | )
45 | arg = parser.add_argument
46 | arg('-b', '--boundary', metavar='STR', default='-X-',
47 | help='sentence boundary')
48 | arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
49 | help='character delimiting items in input')
50 | arg('-o', '--otag', metavar='CHAR', default='O',
51 | help='alternative outside tag')
52 | arg('file', nargs='?', default=None)
53 | return parser.parse_args(argv)
54 |
55 | def parse_tag(t):
56 | m = re.match(r'^([^-]*)-(.*)$', t)
57 | return m.groups() if m else (t, '')
58 |
59 | def evaluate(iterable, options=None):
60 | if options is None:
61 | options = parse_args([]) # use defaults
62 |
63 | counts = EvalCounts()
64 | num_features = None # number of features per line
65 | in_correct = False # currently processed chunks is correct until now
66 | last_correct = 'O' # previous chunk tag in corpus
67 | last_correct_type = '' # type of previously identified chunk tag
68 | last_guessed = 'O' # previously identified chunk tag
69 | last_guessed_type = '' # type of previous chunk tag in corpus
70 |
71 | for line in iterable:
72 | line = line.rstrip('\r\n')
73 |
74 | if options.delimiter == ANY_SPACE:
75 | features = line.split()
76 | else:
77 | features = line.split(options.delimiter)
78 |
79 | if num_features is None:
80 | num_features = len(features)
81 | elif num_features != len(features) and len(features) != 0:
82 | raise FormatError('unexpected number of features: %d (%d)' %
83 | (len(features), num_features))
84 |
85 | if len(features) == 0 or features[0] == options.boundary:
86 | features = [options.boundary, 'O', 'O']
87 | if len(features) < 3:
88 | raise FormatError('unexpected number of features in line %s' % line)
89 |
90 | guessed, guessed_type = parse_tag(features.pop())
91 | correct, correct_type = parse_tag(features.pop())
92 | first_item = features.pop(0)
93 |
94 | if first_item == options.boundary:
95 | guessed = 'O'
96 |
97 | end_correct = end_of_chunk(last_correct, correct,
98 | last_correct_type, correct_type)
99 | end_guessed = end_of_chunk(last_guessed, guessed,
100 | last_guessed_type, guessed_type)
101 | start_correct = start_of_chunk(last_correct, correct,
102 | last_correct_type, correct_type)
103 | start_guessed = start_of_chunk(last_guessed, guessed,
104 | last_guessed_type, guessed_type)
105 |
106 | if in_correct:
107 | if (end_correct and end_guessed and
108 | last_guessed_type == last_correct_type):
109 | in_correct = False
110 | counts.correct_chunk += 1
111 | counts.t_correct_chunk[last_correct_type] += 1
112 | elif (end_correct != end_guessed or guessed_type != correct_type):
113 | in_correct = False
114 |
115 | if start_correct and start_guessed and guessed_type == correct_type:
116 | in_correct = True
117 |
118 | if start_correct:
119 | counts.found_correct += 1
120 | counts.t_found_correct[correct_type] += 1
121 | if start_guessed:
122 | counts.found_guessed += 1
123 | counts.t_found_guessed[guessed_type] += 1
124 | if first_item != options.boundary:
125 | if correct == guessed and guessed_type == correct_type:
126 | counts.correct_tags += 1
127 | counts.token_counter += 1
128 |
129 | last_guessed = guessed
130 | last_correct = correct
131 | last_guessed_type = guessed_type
132 | last_correct_type = correct_type
133 |
134 | if in_correct:
135 | counts.correct_chunk += 1
136 | counts.t_correct_chunk[last_correct_type] += 1
137 |
138 | return counts
139 |
140 | def uniq(iterable):
141 | seen = set()
142 | return [i for i in iterable if not (i in seen or seen.add(i))]
143 |
144 | def calculate_metrics(correct, guessed, total):
145 | tp, fp, fn = correct, guessed-correct, total-correct
146 | p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
147 | r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
148 | f = 0 if p + r == 0 else 2 * p * r / (p + r)
149 | return Metrics(tp, fp, fn, p, r, f)
150 |
151 | def metrics(counts):
152 | c = counts
153 | overall = calculate_metrics(
154 | c.correct_chunk, c.found_guessed, c.found_correct
155 | )
156 | by_type = {}
157 | for t in uniq(c.t_found_correct.keys() + c.t_found_guessed.keys()):
158 | by_type[t] = calculate_metrics(
159 | c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
160 | )
161 | return overall, by_type
162 |
163 | def report(counts, out=None):
164 | if out is None:
165 | out = sys.stdout
166 |
167 | overall, by_type = metrics(counts)
168 |
169 | c = counts
170 | out.write('processed %d tokens with %d phrases; ' %
171 | (c.token_counter, c.found_correct))
172 | out.write('found: %d phrases; correct: %d.\n' %
173 | (c.found_guessed, c.correct_chunk))
174 |
175 | if c.token_counter > 0:
176 | out.write('accuracy: %6.2f%%; ' %
177 | (100.*c.correct_tags/c.token_counter))
178 | out.write('precision: %6.2f%%; ' % (100.*overall.prec))
179 | out.write('recall: %6.2f%%; ' % (100.*overall.rec))
180 | out.write('FB1: %6.2f\n' % (100.*overall.fscore))
181 |
182 | for i, m in sorted(by_type.items()):
183 | out.write('%17s: ' % i)
184 | out.write('precision: %6.2f%%; ' % (100.*m.prec))
185 | out.write('recall: %6.2f%%; ' % (100.*m.rec))
186 | out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
187 |
188 | def end_of_chunk(prev_tag, tag, prev_type, type_):
189 | # check if a chunk ended between the previous and current word
190 | # arguments: previous and current chunk tags, previous and current types
191 | chunk_end = False
192 |
193 | if prev_tag == 'E': chunk_end = True
194 | if prev_tag == 'S': chunk_end = True
195 |
196 | if prev_tag == 'B' and tag == 'B': chunk_end = True
197 | if prev_tag == 'B' and tag == 'S': chunk_end = True
198 | if prev_tag == 'B' and tag == 'O': chunk_end = True
199 | if prev_tag == 'I' and tag == 'B': chunk_end = True
200 | if prev_tag == 'I' and tag == 'S': chunk_end = True
201 | if prev_tag == 'I' and tag == 'O': chunk_end = True
202 |
203 | if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
204 | chunk_end = True
205 |
206 | # these chunks are assumed to have length 1
207 | if prev_tag == ']': chunk_end = True
208 | if prev_tag == '[': chunk_end = True
209 |
210 | return chunk_end
211 |
212 | def start_of_chunk(prev_tag, tag, prev_type, type_):
213 | # check if a chunk started between the previous and current word
214 | # arguments: previous and current chunk tags, previous and current types
215 | chunk_start = False
216 |
217 | if tag == 'B': chunk_start = True
218 | if tag == 'S': chunk_start = True
219 |
220 | if prev_tag == 'E' and tag == 'E': chunk_start = True
221 | if prev_tag == 'E' and tag == 'I': chunk_start = True
222 | if prev_tag == 'S' and tag == 'E': chunk_start = True
223 | if prev_tag == 'S' and tag == 'I': chunk_start = True
224 | if prev_tag == 'O' and tag == 'E': chunk_start = True
225 | if prev_tag == 'O' and tag == 'I': chunk_start = True
226 |
227 | if tag != 'O' and tag != '.' and prev_type != type_:
228 | chunk_start = True
229 |
230 | # these chunks are assumed to have length 1
231 | if tag == '[': chunk_start = True
232 | if tag == ']': chunk_start = True
233 |
234 | return chunk_start
235 |
236 | def main(argv):
237 | args = parse_args(argv[1:])
238 |
239 | if args.file is None:
240 | counts = evaluate(sys.stdin, args)
241 | else:
242 | with open(args.file) as f:
243 | counts = evaluate(f, args)
244 | report(counts)
245 |
246 | if __name__ == '__main__':
247 | sys.exit(main(sys.argv))
248 |
--------------------------------------------------------------------------------
/sequence_labeling_experiment.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import collections
3 | import numpy
4 | import random
5 | import math
6 | import gc
7 | import os
8 | import re
9 | import ConfigParser
10 | import theano
11 |
12 | from sequence_labeler import SequenceLabeler
13 | from sequence_labeling_evaluator import SequenceLabelingEvaluator
14 |
15 | floatX=theano.config.floatX
16 |
17 | def read_input_files(file_paths):
18 | sentences = []
19 | for file_path in file_paths.strip().split(","):
20 | with open(file_path, "r") as f:
21 | words, labels = [], []
22 | for line in f:
23 | if len(line.strip()) > 0:
24 | line_parts = line.strip().split()
25 | assert(len(line_parts) >= 2)
26 | words.append(line_parts[0])
27 | labels.append(line_parts[-1])
28 | elif len(line.strip()) == 0 and len(words) > 0:
29 | sentences.append((words, labels))
30 | words, labels = [], []
31 | if len(words) > 0:
32 | raise ValueError("The format expects an empty line at the end of the file in: " + file_path)
33 | return sentences
34 |
35 |
36 | def read_dataset(file_paths, lowercase_words, lowercase_chars, replace_digits, word2id, char2id, label2id):
37 | dataset = []
38 | sentences = read_input_files(file_paths)
39 |
40 | for i in range(len(sentences)):
41 | word_ids = map_text_to_ids(" ".join(sentences[i][0]), word2id, "", "", "", lowercase=lowercase_words, replace_digits=replace_digits)
42 | char_ids = [map_text_to_ids("", char2id, "", "", "")] + \
43 | [map_text_to_ids(" ".join(list(word)), char2id, "", "", "", lowercase=lowercase_chars, replace_digits=replace_digits) for word in sentences[i][0]] + \
44 | [map_text_to_ids("", char2id, "", "", "")]
45 | label_ids = map_text_to_ids(" ".join(sentences[i][1]), label2id)
46 |
47 | assert(len(char_ids) == len(word_ids))
48 | assert(len(char_ids) == len(label_ids) + 2)
49 |
50 | dataset.append((word_ids, char_ids, label_ids))
51 | return dataset
52 |
53 |
54 |
55 | def create_batches(dataset, max_batch_size):
56 | """
57 | Sort sentences by length and organise them into batches
58 | """
59 | sentence_ids_by_length = collections.OrderedDict()
60 | for i in range(len(dataset)):
61 | length = len(dataset[i][0])
62 | if length not in sentence_ids_by_length:
63 | sentence_ids_by_length[length] = []
64 | sentence_ids_by_length[length].append(i)
65 |
66 | batches = []
67 | for sentence_length in sentence_ids_by_length:
68 | for i in range(0, len(sentence_ids_by_length[sentence_length]), max_batch_size):
69 | sentence_ids_in_batch = sentence_ids_by_length[sentence_length][i:i + max_batch_size]
70 | max_word_length = numpy.array([[len(char_ids) for char_ids in dataset[sentence_id][1]] for sentence_id in sentence_ids_in_batch]).max()
71 |
72 | word_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length), dtype=numpy.int32)
73 | char_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32)
74 | char_mask = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32)
75 | label_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length-2), dtype=numpy.int32)
76 |
77 | for i in range(len(sentence_ids_in_batch)):
78 | for j in range(sentence_length):
79 | word_ids[i][j] = dataset[sentence_ids_in_batch[i]][0][j]
80 | for j in range(sentence_length):
81 | for k in range(len(dataset[sentence_ids_in_batch[i]][1][j])):
82 | char_ids[i][j][k] = dataset[sentence_ids_in_batch[i]][1][j][k]
83 | char_mask[i][j][k] = 1
84 | for j in range(sentence_length-2):
85 | label_ids[i][j] = dataset[sentence_ids_in_batch[i]][2][j]
86 | batches.append((word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch))
87 | return batches
88 |
89 |
90 | def process_batches(sequencelabeler, batches, testing, learningrate, name, main_label_id, label2id=None, conll_eval=False, verbose=True):
91 | evaluator = SequenceLabelingEvaluator(main_label_id, label2id, conll_eval)
92 | for word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch in batches:
93 | if testing == True:
94 | cost, predicted_labels = sequencelabeler.test(word_ids, char_ids, char_mask, label_ids)
95 | else:
96 | cost, predicted_labels = sequencelabeler.train(word_ids, char_ids, char_mask, label_ids, learningrate)
97 | evaluator.append_data(cost, predicted_labels, word_ids, label_ids)
98 |
99 | results = evaluator.get_results(name)
100 | if verbose == True:
101 | for key in results:
102 | print key + ": " + str(results[key])
103 | return results[name + "_cost_sum"], results
104 |
105 |
106 |
107 | def is_float(value):
108 | try:
109 | float(value)
110 | return True
111 | except ValueError:
112 | return False
113 |
114 | def parse_config(config_section, config_path):
115 | config_parser = ConfigParser.SafeConfigParser(allow_no_value=True)
116 | config_parser.read(config_path)
117 | config = collections.OrderedDict()
118 | for key, value in config_parser.items(config_section):
119 | if value is None or len(value.strip()) == 0:
120 | config[key] = None
121 | elif value.lower() in ["true", "false"]:
122 | config[key] = config_parser.getboolean(config_section, key)
123 | elif value.isdigit():
124 | config[key] = config_parser.getint(config_section, key)
125 | elif is_float(value):
126 | config[key] = config_parser.getfloat(config_section, key)
127 | else:
128 | config[key] = config_parser.get(config_section, key)
129 | return config
130 |
131 |
132 | def generate_word2id_dictionary(texts, min_freq=-1, insert_words=None, lowercase=False, replace_digits=False):
133 | counter = collections.Counter()
134 | for text in texts:
135 | if lowercase:
136 | text = text.lower()
137 | if replace_digits:
138 | text = re.sub(r'\d', '0', text)
139 | counter.update(text.strip().split())
140 |
141 | word2id = collections.OrderedDict()
142 | if insert_words is not None:
143 | for word in insert_words:
144 | word2id[word] = len(word2id)
145 |
146 | word_count_list = counter.most_common()
147 |
148 | for (word, count) in word_count_list:
149 | if min_freq <= 0 or count >= min_freq:
150 | word2id[word] = len(word2id)
151 |
152 | return word2id
153 |
154 |
155 | def map_text_to_ids(text, word2id, start_token=None, end_token=None, unk_token=None, lowercase=False, replace_digits=False):
156 | ids = []
157 |
158 | if lowercase:
159 | text = text.lower()
160 | if replace_digits:
161 | text = re.sub(r'\d', '0', text)
162 |
163 | if start_token != None:
164 | text = start_token + " " + text
165 | if end_token != None:
166 | text = text + " " + end_token
167 | for word in text.strip().split():
168 | if word in word2id:
169 | ids.append(word2id[word])
170 | elif unk_token != None:
171 | ids.append(word2id[unk_token])
172 | return ids
173 |
174 |
175 |
176 | def preload_vectors(word2id, vector_size, word2vec_path):
177 | rng = numpy.random.RandomState(123)
178 | preloaded_vectors = numpy.asarray(rng.normal(loc=0.0, scale=0.1, size=(len(word2id), vector_size)), dtype=floatX)
179 |
180 | with open(word2vec_path) as f:
181 | for line in f:
182 | line_parts = line.strip().split()
183 | if len(line_parts) <= 2:
184 | continue
185 | word = line_parts[0]
186 | if word in word2id:
187 | word_id = word2id[word]
188 | vector = numpy.array(line_parts[1:])
189 | preloaded_vectors[word_id] = vector
190 | return preloaded_vectors
191 |
192 |
193 | def run_experiment(config_path):
194 | config = parse_config("config", config_path)
195 | random.seed(config["random_seed"] + 1)
196 | temp_model_path = config_path + ".model"
197 | sequencelabeler = None
198 |
199 | # Preparing dictionaries
200 | if config["path_train"] is not None and len(config["path_train"]) > 0:
201 | sentences_train = read_input_files(config["path_train"])
202 | word2id = generate_word2id_dictionary([" ".join(sentence[0]) for sentence in sentences_train],
203 | min_freq=config["min_word_freq"],
204 | insert_words=["", "", ""],
205 | lowercase=False,
206 | replace_digits=True)
207 | label2id = generate_word2id_dictionary([" ".join(sentence[1]) for sentence in sentences_train])
208 | char2id = generate_word2id_dictionary([" ".join([" ".join(list(word)) for word in sentence[0]]) for sentence in sentences_train],
209 | min_freq=-1,
210 | insert_words=["", "", "", "", ""],
211 | lowercase=False,
212 | replace_digits=True)
213 |
214 | if config["load"] is not None and len(config["load"]) > 0:
215 | if config["rebuild_output_layer"] == True:
216 | sequencelabeler = SequenceLabeler.load(config["load"], new_output_layer_size=len(label2id))
217 | # label2id = label2id
218 | else:
219 | sequencelabeler = SequenceLabeler.load(config["load"])
220 | label2id = sequencelabeler.config["label2id"]
221 | word2id = sequencelabeler.config["word2id"]
222 | char2id = sequencelabeler.config["char2id"]
223 |
224 | if config["path_train"] is not None and len(config["path_train"]) > 0:
225 | data_train = read_dataset(config["path_train"], False, False, True, word2id, char2id, label2id)
226 |
227 | if config["load"] is None or len(config["load"]) == 0:
228 | config["n_words"] = len(word2id)
229 | config["n_chars"] = len(char2id)
230 | config["n_labels"] = len(label2id)
231 | config["unk_token"] = ""
232 | config["unk_token_id"] = word2id[""]
233 | sequencelabeler = SequenceLabeler(config)
234 | if config['preload_vectors'] is not None:
235 | new_embeddings = preload_vectors(word2id, config['word_embedding_size'], config['preload_vectors'])
236 | sequencelabeler.word_embeddings.set_value(new_embeddings)
237 |
238 | if config["path_dev"] is not None and len(config["path_dev"]) > 0:
239 | data_dev = read_dataset(config["path_dev"], False, False, True, word2id, char2id, label2id)
240 | batches_dev = create_batches(data_dev, config['max_batch_size'])
241 |
242 | # printing config
243 | for key, val in config.items():
244 | print key, ": ", val
245 | print "parameter_count: ", sequencelabeler.get_parameter_count()
246 | print "parameter_count_without_word_embeddings: ", sequencelabeler.get_parameter_count_without_word_embeddings()
247 |
248 | config["word2id"] = word2id
249 | config["char2id"] = char2id
250 | config["label2id"] = label2id
251 |
252 | if config["path_train"] is not None and len(config["path_train"]) > 0:
253 | best_selector_value = 0.0
254 | learningrate = config["learningrate"]
255 | for epoch in xrange(config["epochs"]):
256 | print("EPOCH: " + str(epoch))
257 | print("learningrate: " + str(learningrate))
258 | random.shuffle(data_train)
259 | batches_train = create_batches(data_train, config['max_batch_size'])
260 | random.shuffle(batches_train)
261 |
262 | train_cost_sum, results_train = process_batches(sequencelabeler, batches_train, testing=False, learningrate=learningrate, name="train", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
263 | dev_cost_sum, results_dev = process_batches(sequencelabeler, batches_dev, testing=True, learningrate=0.0, name="dev", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
264 |
265 | if math.isnan(dev_cost_sum) or math.isinf(dev_cost_sum):
266 | sys.stderr.write("ERROR: Cost is NaN or Inf. Exiting.\n")
267 | break
268 |
269 | if (epoch == 0 or (config["best_model_selector"].split(":")[1] == "high" and results_dev[config["best_model_selector"].split(":")[0]] > best_selector_value)
270 | or (config["best_model_selector"].split(":")[1] == "low" and results_dev[config["best_model_selector"].split(":")[0]] < best_selector_value)):
271 | best_epoch = epoch
272 | best_selector_value = results_dev[config["best_model_selector"].split(":")[0]]
273 | sequencelabeler.save(temp_model_path)
274 | print("best_epoch: " + str(best_epoch))
275 |
276 | batches_train = None
277 | gc.collect()
278 |
279 | if config["stop_if_no_improvement_for_epochs"] > 0 and (epoch - best_epoch) >= config["stop_if_no_improvement_for_epochs"]:
280 | break
281 |
282 | # loading the best model so far
283 | if config["epochs"] > 0:
284 | sequencelabeler = SequenceLabeler.load(temp_model_path)
285 | os.remove(temp_model_path)
286 |
287 | if config["save"] is not None and len(config["save"]) > 0:
288 | sequencelabeler.save(config["save"])
289 |
290 | if config["path_test"] is not None:
291 | i = 0
292 | for path_test in config["path_test"].strip().split(":"):
293 | data_test = read_dataset(path_test, False, False, True, word2id, char2id, label2id)
294 | batches_test = create_batches(data_test, config['max_batch_size'])
295 | test_cost_sum, results_test = process_batches(sequencelabeler, batches_test, testing=True, learningrate=0.0, name="test" + (str(i) if len(batches_test) > 1 else ""), main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
296 | i += 1
297 |
298 |
299 | if __name__ == "__main__":
300 | run_experiment(sys.argv[1])
301 |
--------------------------------------------------------------------------------