├── config.conf
├── sequence_labeling_evaluator.py
├── crf.py
├── punctuator.py
├── recurrence.py
├── error_calculator.py
├── README.md
├── punctuation_data_converter.py
├── sequence_labeler.py
├── conlleval.py
└── sequence_labeling_experiment.py


/config.conf:
--------------------------------------------------------------------------------
 1 | [config]
 2 | path_train = ./data/train
 3 | path_dev = ./data/dev
 4 | path_test = ./data/test
 5 | main_label = .PERIOD
 6 | conll_eval = False
 7 | preload_vectors = /home/ottokar/old/large_files/GoogleNews-vectors-negative300.txt
 8 | word_embedding_size = 300
 9 | char_embedding_size = 50
10 | word_recurrent_size = 200
11 | char_recurrent_size = 200
12 | narrow_layer_size = 50
13 | best_model_selector = dev_f:high
14 | epochs = 20
15 | stop_if_no_improvement_for_epochs = 7
16 | learningrate = 1.0
17 | min_word_freq = 2
18 | max_batch_size = 64
19 | save = punctuator.model
20 | load = 
21 | random_seed = 1001
22 | crf_on_top = True
23 | char_integration_method = attention
24 | 


--------------------------------------------------------------------------------
/sequence_labeling_evaluator.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import collections
 3 | import numpy
 4 | 
 5 | import conlleval
 6 | 
 7 | class SequenceLabelingEvaluator(object):
 8 |     def __init__(self, main_label_id, label2id=None, conll_eval=False):
 9 |         self.main_label_id = main_label_id
10 |         self.label2id = label2id
11 |         self.conll_eval = conll_eval
12 | 
13 |         self.cost_sum = 0.0
14 |         self.correct_sum = 0.0
15 |         self.main_predicted_count = 0
16 |         self.main_total_count = 0
17 |         self.main_correct_count = 0
18 |         self.token_count = 0
19 |         self.start_time = time.time()
20 | 
21 |         if self.label2id is not None:
22 |             self.id2label = collections.OrderedDict()
23 |             for label in self.label2id:
24 |                 self.id2label[self.label2id[label]] = label
25 | 
26 |         self.conll_format = []
27 | 
28 |     def append_data(self, cost, predicted_labels, word_ids, label_ids):
29 |         self.cost_sum += cost
30 |         self.token_count += label_ids.size
31 |         self.correct_sum += numpy.equal(predicted_labels, label_ids).sum()
32 |         self.main_predicted_count += (predicted_labels == self.main_label_id).sum()
33 |         self.main_total_count += (label_ids == self.main_label_id).sum()
34 |         self.main_correct_count += ((predicted_labels == self.main_label_id)*(label_ids == self.main_label_id)).sum()
35 | 
36 |         for i in range(word_ids.shape[0]):
37 |             for j in range(word_ids.shape[1]-2):
38 |                 try:
39 |                     self.conll_format.append(str(word_ids[i][j+1]) + "\t" + str(self.id2label[label_ids[i][j]]) + "\t" + str(self.id2label[predicted_labels[i][j]]))
40 |                 except KeyError:
41 |                     print("Unexpected label id in predictions.") # Probably means the CRF decided to predict a start/end label, which it shouldn't
42 |             self.conll_format.append("")
43 | 
44 | 
45 |     def get_results(self, name):
46 |         p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0
47 |         r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0
48 |         f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0
49 |         f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0
50 | 
51 |         results = collections.OrderedDict()
52 |         results[name + "_cost_avg"] = self.cost_sum / float(self.token_count)
53 |         results[name + "_cost_sum"] = self.cost_sum
54 |         results[name + "_main_predicted_count"] = self.main_predicted_count
55 |         results[name + "_main_total_count"] = self.main_total_count
56 |         results[name + "_main_correct_count"] = self.main_correct_count
57 |         results[name + "_p"] = p
58 |         results[name + "_r"] = r
59 |         results[name + "_f"] = f
60 |         results[name + "_f05"] = f05
61 |         results[name + "_accuracy"] = self.correct_sum / float(self.token_count)
62 |         results[name + "_token_count"] = self.token_count
63 |         results[name + "_time"] = float(time.time()) - float(self.start_time)
64 | 
65 |         if self.label2id is not None and self.conll_eval == True:
66 |             conll_counts = conlleval.evaluate(self.conll_format)
67 |             conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts)
68 |             results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter)
69 |             results[name + "_conll_p"] = conll_metrics_overall.prec
70 |             results[name + "_conll_r"] = conll_metrics_overall.rec
71 |             results[name + "_conll_f"] = conll_metrics_overall.fscore
72 | #            for i, m in sorted(conll_metrics_by_type.items()):
73 | #                results[name + "_conll_p_" + str(i)] = m.prec
74 | #                results[name + "_conll_r_" + str(i)] = m.rec
75 | #                results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i])
76 | 
77 |         return results
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/crf.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import numpy
 3 | 
 4 | # CRF implementation based on Lample et al.
 5 | # "Neural Architectures for Named Entity Recognition"
 6 | 
 7 | floatX=theano.config.floatX
 8 | 
 9 | def log_sum(x, axis=None):
10 |     x_max_value = x.max(axis=axis)
11 |     x_max_tensor = x.max(axis=axis, keepdims=True)
12 |     return x_max_value + theano.tensor.log(theano.tensor.exp(x - x_max_tensor).sum(axis=axis))
13 | 
14 | 
15 | def forward(observation_weights, transition_weights, return_best_sequence=False):
16 |     def recurrence(observation_weights, previous_scores, transition_weights):
17 |         previous_scores = previous_scores.dimshuffle(0, 1, 'x')
18 |         observation_weights = observation_weights.dimshuffle(0, 'x', 1)
19 |         scores = previous_scores + observation_weights + transition_weights.dimshuffle('x', 0, 1)
20 |         if return_best_sequence:
21 |             best_scores = scores.max(axis=1)
22 |             best_states = scores.argmax(axis=1)
23 |             return best_scores, best_states
24 |         else:
25 |             return log_sum(scores, axis=1)
26 | 
27 |     initial = observation_weights[0]
28 |     crf_states, _ = theano.scan(
29 |         fn=recurrence,
30 |         outputs_info=(initial, None) if return_best_sequence else initial,
31 |         sequences=[observation_weights[1:],],
32 |         non_sequences=transition_weights
33 |     )
34 | 
35 |     if return_best_sequence:
36 |         sequence, _ = theano.scan(
37 |             fn=lambda beta_i, previous: beta_i[theano.tensor.arange(previous.shape[0]), previous],
38 |             outputs_info=theano.tensor.cast(theano.tensor.argmax(crf_states[0][-1], axis=1), 'int32'),
39 |             sequences=theano.tensor.cast(crf_states[1][::-1], 'int32')
40 |         )
41 |         sequence = theano.tensor.concatenate([sequence[::-1], [theano.tensor.argmax(crf_states[0][-1], axis=1)]])
42 |         return sequence, crf_states[0]
43 |     else:
44 |         return log_sum(crf_states[-1], axis=1)
45 | 
46 | 
47 | def construct(name, input_tensor, n_labels, gold_labels, fn_create_parameter_matrix):
48 |     transition_weights = fn_create_parameter_matrix(name + "_crf_transition_weights", (n_labels + 2, n_labels + 2))
49 | 
50 |     small = -1000.0
51 |     padding_start = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small
52 |     padding_start = theano.tensor.set_subtensor(padding_start[:,:,-2], 0.0)
53 |     padding_end = theano.tensor.zeros((input_tensor.shape[0], 1, n_labels + 2)) + small
54 |     padding_end = theano.tensor.set_subtensor(padding_end[:,:,-1], 0.0)
55 |     observation_weights = theano.tensor.concatenate([input_tensor, theano.tensor.zeros((input_tensor.shape[0], input_tensor.shape[1], 2)) + small], axis=2)
56 |     observation_weights = theano.tensor.concatenate([padding_start, observation_weights, padding_end], axis=1)
57 |     observation_weights = observation_weights.dimshuffle(1,0,2) # reordering the tensor (words, sentences, labels)
58 | 
59 |     # Score from tags
60 |     real_paths_scores = input_tensor[theano.tensor.arange(input_tensor.shape[0])[:, numpy.newaxis], theano.tensor.arange(input_tensor.shape[1]), gold_labels].sum(axis=1)
61 | 
62 |     # Score from transition_weights
63 |     padding_id_start = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels
64 |     padding_id_end = theano.tensor.zeros((gold_labels.shape[0], 1), dtype=numpy.int32) + n_labels + 1
65 |     padded_gold_labels = theano.tensor.concatenate([padding_id_start, gold_labels, padding_id_end], axis=1)
66 |     real_paths_scores += transition_weights[
67 |         padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1)],
68 |         padded_gold_labels[theano.tensor.arange(gold_labels.shape[0])[:, numpy.newaxis], theano.tensor.arange(gold_labels.shape[1] + 1) + 1]
69 |     ].sum(axis=1)
70 | 
71 |     all_paths_scores = forward(observation_weights, transition_weights)
72 | 
73 |     best_sequence, scores = forward(observation_weights, transition_weights, return_best_sequence=True)
74 | 
75 |     scores = scores.dimshuffle(1,0,2)[:,:-1,:-2]
76 |     best_sequence = best_sequence.dimshuffle(1,0)[:,1:-1]
77 | 
78 |     return all_paths_scores, real_paths_scores, best_sequence, scores
79 | 
80 | 


--------------------------------------------------------------------------------
/punctuator.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy
  3 | 
  4 | from collections import OrderedDict
  5 | from sequence_labeler import SequenceLabeler
  6 | from sequence_labeling_experiment import read_dataset, create_batches, parse_config, map_text_to_ids
  7 | from punctuation_data_converter import EOS_TOKENS, SPACE, MAX_SEQUENCE_LEN
  8 | 
  9 | def last_index_of(array, element):
 10 |     try:
 11 |         return len(array) -1 - array[::-1].index(element)
 12 |     except:
 13 |         return 0
 14 | 
 15 | def up_to_last_instance_of(array, elements):
 16 |     idx = max(last_index_of(array, element) for element in elements)
 17 |     if idx == 0:
 18 |         return array
 19 |     else:
 20 |         return array[:idx + 1]
 21 | 
 22 | def reverse_mapping(d):
 23 |     return OrderedDict([(v,k) for (k,v) in d.items()])
 24 | 
 25 | def convert_to_batch(word_sequence, lowercase_words, lowercase_chars, replace_digits, word2id, char2id):
 26 |     raw_word_ids = map_text_to_ids(" ".join(word_sequence), word2id, "<s>", "</s>", "<unk>", lowercase=lowercase_words, replace_digits=replace_digits)
 27 |     raw_char_ids = [map_text_to_ids("<s>", char2id, "<w>", "</w>", "<cunk>")] + \
 28 |                    [map_text_to_ids(" ".join(list(word)), char2id, "<w>", "</w>", "<cunk>", lowercase=lowercase_chars, replace_digits=replace_digits) for word in word_sequence] + \
 29 |                    [map_text_to_ids("</s>", char2id, "<w>", "</w>", "<cunk>")]
 30 | 
 31 |     assert(len(raw_char_ids) == len(raw_word_ids))
 32 | 
 33 |     # Mask and convert to numpy array
 34 |     batch_size = 1
 35 |     seq_len = len(raw_word_ids)
 36 | 
 37 |     max_word_length = numpy.array([len(c) for c in raw_char_ids]).max()
 38 | 
 39 |     word_ids = numpy.zeros((batch_size, seq_len), dtype=numpy.int32)
 40 |     char_ids = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32)
 41 |     char_mask = numpy.zeros((batch_size, seq_len, max_word_length), dtype=numpy.int32)
 42 | 
 43 |     for i in range(batch_size):
 44 |         for j in range(seq_len):
 45 |             word_ids[i][j] = raw_word_ids[j]
 46 |         for j in range(seq_len):
 47 |             for k in range(len(raw_char_ids[j])):
 48 |                 char_ids[i][j][k] = raw_char_ids[j][k]
 49 |                 char_mask[i][j][k] = 1
 50 | 
 51 |     return word_ids, char_ids, char_mask
 52 | 
 53 | def punctuate(config_path):
 54 |     config = parse_config("config", config_path)
 55 |     if config["path_test"] is None:
 56 |         print("No test data configured")
 57 |         return
 58 | 
 59 |     sequencelabeler = SequenceLabeler.load(config["save"])
 60 |     label2id = sequencelabeler.config["label2id"]
 61 |     word2id = sequencelabeler.config["word2id"]
 62 |     char2id = sequencelabeler.config["char2id"]
 63 | 
 64 |     config["word2id"] = word2id
 65 |     config["char2id"] = char2id
 66 |     config["label2id"] = label2id
 67 | 
 68 |     id2label = reverse_mapping(label2id)
 69 |     eos_labels = [label2id[l] for l in EOS_TOKENS if l in label2id]
 70 |     space_id = label2id[SPACE]
 71 | 
 72 |     all_predicted_labels = []
 73 | 
 74 |     for path_test in config["path_test"].strip().split(":"):
 75 | 
 76 |         with open(path_test + '.orig', 'r') as f:
 77 |             all_words = [w for w in f.read().split() if w not in label2id]
 78 | 
 79 |         last_eos_idx = 0
 80 |         
 81 |         while True:
 82 |             word_sequence = all_words[last_eos_idx:last_eos_idx+MAX_SEQUENCE_LEN]
 83 |             if len(word_sequence) == 0:
 84 |                 break
 85 |             word_ids, char_ids, char_mask = convert_to_batch(word_sequence, False, False, True, word2id, char2id)
 86 |             predicted_labels = sequencelabeler.predict(word_ids, char_ids, char_mask)
 87 |             predicted_labels = up_to_last_instance_of(list(predicted_labels.flatten()), eos_labels)
 88 |             if len(predicted_labels) == 0:
 89 |                 break
 90 |             all_predicted_labels += predicted_labels
 91 |             last_eos_idx += len(predicted_labels)
 92 | 
 93 |         with open(path_test + '.pred', 'w') as f:
 94 |             for w, l_id in zip(all_words, all_predicted_labels):
 95 |                 f.write('%s %s ' % (w, '' if l_id == space_id else id2label[l_id]))
 96 | 
 97 | if __name__ == "__main__":
 98 |     punctuate(sys.argv[1])
 99 | 
100 | 


--------------------------------------------------------------------------------
/recurrence.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import numpy
 3 | 
 4 | floatX=theano.config.floatX
 5 | 
 6 | def create_birnn(input_tensor, input_size, mask, recurrent_size, return_combined, fn_create_parameter_matrix, name):
 7 |     rnn_mask = mask.dimshuffle(1,0) if (mask is not None) else None
 8 |     recurrent_forward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask, 
 9 |                                     recurrent_size, only_return_final=return_combined, go_backwards=False, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_forward")
10 |     recurrent_backward = create_lstm(input_tensor.dimshuffle(1,0,2), input_size, rnn_mask, 
11 |                                     recurrent_size, only_return_final=return_combined, go_backwards=True, fn_create_parameter_matrix=fn_create_parameter_matrix, name=name + "_backward")
12 |     if return_combined == True:
13 |         return theano.tensor.concatenate([recurrent_forward, recurrent_backward], axis=1)
14 |     else:
15 |         return theano.tensor.concatenate([recurrent_forward.dimshuffle(1,0,2), recurrent_backward.dimshuffle(1,0,2)], axis=2)
16 | 
17 | 
18 | def create_lstm(input_tensor, input_size, mask, recurrent_size, only_return_final, go_backwards, fn_create_parameter_matrix, name):
19 |     # LSTM. Following Graves et al.
20 |     # "Hybrid speech recognition with deep bidirectional LSTM"
21 |     def lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co):
22 |         m_xhb = theano.tensor.dot(x, W_x) + theano.tensor.dot(h_prev, W_h) + b
23 |         i = theano.tensor.nnet.sigmoid(_slice(m_xhb, 0, 4) + c_prev * W_ci)
24 |         f = theano.tensor.nnet.sigmoid(_slice(m_xhb, 1, 4) + c_prev * W_cf)
25 |         c = f * c_prev + i * theano.tensor.tanh(_slice(m_xhb, 2, 4))
26 |         o = theano.tensor.nnet.sigmoid(_slice(m_xhb, 3, 4) + c * W_co)
27 |         h = o * theano.tensor.tanh(c)
28 |         return h, c
29 | 
30 |     def lstm_mask_step(x, mask, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co):
31 |         h_new, c_new = lstm_step(x, h_prev, c_prev, W_x, W_h, b, W_ci, W_cf, W_co)
32 |         h = theano.tensor.switch(mask, h_new, h_prev)
33 |         c = theano.tensor.switch(mask, c_new, c_prev)
34 |         return h, c
35 | 
36 |     def _slice(M, slice_num, total_slices):
37 |         if M.ndim == 3:
38 |             l = M.shape[2] / total_slices
39 |             return M[:, :, slice_num*l:(slice_num+1)*l]
40 |         elif M.ndim == 2:
41 |             l = M.shape[1] / total_slices
42 |             return M[:, slice_num*l:(slice_num+1)*l]
43 |         elif M.ndim == 1:
44 |             l = M.shape[0] / total_slices
45 |             return M[slice_num*l:(slice_num+1)*l]
46 | 
47 |     h_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size)
48 |     c_initial = theano.tensor.alloc(numpy.array(0, dtype=floatX), input_tensor.shape[1], recurrent_size)
49 | 
50 |     if mask is not None:
51 |         mask = mask.dimshuffle(0, 1, 'x')
52 |         fn_step = locals()["lstm_mask_step"]
53 |         sequences = [input_tensor, mask]
54 |     else:
55 |         fn_step = locals()["lstm_step"]
56 |         sequences = input_tensor
57 | 
58 |     W_x = fn_create_parameter_matrix('W_x_'+name, (input_size, recurrent_size*4))
59 |     W_h = fn_create_parameter_matrix('W_h_'+name, (recurrent_size, recurrent_size*4))
60 |     b = fn_create_parameter_matrix('b_'+name, (recurrent_size*4,))
61 |     W_ci = fn_create_parameter_matrix('W_ci_'+name, (recurrent_size,))
62 |     W_cf = fn_create_parameter_matrix('W_cf_'+name, (recurrent_size,))
63 |     W_co = fn_create_parameter_matrix('W_co_'+name, (recurrent_size,))
64 |     result, _ = theano.scan(
65 |         fn_step,
66 |         sequences = sequences,
67 |         outputs_info = [h_initial, c_initial],
68 |         non_sequences = [W_x, W_h, b, W_ci, W_cf, W_co],
69 |         go_backwards=go_backwards)
70 | 
71 |     h = result[0]
72 |     if only_return_final == True:
73 |         h = h[-1]
74 |     else:
75 |         if go_backwards == True:
76 |             h = h[::-1]
77 |     return h
78 | 
79 | 
80 | 
81 | def create_feedforward(input_tensor, input_size, output_size, activation, fn_create_parameter_matrix, name):
82 |     weights = fn_create_parameter_matrix('ff_weights_' + name, (input_size, output_size))
83 |     bias = fn_create_parameter_matrix('ff_bias_' + name, (output_size,))
84 |     output = theano.tensor.dot(input_tensor, weights) + bias
85 |     if activation == "tanh":
86 |         output = theano.tensor.tanh(output)
87 |     elif activation == "sigmoid":
88 |         output = theano.tensor.nnet.sigmoid(output)
89 |     return output
90 | 
91 | 


--------------------------------------------------------------------------------
/error_calculator.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | Computes and prints the overall classification error and precision, recall, F-score over punctuations.
  5 | """
  6 | 
  7 | from numpy import nan
  8 | import punctuation_data_converter as data
  9 | import codecs
 10 | import sys
 11 | 
 12 | MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example
 13 | 
 14 | def compute_error(target_paths, predicted_paths):
 15 |     counter = 0
 16 |     total_correct = 0
 17 | 
 18 |     correct = 0.
 19 |     substitutions = 0.
 20 |     deletions = 0.
 21 |     insertions = 0.
 22 | 
 23 |     true_positives = {}
 24 |     false_positives = {}
 25 |     false_negatives = {}
 26 | 
 27 |     for target_path, predicted_path in zip(target_paths, predicted_paths):
 28 | 
 29 |         target_punctuation = " "
 30 |         predicted_punctuation = " "
 31 | 
 32 |         t_i = 0
 33 |         p_i = 0
 34 | 
 35 |         with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted:
 36 | 
 37 |             target_stream = target.read().split()
 38 |             predicted_stream = predicted.read().split()
 39 |             
 40 |             while True:
 41 | 
 42 |                 if data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY:
 43 |                     while data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i]) in data.PUNCTUATION_VOCABULARY: # skip multiple consecutive punctuations
 44 |                         target_punctuation = data.PUNCTUATION_MAPPING.get(target_stream[t_i], target_stream[t_i])
 45 |                         target_punctuation = MAPPING.get(target_punctuation, target_punctuation)
 46 |                         t_i += 1
 47 |                 else:
 48 |                     target_punctuation = " "
 49 | 
 50 |                 if predicted_stream[p_i] in data.PUNCTUATION_VOCABULARY:
 51 |                     predicted_punctuation = MAPPING.get(predicted_stream[p_i], predicted_stream[p_i])
 52 |                     p_i += 1
 53 |                 else:
 54 |                     predicted_punctuation = " "
 55 | 
 56 |                 is_correct = target_punctuation == predicted_punctuation
 57 | 
 58 |                 counter += 1 
 59 |                 total_correct += is_correct
 60 | 
 61 |                 if predicted_punctuation == " " and target_punctuation != " ":
 62 |                     deletions += 1
 63 |                 elif predicted_punctuation != " " and target_punctuation == " ":
 64 |                     insertions += 1
 65 |                 elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation == target_punctuation:
 66 |                     correct += 1
 67 |                 elif predicted_punctuation != " " and target_punctuation != " " and predicted_punctuation != target_punctuation:
 68 |                     substitutions += 1
 69 | 
 70 |                 true_positives[target_punctuation] = true_positives.get(target_punctuation, 0.) + float(is_correct)
 71 |                 false_positives[predicted_punctuation] = false_positives.get(predicted_punctuation, 0.) + float(not is_correct)
 72 |                 false_negatives[target_punctuation] = false_negatives.get(target_punctuation, 0.) + float(not is_correct)
 73 | 
 74 |                 assert target_stream[t_i] == predicted_stream[p_i] or predicted_stream[p_i] == "<unk>", \
 75 |                        ("File: %s \n" + \
 76 |                        "Error: %s (%s) != %s (%s) \n" + \
 77 |                        "Target context: %s \n" + \
 78 |                        "Predicted context: %s") % \
 79 |                        (target_path,
 80 |                         target_stream[t_i], t_i, predicted_stream[p_i], p_i,
 81 |                         " ".join(target_stream[t_i-2:t_i+2]),
 82 |                         " ".join(predicted_stream[p_i-2:p_i+2]))
 83 | 
 84 |                 t_i += 1
 85 |                 p_i += 1
 86 | 
 87 |                 if t_i >= len(target_stream)-1 and p_i >= len(predicted_stream)-1:
 88 |                     break
 89 | 
 90 |     overall_tp = 0.0
 91 |     overall_fp = 0.0
 92 |     overall_fn = 0.0
 93 | 
 94 |     print "-"*46
 95 |     print "{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')
 96 |     for p in data.PUNCTUATION_VOCABULARY:
 97 | 
 98 |         if p == data.SPACE:
 99 |             continue
100 | 
101 |         overall_tp += true_positives.get(p,0.)
102 |         overall_fp += false_positives.get(p,0.)
103 |         overall_fn += false_negatives.get(p,0.)
104 | 
105 |         punctuation = p
106 |         precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan
107 |         recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan
108 |         f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan        
109 |         print "{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100)
110 |     print "-"*46
111 |     pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan
112 |     rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan
113 |     f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan
114 |     print "{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)
115 |     print "Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)
116 |     print "SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1)
117 | 
118 | 
119 | if __name__ == "__main__":
120 | 
121 |     if len(sys.argv) > 1:
122 |         target_path = sys.argv[1]
123 |     else:
124 |         sys.exit("Ground truth file path argument missing")
125 | 
126 |     if len(sys.argv) > 2:
127 |         predicted_path = sys.argv[2]
128 |     else:
129 |         sys.exit("Model predictions file path argument missing")
130 | 
131 |     compute_error([target_path], [predicted_path])    
132 |         
133 | 
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | A fork from https://github.com/marekrei/sequence-labeler to enable punctuation restoration in unsegmented text.
  2 | 
  3 | ## Performance on English TED talks
  4 | (Training set size: 2.1M words)
  5 | 
  6 | PUNCTUATION      | PRECISION | RECALL    | F-SCORE
  7 | --- | --- | --- | ---
  8 | ,COMMA           | 58.5 | 58.7 | 58.6
  9 | ?QUESTIONMARK    | 71.4 | 54.3 | 61.7
 10 | .PERIOD          | 69.9 | 72.0 | 70.9
 11 | _Overall_        | _64.3_ | _64.9_ | _64.6_
 12 | 
 13 | Performance is very similar (even slightly better) to https://github.com/ottokart/punctuator2 although they are not directly comparable as punctuator2 used pretrained embeddings that were trained on much less data and had much smaller size. More details can be found [here](http://www.isca-speech.org/archive/Interspeech_2016/pdfs/1517.PDF).
 14 | 
 15 | Original README:
 16 | =========================
 17 | 
 18 | Sequence labeler
 19 | =========================
 20 | 
 21 | This is a neural network sequence labeling system. Given a sequence of tokens, it will learn to assign labels to each token. Can be used for named entity recognition, POS-tagging, error detection, chunking, CCG supertagging, etc.
 22 | 
 23 | The main model implements a bidirectional LSTM for sequence tagging. In addition, you can incorporate character-level information -- either by concatenating a character-based representation, or by using an attention/gating mechanism for combining it with a word embedding.
 24 | 
 25 | Run with:
 26 | 
 27 |     python sequence_labeling_experiment.py config.conf
 28 | 
 29 | Preferably with Theano set up to use CUDA, so the process can run on a GPU.
 30 | 
 31 | Requirements
 32 | -------------------------
 33 | 
 34 | * numpy
 35 | * theano
 36 | * lasagne
 37 | 
 38 | Configuration
 39 | -------------------------
 40 | 
 41 | Edit the values in config.conf as needed:
 42 | 
 43 | * **path_train** - Path to the training data, in CoNLL tab-separated format. One word per line, first column is the word, last column is the label. Empty lines between sentences.
 44 | * **path_dev** - Path to the development data, used for choosing the best epoch.
 45 | * **path_test** - Path to the test file. Can contain multiple files, colon separated.
 46 | * **main_label** - The output label for which precision/recall/F-measure are calculated.
 47 | * **conll_eval** - Whether the standard CoNLL NER evaluation should be run.
 48 | * **preload_vectors** - Path to the pretrained word embeddings, in word2vec plain text format. If your embeddings are in binary, you can use [convertvec](https://github.com/marekrei/convertvec) to convert them to plain text.
 49 | * **word_embedding_size** - Size of the word embeddings used in the model.
 50 | * **char_embedding_size** - Size of the character embeddings.
 51 | * **word_recurrent_size** - Size of the word-level LSTM hidden layers.
 52 | * **char_recurrent_size** - Size of the char-level LSTM hidden layers.
 53 | * **narrow_layer_size** - Size of the extra hidden layer on top of the bi-LSTM.
 54 | * **best_model_selector** - What is measured on the dev set for model selection: "dev_conll_f:high" for NER and chunking, "dev_acc:high" for POS-tagging, "dev_f05:high" for error detection.
 55 | * **epochs** - Maximum number of epochs to run.
 56 | * **stop_if_no_improvement_for_epochs** - Training will be stopped if there has been no improvement for n epochs.
 57 | * **learningrate** - Learning rate.
 58 | * **min_word_freq** - Minimal frequency of words to be included in the vocabulary. Others will be considered OOV.
 59 | * **max_batch_size** - Maximum batch size.
 60 | * **save** - Path to save the model.
 61 | * **load** - Path to load the model.
 62 | * **random_seed** - Random seed for initialisation and data shuffling. This can affect results, so for robust conclusions I recommend running multiple experiments with different seeds and averaging the metrics.
 63 | * **crf_on_top** - If True, use a CRF as the output layer. If False, use softmax instead.
 64 | * **char_integration_method** - How character information is integrated. Options are: "none" (not integrated), "input" (concatenated), "attention" (the method proposed in Rei et al. (2016)).
 65 | 
 66 | 
 67 | References
 68 | -------------------------
 69 | 
 70 | If you use the main sequence labeling code, please reference:
 71 | 
 72 | [**Compositional Sequence Labeling Models for Error Detection in Learner Writing**](http://aclweb.org/anthology/P/P16/P16-1112.pdf)  
 73 | Marek Rei and Helen Yannakoudakis  
 74 | *In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL-2016)*
 75 |   
 76 | 
 77 | If you use the character-level attention component, please reference:
 78 | 
 79 | [**Attending to characters in neural sequence labeling models**](https://aclweb.org/anthology/C/C16/C16-1030.pdf)  
 80 | Marek Rei, Sampo Pyysalo and Gamal K.O. Crichton  
 81 | *In Proceedings of the 26th International Conference on Computational Linguistics (COLING-2016)*
 82 |   
 83 | 
 84 | The CRF implementation is based on:
 85 | 
 86 | [**Neural Architectures for Named Entity Recognition**](https://arxiv.org/abs/1603.01360)  
 87 | Guillaume Lample, Miguel Ballesteros, Sandeep Subramanian, Kazuya Kawakami and Chris Dyer  
 88 | *In Proceedings of NAACL-HLT 2016*
 89 |   
 90 | 
 91 | The conlleval.py script is from: https://github.com/spyysalo/conlleval.py
 92 | 
 93 | 
 94 | License
 95 | ---------------------------
 96 | 
 97 | MIT License
 98 | 
 99 | Copyright (c) 2016 Marek Rei
100 | 
101 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
102 | 
103 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
104 | 
105 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
106 | 


--------------------------------------------------------------------------------
/punctuation_data_converter.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import division
  3 | 
  4 | import random
  5 | import os
  6 | import sys
  7 | import operator
  8 | import cPickle
  9 | import codecs
 10 | import fnmatch
 11 | 
 12 | DATA_PATH = "./data"
 13 | 
 14 | END = "</S>"
 15 | UNK = "<UNK>"
 16 | 
 17 | SPACE = "_SPACE"
 18 | 
 19 | MAX_WORD_VOCABULARY_SIZE = 100000
 20 | MIN_WORD_COUNT_IN_VOCAB = 2
 21 | MAX_SEQUENCE_LEN = 50
 22 | 
 23 | TRAIN_FILE = os.path.join(DATA_PATH, "train")
 24 | DEV_FILE = os.path.join(DATA_PATH, "dev")
 25 | TEST_FILE = os.path.join(DATA_PATH, "test")
 26 | 
 27 | PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"}
 28 | PUNCTUATION_MAPPING = {}
 29 | 
 30 | # Comma, period & question mark only:
 31 | # PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK"}
 32 | # PUNCTUATION_MAPPING = {"!EXCLAMATIONMARK": ".PERIOD", ":COLON": ",COMMA", ";SEMICOLON": ".PERIOD", "-DASH": ",COMMA"}
 33 | 
 34 | EOS_TOKENS = {".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK"}
 35 | CRAP_TOKENS = {"<doc>", "<doc.>"} # punctuations that are not included in vocabulary nor mapping, must be added to CRAP_TOKENS
 36 | 
 37 | def write_processed_dataset(input_files, output_file):
 38 |     """
 39 |     data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens (actually punctuation sequence will be 1 element shorter).
 40 |     If a sentence is cut, then it will be added to next subsequence entirely (words before the cut belong to both sequences)
 41 |     """
 42 | 
 43 |     current_words = []
 44 |     current_punctuations = []
 45 | 
 46 |     last_eos_idx = 0 # if it's still 0 when MAX_SEQUENCE_LEN is reached, then the sentence is too long and skipped.
 47 |     last_token_was_punctuation = True # skipt first token if it's punctuation
 48 | 
 49 |     skip_until_eos = False # if a sentence does not fit into subsequence, then we need to skip tokens until we find a new sentence
 50 | 
 51 |     for input_file in input_files:
 52 | 
 53 |         with codecs.open(input_file, 'r', 'utf-8') as text, \
 54 |              codecs.open(output_file, 'w', 'utf-8') as text_out:
 55 | 
 56 |             for line in text:
 57 | 
 58 |                 for token in line.split():
 59 | 
 60 |                     # First map oov punctuations to known punctuations
 61 |                     if token in PUNCTUATION_MAPPING:
 62 |                         token = PUNCTUATION_MAPPING[token]
 63 | 
 64 |                     if skip_until_eos:
 65 | 
 66 |                         if token in EOS_TOKENS:
 67 |                             skip_until_eos = False
 68 | 
 69 |                         continue
 70 | 
 71 |                     elif token in CRAP_TOKENS:
 72 |                         continue
 73 | 
 74 |                     elif token in PUNCTUATION_VOCABULARY:
 75 | 
 76 |                         if last_token_was_punctuation: # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", then we only use the first punctuation and skip the ones that follow
 77 |                             continue
 78 | 
 79 |                         if token in EOS_TOKENS:
 80 |                             last_eos_idx = len(current_punctuations) # no -1, because the token is not added yet
 81 | 
 82 |                         punctuation = token
 83 | 
 84 |                         current_punctuations.append(punctuation)
 85 |                         last_token_was_punctuation = True
 86 | 
 87 |                     else:
 88 | 
 89 |                         if not last_token_was_punctuation:
 90 |                             current_punctuations.append(SPACE)
 91 | 
 92 |                         word = token
 93 | 
 94 |                         current_words.append(word)
 95 |                         last_token_was_punctuation = False
 96 | 
 97 |                     if len(current_words) == MAX_SEQUENCE_LEN: # this also means, that last token was a word
 98 |                         
 99 |                         assert len(current_words) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations))
100 | 
101 |                         # Sentence did not fit into subsequence - skip it
102 |                         if last_eos_idx == 0: 
103 |                             skip_until_eos = True
104 | 
105 |                             current_words = []
106 |                             current_punctuations = []
107 | 
108 |                             last_token_was_punctuation = True # next sequence starts with a new sentence, so is preceded by eos which is punctuation
109 | 
110 |                         else:
111 | 
112 |                             for w, p in zip(current_words, current_punctuations):
113 |                                 text_out.write('%s\t%s\n' % (w, p))
114 |                             text_out.write('\n')
115 | 
116 |                             # Carry unfinished sentence to next subsequence
117 |                             current_words = current_words[last_eos_idx+1:]
118 |                             current_punctuations = current_punctuations[last_eos_idx+1:]
119 | 
120 |                         last_eos_idx = 0 # sequence always starts with a new sentence
121 | 
122 | def create_dev_test_train_split(root_path, train_output, dev_output, test_output):
123 | 
124 |     train_txt_files = []
125 |     dev_txt_files = []
126 |     test_txt_files = []
127 | 
128 |     for root, dirnames, filenames in os.walk(root_path):
129 |         for filename in fnmatch.filter(filenames, '*.txt'):
130 | 
131 |             path = os.path.join(root, filename)
132 | 
133 |             if filename.endswith(".test.txt"):
134 |                 test_txt_files.append(path)
135 | 
136 |             elif filename.endswith(".dev.txt"):
137 |                 dev_txt_files.append(path)
138 | 
139 |             else:
140 |                 train_txt_files.append(path)
141 | 
142 |     write_processed_dataset(train_txt_files, train_output)
143 |     write_processed_dataset(dev_txt_files, dev_output)
144 |     write_processed_dataset(test_txt_files, test_output)
145 | 
146 | if __name__ == "__main__":
147 | 
148 |     if len(sys.argv) > 1:
149 |         path = sys.argv[1]
150 |     else:
151 |         sys.exit("The path to source data directory with txt files is missing")
152 | 
153 |     if not os.path.exists(DATA_PATH):
154 |         os.makedirs(DATA_PATH)
155 |     else:
156 |         sys.exit("Data already exists")
157 | 
158 |     create_dev_test_train_split(path, TRAIN_FILE, DEV_FILE, TEST_FILE)
159 | 
160 | 


--------------------------------------------------------------------------------
/sequence_labeler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import theano
  3 | import numpy
  4 | import collections
  5 | import cPickle
  6 | import lasagne
  7 | 
  8 | import crf
  9 | import recurrence
 10 | 
 11 | sys.setrecursionlimit(50000)
 12 | floatX=theano.config.floatX
 13 | 
 14 | class SequenceLabeler(object):
 15 |     def __init__(self, config):
 16 |         self.config = config
 17 |         self.params = collections.OrderedDict()
 18 |         self.rng = numpy.random.RandomState(config["random_seed"])
 19 | 
 20 |         word_ids = theano.tensor.imatrix('word_ids')
 21 |         char_ids = theano.tensor.itensor3('char_ids')
 22 |         char_mask = theano.tensor.ftensor3('char_mask')
 23 |         label_ids = theano.tensor.imatrix('label_ids')
 24 |         learningrate = theano.tensor.fscalar('learningrate')
 25 | 
 26 |         cost = 0.0
 27 |         input_tensor = None
 28 |         input_vector_size = 0
 29 | 
 30 |         self.word_embeddings = self.create_parameter_matrix('word_embeddings', (config["n_words"], config["word_embedding_size"]))
 31 |         input_tensor = self.word_embeddings[word_ids]
 32 |         input_vector_size = config["word_embedding_size"]
 33 | 
 34 |         char_embeddings = self.create_parameter_matrix('char_embeddings', (config["n_chars"], config["char_embedding_size"]))
 35 |         char_input_tensor = char_embeddings[char_ids].reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2],config["char_embedding_size"]))
 36 |         char_mask_reshaped = char_mask.reshape((char_ids.shape[0]*char_ids.shape[1],char_ids.shape[2]))
 37 | 
 38 |         char_output_tensor = recurrence.create_birnn(char_input_tensor, config["char_embedding_size"], char_mask_reshaped, config["char_recurrent_size"], return_combined=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="char_birnn")
 39 |         char_output_tensor = recurrence.create_feedforward(char_output_tensor, config["char_recurrent_size"]*2, config["word_embedding_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="char_ff")
 40 |         char_output_tensor = char_output_tensor.reshape((char_ids.shape[0],char_ids.shape[1],config["word_embedding_size"]))
 41 | 
 42 |         if config["char_integration_method"] == "input":
 43 |             input_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2)
 44 |             input_vector_size += config["word_embedding_size"]
 45 | 
 46 |         elif config["char_integration_method"] == "attention":
 47 |             static_input_tensor = theano.gradient.disconnected_grad(input_tensor)
 48 |             is_unk = theano.tensor.eq(word_ids, config["unk_token_id"])
 49 |             is_unk_tensor = is_unk.dimshuffle(0,1,'x')
 50 |             char_output_tensor_normalised = char_output_tensor / char_output_tensor.norm(2, axis=2)[:, :, numpy.newaxis]
 51 |             static_input_tensor_normalised = static_input_tensor / static_input_tensor.norm(2, axis=2)[:, :, numpy.newaxis]
 52 |             cosine_cost = 1.0 - (char_output_tensor_normalised * static_input_tensor_normalised).sum(axis=2)
 53 |             cost += theano.tensor.switch(is_unk, 0.0, cosine_cost).sum()
 54 |             attention_evidence_tensor = theano.tensor.concatenate([input_tensor, char_output_tensor], axis=2)
 55 |             attention_output = recurrence.create_feedforward(attention_evidence_tensor, config["word_embedding_size"]*2, config["word_embedding_size"], "tanh", self.create_parameter_matrix, "attention_tanh")
 56 |             attention_output = recurrence.create_feedforward(attention_output, config["word_embedding_size"], config["word_embedding_size"], "sigmoid", self.create_parameter_matrix, "attention_sigmoid")
 57 |             input_tensor = input_tensor * attention_output + char_output_tensor * (1.0 - attention_output)
 58 | 
 59 |         processed_tensor = recurrence.create_birnn(input_tensor, input_vector_size, None, config["word_recurrent_size"], return_combined=False, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn")
 60 |         processed_tensor = recurrence.create_feedforward(processed_tensor, config["word_recurrent_size"]*2, config["narrow_layer_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="narrow_ff")
 61 | 
 62 |         W_output = self.create_parameter_matrix('W_output', (config["narrow_layer_size"], config["n_labels"]))
 63 |         bias_output = self.create_parameter_matrix('bias_output', (config["n_labels"],))
 64 |         output = theano.tensor.dot(processed_tensor, W_output) + bias_output
 65 |         output = output[:,1:-1,:] # removing <s> and </s>
 66 | 
 67 |         if config["crf_on_top"] == True:
 68 |             all_paths_scores, real_paths_scores, best_sequence, scores = crf.construct("crf", output, config["n_labels"], label_ids, self.create_parameter_matrix)
 69 |             predicted_labels = best_sequence
 70 |             output_probs = scores
 71 |             cost += - (real_paths_scores - all_paths_scores).sum()
 72 |         else:
 73 |             output_probs = theano.tensor.nnet.softmax(output.reshape((word_ids.shape[0]*(word_ids.shape[1]-2), config["n_labels"])))
 74 |             predicted_labels = theano.tensor.argmax(output_probs.reshape((word_ids.shape[0], (word_ids.shape[1]-2), config["n_labels"])), axis=2)
 75 |             cost += theano.tensor.nnet.categorical_crossentropy(output_probs, label_ids.reshape((-1,))).sum()
 76 | 
 77 |         gradients = theano.tensor.grad(cost, self.params.values(), disconnected_inputs='ignore')
 78 |         updates = lasagne.updates.adadelta(gradients, self.params.values(), learningrate)
 79 | 
 80 |         input_vars_train = [word_ids, char_ids, char_mask, label_ids, learningrate]
 81 |         input_vars_test = [word_ids, char_ids, char_mask, label_ids]
 82 |         output_vars = [cost, predicted_labels]
 83 |         self.train = theano.function(input_vars_train, output_vars, updates=updates, on_unused_input='ignore', allow_input_downcast = True)
 84 |         self.test = theano.function(input_vars_test, output_vars, on_unused_input='ignore', allow_input_downcast = True)
 85 |         self.predict = theano.function([word_ids, char_ids, char_mask], predicted_labels, on_unused_input='ignore', allow_input_downcast = True)
 86 | 
 87 |     def create_parameter_matrix(self, name, size):
 88 |         param_vals = numpy.asarray(self.rng.normal(loc=0.0, scale=0.1, size=size), dtype=floatX)
 89 |         param_shared = theano.shared(param_vals, name)
 90 |         self.params[name] = param_shared
 91 |         return param_shared
 92 | 
 93 | 
 94 |     def get_parameter_count(self):
 95 |         total = 0
 96 |         for key, val in self.params.iteritems():
 97 |             total += val.get_value().size
 98 |         return total
 99 | 
100 |     def get_parameter_count_without_word_embeddings(self):
101 |         total = 0
102 |         for key, val in self.params.iteritems():
103 |             if val == self.word_embeddings:
104 |                 continue
105 |             total += val.get_value().size
106 |         return total
107 | 
108 |     def save(self, filename):
109 |         dump = {}
110 |         dump["config"] = self.config
111 |         dump["params"] = {}
112 |         for param_name in self.params:
113 |             dump["params"][param_name] = self.params[param_name].get_value()
114 |         f = file(filename, 'wb')
115 |         cPickle.dump(dump, f, protocol=cPickle.HIGHEST_PROTOCOL)
116 |         f.close()
117 | 
118 |     @staticmethod
119 |     def load(filename, new_output_layer_size=None):
120 |         f = file(filename, 'rb')
121 |         dump = cPickle.load(f)
122 |         f.close()
123 |         if new_output_layer_size is not None:
124 |             dump["n_labels"] = new_output_layer_size
125 |         sequencelabeler = SequenceLabeler(dump["config"])
126 |         for param_name in sequencelabeler.params:
127 |             assert(param_name in dump["params"])
128 |             if new_output_layer_size is not None and param_name in ["W_output", "bias_output"]:
129 |                 continue
130 |             sequencelabeler.params[param_name].set_value(dump["params"][param_name])
131 |         return sequencelabeler
132 | 


--------------------------------------------------------------------------------
/conlleval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Python version of the evaluation script from CoNLL'00-
  4 | # Originates from: https://github.com/spyysalo/conlleval.py
  5 | 
  6 | 
  7 | # Intentional differences:
  8 | # - accept any space as delimiter by default
  9 | # - optional file argument (default STDIN)
 10 | # - option to set boundary (-b argument)
 11 | # - LaTeX output (-l argument) not supported
 12 | # - raw tags (-r argument) not supported
 13 | 
 14 | import sys
 15 | import re
 16 | 
 17 | from collections import defaultdict, namedtuple
 18 | 
 19 | ANY_SPACE = '<SPACE>'
 20 | 
 21 | class FormatError(Exception):
 22 |     pass
 23 | 
 24 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
 25 | 
 26 | class EvalCounts(object):
 27 |     def __init__(self):
 28 |         self.correct_chunk = 0    # number of correctly identified chunks
 29 |         self.correct_tags = 0     # number of correct chunk tags
 30 |         self.found_correct = 0    # number of chunks in corpus
 31 |         self.found_guessed = 0    # number of identified chunks
 32 |         self.token_counter = 0    # token counter (ignores sentence breaks)
 33 | 
 34 |         # counts by type
 35 |         self.t_correct_chunk = defaultdict(int)
 36 |         self.t_found_correct = defaultdict(int)
 37 |         self.t_found_guessed = defaultdict(int)
 38 | 
 39 | def parse_args(argv):
 40 |     import argparse
 41 |     parser = argparse.ArgumentParser(
 42 |         description='evaluate tagging results using CoNLL criteria',
 43 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 44 |     )
 45 |     arg = parser.add_argument
 46 |     arg('-b', '--boundary', metavar='STR', default='-X-',
 47 |         help='sentence boundary')
 48 |     arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
 49 |         help='character delimiting items in input')
 50 |     arg('-o', '--otag', metavar='CHAR', default='O',
 51 |         help='alternative outside tag')
 52 |     arg('file', nargs='?', default=None)
 53 |     return parser.parse_args(argv)
 54 | 
 55 | def parse_tag(t):
 56 |     m = re.match(r'^([^-]*)-(.*)$', t)
 57 |     return m.groups() if m else (t, '')
 58 | 
 59 | def evaluate(iterable, options=None):
 60 |     if options is None:
 61 |         options = parse_args([])    # use defaults
 62 | 
 63 |     counts = EvalCounts()
 64 |     num_features = None       # number of features per line
 65 |     in_correct = False        # currently processed chunks is correct until now
 66 |     last_correct = 'O'        # previous chunk tag in corpus
 67 |     last_correct_type = ''    # type of previously identified chunk tag
 68 |     last_guessed = 'O'        # previously identified chunk tag
 69 |     last_guessed_type = ''    # type of previous chunk tag in corpus
 70 | 
 71 |     for line in iterable:
 72 |         line = line.rstrip('\r\n')
 73 | 
 74 |         if options.delimiter == ANY_SPACE:
 75 |             features = line.split()
 76 |         else:
 77 |             features = line.split(options.delimiter)
 78 | 
 79 |         if num_features is None:
 80 |             num_features = len(features)
 81 |         elif num_features != len(features) and len(features) != 0:
 82 |             raise FormatError('unexpected number of features: %d (%d)' %
 83 |                               (len(features), num_features))
 84 | 
 85 |         if len(features) == 0 or features[0] == options.boundary:
 86 |             features = [options.boundary, 'O', 'O']
 87 |         if len(features) < 3:
 88 |             raise FormatError('unexpected number of features in line %s' % line)
 89 | 
 90 |         guessed, guessed_type = parse_tag(features.pop())
 91 |         correct, correct_type = parse_tag(features.pop())
 92 |         first_item = features.pop(0)
 93 | 
 94 |         if first_item == options.boundary:
 95 |             guessed = 'O'
 96 | 
 97 |         end_correct = end_of_chunk(last_correct, correct,
 98 |                                    last_correct_type, correct_type)
 99 |         end_guessed = end_of_chunk(last_guessed, guessed,
100 |                                    last_guessed_type, guessed_type)
101 |         start_correct = start_of_chunk(last_correct, correct,
102 |                                        last_correct_type, correct_type)
103 |         start_guessed = start_of_chunk(last_guessed, guessed,
104 |                                        last_guessed_type, guessed_type)
105 | 
106 |         if in_correct:
107 |             if (end_correct and end_guessed and
108 |                 last_guessed_type == last_correct_type):
109 |                 in_correct = False
110 |                 counts.correct_chunk += 1
111 |                 counts.t_correct_chunk[last_correct_type] += 1
112 |             elif (end_correct != end_guessed or guessed_type != correct_type):
113 |                 in_correct = False
114 | 
115 |         if start_correct and start_guessed and guessed_type == correct_type:
116 |             in_correct = True
117 | 
118 |         if start_correct:
119 |             counts.found_correct += 1
120 |             counts.t_found_correct[correct_type] += 1
121 |         if start_guessed:
122 |             counts.found_guessed += 1
123 |             counts.t_found_guessed[guessed_type] += 1
124 |         if first_item != options.boundary:
125 |             if correct == guessed and guessed_type == correct_type:
126 |                 counts.correct_tags += 1
127 |             counts.token_counter += 1
128 | 
129 |         last_guessed = guessed
130 |         last_correct = correct
131 |         last_guessed_type = guessed_type
132 |         last_correct_type = correct_type
133 | 
134 |     if in_correct:
135 |         counts.correct_chunk += 1
136 |         counts.t_correct_chunk[last_correct_type] += 1
137 | 
138 |     return counts
139 | 
140 | def uniq(iterable):
141 |   seen = set()
142 |   return [i for i in iterable if not (i in seen or seen.add(i))]
143 | 
144 | def calculate_metrics(correct, guessed, total):
145 |     tp, fp, fn = correct, guessed-correct, total-correct
146 |     p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
147 |     r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
148 |     f = 0 if p + r == 0 else 2 * p * r / (p + r)
149 |     return Metrics(tp, fp, fn, p, r, f)
150 | 
151 | def metrics(counts):
152 |     c = counts
153 |     overall = calculate_metrics(
154 |         c.correct_chunk, c.found_guessed, c.found_correct
155 |     )
156 |     by_type = {}
157 |     for t in uniq(c.t_found_correct.keys() + c.t_found_guessed.keys()):
158 |         by_type[t] = calculate_metrics(
159 |             c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
160 |         )
161 |     return overall, by_type
162 | 
163 | def report(counts, out=None):
164 |     if out is None:
165 |         out = sys.stdout
166 | 
167 |     overall, by_type = metrics(counts)
168 | 
169 |     c = counts
170 |     out.write('processed %d tokens with %d phrases; ' %
171 |               (c.token_counter, c.found_correct))
172 |     out.write('found: %d phrases; correct: %d.\n' %
173 |               (c.found_guessed, c.correct_chunk))
174 | 
175 |     if c.token_counter > 0:
176 |         out.write('accuracy: %6.2f%%; ' %
177 |                   (100.*c.correct_tags/c.token_counter))
178 |         out.write('precision: %6.2f%%; ' % (100.*overall.prec))
179 |         out.write('recall: %6.2f%%; ' % (100.*overall.rec))
180 |         out.write('FB1: %6.2f\n' % (100.*overall.fscore))
181 | 
182 |     for i, m in sorted(by_type.items()):
183 |         out.write('%17s: ' % i)
184 |         out.write('precision: %6.2f%%; ' % (100.*m.prec))
185 |         out.write('recall: %6.2f%%; ' % (100.*m.rec))
186 |         out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
187 | 
188 | def end_of_chunk(prev_tag, tag, prev_type, type_):
189 |     # check if a chunk ended between the previous and current word
190 |     # arguments: previous and current chunk tags, previous and current types
191 |     chunk_end = False
192 | 
193 |     if prev_tag == 'E': chunk_end = True
194 |     if prev_tag == 'S': chunk_end = True
195 | 
196 |     if prev_tag == 'B' and tag == 'B': chunk_end = True
197 |     if prev_tag == 'B' and tag == 'S': chunk_end = True
198 |     if prev_tag == 'B' and tag == 'O': chunk_end = True
199 |     if prev_tag == 'I' and tag == 'B': chunk_end = True
200 |     if prev_tag == 'I' and tag == 'S': chunk_end = True
201 |     if prev_tag == 'I' and tag == 'O': chunk_end = True
202 | 
203 |     if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
204 |         chunk_end = True
205 | 
206 |     # these chunks are assumed to have length 1
207 |     if prev_tag == ']': chunk_end = True
208 |     if prev_tag == '[': chunk_end = True
209 | 
210 |     return chunk_end
211 | 
212 | def start_of_chunk(prev_tag, tag, prev_type, type_):
213 |     # check if a chunk started between the previous and current word
214 |     # arguments: previous and current chunk tags, previous and current types
215 |     chunk_start = False
216 | 
217 |     if tag == 'B': chunk_start = True
218 |     if tag == 'S': chunk_start = True
219 | 
220 |     if prev_tag == 'E' and tag == 'E': chunk_start = True
221 |     if prev_tag == 'E' and tag == 'I': chunk_start = True
222 |     if prev_tag == 'S' and tag == 'E': chunk_start = True
223 |     if prev_tag == 'S' and tag == 'I': chunk_start = True
224 |     if prev_tag == 'O' and tag == 'E': chunk_start = True
225 |     if prev_tag == 'O' and tag == 'I': chunk_start = True
226 | 
227 |     if tag != 'O' and tag != '.' and prev_type != type_:
228 |         chunk_start = True
229 | 
230 |     # these chunks are assumed to have length 1
231 |     if tag == '[': chunk_start = True
232 |     if tag == ']': chunk_start = True
233 | 
234 |     return chunk_start
235 | 
236 | def main(argv):
237 |     args = parse_args(argv[1:])
238 | 
239 |     if args.file is None:
240 |         counts = evaluate(sys.stdin, args)
241 |     else:
242 |         with open(args.file) as f:
243 |             counts = evaluate(f, args)
244 |     report(counts)
245 | 
246 | if __name__ == '__main__':
247 |     sys.exit(main(sys.argv))
248 | 


--------------------------------------------------------------------------------
/sequence_labeling_experiment.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import collections
  3 | import numpy
  4 | import random
  5 | import math
  6 | import gc
  7 | import os
  8 | import re
  9 | import ConfigParser
 10 | import theano
 11 | 
 12 | from sequence_labeler import SequenceLabeler
 13 | from sequence_labeling_evaluator import SequenceLabelingEvaluator
 14 | 
 15 | floatX=theano.config.floatX
 16 | 
 17 | def read_input_files(file_paths):
 18 |     sentences = []
 19 |     for file_path in file_paths.strip().split(","):
 20 |         with open(file_path, "r") as f:
 21 |             words, labels = [], []
 22 |             for line in f:
 23 |                 if len(line.strip()) > 0:
 24 |                     line_parts = line.strip().split()
 25 |                     assert(len(line_parts) >= 2)
 26 |                     words.append(line_parts[0])
 27 |                     labels.append(line_parts[-1])
 28 |                 elif len(line.strip()) == 0 and len(words) > 0:
 29 |                     sentences.append((words, labels))
 30 |                     words, labels = [], []
 31 |             if len(words) > 0:
 32 |                 raise ValueError("The format expects an empty line at the end of the file in: " + file_path)
 33 |     return sentences
 34 | 
 35 | 
 36 | def read_dataset(file_paths, lowercase_words, lowercase_chars, replace_digits, word2id, char2id, label2id):
 37 |     dataset = []
 38 |     sentences = read_input_files(file_paths)
 39 | 
 40 |     for i in range(len(sentences)):
 41 |         word_ids = map_text_to_ids(" ".join(sentences[i][0]), word2id, "<s>", "</s>", "<unk>", lowercase=lowercase_words, replace_digits=replace_digits)
 42 |         char_ids = [map_text_to_ids("<s>", char2id, "<w>", "</w>", "<cunk>")] + \
 43 |                    [map_text_to_ids(" ".join(list(word)), char2id, "<w>", "</w>", "<cunk>", lowercase=lowercase_chars, replace_digits=replace_digits) for word in sentences[i][0]] + \
 44 |                    [map_text_to_ids("</s>", char2id, "<w>", "</w>", "<cunk>")]
 45 |         label_ids = map_text_to_ids(" ".join(sentences[i][1]), label2id)
 46 | 
 47 |         assert(len(char_ids) == len(word_ids))
 48 |         assert(len(char_ids) == len(label_ids) + 2)
 49 | 
 50 |         dataset.append((word_ids, char_ids, label_ids))
 51 |     return dataset
 52 | 
 53 | 
 54 | 
 55 | def create_batches(dataset, max_batch_size):
 56 |     """
 57 |     Sort sentences by length and organise them into batches
 58 |     """
 59 |     sentence_ids_by_length = collections.OrderedDict()
 60 |     for i in range(len(dataset)):
 61 |         length = len(dataset[i][0])
 62 |         if length not in sentence_ids_by_length:
 63 |             sentence_ids_by_length[length] = []
 64 |         sentence_ids_by_length[length].append(i)
 65 | 
 66 |     batches = []
 67 |     for sentence_length in sentence_ids_by_length:
 68 |         for i in range(0, len(sentence_ids_by_length[sentence_length]), max_batch_size):
 69 |             sentence_ids_in_batch = sentence_ids_by_length[sentence_length][i:i + max_batch_size]
 70 |             max_word_length = numpy.array([[len(char_ids) for char_ids in dataset[sentence_id][1]] for sentence_id in sentence_ids_in_batch]).max()
 71 | 
 72 |             word_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length), dtype=numpy.int32)
 73 |             char_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32)
 74 |             char_mask = numpy.zeros((len(sentence_ids_in_batch), sentence_length, max_word_length), dtype=numpy.int32)
 75 |             label_ids = numpy.zeros((len(sentence_ids_in_batch), sentence_length-2), dtype=numpy.int32)
 76 | 
 77 |             for i in range(len(sentence_ids_in_batch)):
 78 |                 for j in range(sentence_length):
 79 |                     word_ids[i][j] = dataset[sentence_ids_in_batch[i]][0][j]
 80 |                 for j in range(sentence_length):
 81 |                     for k in range(len(dataset[sentence_ids_in_batch[i]][1][j])):
 82 |                         char_ids[i][j][k] = dataset[sentence_ids_in_batch[i]][1][j][k]
 83 |                         char_mask[i][j][k] = 1
 84 |                 for j in range(sentence_length-2):
 85 |                     label_ids[i][j] = dataset[sentence_ids_in_batch[i]][2][j]
 86 |             batches.append((word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch))
 87 |     return batches
 88 | 
 89 | 
 90 | def process_batches(sequencelabeler, batches, testing, learningrate, name, main_label_id, label2id=None, conll_eval=False, verbose=True):
 91 |     evaluator = SequenceLabelingEvaluator(main_label_id, label2id, conll_eval)
 92 |     for word_ids, char_ids, char_mask, label_ids, sentence_ids_in_batch in batches:
 93 |         if testing == True:
 94 |             cost, predicted_labels = sequencelabeler.test(word_ids, char_ids, char_mask, label_ids)
 95 |         else:
 96 |             cost, predicted_labels = sequencelabeler.train(word_ids, char_ids, char_mask, label_ids, learningrate)
 97 |         evaluator.append_data(cost, predicted_labels, word_ids, label_ids)
 98 | 
 99 |     results = evaluator.get_results(name)
100 |     if verbose == True:
101 |         for key in results:
102 |             print key + ": " + str(results[key])
103 |     return results[name + "_cost_sum"], results
104 | 
105 | 
106 | 
107 | def is_float(value):
108 |     try:
109 |         float(value)
110 |         return True
111 |     except ValueError:
112 |         return False
113 | 
114 | def parse_config(config_section, config_path):
115 |     config_parser = ConfigParser.SafeConfigParser(allow_no_value=True)
116 |     config_parser.read(config_path)
117 |     config = collections.OrderedDict()
118 |     for key, value in config_parser.items(config_section):
119 |         if value is None or len(value.strip()) == 0:
120 |             config[key] = None
121 |         elif value.lower() in ["true", "false"]:
122 |             config[key] = config_parser.getboolean(config_section, key)
123 |         elif value.isdigit():
124 |             config[key] = config_parser.getint(config_section, key)
125 |         elif is_float(value):
126 |             config[key] = config_parser.getfloat(config_section, key)
127 |         else:
128 |             config[key] = config_parser.get(config_section, key)
129 |     return config
130 | 
131 | 
132 | def generate_word2id_dictionary(texts, min_freq=-1, insert_words=None, lowercase=False, replace_digits=False):
133 |     counter = collections.Counter()
134 |     for text in texts:
135 |         if lowercase:
136 |             text = text.lower()
137 |         if replace_digits:
138 |             text = re.sub(r'\d', '0', text)
139 |         counter.update(text.strip().split())
140 | 
141 |     word2id = collections.OrderedDict()
142 |     if insert_words is not None:
143 |         for word in insert_words:
144 |             word2id[word] = len(word2id)
145 | 
146 |     word_count_list = counter.most_common()
147 | 
148 |     for (word, count) in word_count_list:
149 |         if min_freq <= 0 or count >= min_freq:
150 |             word2id[word] = len(word2id)
151 | 
152 |     return word2id
153 | 
154 | 
155 | def map_text_to_ids(text, word2id, start_token=None, end_token=None, unk_token=None, lowercase=False, replace_digits=False):
156 |     ids = []
157 | 
158 |     if lowercase:
159 |         text = text.lower()
160 |     if replace_digits:
161 |         text = re.sub(r'\d', '0', text)
162 | 
163 |     if start_token != None:
164 |         text = start_token + " " + text
165 |     if end_token != None:
166 |         text = text + " " + end_token
167 |     for word in text.strip().split():
168 |         if word in word2id:
169 |             ids.append(word2id[word])
170 |         elif unk_token != None:
171 |             ids.append(word2id[unk_token])
172 |     return ids
173 | 
174 | 
175 | 
176 | def preload_vectors(word2id, vector_size, word2vec_path):
177 |     rng = numpy.random.RandomState(123)
178 |     preloaded_vectors = numpy.asarray(rng.normal(loc=0.0, scale=0.1, size=(len(word2id), vector_size)), dtype=floatX)
179 | 
180 |     with open(word2vec_path) as f:
181 |         for line in f:
182 |             line_parts = line.strip().split()
183 |             if len(line_parts) <= 2:
184 |                 continue
185 |             word = line_parts[0]
186 |             if word in word2id:
187 |                 word_id = word2id[word]
188 |                 vector = numpy.array(line_parts[1:])
189 |                 preloaded_vectors[word_id] = vector
190 |     return preloaded_vectors
191 | 
192 | 
193 | def run_experiment(config_path):
194 |     config = parse_config("config", config_path)
195 |     random.seed(config["random_seed"] + 1)
196 |     temp_model_path = config_path + ".model"
197 |     sequencelabeler = None
198 | 
199 |     # Preparing dictionaries
200 |     if config["path_train"] is not None and len(config["path_train"]) > 0:
201 |         sentences_train = read_input_files(config["path_train"])
202 |         word2id = generate_word2id_dictionary([" ".join(sentence[0]) for sentence in sentences_train], 
203 |                                                         min_freq=config["min_word_freq"], 
204 |                                                         insert_words=["<unk>", "<s>", "</s>"], 
205 |                                                         lowercase=False, 
206 |                                                         replace_digits=True)
207 |         label2id = generate_word2id_dictionary([" ".join(sentence[1]) for sentence in sentences_train])
208 |         char2id = generate_word2id_dictionary([" ".join([" ".join(list(word)) for word in sentence[0]]) for sentence in sentences_train], 
209 |                                                         min_freq=-1, 
210 |                                                         insert_words=["<cunk>", "<w>", "</w>", "<s>", "</s>"], 
211 |                                                         lowercase=False, 
212 |                                                         replace_digits=True)
213 | 
214 |     if config["load"] is not None and len(config["load"]) > 0:
215 |         if config["rebuild_output_layer"] == True:
216 |             sequencelabeler = SequenceLabeler.load(config["load"], new_output_layer_size=len(label2id))
217 |             # label2id = label2id
218 |         else:
219 |             sequencelabeler = SequenceLabeler.load(config["load"])
220 |             label2id = sequencelabeler.config["label2id"]
221 |         word2id = sequencelabeler.config["word2id"]
222 |         char2id = sequencelabeler.config["char2id"]
223 | 
224 |     if config["path_train"] is not None and len(config["path_train"]) > 0:
225 |         data_train = read_dataset(config["path_train"], False, False, True, word2id, char2id, label2id)
226 | 
227 |     if config["load"] is None or len(config["load"]) == 0:
228 |         config["n_words"] = len(word2id)
229 |         config["n_chars"] = len(char2id)
230 |         config["n_labels"] = len(label2id)
231 |         config["unk_token"] = "<unk>"
232 |         config["unk_token_id"] = word2id["<unk>"]
233 |         sequencelabeler = SequenceLabeler(config)
234 |         if config['preload_vectors'] is not None:
235 |             new_embeddings = preload_vectors(word2id, config['word_embedding_size'], config['preload_vectors'])
236 |             sequencelabeler.word_embeddings.set_value(new_embeddings)
237 | 
238 |     if config["path_dev"] is not None and len(config["path_dev"]) > 0:
239 |         data_dev = read_dataset(config["path_dev"], False, False, True, word2id, char2id, label2id)
240 |         batches_dev = create_batches(data_dev, config['max_batch_size'])
241 | 
242 |     # printing config
243 |     for key, val in config.items():
244 |             print key, ": ", val
245 |     print "parameter_count: ", sequencelabeler.get_parameter_count()
246 |     print "parameter_count_without_word_embeddings: ", sequencelabeler.get_parameter_count_without_word_embeddings()
247 | 
248 |     config["word2id"] = word2id
249 |     config["char2id"] = char2id
250 |     config["label2id"] = label2id
251 | 
252 |     if config["path_train"] is not None and len(config["path_train"]) > 0:
253 |         best_selector_value = 0.0
254 |         learningrate = config["learningrate"]
255 |         for epoch in xrange(config["epochs"]):
256 |             print("EPOCH: " + str(epoch))
257 |             print("learningrate: " + str(learningrate))
258 |             random.shuffle(data_train)
259 |             batches_train = create_batches(data_train, config['max_batch_size'])
260 |             random.shuffle(batches_train)
261 | 
262 |             train_cost_sum, results_train = process_batches(sequencelabeler, batches_train, testing=False, learningrate=learningrate, name="train", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
263 |             dev_cost_sum, results_dev = process_batches(sequencelabeler, batches_dev, testing=True, learningrate=0.0, name="dev", main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
264 | 
265 |             if math.isnan(dev_cost_sum) or math.isinf(dev_cost_sum):
266 |                 sys.stderr.write("ERROR: Cost is NaN or Inf. Exiting.\n")
267 |                 break
268 | 
269 |             if (epoch == 0 or (config["best_model_selector"].split(":")[1] == "high" and results_dev[config["best_model_selector"].split(":")[0]] > best_selector_value) 
270 |                            or (config["best_model_selector"].split(":")[1] == "low" and results_dev[config["best_model_selector"].split(":")[0]] < best_selector_value)):
271 |                 best_epoch = epoch
272 |                 best_selector_value = results_dev[config["best_model_selector"].split(":")[0]]
273 |                 sequencelabeler.save(temp_model_path)
274 |             print("best_epoch: " + str(best_epoch))
275 | 
276 |             batches_train = None
277 |             gc.collect()
278 | 
279 |             if config["stop_if_no_improvement_for_epochs"] > 0 and (epoch - best_epoch) >= config["stop_if_no_improvement_for_epochs"]:
280 |                 break
281 | 
282 |         # loading the best model so far
283 |         if config["epochs"] > 0:
284 |             sequencelabeler = SequenceLabeler.load(temp_model_path)
285 |             os.remove(temp_model_path)
286 | 
287 |     if config["save"] is not None and len(config["save"]) > 0:
288 |         sequencelabeler.save(config["save"])
289 | 
290 |     if config["path_test"] is not None:
291 |         i = 0
292 |         for path_test in config["path_test"].strip().split(":"):
293 |             data_test = read_dataset(path_test, False, False, True, word2id, char2id, label2id)
294 |             batches_test = create_batches(data_test, config['max_batch_size'])
295 |             test_cost_sum, results_test = process_batches(sequencelabeler, batches_test, testing=True, learningrate=0.0, name="test" + (str(i) if len(batches_test) > 1 else ""), main_label_id=label2id[str(config["main_label"])], label2id=label2id, conll_eval=config["conll_eval"], verbose=True)
296 |             i += 1
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     run_experiment(sys.argv[1])
301 | 


--------------------------------------------------------------------------------