├── data └── ATIS_samples │ ├── test │ ├── test.label │ ├── test.seq.in │ └── test.seq.out │ ├── train │ ├── train.label │ ├── train.seq.in │ └── train.seq.out │ └── valid │ ├── valid.label │ ├── valid.seq.in │ └── valid.seq.out ├── README.md ├── seq_classification.py ├── seq_labeling.py ├── data_utils.py ├── conlleval.pl ├── run_multi-task_rnn.py └── multi_task_model.py /data/ATIS_samples/test/test.label: -------------------------------------------------------------------------------- 1 | flight 2 | airfare 3 | flight 4 | flight 5 | flight 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.label: -------------------------------------------------------------------------------- 1 | airfare 2 | flight 3 | flight 4 | airfare 5 | flight 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.label: -------------------------------------------------------------------------------- 1 | flight 2 | flight 3 | flight_time 4 | airfare 5 | airfare 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.seq.in: -------------------------------------------------------------------------------- 1 | what's the lowest round trip fare from dallas to atlanta 2 | find me the earliest flight from boston to atlanta on any day of the week 3 | display all flights from boston to baltimore on july thirty first 4 | economy fares new york to miami round trip 5 | i need to fly from boston to denver on to san francisco and back 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/test/test.seq.in: -------------------------------------------------------------------------------- 1 | i would like to find a flight from charlotte to las vegas that makes a stop in st. louis 2 | on april first i need a ticket from tacoma to san jose departing before DIGIT am 3 | on april first i need a flight going from phoenix to san diego 4 | i would like a flight traveling one way from phoenix to san diego on april first 5 | i would like a flight from orlando to salt lake city for april first on delta airlines 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.seq.in: -------------------------------------------------------------------------------- 1 | i want to fly from boston at DIGITDIGITDIGIT am and arrive in denver at DIGITDIGITDIGITDIGIT in the morning 2 | what flights are available from pittsburgh to baltimore on thursday morning 3 | what is the arrival time in san francisco for the DIGITDIGITDIGIT am flight leaving washington 4 | cheapest airfare from tacoma to orlando 5 | round trip fares from pittsburgh to philadelphia under DIGITDIGITDIGITDIGIT dollars 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.seq.out: -------------------------------------------------------------------------------- 1 | O O B-cost_relative B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name 2 | O O O B-flight_mod O O B-fromloc.city_name O B-toloc.city_name O O O O O O 3 | O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number 4 | B-economy O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip 5 | O O O O O B-fromloc.city_name O B-toloc.city_name O O B-toloc.city_name I-toloc.city_name O O 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.seq.out: -------------------------------------------------------------------------------- 1 | O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day 2 | O O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.day_name B-depart_time.period_of_day 3 | O O O B-flight_time I-flight_time O B-fromloc.city_name I-fromloc.city_name O O B-depart_time.time I-depart_time.time O O B-fromloc.city_name 4 | B-cost_relative O O B-fromloc.city_name O B-toloc.city_name 5 | B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative B-fare_amount I-fare_amount 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/test/test.seq.out: -------------------------------------------------------------------------------- 1 | O O O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name 2 | O B-depart_date.month_name B-depart_date.day_number O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_time.time_relative B-depart_time.time I-depart_time.time 3 | O B-depart_date.month_name B-depart_date.day_number O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name 4 | O O O O O O B-round_trip I-round_trip O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number 5 | O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number O B-airline_name I-airline_name 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Attention-based RNN model for Spoken Language Understanding (Intent Detection & Slot Filling) 2 | ================== 3 | Tensorflow implementation of attention-based LSTM models for sequence classification and sequence labeling. 4 | 5 | **Updates - 2017/07/29** 6 | * Updated code to work with the latest TensorFlow API: r1.2 7 | * Code cleanup and formatting 8 | * Note that this published code does not include the modeling of output label dependencies. One may add a loop function as in the rnn_decoder function in TensorFlow seq2seq.py example to feed emitted label embedding back to RNN state. Alternatively, sequence level optimization can be performed by adding a CRF layer on top of the RNN outputs. 9 | * The dataset used in the paper can be found at: https://github.com/yvchen/JointSLU/tree/master/data. We used the training set in the original ATIS train/test split, which has 4978 training samples. There are 15 test samples that have multiple intent labels for an utterance. We used the more frequent label (most likely, "flight") as the true label during evaluation. 10 | 11 | 12 | **Setup** 13 | 14 | * TensorFlow, version r1.2 (https://www.tensorflow.org/api_docs/) 15 | 16 | **Usage**: 17 | ```bash 18 | data_dir=data/ATIS_samples 19 | model_dir=model_tmp 20 | max_sequence_length=50 # max length for train/valid/test sequence 21 | task=joint # available options: intent; tagging; joint 22 | bidirectional_rnn=True # available options: True; False 23 | use_attention=True # available options: True; False 24 | 25 | python run_multi-task_rnn.py --data_dir $data_dir \ 26 | --train_dir $model_dir\ 27 | --max_sequence_length $max_sequence_length \ 28 | --task $task \ 29 | --bidirectional_rnn $bidirectional_rnn \ 30 | --use_attention $use_attention 31 | ``` 32 | 33 | **Reference** 34 | 35 | * Bing Liu, Ian Lane, "Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling", Interspeech, 2016 (PDF) 36 | 37 | ``` 38 | @inproceedings{Liu+2016, 39 | author={Bing Liu and Ian Lane}, 40 | title={Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling}, 41 | year=2016, 42 | booktitle={Interspeech 2016}, 43 | doi={10.21437/Interspeech.2016-1352}, 44 | url={http://dx.doi.org/10.21437/Interspeech.2016-1352}, 45 | pages={685--689} 46 | } 47 | ``` 48 | 49 | **Contact** 50 | 51 | Feel free to email liubing@cmu.edu for any pertinent questions/bugs regarding the code. 52 | -------------------------------------------------------------------------------- /seq_classification.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 28 15:28:44 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | from six.moves import xrange # pylint: disable=redefined-builtin 13 | # We disable pylint because we need python3 compatibility. 14 | import tensorflow as tf 15 | from tensorflow.python.ops import rnn_cell_impl 16 | 17 | linear = rnn_cell_impl._linear 18 | 19 | def attention_single_output_decoder(initial_state, 20 | attention_states, 21 | output_size=None, 22 | num_heads=1, 23 | dtype=tf.float32, 24 | scope=None, 25 | sequence_length=tf.ones([16]), 26 | initial_state_attention=True, 27 | use_attention=False): 28 | 29 | if num_heads < 1: 30 | raise ValueError("With less than 1 heads, use a non-attention decoder.") 31 | if not attention_states.get_shape()[1:2].is_fully_defined(): 32 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s" 33 | % attention_states.get_shape()) 34 | 35 | with tf.variable_scope(scope or "decoder_single_output"): 36 | # print (initial_state.eval().shape) 37 | batch_size = tf.shape(initial_state)[0] # Needed for reshaping. 38 | # print (attention_states.get_shape()) 39 | attn_length = attention_states.get_shape()[1].value 40 | attn_size = attention_states.get_shape()[2].value 41 | 42 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. 43 | hidden = tf.reshape( 44 | attention_states, [-1, attn_length, 1, attn_size]) 45 | hidden_features = [] 46 | v = [] 47 | attention_vec_size = attn_size # Size of query vectors for attention. 48 | for a in xrange(num_heads): 49 | k = tf.get_variable("AttnW_%d" % a, 50 | [1, 1, attn_size, attention_vec_size]) 51 | hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) 52 | v.append(tf.get_variable("AttnV_%d" % a, 53 | [attention_vec_size])) 54 | 55 | # state = initial_state 56 | 57 | def attention(query, use_attention=False): 58 | """Put attention masks on hidden using hidden_features and query.""" 59 | attn_weights = [] 60 | ds = [] # Results of attention reads will be stored here. 61 | for i in xrange(num_heads): 62 | with tf.variable_scope("Attention_%d" % i): 63 | # y = linear(query, attention_vec_size, True) 64 | y = linear(query, attention_vec_size, True) 65 | y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) 66 | # Attention mask is a softmax of v^T * tanh(...). 67 | s = tf.reduce_sum( 68 | v[i] * tf.tanh(hidden_features[i] + y), [2, 3]) 69 | if use_attention is False: # apply mean pooling 70 | weights = tf.tile(sequence_length, tf.stack([attn_length])) 71 | weights = tf.reshape(weights, tf.shape(s)) 72 | a = tf.ones(tf.shape(s), dtype=dtype) / tf.to_float(weights) 73 | # a = tf.ones(tf.shape(s), dtype=dtype) / tf.to_float(tf.shape(s)[1]) 74 | else: 75 | a = tf.nn.softmax(s) 76 | attn_weights.append(a) 77 | # Now calculate the attention-weighted vector d. 78 | d = tf.reduce_sum( 79 | tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, 80 | [1, 2]) 81 | ds.append(tf.reshape(d, [-1, attn_size])) 82 | return attn_weights, ds 83 | 84 | batch_attn_size = tf.stack([batch_size, attn_size]) 85 | attns = [tf.zeros(batch_attn_size, dtype=dtype) 86 | for _ in xrange(num_heads)] 87 | for a in attns: # Ensure the second shape of attention vectors is set. 88 | a.set_shape([None, attn_size]) 89 | if initial_state_attention: 90 | attn_weights, attns = attention(initial_state, use_attention=use_attention) 91 | 92 | #with variable_scope.variable_scope(scope or "Linear"): 93 | matrix = tf.get_variable("Out_Matrix", [attn_size, output_size]) 94 | res = tf.matmul(attns[0], matrix) 95 | # NOTE: here we temporarily assume num_head = 1 96 | bias_start = 0.0 97 | bias_term = tf.get_variable("Out_Bias", 98 | [output_size], 99 | initializer=tf.constant_initializer(bias_start)) 100 | output = res + bias_term 101 | # NOTE: here we temporarily assume num_head = 1 102 | return attention_states, attn_weights[0], attns[0], [output] 103 | 104 | def generate_single_output(encoder_state, attention_states, sequence_length, 105 | targets, num_classes, buckets, 106 | use_mean_attention=False, 107 | softmax_loss_function=None, per_example_loss=False, 108 | name=None, use_attention=False): 109 | all_inputs = targets 110 | with tf.name_scope(name, "model_with_buckets", all_inputs): 111 | with tf.variable_scope(tf.get_variable_scope(), 112 | reuse=None): 113 | single_outputs = attention_single_output_decoder(encoder_state, 114 | attention_states, 115 | output_size=num_classes, 116 | num_heads=1, 117 | sequence_length=sequence_length, 118 | use_attention=use_attention) 119 | _, _, _, bucket_outputs = single_outputs 120 | 121 | if softmax_loss_function is None: 122 | assert len(bucket_outputs) == len(targets) == 1 123 | # We need to make target and int64-tensor and set its shape. 124 | bucket_target = tf.reshape(tf.to_int64(targets[0]), [-1]) 125 | crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( 126 | logits=bucket_outputs[0], labels=bucket_target) 127 | else: 128 | assert len(bucket_outputs) == len(targets) == 1 129 | crossent = softmax_loss_function(bucket_outputs[0], targets[0]) 130 | 131 | batch_size = tf.shape(targets[0])[0] 132 | loss = tf.reduce_sum(crossent) / tf.cast(batch_size, tf.float32) 133 | 134 | return bucket_outputs, loss -------------------------------------------------------------------------------- /seq_labeling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 28 11:32:21 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | # We disable pylint because we need python3 compatibility. 13 | from six.moves import xrange # pylint: disable=redefined-builtin 14 | #from six.moves import zip # pylint: disable=redefined-builtin 15 | 16 | import tensorflow as tf 17 | from tensorflow.python.ops import control_flow_ops 18 | from tensorflow.python.framework import tensor_shape 19 | from tensorflow.contrib.legacy_seq2seq import sequence_loss_by_example 20 | from tensorflow.contrib.legacy_seq2seq import sequence_loss 21 | 22 | from tensorflow.python.ops import rnn_cell_impl 23 | 24 | linear = rnn_cell_impl._linear 25 | 26 | def _step(time, sequence_length, min_sequence_length, 27 | max_sequence_length, zero_logit, generate_logit): 28 | # Step 1: determine whether we need to call_cell or not 29 | empty_update = lambda: zero_logit 30 | logit = control_flow_ops.cond( 31 | time < max_sequence_length, generate_logit, empty_update) 32 | 33 | # Step 2: determine whether we need to copy through state and/or outputs 34 | existing_logit = lambda: logit 35 | 36 | def copy_through(): 37 | # Use broadcasting select to determine which values should get 38 | # the previous state & zero output, and which values should get 39 | # a calculated state & output. 40 | copy_cond = (time >= sequence_length) 41 | return tf.where(copy_cond, zero_logit, logit) 42 | 43 | logit = control_flow_ops.cond( 44 | time < min_sequence_length, existing_logit, copy_through) 45 | logit.set_shape(zero_logit.get_shape()) 46 | return logit 47 | 48 | def attention_RNN(encoder_outputs, 49 | encoder_state, 50 | num_decoder_symbols, 51 | sequence_length, 52 | num_heads=1, 53 | dtype=tf.float32, 54 | use_attention=True, 55 | loop_function=None, 56 | scope=None): 57 | if use_attention: 58 | print ('Use the attention RNN model') 59 | if num_heads < 1: 60 | raise ValueError("With less than 1 heads, use a non-attention decoder.") 61 | 62 | with tf.variable_scope(scope or "attention_RNN"): 63 | output_size = encoder_outputs[0].get_shape()[1].value 64 | top_states = [tf.reshape(e, [-1, 1, output_size]) 65 | for e in encoder_outputs] 66 | attention_states = tf.concat(top_states, 1) 67 | if not attention_states.get_shape()[1:2].is_fully_defined(): 68 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s" 69 | % attention_states.get_shape()) 70 | 71 | batch_size = tf.shape(top_states[0])[0] # Needed for reshaping. 72 | attn_length = attention_states.get_shape()[1].value 73 | attn_size = attention_states.get_shape()[2].value 74 | 75 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. 76 | hidden = tf.reshape( 77 | attention_states, [-1, attn_length, 1, attn_size]) 78 | hidden_features = [] 79 | v = [] 80 | attention_vec_size = attn_size # Size of query vectors for attention. 81 | for a in xrange(num_heads): 82 | k = tf.get_variable("AttnW_%d" % a, 83 | [1, 1, attn_size, attention_vec_size]) 84 | hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) 85 | v.append(tf.get_variable("AttnV_%d" % a, 86 | [attention_vec_size])) 87 | 88 | def attention(query): 89 | """Put attention masks on hidden using hidden_features and query.""" 90 | attn_weights = [] 91 | ds = [] # Results of attention reads will be stored here. 92 | for i in xrange(num_heads): 93 | with tf.variable_scope("Attention_%d" % i): 94 | #y = linear(query, attention_vec_size, True) 95 | y = linear(query, attention_vec_size, True) 96 | y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) 97 | # Attention mask is a softmax of v^T * tanh(...). 98 | s = tf.reduce_sum( 99 | v[i] * tf.tanh(hidden_features[i] + y), [2, 3]) 100 | a = tf.nn.softmax(s) 101 | attn_weights.append(a) 102 | # Now calculate the attention-weighted vector d. 103 | d = tf.reduce_sum( 104 | tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, 105 | [1, 2]) 106 | ds.append(tf.reshape(d, [-1, attn_size])) 107 | return attn_weights, ds 108 | 109 | batch_attn_size = tf.stack([batch_size, attn_size]) 110 | attns = [tf.zeros(batch_attn_size, dtype=dtype) 111 | for _ in xrange(num_heads)] 112 | for a in attns: # Ensure the second shape of attention vectors is set. 113 | a.set_shape([None, attn_size]) 114 | 115 | # loop through the encoder_outputs 116 | attention_encoder_outputs = list() 117 | sequence_attention_weights = list() 118 | for i in xrange(len(encoder_outputs)): 119 | if i > 0: 120 | tf.get_variable_scope().reuse_variables() 121 | if i == 0: 122 | with tf.variable_scope("Initial_Decoder_Attention"): 123 | initial_state = linear(encoder_state, output_size, True) 124 | attn_weights, ds = attention(initial_state) 125 | else: 126 | attn_weights, ds = attention(encoder_outputs[i]) 127 | output = tf.concat([ds[0], encoder_outputs[i]], 1) 128 | # NOTE: here we temporarily assume num_head = 1 129 | with tf.variable_scope("AttnRnnOutputProjection"): 130 | logit = linear(output, num_decoder_symbols, True) 131 | attention_encoder_outputs.append(logit) 132 | # NOTE: here we temporarily assume num_head = 1 133 | sequence_attention_weights.append(attn_weights[0]) 134 | # NOTE: here we temporarily assume num_head = 1 135 | else: 136 | print ('Use the NON attention RNN model') 137 | with tf.variable_scope(scope or "non-attention_RNN"): 138 | attention_encoder_outputs = list() 139 | sequence_attention_weights = list() 140 | 141 | # copy over logits once out of sequence_length 142 | if encoder_outputs[0].get_shape().ndims != 1: 143 | (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) 144 | else: 145 | fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] 146 | 147 | if fixed_batch_size.value: 148 | batch_size = fixed_batch_size.value 149 | else: 150 | batch_size = tf.shape(encoder_outputs[0])[0] 151 | if sequence_length is not None: 152 | sequence_length = tf.to_int32(sequence_length) 153 | if sequence_length is not None: # Prepare variables 154 | zero_logit = tf.zeros( 155 | tf.stack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) 156 | zero_logit.set_shape( 157 | tensor_shape.TensorShape([fixed_batch_size.value, 158 | num_decoder_symbols])) 159 | min_sequence_length = tf.reduce_min(sequence_length) 160 | max_sequence_length = tf.reduce_max(sequence_length) 161 | 162 | #reuse = False 163 | for time, input_ in enumerate(encoder_outputs): 164 | if time > 0: 165 | tf.get_variable_scope().reuse_variables() 166 | #reuse = True 167 | # pylint: disable=cell-var-from-loop 168 | # call_cell = lambda: cell(input_, state) 169 | generate_logit = lambda: linear(encoder_outputs[time], 170 | num_decoder_symbols, 171 | True) 172 | # pylint: enable=cell-var-from-loop 173 | if sequence_length is not None: 174 | logit = _step(time, sequence_length, min_sequence_length, 175 | max_sequence_length, zero_logit, generate_logit) 176 | else: 177 | logit = generate_logit 178 | attention_encoder_outputs.append(logit) 179 | 180 | return attention_encoder_outputs, sequence_attention_weights 181 | 182 | 183 | def generate_sequence_output(num_encoder_symbols, 184 | encoder_outputs, 185 | encoder_state, 186 | targets, 187 | sequence_length, 188 | num_decoder_symbols, 189 | weights, 190 | buckets, 191 | softmax_loss_function=None, 192 | per_example_loss=False, 193 | name=None, 194 | use_attention=False): 195 | if len(targets) < buckets[-1][1]: 196 | raise ValueError("Length of targets (%d) must be at least that of last" 197 | "bucket (%d)." % (len(targets), buckets[-1][1])) 198 | 199 | all_inputs = encoder_outputs + targets + weights 200 | with tf.name_scope(name, "model_with_buckets", all_inputs): 201 | with tf.variable_scope("decoder_sequence_output", reuse=None): 202 | logits, attention_weights = attention_RNN(encoder_outputs, 203 | encoder_state, 204 | num_decoder_symbols, 205 | sequence_length, 206 | use_attention=use_attention) 207 | if per_example_loss is None: 208 | assert len(logits) == len(targets) 209 | # We need to make target and int64-tensor and set its shape. 210 | bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets] 211 | crossent = sequence_loss_by_example( 212 | logits, bucket_target, weights, 213 | softmax_loss_function=softmax_loss_function) 214 | else: 215 | assert len(logits) == len(targets) 216 | bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets] 217 | crossent = sequence_loss( 218 | logits, bucket_target, weights, 219 | softmax_loss_function=softmax_loss_function) 220 | 221 | return logits, crossent 222 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 27 09:33:32 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | 7 | Prepare data for multi-task RNN model. 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import os 15 | import re 16 | 17 | from tensorflow.python.platform import gfile 18 | 19 | # Special vocabulary symbols - we always put them at the start. 20 | _PAD = "_PAD" 21 | _UNK = "_UNK" 22 | _START_VOCAB = [_PAD, _UNK] 23 | 24 | START_VOCAB_dict = dict() 25 | START_VOCAB_dict['with_padding'] = [_PAD, _UNK] 26 | START_VOCAB_dict['no_padding'] = [_UNK] 27 | 28 | 29 | PAD_ID = 0 30 | 31 | UNK_ID_dict = dict() 32 | UNK_ID_dict['with_padding'] = 1 33 | UNK_ID_dict['no_padding'] = 0 34 | 35 | # Regular expressions used to tokenize. 36 | _WORD_SPLIT = re.compile("([.,!?\"':;)(])") 37 | _DIGIT_RE = re.compile(r"\d") 38 | 39 | def basic_tokenizer(sentence): 40 | """Very basic tokenizer: split the sentence into a list of tokens.""" 41 | words = [] 42 | for space_separated_fragment in sentence.strip().split(): 43 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) 44 | return [w for w in words if w] 45 | 46 | def naive_tokenizer(sentence): 47 | """Naive tokenizer: split the sentence by space into a list of tokens.""" 48 | return sentence.split() 49 | 50 | 51 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, 52 | tokenizer=None, normalize_digits=True): 53 | """Create vocabulary file (if it does not exist yet) from data file. 54 | 55 | Data file is assumed to contain one sentence per line. Each sentence is 56 | tokenized and digits are normalized (if normalize_digits is set). 57 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. 58 | We write it to vocabulary_path in a one-token-per-line format, so that later 59 | token in the first line gets id=0, second line gets id=1, and so on. 60 | 61 | Args: 62 | vocabulary_path: path where the vocabulary will be created. 63 | data_path: data file that will be used to create vocabulary. 64 | max_vocabulary_size: limit on the size of the created vocabulary. 65 | tokenizer: a function to use to tokenize each data sentence; 66 | if None, basic_tokenizer will be used. 67 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 68 | """ 69 | if not gfile.Exists(vocabulary_path): 70 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 71 | vocab = {} 72 | with gfile.GFile(data_path, mode="r") as f: 73 | counter = 0 74 | for line in f: 75 | counter += 1 76 | if counter % 100000 == 0: 77 | print(" processing line %d" % counter) 78 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 79 | for w in tokens: 80 | word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w 81 | if word in vocab: 82 | vocab[word] += 1 83 | else: 84 | vocab[word] = 1 85 | vocab_list = START_VOCAB_dict['with_padding'] + \ 86 | sorted(vocab, key=vocab.get, reverse=True) 87 | if len(vocab_list) > max_vocabulary_size: 88 | vocab_list = vocab_list[:max_vocabulary_size] 89 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file: 90 | for w in vocab_list: 91 | vocab_file.write(w + "\n") 92 | 93 | 94 | def initialize_vocab(vocabulary_path): 95 | """Initialize vocabulary from file. 96 | 97 | We assume the vocabulary is stored one-item-per-line, so a file: 98 | dog 99 | cat 100 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will 101 | also return the reversed-vocabulary ["dog", "cat"]. 102 | 103 | Args: 104 | vocabulary_path: path to the file containing the vocabulary. 105 | 106 | Returns: 107 | a pair: the vocabulary (a dictionary mapping string to integers), and 108 | the reversed vocabulary (a list, which reverses the vocabulary mapping). 109 | 110 | Raises: 111 | ValueError: if the provided vocabulary_path does not exist. 112 | """ 113 | if gfile.Exists(vocabulary_path): 114 | rev_vocab = [] 115 | with gfile.GFile(vocabulary_path, mode="r") as f: 116 | rev_vocab.extend(f.readlines()) 117 | rev_vocab = [line.strip() for line in rev_vocab] 118 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 119 | return vocab, rev_vocab 120 | else: 121 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 122 | 123 | 124 | def sentence_to_token_ids(sentence, vocabulary, UNK_ID, 125 | tokenizer=None, normalize_digits=True): 126 | """Convert a string to list of integers representing token-ids. 127 | 128 | For example, a sentence "I have a dog" may become tokenized into 129 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2, 130 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7]. 131 | 132 | Args: 133 | sentence: a string, the sentence to convert to token-ids. 134 | vocabulary: a dictionary mapping tokens to integers. 135 | tokenizer: a function to use to tokenize each sentence; 136 | if None, basic_tokenizer will be used. 137 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 138 | 139 | Returns: 140 | a list of integers, the token-ids for the sentence. 141 | """ 142 | if tokenizer: 143 | words = tokenizer(sentence) 144 | else: 145 | words = basic_tokenizer(sentence) 146 | if not normalize_digits: 147 | return [vocabulary.get(w, UNK_ID) for w in words] 148 | # Normalize digits by 0 before looking words up in the vocabulary. 149 | return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words] 150 | 151 | 152 | def data_to_token_ids(data_path, target_path, vocabulary_path, 153 | tokenizer=None, normalize_digits=True, use_padding=True): 154 | """Tokenize data file and turn into token-ids using given vocabulary file. 155 | 156 | This function loads data line-by-line from data_path, calls the above 157 | sentence_to_token_ids, and saves the result to target_path. See comment 158 | for sentence_to_token_ids on the details of token-ids format. 159 | 160 | Args: 161 | data_path: path to the data file in one-sentence-per-line format. 162 | target_path: path where the file with token-ids will be created. 163 | vocabulary_path: path to the vocabulary file. 164 | tokenizer: a function to use to tokenize each sentence; 165 | if None, basic_tokenizer will be used. 166 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 167 | """ 168 | if not gfile.Exists(target_path): 169 | print("Tokenizing data in %s" % data_path) 170 | vocab, _ = initialize_vocab(vocabulary_path) 171 | with gfile.GFile(data_path, mode="r") as data_file: 172 | with gfile.GFile(target_path, mode="w") as tokens_file: 173 | counter = 0 174 | for line in data_file: 175 | counter += 1 176 | if counter % 100000 == 0: 177 | print(" tokenizing line %d" % counter) 178 | if use_padding: 179 | UNK_ID = UNK_ID_dict['with_padding'] 180 | else: 181 | UNK_ID = UNK_ID_dict['no_padding'] 182 | token_ids = sentence_to_token_ids(line, vocab, UNK_ID, tokenizer, 183 | normalize_digits) 184 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 185 | 186 | 187 | 188 | def create_label_vocab(vocabulary_path, data_path): 189 | if not gfile.Exists(vocabulary_path): 190 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 191 | vocab = {} 192 | with gfile.GFile(data_path, mode="r") as f: 193 | counter = 0 194 | for line in f: 195 | counter += 1 196 | if counter % 100000 == 0: 197 | print(" processing line %d" % counter) 198 | label = line.strip() 199 | vocab[label] = 1 200 | label_list = START_VOCAB_dict['no_padding'] + sorted(vocab) 201 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file: 202 | for k in label_list: 203 | vocab_file.write(k + "\n") 204 | 205 | def prepare_multi_task_data(data_dir, in_vocab_size, out_vocab_size): 206 | train_path = data_dir + '/train/train' 207 | dev_path = data_dir + '/valid/valid' 208 | test_path = data_dir + '/test/test' 209 | 210 | # Create vocabularies of the appropriate sizes. 211 | in_vocab_path = os.path.join(data_dir, "in_vocab_%d.txt" % in_vocab_size) 212 | out_vocab_path = os.path.join(data_dir, "out_vocab_%d.txt" % out_vocab_size) 213 | label_path = os.path.join(data_dir, "label.txt") 214 | 215 | create_vocabulary(in_vocab_path, 216 | train_path + ".seq.in", 217 | in_vocab_size, 218 | tokenizer=naive_tokenizer) 219 | create_vocabulary(out_vocab_path, 220 | train_path + ".seq.out", 221 | out_vocab_size, 222 | tokenizer=naive_tokenizer) 223 | create_label_vocab(label_path, train_path + ".label") 224 | 225 | # Create token ids for the training data. 226 | in_seq_train_ids_path = train_path + (".ids%d.seq.in" % in_vocab_size) 227 | out_seq_train_ids_path = train_path + (".ids%d.seq.out" % out_vocab_size) 228 | label_train_ids_path = train_path + (".ids.label") 229 | 230 | data_to_token_ids(train_path + ".seq.in", 231 | in_seq_train_ids_path, 232 | in_vocab_path, 233 | tokenizer=naive_tokenizer) 234 | data_to_token_ids(train_path + ".seq.out", 235 | out_seq_train_ids_path, 236 | out_vocab_path, 237 | tokenizer=naive_tokenizer) 238 | data_to_token_ids(train_path + ".label", 239 | label_train_ids_path, 240 | label_path, 241 | normalize_digits=False, 242 | use_padding=False) 243 | 244 | # Create token ids for the development data. 245 | in_seq_dev_ids_path = dev_path + (".ids%d.seq.in" % in_vocab_size) 246 | out_seq_dev_ids_path = dev_path + (".ids%d.seq.out" % out_vocab_size) 247 | label_dev_ids_path = dev_path + (".ids.label") 248 | 249 | data_to_token_ids(dev_path + ".seq.in", 250 | in_seq_dev_ids_path, 251 | in_vocab_path, 252 | tokenizer=naive_tokenizer) 253 | data_to_token_ids(dev_path + ".seq.out", 254 | out_seq_dev_ids_path, 255 | out_vocab_path, 256 | tokenizer=naive_tokenizer) 257 | data_to_token_ids(dev_path + ".label", 258 | label_dev_ids_path, 259 | label_path, 260 | normalize_digits=False, 261 | use_padding=False) 262 | 263 | # Create token ids for the test data. 264 | in_seq_test_ids_path = test_path + (".ids%d.seq.in" % in_vocab_size) 265 | out_seq_test_ids_path = test_path + (".ids%d.seq.out" % out_vocab_size) 266 | label_test_ids_path = test_path + (".ids.label") 267 | 268 | data_to_token_ids(test_path + ".seq.in", 269 | in_seq_test_ids_path, 270 | in_vocab_path, 271 | tokenizer=naive_tokenizer) 272 | data_to_token_ids(test_path + ".seq.out", 273 | out_seq_test_ids_path, 274 | out_vocab_path, 275 | tokenizer=naive_tokenizer) 276 | data_to_token_ids(test_path + ".label", 277 | label_test_ids_path, 278 | label_path, 279 | normalize_digits=False, 280 | use_padding=False) 281 | 282 | return [(in_seq_train_ids_path,out_seq_train_ids_path,label_train_ids_path), 283 | (in_seq_dev_ids_path, out_seq_dev_ids_path, label_dev_ids_path), 284 | (in_seq_test_ids_path, out_seq_test_ids_path, label_test_ids_path), 285 | (in_vocab_path, out_vocab_path, label_path)] -------------------------------------------------------------------------------- /conlleval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | 86 | #printf $line; 87 | #printf STDERR $#features; 88 | #printf "\n"; 89 | 90 | #printf $nbrOfFeatures; 91 | #printf "\n"; 92 | #printf $#features; 93 | #printf "\n"; 94 | 95 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 96 | elsif ($nbrOfFeatures != $#features and @features != 0) { 97 | printf STDERR "unexpected number of features: %d (%d)\n", 98 | $#features+1,$nbrOfFeatures+1; 99 | exit(1); 100 | } 101 | if (@features == 0 or 102 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 103 | if (@features < 2) { 104 | die "conlleval: unexpected number of features in line $line\n"; 105 | } 106 | if ($raw) { 107 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 108 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 109 | if ($features[$#features] ne "O") { 110 | $features[$#features] = "B-$features[$#features]"; 111 | } 112 | if ($features[$#features-1] ne "O") { 113 | $features[$#features-1] = "B-$features[$#features-1]"; 114 | } 115 | } 116 | # 20040126 ET code which allows hyphens in the types 117 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 118 | $guessed = $1; 119 | $guessedType = $2; 120 | } else { 121 | $guessed = $features[$#features]; 122 | $guessedType = ""; 123 | } 124 | pop(@features); 125 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 126 | $correct = $1; 127 | $correctType = $2; 128 | } else { 129 | $correct = $features[$#features]; 130 | $correctType = ""; 131 | } 132 | pop(@features); 133 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 134 | # ($correct,$correctType) = split(/-/,pop(@features)); 135 | $guessedType = $guessedType ? $guessedType : ""; 136 | $correctType = $correctType ? $correctType : ""; 137 | $firstItem = shift(@features); 138 | 139 | # 1999-06-26 sentence breaks should always be counted as out of chunk 140 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 141 | 142 | if ($inCorrect) { 143 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 144 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 145 | $lastGuessedType eq $lastCorrectType) { 146 | $inCorrect=$false; 147 | $correctChunk++; 148 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 149 | $correctChunk{$lastCorrectType}+1 : 1; 150 | } elsif ( 151 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 152 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 153 | $guessedType ne $correctType ) { 154 | $inCorrect=$false; 155 | } 156 | } 157 | 158 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 159 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 160 | $guessedType eq $correctType) { $inCorrect = $true; } 161 | 162 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 163 | $foundCorrect++; 164 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 165 | $foundCorrect{$correctType}+1 : 1; 166 | } 167 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 168 | $foundGuessed++; 169 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 170 | $foundGuessed{$guessedType}+1 : 1; 171 | } 172 | if ( $firstItem ne $boundary ) { 173 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 174 | $correctTags++; 175 | } 176 | $tokenCounter++; 177 | } 178 | 179 | $lastGuessed = $guessed; 180 | $lastCorrect = $correct; 181 | $lastGuessedType = $guessedType; 182 | $lastCorrectType = $correctType; 183 | } 184 | if ($inCorrect) { 185 | $correctChunk++; 186 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 187 | $correctChunk{$lastCorrectType}+1 : 1; 188 | } 189 | 190 | if (not $latex) { 191 | # compute overall precision, recall and FB1 (default values are 0.0) 192 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 193 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 194 | $FB1 = 2*$precision*$recall/($precision+$recall) 195 | if ($precision+$recall > 0); 196 | 197 | # print overall performance 198 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 199 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 200 | if ($tokenCounter>0) { 201 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 202 | print "$correctChunk $foundCorrect $foundGuessed "; 203 | printf "precision: %6.2f%%; ",$precision; 204 | printf "recall: %6.2f%%; ",$recall; 205 | printf "FB1: %6.2f\n",$FB1; 206 | } 207 | } 208 | 209 | # sort chunk type names 210 | undef($lastType); 211 | @sortedTypes = (); 212 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 213 | if (not($lastType) or $lastType ne $i) { 214 | push(@sortedTypes,($i)); 215 | } 216 | $lastType = $i; 217 | } 218 | # print performance per chunk type 219 | if (not $latex) { 220 | for $i (@sortedTypes) { 221 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 222 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 223 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 224 | if (not($foundCorrect{$i})) { $recall = 0.0; } 225 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 226 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 227 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 228 | printf "%17s: ",$i; 229 | printf "% 4d % 4d % 4d ", $correctChunk{$i}, $foundCorrect{$i}, $foundGuessed{$i}; 230 | printf "precision: %6.2f%%; ",$precision; 231 | printf "recall: %6.2f%%; ",$recall; 232 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 233 | } 234 | } else { 235 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 236 | for $i (@sortedTypes) { 237 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 238 | if (not($foundGuessed{$i})) { $precision = 0.0; } 239 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 240 | if (not($foundCorrect{$i})) { $recall = 0.0; } 241 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 242 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 243 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 244 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 245 | $i,$precision,$recall,$FB1; 246 | } 247 | print "\\hline\n"; 248 | $precision = 0.0; 249 | $recall = 0; 250 | $FB1 = 0.0; 251 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 252 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 253 | $FB1 = 2*$precision*$recall/($precision+$recall) 254 | if ($precision+$recall > 0); 255 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 256 | $precision,$recall,$FB1; 257 | } 258 | 259 | exit 0; 260 | 261 | # endOfChunk: checks if a chunk ended between the previous and current word 262 | # arguments: previous and current chunk tags, previous and current types 263 | # note: this code is capable of handling other chunk representations 264 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 265 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 266 | 267 | sub endOfChunk { 268 | my $prevTag = shift(@_); 269 | my $tag = shift(@_); 270 | my $prevType = shift(@_); 271 | my $type = shift(@_); 272 | my $chunkEnd = $false; 273 | 274 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 275 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 276 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 277 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 278 | 279 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 280 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 281 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 282 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 283 | 284 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 285 | $chunkEnd = $true; 286 | } 287 | 288 | # corrected 1998-12-22: these chunks are assumed to have length 1 289 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 290 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 291 | 292 | return($chunkEnd); 293 | } 294 | 295 | # startOfChunk: checks if a chunk started between the previous and current word 296 | # arguments: previous and current chunk tags, previous and current types 297 | # note: this code is capable of handling other chunk representations 298 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 299 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 300 | 301 | sub startOfChunk { 302 | my $prevTag = shift(@_); 303 | my $tag = shift(@_); 304 | my $prevType = shift(@_); 305 | my $type = shift(@_); 306 | my $chunkStart = $false; 307 | 308 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 309 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 310 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 311 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 312 | 313 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 314 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 315 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 316 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 317 | 318 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 319 | $chunkStart = $true; 320 | } 321 | 322 | # corrected 1998-12-22: these chunks are assumed to have length 1 323 | if ( $tag eq "[" ) { $chunkStart = $true; } 324 | if ( $tag eq "]" ) { $chunkStart = $true; } 325 | 326 | return($chunkStart); 327 | } 328 | -------------------------------------------------------------------------------- /run_multi-task_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 28 16:23:37 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import math 13 | import os 14 | import sys 15 | import time 16 | 17 | import numpy as np 18 | from six.moves import xrange # pylint: disable=redefined-builtin 19 | import tensorflow as tf 20 | 21 | import data_utils 22 | import multi_task_model 23 | 24 | import subprocess 25 | import stat 26 | 27 | 28 | #tf.app.flags.DEFINE_float("learning_rate", 0.1, "Learning rate.") 29 | #tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.9, 30 | # "Learning rate decays by this much.") 31 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0, 32 | "Clip gradients to this norm.") 33 | tf.app.flags.DEFINE_integer("batch_size", 16, 34 | "Batch size to use during training.") 35 | tf.app.flags.DEFINE_integer("size", 128, "Size of each model layer.") 36 | tf.app.flags.DEFINE_integer("word_embedding_size", 128, "word embedding size") 37 | tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") 38 | tf.app.flags.DEFINE_integer("in_vocab_size", 10000, "max vocab Size.") 39 | tf.app.flags.DEFINE_integer("out_vocab_size", 10000, "max tag vocab Size.") 40 | tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory") 41 | tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training directory.") 42 | tf.app.flags.DEFINE_integer("max_train_data_size", 0, 43 | "Limit on the size of training data (0: no limit)") 44 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 100, 45 | "How many training steps to do per checkpoint.") 46 | tf.app.flags.DEFINE_integer("max_training_steps", 30000, 47 | "Max training steps.") 48 | tf.app.flags.DEFINE_integer("max_test_data_size", 0, 49 | "Max size of test set.") 50 | tf.app.flags.DEFINE_boolean("use_attention", True, 51 | "Use attention based RNN") 52 | tf.app.flags.DEFINE_integer("max_sequence_length", 0, 53 | "Max sequence length.") 54 | tf.app.flags.DEFINE_float("dropout_keep_prob", 0.5, 55 | "dropout keep cell input and output prob.") 56 | tf.app.flags.DEFINE_boolean("bidirectional_rnn", True, 57 | "Use birectional RNN") 58 | tf.app.flags.DEFINE_string("task", None, "Options: joint; intent; tagging") 59 | FLAGS = tf.app.flags.FLAGS 60 | 61 | if FLAGS.max_sequence_length == 0: 62 | print ('Please indicate max sequence length. Exit') 63 | exit() 64 | 65 | if FLAGS.task is None: 66 | print ('Please indicate task to run.' + 67 | 'Available options: intent; tagging; joint') 68 | exit() 69 | 70 | task = dict({'intent':0, 'tagging':0, 'joint':0}) 71 | if FLAGS.task == 'intent': 72 | task['intent'] = 1 73 | elif FLAGS.task == 'tagging': 74 | task['tagging'] = 1 75 | elif FLAGS.task == 'joint': 76 | task['intent'] = 1 77 | task['tagging'] = 1 78 | task['joint'] = 1 79 | 80 | _buckets = [(FLAGS.max_sequence_length, FLAGS.max_sequence_length)] 81 | #_buckets = [(3, 10), (10, 25)] 82 | 83 | # metrics function using conlleval.pl 84 | def conlleval(p, g, w, filename): 85 | ''' 86 | INPUT: 87 | p :: predictions 88 | g :: groundtruth 89 | w :: corresponding words 90 | 91 | OUTPUT: 92 | filename :: name of the file where the predictions 93 | are written. it will be the input of conlleval.pl script 94 | for computing the performance in terms of precision 95 | recall and f1 score 96 | ''' 97 | out = '' 98 | for sl, sp, sw in zip(g, p, w): 99 | out += 'BOS O O\n' 100 | for wl, wp, w in zip(sl, sp, sw): 101 | out += w + ' ' + wl + ' ' + wp + '\n' 102 | out += 'EOS O O\n\n' 103 | 104 | f = open(filename, 'w') 105 | f.writelines(out[:-1]) # remove the ending \n on last line 106 | f.close() 107 | 108 | return get_perf(filename) 109 | 110 | def get_perf(filename): 111 | ''' run conlleval.pl perl script to obtain 112 | precision/recall and F1 score ''' 113 | _conlleval = os.path.dirname(os.path.realpath(__file__)) + '/conlleval.pl' 114 | os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions 115 | 116 | proc = subprocess.Popen(["perl", 117 | _conlleval], 118 | stdin=subprocess.PIPE, 119 | stdout=subprocess.PIPE) 120 | 121 | stdout, _ = proc.communicate(''.join(open(filename).readlines())) 122 | for line in stdout.split('\n'): 123 | if 'accuracy' in line: 124 | out = line.split() 125 | break 126 | 127 | precision = float(out[6][:-2]) 128 | recall = float(out[8][:-2]) 129 | f1score = float(out[10]) 130 | 131 | return {'p': precision, 'r': recall, 'f1': f1score} 132 | 133 | 134 | def read_data(source_path, target_path, label_path, max_size=None): 135 | """Read data from source and target files and put into buckets. 136 | 137 | Args: 138 | source_path: path to the files with token-ids for the word sequence. 139 | target_path: path to the file with token-ids for the tag sequence; 140 | it must be aligned with the source file: n-th line contains the desired 141 | output for n-th line from the source_path. 142 | label_path: path to the file with token-ids for the intent label 143 | max_size: maximum number of lines to read, all other will be ignored; 144 | if 0 or None, data files will be read completely (no limit). 145 | 146 | Returns: 147 | data_set: a list of length len(_buckets); data_set[n] contains a list of 148 | (source, target, label) tuple read from the provided data files that fit 149 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and 150 | len(target) < _buckets[n][1];source, target, label are lists of token-ids 151 | """ 152 | data_set = [[] for _ in _buckets] 153 | with tf.gfile.GFile(source_path, mode="r") as source_file: 154 | with tf.gfile.GFile(target_path, mode="r") as target_file: 155 | with tf.gfile.GFile(label_path, mode="r") as label_file: 156 | source = source_file.readline() 157 | target = target_file.readline() 158 | label = label_file.readline() 159 | counter = 0 160 | while source and target and label and (not max_size \ 161 | or counter < max_size): 162 | counter += 1 163 | if counter % 100000 == 0: 164 | print(" reading data line %d" % counter) 165 | sys.stdout.flush() 166 | source_ids = [int(x) for x in source.split()] 167 | target_ids = [int(x) for x in target.split()] 168 | label_ids = [int(x) for x in label.split()] 169 | # target_ids.append(data_utils.EOS_ID) 170 | for bucket_id, (source_size, target_size) in enumerate(_buckets): 171 | if len(source_ids) < source_size and len(target_ids) < target_size: 172 | data_set[bucket_id].append([source_ids, target_ids, label_ids]) 173 | break 174 | source = source_file.readline() 175 | target = target_file.readline() 176 | label = label_file.readline() 177 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids 178 | 179 | def create_model(session, 180 | source_vocab_size, 181 | target_vocab_size, 182 | label_vocab_size): 183 | """Create model and initialize or load parameters in session.""" 184 | with tf.variable_scope("model", reuse=None): 185 | model_train = multi_task_model.MultiTaskModel( 186 | source_vocab_size, 187 | target_vocab_size, 188 | label_vocab_size, 189 | _buckets, 190 | FLAGS.word_embedding_size, 191 | FLAGS.size, FLAGS.num_layers, 192 | FLAGS.max_gradient_norm, 193 | FLAGS.batch_size, 194 | dropout_keep_prob=FLAGS.dropout_keep_prob, 195 | use_lstm=True, 196 | forward_only=False, 197 | use_attention=FLAGS.use_attention, 198 | bidirectional_rnn=FLAGS.bidirectional_rnn, 199 | task=task) 200 | with tf.variable_scope("model", reuse=True): 201 | model_test = multi_task_model.MultiTaskModel( 202 | source_vocab_size, 203 | target_vocab_size, 204 | label_vocab_size, 205 | _buckets, 206 | FLAGS.word_embedding_size, 207 | FLAGS.size, 208 | FLAGS.num_layers, 209 | FLAGS.max_gradient_norm, 210 | FLAGS.batch_size, 211 | dropout_keep_prob=FLAGS.dropout_keep_prob, 212 | use_lstm=True, 213 | forward_only=True, 214 | use_attention=FLAGS.use_attention, 215 | bidirectional_rnn=FLAGS.bidirectional_rnn, 216 | task=task) 217 | 218 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 219 | if ckpt: 220 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path) 221 | model_train.saver.restore(session, ckpt.model_checkpoint_path) 222 | else: 223 | print("Created model with fresh parameters.") 224 | session.run(tf.global_variables_initializer()) 225 | return model_train, model_test 226 | 227 | def train(): 228 | print ('Applying Parameters:') 229 | for k,v in FLAGS.__dict__['__flags'].iteritems(): 230 | print ('%s: %s' % (k, str(v))) 231 | print("Preparing data in %s" % FLAGS.data_dir) 232 | vocab_path = '' 233 | tag_vocab_path = '' 234 | label_vocab_path = '' 235 | date_set = data_utils.prepare_multi_task_data( 236 | FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size) 237 | in_seq_train, out_seq_train, label_train = date_set[0] 238 | in_seq_dev, out_seq_dev, label_dev = date_set[1] 239 | in_seq_test, out_seq_test, label_test = date_set[2] 240 | vocab_path, tag_vocab_path, label_vocab_path = date_set[3] 241 | 242 | result_dir = FLAGS.train_dir + '/test_results' 243 | if not os.path.isdir(result_dir): 244 | os.makedirs(result_dir) 245 | 246 | current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt' 247 | current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt' 248 | 249 | vocab, rev_vocab = data_utils.initialize_vocab(vocab_path) 250 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocab(tag_vocab_path) 251 | label_vocab, rev_label_vocab = data_utils.initialize_vocab(label_vocab_path) 252 | 253 | config = tf.ConfigProto( 254 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23), 255 | #device_count = {'gpu': 2} 256 | ) 257 | 258 | with tf.Session(config=config) as sess: 259 | # Create model. 260 | print("Max sequence length: %d." % _buckets[0][0]) 261 | print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) 262 | 263 | model, model_test = create_model(sess, 264 | len(vocab), 265 | len(tag_vocab), 266 | len(label_vocab)) 267 | print ("Creating model with " + 268 | "source_vocab_size=%d, target_vocab_size=%d, label_vocab_size=%d." \ 269 | % (len(vocab), len(tag_vocab), len(label_vocab))) 270 | 271 | # Read data into buckets and compute their sizes. 272 | print ("Reading train/valid/test data (training set limit: %d)." 273 | % FLAGS.max_train_data_size) 274 | dev_set = read_data(in_seq_dev, out_seq_dev, label_dev) 275 | test_set = read_data(in_seq_test, out_seq_test, label_test) 276 | train_set = read_data(in_seq_train, out_seq_train, label_train) 277 | train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] 278 | train_total_size = float(sum(train_bucket_sizes)) 279 | 280 | train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size 281 | for i in xrange(len(train_bucket_sizes))] 282 | 283 | # This is the training loop. 284 | step_time, loss = 0.0, 0.0 285 | current_step = 0 286 | 287 | best_valid_score = 0 288 | best_test_score = 0 289 | while model.global_step.eval() < FLAGS.max_training_steps: 290 | random_number_01 = np.random.random_sample() 291 | bucket_id = min([i for i in xrange(len(train_buckets_scale)) 292 | if train_buckets_scale[i] > random_number_01]) 293 | 294 | # Get a batch and make a step. 295 | start_time = time.time() 296 | batch_data = model.get_batch(train_set, bucket_id) 297 | encoder_inputs,tags,tag_weights,batch_sequence_length,labels = batch_data 298 | if task['joint'] == 1: 299 | step_outputs = model.joint_step(sess, 300 | encoder_inputs, 301 | tags, 302 | tag_weights, 303 | labels, 304 | batch_sequence_length, 305 | bucket_id, 306 | False) 307 | _, step_loss, tagging_logits, class_logits = step_outputs 308 | elif task['tagging'] == 1: 309 | step_outputs = model.tagging_step(sess, 310 | encoder_inputs, 311 | tags, 312 | tag_weights, 313 | batch_sequence_length, 314 | bucket_id, 315 | False) 316 | _, step_loss, tagging_logits = step_outputs 317 | elif task['intent'] == 1: 318 | step_outputs = model.classification_step(sess, 319 | encoder_inputs, 320 | labels, 321 | batch_sequence_length, 322 | bucket_id, 323 | False) 324 | _, step_loss, class_logits = step_outputs 325 | 326 | step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint 327 | loss += step_loss / FLAGS.steps_per_checkpoint 328 | current_step += 1 329 | 330 | # Once in a while, we save checkpoint, print statistics, and run evals. 331 | if current_step % FLAGS.steps_per_checkpoint == 0: 332 | perplexity = math.exp(loss) if loss < 300 else float('inf') 333 | print ("global step %d step-time %.2f. Training perplexity %.2f" 334 | % (model.global_step.eval(), step_time, perplexity)) 335 | sys.stdout.flush() 336 | # Save checkpoint and zero timer and loss. 337 | checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") 338 | model.saver.save(sess, checkpoint_path, global_step=model.global_step) 339 | step_time, loss = 0.0, 0.0 340 | 341 | def run_valid_test(data_set, mode): # mode: Eval, Test 342 | # Run evals on development/test set and print the accuracy. 343 | word_list = list() 344 | ref_tag_list = list() 345 | hyp_tag_list = list() 346 | ref_label_list = list() 347 | hyp_label_list = list() 348 | correct_count = 0 349 | accuracy = 0.0 350 | tagging_eval_result = dict() 351 | for bucket_id in xrange(len(_buckets)): 352 | eval_loss = 0.0 353 | count = 0 354 | for i in xrange(len(data_set[bucket_id])): 355 | count += 1 356 | sample = model_test.get_one(data_set, bucket_id, i) 357 | encoder_inputs,tags,tag_weights,sequence_length,labels = sample 358 | tagging_logits = [] 359 | class_logits = [] 360 | if task['joint'] == 1: 361 | step_outputs = model_test.joint_step(sess, 362 | encoder_inputs, 363 | tags, 364 | tag_weights, 365 | labels, 366 | sequence_length, 367 | bucket_id, 368 | True) 369 | _, step_loss, tagging_logits, class_logits = step_outputs 370 | elif task['tagging'] == 1: 371 | step_outputs = model_test.tagging_step(sess, 372 | encoder_inputs, 373 | tags, 374 | tag_weights, 375 | sequence_length, 376 | bucket_id, 377 | True) 378 | _, step_loss, tagging_logits = step_outputs 379 | elif task['intent'] == 1: 380 | step_outputs = model_test.classification_step(sess, 381 | encoder_inputs, 382 | labels, 383 | sequence_length, 384 | bucket_id, 385 | True) 386 | _, step_loss, class_logits = step_outputs 387 | eval_loss += step_loss / len(data_set[bucket_id]) 388 | hyp_label = None 389 | if task['intent'] == 1: 390 | ref_label_list.append(rev_label_vocab[labels[0][0]]) 391 | hyp_label = np.argmax(class_logits[0],0) 392 | hyp_label_list.append(rev_label_vocab[hyp_label]) 393 | if labels[0] == hyp_label: 394 | correct_count += 1 395 | if task['tagging'] == 1: 396 | word_list.append([rev_vocab[x[0]] for x in \ 397 | encoder_inputs[:sequence_length[0]]]) 398 | ref_tag_list.append([rev_tag_vocab[x[0]] for x in \ 399 | tags[:sequence_length[0]]]) 400 | hyp_tag_list.append( 401 | [rev_tag_vocab[np.argmax(x)] for x in \ 402 | tagging_logits[:sequence_length[0]]]) 403 | 404 | accuracy = float(correct_count)*100/count 405 | if task['intent'] == 1: 406 | print(" %s accuracy: %.2f %d/%d" \ 407 | % (mode, accuracy, correct_count, count)) 408 | sys.stdout.flush() 409 | if task['tagging'] == 1: 410 | if mode == 'Eval': 411 | taging_out_file = current_taging_valid_out_file 412 | elif mode == 'Test': 413 | taging_out_file = current_taging_test_out_file 414 | tagging_eval_result = conlleval(hyp_tag_list, 415 | ref_tag_list, 416 | word_list, 417 | taging_out_file) 418 | print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1'])) 419 | sys.stdout.flush() 420 | return accuracy, tagging_eval_result 421 | 422 | # valid 423 | valid_accuracy, valid_tagging_result = run_valid_test(dev_set, 'Eval') 424 | if task['tagging'] == 1 \ 425 | and valid_tagging_result['f1'] > best_valid_score: 426 | best_valid_score = valid_tagging_result['f1'] 427 | # save the best output file 428 | subprocess.call(['mv', 429 | current_taging_valid_out_file, 430 | current_taging_valid_out_file + '.best_f1_%.2f' \ 431 | % best_valid_score]) 432 | # test, run test after each validation for development purpose. 433 | test_accuracy, test_tagging_result = run_valid_test(test_set, 'Test') 434 | if task['tagging'] == 1 \ 435 | and test_tagging_result['f1'] > best_test_score: 436 | best_test_score = test_tagging_result['f1'] 437 | # save the best output file 438 | subprocess.call(['mv', 439 | current_taging_test_out_file, 440 | current_taging_test_out_file + '.best_f1_%.2f' \ 441 | % best_test_score]) 442 | 443 | def main(_): 444 | train() 445 | 446 | if __name__ == "__main__": 447 | tf.app.run() 448 | -------------------------------------------------------------------------------- /multi_task_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 28 17:28:22 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | 7 | Multi-task RNN model with an attention mechanism. 8 | - Developped on top of the Tensorflow seq2seq_model.py example: 9 | https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/seq2seq_model.py 10 | - Note that this example code does not include output label dependency modeling. 11 | One may add a loop function as in the rnn_decoder function in tensorflow 12 | seq2seq.py example to feed emitted label embedding back to RNN state. 13 | """ 14 | 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import random 20 | 21 | import numpy as np 22 | from six.moves import xrange # pylint: disable=redefined-builtin 23 | import tensorflow as tf 24 | 25 | 26 | import data_utils 27 | import seq_labeling 28 | import seq_classification 29 | from tensorflow.contrib.rnn import BasicLSTMCell 30 | from tensorflow.contrib.rnn import MultiRNNCell 31 | from tensorflow.contrib.rnn import DropoutWrapper 32 | from tensorflow.contrib.rnn import static_rnn 33 | from tensorflow.contrib.rnn import static_bidirectional_rnn 34 | 35 | 36 | class MultiTaskModel(object): 37 | def __init__(self, 38 | source_vocab_size, 39 | tag_vocab_size, 40 | label_vocab_size, 41 | buckets, 42 | word_embedding_size, 43 | size, 44 | num_layers, 45 | max_gradient_norm, 46 | batch_size, 47 | dropout_keep_prob=1.0, 48 | use_lstm=False, 49 | bidirectional_rnn=True, 50 | num_samples=1024, 51 | use_attention=False, 52 | task=None, 53 | forward_only=False): 54 | self.source_vocab_size = source_vocab_size 55 | self.tag_vocab_size = tag_vocab_size 56 | self.label_vocab_size = label_vocab_size 57 | self.word_embedding_size = word_embedding_size 58 | self.cell_size = size 59 | self.num_layers = num_layers 60 | self.buckets = buckets 61 | self.batch_size = batch_size 62 | self.bidirectional_rnn = bidirectional_rnn 63 | self.global_step = tf.Variable(0, trainable=False) 64 | 65 | # If we use sampled softmax, we need an output projection. 66 | softmax_loss_function = None 67 | 68 | # Create the internal multi-layer cell for our RNN. 69 | def create_cell(): 70 | if not forward_only and dropout_keep_prob < 1.0: 71 | single_cell = lambda: BasicLSTMCell(self.cell_size) 72 | cell = MultiRNNCell([single_cell() for _ in range(self.num_layers)]) 73 | cell = DropoutWrapper(cell, 74 | input_keep_prob=dropout_keep_prob, 75 | output_keep_prob=dropout_keep_prob) 76 | else: 77 | single_cell = lambda: BasicLSTMCell(self.cell_size) 78 | cell = MultiRNNCell([single_cell() for _ in range(self.num_layers)]) 79 | return cell 80 | 81 | self.cell_fw = create_cell() 82 | self.cell_bw = create_cell() 83 | 84 | # Feeds for inputs. 85 | self.encoder_inputs = [] 86 | self.tags = [] 87 | self.tag_weights = [] 88 | self.labels = [] 89 | self.sequence_length = tf.placeholder(tf.int32, [None], 90 | name="sequence_length") 91 | 92 | for i in xrange(buckets[-1][0]): 93 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 94 | name="encoder{0}".format(i))) 95 | for i in xrange(buckets[-1][1]): 96 | self.tags.append(tf.placeholder(tf.float32, shape=[None], 97 | name="tag{0}".format(i))) 98 | self.tag_weights.append(tf.placeholder(tf.float32, shape=[None], 99 | name="weight{0}".format(i))) 100 | self.labels.append(tf.placeholder(tf.float32, shape=[None], name="label")) 101 | 102 | base_rnn_output = self.generate_rnn_output() 103 | encoder_outputs, encoder_state, attention_states = base_rnn_output 104 | 105 | if task['tagging'] == 1: 106 | seq_labeling_outputs = seq_labeling.generate_sequence_output( 107 | self.source_vocab_size, 108 | encoder_outputs, 109 | encoder_state, 110 | self.tags, 111 | self.sequence_length, 112 | self.tag_vocab_size, 113 | self.tag_weights, 114 | buckets, 115 | softmax_loss_function=softmax_loss_function, 116 | use_attention=use_attention) 117 | self.tagging_output, self.tagging_loss = seq_labeling_outputs 118 | if task['intent'] == 1: 119 | seq_intent_outputs = seq_classification.generate_single_output( 120 | encoder_state, 121 | attention_states, 122 | self.sequence_length, 123 | self.labels, 124 | self.label_vocab_size, 125 | buckets, 126 | softmax_loss_function=softmax_loss_function, 127 | use_attention=use_attention) 128 | self.classification_output, self.classification_loss = seq_intent_outputs 129 | 130 | if task['tagging'] == 1: 131 | self.loss = self.tagging_loss 132 | elif task['intent'] == 1: 133 | self.loss = self.classification_loss 134 | 135 | # Gradients and SGD update operation for training the model. 136 | params = tf.trainable_variables() 137 | if not forward_only: 138 | opt = tf.train.AdamOptimizer() 139 | if task['joint'] == 1: 140 | # backpropagate the intent and tagging loss, one may further adjust 141 | # the weights for the two costs. 142 | gradients = tf.gradients([self.tagging_loss, self.classification_loss], 143 | params) 144 | elif task['tagging'] == 1: 145 | gradients = tf.gradients(self.tagging_loss, params) 146 | elif task['intent'] == 1: 147 | gradients = tf.gradients(self.classification_loss, params) 148 | 149 | clipped_gradients, norm = tf.clip_by_global_norm(gradients, 150 | max_gradient_norm) 151 | self.gradient_norm = norm 152 | self.update = opt.apply_gradients( 153 | zip(clipped_gradients, params), global_step=self.global_step) 154 | 155 | self.saver = tf.train.Saver(tf.global_variables()) 156 | 157 | def generate_rnn_output(self): 158 | """ 159 | Generate RNN state outputs with word embeddings as inputs 160 | """ 161 | with tf.variable_scope("generate_seq_output"): 162 | if self.bidirectional_rnn: 163 | embedding = tf.get_variable("embedding", 164 | [self.source_vocab_size, 165 | self.word_embedding_size]) 166 | encoder_emb_inputs = list() 167 | encoder_emb_inputs = [tf.nn.embedding_lookup(embedding, encoder_input)\ 168 | for encoder_input in self.encoder_inputs] 169 | rnn_outputs = static_bidirectional_rnn(self.cell_fw, 170 | self.cell_bw, 171 | encoder_emb_inputs, 172 | sequence_length=self.sequence_length, 173 | dtype=tf.float32) 174 | encoder_outputs, encoder_state_fw, encoder_state_bw = rnn_outputs 175 | # with state_is_tuple = True, if num_layers > 1, 176 | # here we simply use the state from last layer as the encoder state 177 | state_fw = encoder_state_fw[-1] 178 | state_bw = encoder_state_bw[-1] 179 | encoder_state = tf.concat([tf.concat(state_fw, 1), 180 | tf.concat(state_bw, 1)], 1) 181 | top_states = [tf.reshape(e, [-1, 1, self.cell_fw.output_size \ 182 | + self.cell_bw.output_size]) 183 | for e in encoder_outputs] 184 | attention_states = tf.concat(top_states, 1) 185 | else: 186 | embedding = tf.get_variable("embedding", 187 | [self.source_vocab_size, 188 | self.word_embedding_size]) 189 | encoder_emb_inputs = list() 190 | encoder_emb_inputs = [tf.nn.embedding_lookup(embedding, encoder_input)\ 191 | for encoder_input in self.encoder_inputs] 192 | rnn_outputs = static_rnn(self.cell_fw, 193 | encoder_emb_inputs, 194 | sequence_length=self.sequence_length, 195 | dtype=tf.float32) 196 | encoder_outputs, encoder_state = rnn_outputs 197 | # with state_is_tuple = True, if num_layers > 1, 198 | # here we use the state from last layer as the encoder state 199 | state = encoder_state[-1] 200 | encoder_state = tf.concat(state, 1) 201 | top_states = [tf.reshape(e, [-1, 1, self.cell_fw.output_size]) 202 | for e in encoder_outputs] 203 | attention_states = tf.concat(top_states, 1) 204 | return encoder_outputs, encoder_state, attention_states 205 | 206 | def joint_step(self, session, encoder_inputs, tags, tag_weights, 207 | labels, batch_sequence_length, 208 | bucket_id, forward_only): 209 | """Run a step of the joint model feeding the given inputs. 210 | 211 | Args: 212 | session: tensorflow session to use. 213 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 214 | tags: list of numpy int vectors to feed as decoder inputs. 215 | tag_weights: list of numpy float vectors to feed as tag weights. 216 | labels: list of numpy int vectors to feed as sequence class labels. 217 | bucket_id: which bucket of the model to use. 218 | batch_sequence_length: batch_sequence_length 219 | bucket_id: which bucket of the model to use. 220 | forward_only: whether to do the backward step or only forward. 221 | 222 | Returns: 223 | A triple consisting of gradient norm (or None if we did not do backward), 224 | average perplexity, output tags, and output class label. 225 | 226 | Raises: 227 | ValueError: if length of encoder_inputs, decoder_inputs, or 228 | target_weights disagrees with bucket size for the specified bucket_id. 229 | """ 230 | # Check if the sizes match. 231 | encoder_size, tag_size = self.buckets[bucket_id] 232 | if len(encoder_inputs) != encoder_size: 233 | raise ValueError("Encoder length must be equal to the one in bucket," 234 | " %d != %d." % (len(encoder_inputs), encoder_size)) 235 | if len(tags) != tag_size: 236 | raise ValueError("Decoder length must be equal to the one in bucket," 237 | " %d != %d." % (len(tags), tag_size)) 238 | if len(labels) != 1: 239 | raise ValueError("Decoder length must be equal to the one in bucket," 240 | " %d != %d." % (len(labels), 1)) 241 | 242 | input_feed = {} 243 | input_feed[self.sequence_length.name] = batch_sequence_length 244 | for l in xrange(encoder_size): 245 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 246 | input_feed[self.tags[l].name] = tags[l] 247 | input_feed[self.tag_weights[l].name] = tag_weights[l] 248 | input_feed[self.labels[0].name] = labels[0] 249 | 250 | # Output feed: depends on whether we do a backward step or not. 251 | if not forward_only: 252 | output_feed = [self.update, # Update Op that does SGD. 253 | self.gradient_norm, # Gradient norm. 254 | self.loss] # Loss for this batch. 255 | for i in range(tag_size): 256 | output_feed.append(self.tagging_output[i]) 257 | output_feed.append(self.classification_output[0]) 258 | else: 259 | output_feed = [self.loss] 260 | for i in range(tag_size): 261 | output_feed.append(self.tagging_output[i]) 262 | output_feed.append(self.classification_output[0]) 263 | 264 | outputs = session.run(output_feed, input_feed) 265 | if not forward_only: 266 | return outputs[1], outputs[2], outputs[3:3+tag_size], outputs[-1] 267 | else: 268 | return None, outputs[0], outputs[1:1+tag_size], outputs[-1] 269 | 270 | 271 | def tagging_step(self, session, encoder_inputs, tags, tag_weights, 272 | batch_sequence_length, bucket_id, forward_only): 273 | """Run a step of the tagging model feeding the given inputs. 274 | 275 | Args: 276 | session: tensorflow session to use. 277 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 278 | tags: list of numpy int vectors to feed as decoder inputs. 279 | tag_weights: list of numpy float vectors to feed as target weights. 280 | batch_sequence_length: batch_sequence_length 281 | bucket_id: which bucket of the model to use. 282 | forward_only: whether to do the backward step or only forward. 283 | 284 | Returns: 285 | A triple consisting of gradient norm (or None if we did not do backward), 286 | average perplexity, and the output tags. 287 | 288 | Raises: 289 | ValueError: if length of encoder_inputs, decoder_inputs, or 290 | target_weights disagrees with bucket size for the specified bucket_id. 291 | """ 292 | # Check if the sizes match. 293 | encoder_size, tag_size = self.buckets[bucket_id] 294 | if len(encoder_inputs) != encoder_size: 295 | raise ValueError("Encoder length must be equal to the one in bucket," 296 | " %d != %d." % (len(encoder_inputs), encoder_size)) 297 | if len(tags) != tag_size: 298 | raise ValueError("Decoder length must be equal to the one in bucket," 299 | " %d != %d." % (len(tags), tag_size)) 300 | 301 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 302 | input_feed = {} 303 | input_feed[self.sequence_length.name] = batch_sequence_length 304 | for l in xrange(encoder_size): 305 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 306 | input_feed[self.tags[l].name] = tags[l] 307 | input_feed[self.tag_weights[l].name] = tag_weights[l] 308 | 309 | # Output feed: depends on whether we do a backward step or not. 310 | if not forward_only: 311 | output_feed = [self.update, # Update Op that does SGD. 312 | self.gradient_norm, # Gradient norm. 313 | self.loss] # Loss for this batch. 314 | for i in range(tag_size): 315 | output_feed.append(self.tagging_output[i]) 316 | else: 317 | output_feed = [self.loss] 318 | for i in range(tag_size): 319 | output_feed.append(self.tagging_output[i]) 320 | 321 | outputs = session.run(output_feed, input_feed) 322 | if not forward_only: 323 | return outputs[1], outputs[2], outputs[3:3+tag_size] 324 | else: 325 | return None, outputs[0], outputs[1:1+tag_size] 326 | 327 | def classification_step(self, session, encoder_inputs, labels, 328 | batch_sequence_length, bucket_id, forward_only): 329 | """Run a step of the intent classification model feeding the given inputs. 330 | 331 | Args: 332 | session: tensorflow session to use. 333 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 334 | labels: list of numpy int vectors to feed as sequence class labels. 335 | batch_sequence_length: batch_sequence_length 336 | bucket_id: which bucket of the model to use. 337 | forward_only: whether to do the backward step or only forward. 338 | 339 | Returns: 340 | A triple consisting of gradient norm (or None if we did not do backward), 341 | average perplexity, and the output class label. 342 | 343 | Raises: 344 | ValueError: if length of encoder_inputs, decoder_inputs, or 345 | target_weights disagrees with bucket size for the specified bucket_id. 346 | """ 347 | # Check if the sizes match. 348 | encoder_size, target_size = self.buckets[bucket_id] 349 | if len(encoder_inputs) != encoder_size: 350 | raise ValueError("Encoder length must be equal to the one in bucket," 351 | " %d != %d." % (len(encoder_inputs), encoder_size)) 352 | 353 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 354 | input_feed = {} 355 | input_feed[self.sequence_length.name] = batch_sequence_length 356 | for l in xrange(encoder_size): 357 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 358 | input_feed[self.labels[0].name] = labels[0] 359 | 360 | # Output feed: depends on whether we do a backward step or not. 361 | if not forward_only: 362 | output_feed = [self.update, # Update Op that does SGD. 363 | self.gradient_norm, # Gradient norm. 364 | self.loss, # Loss for this batch. 365 | self.classification_output[0]] 366 | else: 367 | output_feed = [self.loss, 368 | self.classification_output[0],] 369 | 370 | outputs = session.run(output_feed, input_feed) 371 | if not forward_only: 372 | return outputs[1], outputs[2], outputs[3] # Gradient norm, loss, outputs. 373 | else: 374 | return None, outputs[0], outputs[1] # No gradient norm, loss, outputs. 375 | 376 | 377 | def get_batch(self, data, bucket_id): 378 | """Get a random batch of data from the specified bucket, prepare for step. 379 | 380 | To feed data in step(..) it must be a list of batch-major vectors, while 381 | data here contains single length-major cases. So the main logic of this 382 | function is to re-index data cases to be in the proper format for feeding. 383 | 384 | Args: 385 | data: a tuple of size len(self.buckets) in which each element contains 386 | lists of pairs of input and output data that we use to create a batch. 387 | bucket_id: integer, which bucket to get the batch for. 388 | 389 | Returns: 390 | The triple (encoder_inputs, decoder_inputs, target_weights) for 391 | the constructed batch that has the proper format to call step(...) later. 392 | """ 393 | encoder_size, decoder_size = self.buckets[bucket_id] 394 | encoder_inputs, decoder_inputs, labels = [], [], [] 395 | 396 | # Get a random batch of encoder and decoder inputs from data, 397 | # pad them if needed, reverse encoder inputs and add GO to decoder. 398 | batch_sequence_length_list= list() 399 | for _ in xrange(self.batch_size): 400 | encoder_input, decoder_input, label = random.choice(data[bucket_id]) 401 | batch_sequence_length_list.append(len(encoder_input)) 402 | 403 | # Encoder inputs are padded and then reversed. 404 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) 405 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) 406 | encoder_inputs.append(list(encoder_input + encoder_pad)) 407 | 408 | # Decoder inputs get an extra "GO" symbol, and are padded then. 409 | decoder_pad_size = decoder_size - len(decoder_input) 410 | decoder_inputs.append(decoder_input + 411 | [data_utils.PAD_ID] * decoder_pad_size) 412 | labels.append(label) 413 | 414 | # Now we create batch-major vectors from the data selected above. 415 | batch_encoder_inputs = [] 416 | batch_decoder_inputs = [] 417 | batch_weights = [] 418 | batch_labels = [] 419 | 420 | # Batch encoder inputs are just re-indexed encoder_inputs. 421 | for length_idx in xrange(encoder_size): 422 | batch_encoder_inputs.append( 423 | np.array([encoder_inputs[batch_idx][length_idx] 424 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 425 | 426 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 427 | for length_idx in xrange(decoder_size): 428 | batch_decoder_inputs.append( 429 | np.array([decoder_inputs[batch_idx][length_idx] 430 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 431 | # Create target_weights to be 0 for targets that are padding. 432 | batch_weight = np.ones(self.batch_size, dtype=np.float32) 433 | for batch_idx in xrange(self.batch_size): 434 | # We set weight to 0 if the corresponding target is a PAD symbol. 435 | # The corresponding target is decoder_input shifted by 1 forward. 436 | # if length_idx < decoder_size - 1: 437 | # target = decoder_inputs[batch_idx][length_idx + 1] 438 | # print (length_idx) 439 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 440 | batch_weight[batch_idx] = 0.0 441 | batch_weights.append(batch_weight) 442 | 443 | batch_labels.append( 444 | np.array([labels[batch_idx][0] 445 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 446 | 447 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32) 448 | return (batch_encoder_inputs, batch_decoder_inputs, batch_weights, 449 | batch_sequence_length, batch_labels) 450 | 451 | 452 | def get_one(self, data, bucket_id, sample_id): 453 | """Get a single sample data from the specified bucket, prepare for step. 454 | 455 | To feed data in step(..) it must be a list of batch-major vectors, while 456 | data here contains single length-major cases. So the main logic of this 457 | function is to re-index data cases to be in the proper format for feeding. 458 | 459 | Args: 460 | data: a tuple of size len(self.buckets) in which each element contains 461 | lists of pairs of input and output data that we use to create a batch. 462 | bucket_id: integer, which bucket to get the batch for. 463 | 464 | Returns: 465 | The triple (encoder_inputs, decoder_inputs, target_weights) for 466 | the constructed batch that has the proper format to call step(...) later. 467 | """ 468 | encoder_size, decoder_size = self.buckets[bucket_id] 469 | encoder_inputs, decoder_inputs, labels = [], [], [] 470 | 471 | # Get a random batch of encoder and decoder inputs from data, 472 | # pad them if needed, reverse encoder inputs and add GO to decoder. 473 | batch_sequence_length_list= list() 474 | #for _ in xrange(self.batch_size): 475 | encoder_input, decoder_input, label = data[bucket_id][sample_id] 476 | batch_sequence_length_list.append(len(encoder_input)) 477 | 478 | # Encoder inputs are padded and then reversed. 479 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) 480 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) 481 | encoder_inputs.append(list(encoder_input + encoder_pad)) 482 | 483 | # Decoder inputs get an extra "GO" symbol, and are padded then. 484 | decoder_pad_size = decoder_size - len(decoder_input) 485 | decoder_inputs.append(decoder_input + 486 | [data_utils.PAD_ID] * decoder_pad_size) 487 | labels.append(label) 488 | 489 | # Now we create batch-major vectors from the data selected above. 490 | batch_encoder_inputs = [] 491 | batch_decoder_inputs = [] 492 | batch_weights = [] 493 | batch_labels = [] 494 | 495 | # Batch encoder inputs are just re-indexed encoder_inputs. 496 | for length_idx in xrange(encoder_size): 497 | batch_encoder_inputs.append( 498 | np.array([encoder_inputs[batch_idx][length_idx] 499 | for batch_idx in xrange(1)], dtype=np.int32)) 500 | 501 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 502 | for length_idx in xrange(decoder_size): 503 | batch_decoder_inputs.append( 504 | np.array([decoder_inputs[batch_idx][length_idx] 505 | for batch_idx in xrange(1)], dtype=np.int32)) 506 | 507 | # Create target_weights to be 0 for targets that are padding. 508 | batch_weight = np.ones(1, dtype=np.float32) 509 | for batch_idx in xrange(1): 510 | # We set weight to 0 if the corresponding target is a PAD symbol. 511 | # The corresponding target is decoder_input shifted by 1 forward. 512 | # if length_idx < decoder_size - 1: 513 | # target = decoder_inputs[batch_idx][length_idx + 1] 514 | # print (length_idx) 515 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 516 | batch_weight[batch_idx] = 0.0 517 | batch_weights.append(batch_weight) 518 | 519 | batch_labels.append( 520 | np.array([labels[batch_idx][0] 521 | for batch_idx in xrange(1)], dtype=np.int32)) 522 | 523 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32) 524 | return (batch_encoder_inputs, batch_decoder_inputs, batch_weights, 525 | batch_sequence_length, batch_labels) --------------------------------------------------------------------------------