├── data
└── ATIS_samples
│ ├── test
│ ├── test.label
│ ├── test.seq.in
│ └── test.seq.out
│ ├── train
│ ├── train.label
│ ├── train.seq.in
│ └── train.seq.out
│ └── valid
│ ├── valid.label
│ ├── valid.seq.in
│ └── valid.seq.out
├── README.md
├── seq_classification.py
├── seq_labeling.py
├── data_utils.py
├── conlleval.pl
├── run_multi-task_rnn.py
└── multi_task_model.py
/data/ATIS_samples/test/test.label:
--------------------------------------------------------------------------------
1 | flight
2 | airfare
3 | flight
4 | flight
5 | flight
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.label:
--------------------------------------------------------------------------------
1 | airfare
2 | flight
3 | flight
4 | airfare
5 | flight
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.label:
--------------------------------------------------------------------------------
1 | flight
2 | flight
3 | flight_time
4 | airfare
5 | airfare
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.seq.in:
--------------------------------------------------------------------------------
1 | what's the lowest round trip fare from dallas to atlanta
2 | find me the earliest flight from boston to atlanta on any day of the week
3 | display all flights from boston to baltimore on july thirty first
4 | economy fares new york to miami round trip
5 | i need to fly from boston to denver on to san francisco and back
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/test/test.seq.in:
--------------------------------------------------------------------------------
1 | i would like to find a flight from charlotte to las vegas that makes a stop in st. louis
2 | on april first i need a ticket from tacoma to san jose departing before DIGIT am
3 | on april first i need a flight going from phoenix to san diego
4 | i would like a flight traveling one way from phoenix to san diego on april first
5 | i would like a flight from orlando to salt lake city for april first on delta airlines
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.seq.in:
--------------------------------------------------------------------------------
1 | i want to fly from boston at DIGITDIGITDIGIT am and arrive in denver at DIGITDIGITDIGITDIGIT in the morning
2 | what flights are available from pittsburgh to baltimore on thursday morning
3 | what is the arrival time in san francisco for the DIGITDIGITDIGIT am flight leaving washington
4 | cheapest airfare from tacoma to orlando
5 | round trip fares from pittsburgh to philadelphia under DIGITDIGITDIGITDIGIT dollars
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.seq.out:
--------------------------------------------------------------------------------
1 | O O B-cost_relative B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name
2 | O O O B-flight_mod O O B-fromloc.city_name O B-toloc.city_name O O O O O O
3 | O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number
4 | B-economy O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip
5 | O O O O O B-fromloc.city_name O B-toloc.city_name O O B-toloc.city_name I-toloc.city_name O O
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.seq.out:
--------------------------------------------------------------------------------
1 | O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day
2 | O O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.day_name B-depart_time.period_of_day
3 | O O O B-flight_time I-flight_time O B-fromloc.city_name I-fromloc.city_name O O B-depart_time.time I-depart_time.time O O B-fromloc.city_name
4 | B-cost_relative O O B-fromloc.city_name O B-toloc.city_name
5 | B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative B-fare_amount I-fare_amount
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/test/test.seq.out:
--------------------------------------------------------------------------------
1 | O O O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name
2 | O B-depart_date.month_name B-depart_date.day_number O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_time.time_relative B-depart_time.time I-depart_time.time
3 | O B-depart_date.month_name B-depart_date.day_number O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name
4 | O O O O O O B-round_trip I-round_trip O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number
5 | O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number O B-airline_name I-airline_name
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Attention-based RNN model for Spoken Language Understanding (Intent Detection & Slot Filling)
2 | ==================
3 | Tensorflow implementation of attention-based LSTM models for sequence classification and sequence labeling.
4 |
5 | **Updates - 2017/07/29**
6 | * Updated code to work with the latest TensorFlow API: r1.2
7 | * Code cleanup and formatting
8 | * Note that this published code does not include the modeling of output label dependencies. One may add a loop function as in the rnn_decoder function in TensorFlow seq2seq.py example to feed emitted label embedding back to RNN state. Alternatively, sequence level optimization can be performed by adding a CRF layer on top of the RNN outputs.
9 | * The dataset used in the paper can be found at: https://github.com/yvchen/JointSLU/tree/master/data. We used the training set in the original ATIS train/test split, which has 4978 training samples. There are 15 test samples that have multiple intent labels for an utterance. We used the more frequent label (most likely, "flight") as the true label during evaluation.
10 |
11 |
12 | **Setup**
13 |
14 | * TensorFlow, version r1.2 (https://www.tensorflow.org/api_docs/)
15 |
16 | **Usage**:
17 | ```bash
18 | data_dir=data/ATIS_samples
19 | model_dir=model_tmp
20 | max_sequence_length=50 # max length for train/valid/test sequence
21 | task=joint # available options: intent; tagging; joint
22 | bidirectional_rnn=True # available options: True; False
23 | use_attention=True # available options: True; False
24 |
25 | python run_multi-task_rnn.py --data_dir $data_dir \
26 | --train_dir $model_dir\
27 | --max_sequence_length $max_sequence_length \
28 | --task $task \
29 | --bidirectional_rnn $bidirectional_rnn \
30 | --use_attention $use_attention
31 | ```
32 |
33 | **Reference**
34 |
35 | * Bing Liu, Ian Lane, "Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling", Interspeech, 2016 (PDF)
36 |
37 | ```
38 | @inproceedings{Liu+2016,
39 | author={Bing Liu and Ian Lane},
40 | title={Attention-Based Recurrent Neural Network Models for Joint Intent Detection and Slot Filling},
41 | year=2016,
42 | booktitle={Interspeech 2016},
43 | doi={10.21437/Interspeech.2016-1352},
44 | url={http://dx.doi.org/10.21437/Interspeech.2016-1352},
45 | pages={685--689}
46 | }
47 | ```
48 |
49 | **Contact**
50 |
51 | Feel free to email liubing@cmu.edu for any pertinent questions/bugs regarding the code.
52 |
--------------------------------------------------------------------------------
/seq_classification.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Feb 28 15:28:44 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | from six.moves import xrange # pylint: disable=redefined-builtin
13 | # We disable pylint because we need python3 compatibility.
14 | import tensorflow as tf
15 | from tensorflow.python.ops import rnn_cell_impl
16 |
17 | linear = rnn_cell_impl._linear
18 |
19 | def attention_single_output_decoder(initial_state,
20 | attention_states,
21 | output_size=None,
22 | num_heads=1,
23 | dtype=tf.float32,
24 | scope=None,
25 | sequence_length=tf.ones([16]),
26 | initial_state_attention=True,
27 | use_attention=False):
28 |
29 | if num_heads < 1:
30 | raise ValueError("With less than 1 heads, use a non-attention decoder.")
31 | if not attention_states.get_shape()[1:2].is_fully_defined():
32 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
33 | % attention_states.get_shape())
34 |
35 | with tf.variable_scope(scope or "decoder_single_output"):
36 | # print (initial_state.eval().shape)
37 | batch_size = tf.shape(initial_state)[0] # Needed for reshaping.
38 | # print (attention_states.get_shape())
39 | attn_length = attention_states.get_shape()[1].value
40 | attn_size = attention_states.get_shape()[2].value
41 |
42 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
43 | hidden = tf.reshape(
44 | attention_states, [-1, attn_length, 1, attn_size])
45 | hidden_features = []
46 | v = []
47 | attention_vec_size = attn_size # Size of query vectors for attention.
48 | for a in xrange(num_heads):
49 | k = tf.get_variable("AttnW_%d" % a,
50 | [1, 1, attn_size, attention_vec_size])
51 | hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
52 | v.append(tf.get_variable("AttnV_%d" % a,
53 | [attention_vec_size]))
54 |
55 | # state = initial_state
56 |
57 | def attention(query, use_attention=False):
58 | """Put attention masks on hidden using hidden_features and query."""
59 | attn_weights = []
60 | ds = [] # Results of attention reads will be stored here.
61 | for i in xrange(num_heads):
62 | with tf.variable_scope("Attention_%d" % i):
63 | # y = linear(query, attention_vec_size, True)
64 | y = linear(query, attention_vec_size, True)
65 | y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
66 | # Attention mask is a softmax of v^T * tanh(...).
67 | s = tf.reduce_sum(
68 | v[i] * tf.tanh(hidden_features[i] + y), [2, 3])
69 | if use_attention is False: # apply mean pooling
70 | weights = tf.tile(sequence_length, tf.stack([attn_length]))
71 | weights = tf.reshape(weights, tf.shape(s))
72 | a = tf.ones(tf.shape(s), dtype=dtype) / tf.to_float(weights)
73 | # a = tf.ones(tf.shape(s), dtype=dtype) / tf.to_float(tf.shape(s)[1])
74 | else:
75 | a = tf.nn.softmax(s)
76 | attn_weights.append(a)
77 | # Now calculate the attention-weighted vector d.
78 | d = tf.reduce_sum(
79 | tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
80 | [1, 2])
81 | ds.append(tf.reshape(d, [-1, attn_size]))
82 | return attn_weights, ds
83 |
84 | batch_attn_size = tf.stack([batch_size, attn_size])
85 | attns = [tf.zeros(batch_attn_size, dtype=dtype)
86 | for _ in xrange(num_heads)]
87 | for a in attns: # Ensure the second shape of attention vectors is set.
88 | a.set_shape([None, attn_size])
89 | if initial_state_attention:
90 | attn_weights, attns = attention(initial_state, use_attention=use_attention)
91 |
92 | #with variable_scope.variable_scope(scope or "Linear"):
93 | matrix = tf.get_variable("Out_Matrix", [attn_size, output_size])
94 | res = tf.matmul(attns[0], matrix)
95 | # NOTE: here we temporarily assume num_head = 1
96 | bias_start = 0.0
97 | bias_term = tf.get_variable("Out_Bias",
98 | [output_size],
99 | initializer=tf.constant_initializer(bias_start))
100 | output = res + bias_term
101 | # NOTE: here we temporarily assume num_head = 1
102 | return attention_states, attn_weights[0], attns[0], [output]
103 |
104 | def generate_single_output(encoder_state, attention_states, sequence_length,
105 | targets, num_classes, buckets,
106 | use_mean_attention=False,
107 | softmax_loss_function=None, per_example_loss=False,
108 | name=None, use_attention=False):
109 | all_inputs = targets
110 | with tf.name_scope(name, "model_with_buckets", all_inputs):
111 | with tf.variable_scope(tf.get_variable_scope(),
112 | reuse=None):
113 | single_outputs = attention_single_output_decoder(encoder_state,
114 | attention_states,
115 | output_size=num_classes,
116 | num_heads=1,
117 | sequence_length=sequence_length,
118 | use_attention=use_attention)
119 | _, _, _, bucket_outputs = single_outputs
120 |
121 | if softmax_loss_function is None:
122 | assert len(bucket_outputs) == len(targets) == 1
123 | # We need to make target and int64-tensor and set its shape.
124 | bucket_target = tf.reshape(tf.to_int64(targets[0]), [-1])
125 | crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
126 | logits=bucket_outputs[0], labels=bucket_target)
127 | else:
128 | assert len(bucket_outputs) == len(targets) == 1
129 | crossent = softmax_loss_function(bucket_outputs[0], targets[0])
130 |
131 | batch_size = tf.shape(targets[0])[0]
132 | loss = tf.reduce_sum(crossent) / tf.cast(batch_size, tf.float32)
133 |
134 | return bucket_outputs, loss
--------------------------------------------------------------------------------
/seq_labeling.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Feb 28 11:32:21 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | # We disable pylint because we need python3 compatibility.
13 | from six.moves import xrange # pylint: disable=redefined-builtin
14 | #from six.moves import zip # pylint: disable=redefined-builtin
15 |
16 | import tensorflow as tf
17 | from tensorflow.python.ops import control_flow_ops
18 | from tensorflow.python.framework import tensor_shape
19 | from tensorflow.contrib.legacy_seq2seq import sequence_loss_by_example
20 | from tensorflow.contrib.legacy_seq2seq import sequence_loss
21 |
22 | from tensorflow.python.ops import rnn_cell_impl
23 |
24 | linear = rnn_cell_impl._linear
25 |
26 | def _step(time, sequence_length, min_sequence_length,
27 | max_sequence_length, zero_logit, generate_logit):
28 | # Step 1: determine whether we need to call_cell or not
29 | empty_update = lambda: zero_logit
30 | logit = control_flow_ops.cond(
31 | time < max_sequence_length, generate_logit, empty_update)
32 |
33 | # Step 2: determine whether we need to copy through state and/or outputs
34 | existing_logit = lambda: logit
35 |
36 | def copy_through():
37 | # Use broadcasting select to determine which values should get
38 | # the previous state & zero output, and which values should get
39 | # a calculated state & output.
40 | copy_cond = (time >= sequence_length)
41 | return tf.where(copy_cond, zero_logit, logit)
42 |
43 | logit = control_flow_ops.cond(
44 | time < min_sequence_length, existing_logit, copy_through)
45 | logit.set_shape(zero_logit.get_shape())
46 | return logit
47 |
48 | def attention_RNN(encoder_outputs,
49 | encoder_state,
50 | num_decoder_symbols,
51 | sequence_length,
52 | num_heads=1,
53 | dtype=tf.float32,
54 | use_attention=True,
55 | loop_function=None,
56 | scope=None):
57 | if use_attention:
58 | print ('Use the attention RNN model')
59 | if num_heads < 1:
60 | raise ValueError("With less than 1 heads, use a non-attention decoder.")
61 |
62 | with tf.variable_scope(scope or "attention_RNN"):
63 | output_size = encoder_outputs[0].get_shape()[1].value
64 | top_states = [tf.reshape(e, [-1, 1, output_size])
65 | for e in encoder_outputs]
66 | attention_states = tf.concat(top_states, 1)
67 | if not attention_states.get_shape()[1:2].is_fully_defined():
68 | raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
69 | % attention_states.get_shape())
70 |
71 | batch_size = tf.shape(top_states[0])[0] # Needed for reshaping.
72 | attn_length = attention_states.get_shape()[1].value
73 | attn_size = attention_states.get_shape()[2].value
74 |
75 | # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
76 | hidden = tf.reshape(
77 | attention_states, [-1, attn_length, 1, attn_size])
78 | hidden_features = []
79 | v = []
80 | attention_vec_size = attn_size # Size of query vectors for attention.
81 | for a in xrange(num_heads):
82 | k = tf.get_variable("AttnW_%d" % a,
83 | [1, 1, attn_size, attention_vec_size])
84 | hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
85 | v.append(tf.get_variable("AttnV_%d" % a,
86 | [attention_vec_size]))
87 |
88 | def attention(query):
89 | """Put attention masks on hidden using hidden_features and query."""
90 | attn_weights = []
91 | ds = [] # Results of attention reads will be stored here.
92 | for i in xrange(num_heads):
93 | with tf.variable_scope("Attention_%d" % i):
94 | #y = linear(query, attention_vec_size, True)
95 | y = linear(query, attention_vec_size, True)
96 | y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
97 | # Attention mask is a softmax of v^T * tanh(...).
98 | s = tf.reduce_sum(
99 | v[i] * tf.tanh(hidden_features[i] + y), [2, 3])
100 | a = tf.nn.softmax(s)
101 | attn_weights.append(a)
102 | # Now calculate the attention-weighted vector d.
103 | d = tf.reduce_sum(
104 | tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
105 | [1, 2])
106 | ds.append(tf.reshape(d, [-1, attn_size]))
107 | return attn_weights, ds
108 |
109 | batch_attn_size = tf.stack([batch_size, attn_size])
110 | attns = [tf.zeros(batch_attn_size, dtype=dtype)
111 | for _ in xrange(num_heads)]
112 | for a in attns: # Ensure the second shape of attention vectors is set.
113 | a.set_shape([None, attn_size])
114 |
115 | # loop through the encoder_outputs
116 | attention_encoder_outputs = list()
117 | sequence_attention_weights = list()
118 | for i in xrange(len(encoder_outputs)):
119 | if i > 0:
120 | tf.get_variable_scope().reuse_variables()
121 | if i == 0:
122 | with tf.variable_scope("Initial_Decoder_Attention"):
123 | initial_state = linear(encoder_state, output_size, True)
124 | attn_weights, ds = attention(initial_state)
125 | else:
126 | attn_weights, ds = attention(encoder_outputs[i])
127 | output = tf.concat([ds[0], encoder_outputs[i]], 1)
128 | # NOTE: here we temporarily assume num_head = 1
129 | with tf.variable_scope("AttnRnnOutputProjection"):
130 | logit = linear(output, num_decoder_symbols, True)
131 | attention_encoder_outputs.append(logit)
132 | # NOTE: here we temporarily assume num_head = 1
133 | sequence_attention_weights.append(attn_weights[0])
134 | # NOTE: here we temporarily assume num_head = 1
135 | else:
136 | print ('Use the NON attention RNN model')
137 | with tf.variable_scope(scope or "non-attention_RNN"):
138 | attention_encoder_outputs = list()
139 | sequence_attention_weights = list()
140 |
141 | # copy over logits once out of sequence_length
142 | if encoder_outputs[0].get_shape().ndims != 1:
143 | (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2)
144 | else:
145 | fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0]
146 |
147 | if fixed_batch_size.value:
148 | batch_size = fixed_batch_size.value
149 | else:
150 | batch_size = tf.shape(encoder_outputs[0])[0]
151 | if sequence_length is not None:
152 | sequence_length = tf.to_int32(sequence_length)
153 | if sequence_length is not None: # Prepare variables
154 | zero_logit = tf.zeros(
155 | tf.stack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype)
156 | zero_logit.set_shape(
157 | tensor_shape.TensorShape([fixed_batch_size.value,
158 | num_decoder_symbols]))
159 | min_sequence_length = tf.reduce_min(sequence_length)
160 | max_sequence_length = tf.reduce_max(sequence_length)
161 |
162 | #reuse = False
163 | for time, input_ in enumerate(encoder_outputs):
164 | if time > 0:
165 | tf.get_variable_scope().reuse_variables()
166 | #reuse = True
167 | # pylint: disable=cell-var-from-loop
168 | # call_cell = lambda: cell(input_, state)
169 | generate_logit = lambda: linear(encoder_outputs[time],
170 | num_decoder_symbols,
171 | True)
172 | # pylint: enable=cell-var-from-loop
173 | if sequence_length is not None:
174 | logit = _step(time, sequence_length, min_sequence_length,
175 | max_sequence_length, zero_logit, generate_logit)
176 | else:
177 | logit = generate_logit
178 | attention_encoder_outputs.append(logit)
179 |
180 | return attention_encoder_outputs, sequence_attention_weights
181 |
182 |
183 | def generate_sequence_output(num_encoder_symbols,
184 | encoder_outputs,
185 | encoder_state,
186 | targets,
187 | sequence_length,
188 | num_decoder_symbols,
189 | weights,
190 | buckets,
191 | softmax_loss_function=None,
192 | per_example_loss=False,
193 | name=None,
194 | use_attention=False):
195 | if len(targets) < buckets[-1][1]:
196 | raise ValueError("Length of targets (%d) must be at least that of last"
197 | "bucket (%d)." % (len(targets), buckets[-1][1]))
198 |
199 | all_inputs = encoder_outputs + targets + weights
200 | with tf.name_scope(name, "model_with_buckets", all_inputs):
201 | with tf.variable_scope("decoder_sequence_output", reuse=None):
202 | logits, attention_weights = attention_RNN(encoder_outputs,
203 | encoder_state,
204 | num_decoder_symbols,
205 | sequence_length,
206 | use_attention=use_attention)
207 | if per_example_loss is None:
208 | assert len(logits) == len(targets)
209 | # We need to make target and int64-tensor and set its shape.
210 | bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets]
211 | crossent = sequence_loss_by_example(
212 | logits, bucket_target, weights,
213 | softmax_loss_function=softmax_loss_function)
214 | else:
215 | assert len(logits) == len(targets)
216 | bucket_target = [tf.reshape(tf.to_int64(x), [-1]) for x in targets]
217 | crossent = sequence_loss(
218 | logits, bucket_target, weights,
219 | softmax_loss_function=softmax_loss_function)
220 |
221 | return logits, crossent
222 |
--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Feb 27 09:33:32 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 |
7 | Prepare data for multi-task RNN model.
8 | """
9 |
10 | from __future__ import absolute_import
11 | from __future__ import division
12 | from __future__ import print_function
13 |
14 | import os
15 | import re
16 |
17 | from tensorflow.python.platform import gfile
18 |
19 | # Special vocabulary symbols - we always put them at the start.
20 | _PAD = "_PAD"
21 | _UNK = "_UNK"
22 | _START_VOCAB = [_PAD, _UNK]
23 |
24 | START_VOCAB_dict = dict()
25 | START_VOCAB_dict['with_padding'] = [_PAD, _UNK]
26 | START_VOCAB_dict['no_padding'] = [_UNK]
27 |
28 |
29 | PAD_ID = 0
30 |
31 | UNK_ID_dict = dict()
32 | UNK_ID_dict['with_padding'] = 1
33 | UNK_ID_dict['no_padding'] = 0
34 |
35 | # Regular expressions used to tokenize.
36 | _WORD_SPLIT = re.compile("([.,!?\"':;)(])")
37 | _DIGIT_RE = re.compile(r"\d")
38 |
39 | def basic_tokenizer(sentence):
40 | """Very basic tokenizer: split the sentence into a list of tokens."""
41 | words = []
42 | for space_separated_fragment in sentence.strip().split():
43 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
44 | return [w for w in words if w]
45 |
46 | def naive_tokenizer(sentence):
47 | """Naive tokenizer: split the sentence by space into a list of tokens."""
48 | return sentence.split()
49 |
50 |
51 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
52 | tokenizer=None, normalize_digits=True):
53 | """Create vocabulary file (if it does not exist yet) from data file.
54 |
55 | Data file is assumed to contain one sentence per line. Each sentence is
56 | tokenized and digits are normalized (if normalize_digits is set).
57 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
58 | We write it to vocabulary_path in a one-token-per-line format, so that later
59 | token in the first line gets id=0, second line gets id=1, and so on.
60 |
61 | Args:
62 | vocabulary_path: path where the vocabulary will be created.
63 | data_path: data file that will be used to create vocabulary.
64 | max_vocabulary_size: limit on the size of the created vocabulary.
65 | tokenizer: a function to use to tokenize each data sentence;
66 | if None, basic_tokenizer will be used.
67 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
68 | """
69 | if not gfile.Exists(vocabulary_path):
70 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
71 | vocab = {}
72 | with gfile.GFile(data_path, mode="r") as f:
73 | counter = 0
74 | for line in f:
75 | counter += 1
76 | if counter % 100000 == 0:
77 | print(" processing line %d" % counter)
78 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
79 | for w in tokens:
80 | word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w
81 | if word in vocab:
82 | vocab[word] += 1
83 | else:
84 | vocab[word] = 1
85 | vocab_list = START_VOCAB_dict['with_padding'] + \
86 | sorted(vocab, key=vocab.get, reverse=True)
87 | if len(vocab_list) > max_vocabulary_size:
88 | vocab_list = vocab_list[:max_vocabulary_size]
89 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
90 | for w in vocab_list:
91 | vocab_file.write(w + "\n")
92 |
93 |
94 | def initialize_vocab(vocabulary_path):
95 | """Initialize vocabulary from file.
96 |
97 | We assume the vocabulary is stored one-item-per-line, so a file:
98 | dog
99 | cat
100 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
101 | also return the reversed-vocabulary ["dog", "cat"].
102 |
103 | Args:
104 | vocabulary_path: path to the file containing the vocabulary.
105 |
106 | Returns:
107 | a pair: the vocabulary (a dictionary mapping string to integers), and
108 | the reversed vocabulary (a list, which reverses the vocabulary mapping).
109 |
110 | Raises:
111 | ValueError: if the provided vocabulary_path does not exist.
112 | """
113 | if gfile.Exists(vocabulary_path):
114 | rev_vocab = []
115 | with gfile.GFile(vocabulary_path, mode="r") as f:
116 | rev_vocab.extend(f.readlines())
117 | rev_vocab = [line.strip() for line in rev_vocab]
118 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
119 | return vocab, rev_vocab
120 | else:
121 | raise ValueError("Vocabulary file %s not found.", vocabulary_path)
122 |
123 |
124 | def sentence_to_token_ids(sentence, vocabulary, UNK_ID,
125 | tokenizer=None, normalize_digits=True):
126 | """Convert a string to list of integers representing token-ids.
127 |
128 | For example, a sentence "I have a dog" may become tokenized into
129 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
130 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
131 |
132 | Args:
133 | sentence: a string, the sentence to convert to token-ids.
134 | vocabulary: a dictionary mapping tokens to integers.
135 | tokenizer: a function to use to tokenize each sentence;
136 | if None, basic_tokenizer will be used.
137 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
138 |
139 | Returns:
140 | a list of integers, the token-ids for the sentence.
141 | """
142 | if tokenizer:
143 | words = tokenizer(sentence)
144 | else:
145 | words = basic_tokenizer(sentence)
146 | if not normalize_digits:
147 | return [vocabulary.get(w, UNK_ID) for w in words]
148 | # Normalize digits by 0 before looking words up in the vocabulary.
149 | return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]
150 |
151 |
152 | def data_to_token_ids(data_path, target_path, vocabulary_path,
153 | tokenizer=None, normalize_digits=True, use_padding=True):
154 | """Tokenize data file and turn into token-ids using given vocabulary file.
155 |
156 | This function loads data line-by-line from data_path, calls the above
157 | sentence_to_token_ids, and saves the result to target_path. See comment
158 | for sentence_to_token_ids on the details of token-ids format.
159 |
160 | Args:
161 | data_path: path to the data file in one-sentence-per-line format.
162 | target_path: path where the file with token-ids will be created.
163 | vocabulary_path: path to the vocabulary file.
164 | tokenizer: a function to use to tokenize each sentence;
165 | if None, basic_tokenizer will be used.
166 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
167 | """
168 | if not gfile.Exists(target_path):
169 | print("Tokenizing data in %s" % data_path)
170 | vocab, _ = initialize_vocab(vocabulary_path)
171 | with gfile.GFile(data_path, mode="r") as data_file:
172 | with gfile.GFile(target_path, mode="w") as tokens_file:
173 | counter = 0
174 | for line in data_file:
175 | counter += 1
176 | if counter % 100000 == 0:
177 | print(" tokenizing line %d" % counter)
178 | if use_padding:
179 | UNK_ID = UNK_ID_dict['with_padding']
180 | else:
181 | UNK_ID = UNK_ID_dict['no_padding']
182 | token_ids = sentence_to_token_ids(line, vocab, UNK_ID, tokenizer,
183 | normalize_digits)
184 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
185 |
186 |
187 |
188 | def create_label_vocab(vocabulary_path, data_path):
189 | if not gfile.Exists(vocabulary_path):
190 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
191 | vocab = {}
192 | with gfile.GFile(data_path, mode="r") as f:
193 | counter = 0
194 | for line in f:
195 | counter += 1
196 | if counter % 100000 == 0:
197 | print(" processing line %d" % counter)
198 | label = line.strip()
199 | vocab[label] = 1
200 | label_list = START_VOCAB_dict['no_padding'] + sorted(vocab)
201 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
202 | for k in label_list:
203 | vocab_file.write(k + "\n")
204 |
205 | def prepare_multi_task_data(data_dir, in_vocab_size, out_vocab_size):
206 | train_path = data_dir + '/train/train'
207 | dev_path = data_dir + '/valid/valid'
208 | test_path = data_dir + '/test/test'
209 |
210 | # Create vocabularies of the appropriate sizes.
211 | in_vocab_path = os.path.join(data_dir, "in_vocab_%d.txt" % in_vocab_size)
212 | out_vocab_path = os.path.join(data_dir, "out_vocab_%d.txt" % out_vocab_size)
213 | label_path = os.path.join(data_dir, "label.txt")
214 |
215 | create_vocabulary(in_vocab_path,
216 | train_path + ".seq.in",
217 | in_vocab_size,
218 | tokenizer=naive_tokenizer)
219 | create_vocabulary(out_vocab_path,
220 | train_path + ".seq.out",
221 | out_vocab_size,
222 | tokenizer=naive_tokenizer)
223 | create_label_vocab(label_path, train_path + ".label")
224 |
225 | # Create token ids for the training data.
226 | in_seq_train_ids_path = train_path + (".ids%d.seq.in" % in_vocab_size)
227 | out_seq_train_ids_path = train_path + (".ids%d.seq.out" % out_vocab_size)
228 | label_train_ids_path = train_path + (".ids.label")
229 |
230 | data_to_token_ids(train_path + ".seq.in",
231 | in_seq_train_ids_path,
232 | in_vocab_path,
233 | tokenizer=naive_tokenizer)
234 | data_to_token_ids(train_path + ".seq.out",
235 | out_seq_train_ids_path,
236 | out_vocab_path,
237 | tokenizer=naive_tokenizer)
238 | data_to_token_ids(train_path + ".label",
239 | label_train_ids_path,
240 | label_path,
241 | normalize_digits=False,
242 | use_padding=False)
243 |
244 | # Create token ids for the development data.
245 | in_seq_dev_ids_path = dev_path + (".ids%d.seq.in" % in_vocab_size)
246 | out_seq_dev_ids_path = dev_path + (".ids%d.seq.out" % out_vocab_size)
247 | label_dev_ids_path = dev_path + (".ids.label")
248 |
249 | data_to_token_ids(dev_path + ".seq.in",
250 | in_seq_dev_ids_path,
251 | in_vocab_path,
252 | tokenizer=naive_tokenizer)
253 | data_to_token_ids(dev_path + ".seq.out",
254 | out_seq_dev_ids_path,
255 | out_vocab_path,
256 | tokenizer=naive_tokenizer)
257 | data_to_token_ids(dev_path + ".label",
258 | label_dev_ids_path,
259 | label_path,
260 | normalize_digits=False,
261 | use_padding=False)
262 |
263 | # Create token ids for the test data.
264 | in_seq_test_ids_path = test_path + (".ids%d.seq.in" % in_vocab_size)
265 | out_seq_test_ids_path = test_path + (".ids%d.seq.out" % out_vocab_size)
266 | label_test_ids_path = test_path + (".ids.label")
267 |
268 | data_to_token_ids(test_path + ".seq.in",
269 | in_seq_test_ids_path,
270 | in_vocab_path,
271 | tokenizer=naive_tokenizer)
272 | data_to_token_ids(test_path + ".seq.out",
273 | out_seq_test_ids_path,
274 | out_vocab_path,
275 | tokenizer=naive_tokenizer)
276 | data_to_token_ids(test_path + ".label",
277 | label_test_ids_path,
278 | label_path,
279 | normalize_digits=False,
280 | use_padding=False)
281 |
282 | return [(in_seq_train_ids_path,out_seq_train_ids_path,label_train_ids_path),
283 | (in_seq_dev_ids_path, out_seq_dev_ids_path, label_dev_ids_path),
284 | (in_seq_test_ids_path, out_seq_test_ids_path, label_test_ids_path),
285 | (in_vocab_path, out_vocab_path, label_path)]
--------------------------------------------------------------------------------
/conlleval.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html
5 | # options: l: generate LaTeX output for tables like in
6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex
7 | # r: accept raw result tags (without B- and I- prefix;
8 | # assumes one word per chunk)
9 | # d: alternative delimiter tag (default is single space)
10 | # o: alternative outside tag (default is O)
11 | # note: the file should contain lines with items separated
12 | # by $delimiter characters (default space). The final
13 | # two items should contain the correct tag and the
14 | # guessed tag in that order. Sentences should be
15 | # separated from each other by empty lines or lines
16 | # with $boundary fields (default -X-).
17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/
18 | # started: 1998-09-25
19 | # version: 2004-01-26
20 | # author: Erik Tjong Kim Sang
21 |
22 | use strict;
23 |
24 | my $false = 0;
25 | my $true = 42;
26 |
27 | my $boundary = "-X-"; # sentence boundary
28 | my $correct; # current corpus chunk tag (I,O,B)
29 | my $correctChunk = 0; # number of correctly identified chunks
30 | my $correctTags = 0; # number of correct chunk tags
31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.)
32 | my $delimiter = " "; # field delimiter
33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979)
34 | my $firstItem; # first feature (for sentence boundary checks)
35 | my $foundCorrect = 0; # number of chunks in corpus
36 | my $foundGuessed = 0; # number of identified chunks
37 | my $guessed; # current guessed chunk tag
38 | my $guessedType; # type of current guessed chunk tag
39 | my $i; # miscellaneous counter
40 | my $inCorrect = $false; # currently processed chunk is correct until now
41 | my $lastCorrect = "O"; # previous chunk tag in corpus
42 | my $latex = 0; # generate LaTeX formatted output
43 | my $lastCorrectType = ""; # type of previously identified chunk tag
44 | my $lastGuessed = "O"; # previously identified chunk tag
45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
46 | my $lastType; # temporary storage for detecting duplicates
47 | my $line; # line
48 | my $nbrOfFeatures = -1; # number of features per line
49 | my $precision = 0.0; # precision score
50 | my $oTag = "O"; # outside tag, default O
51 | my $raw = 0; # raw input: add B to every token
52 | my $recall = 0.0; # recall score
53 | my $tokenCounter = 0; # token counter (ignores sentence breaks)
54 |
55 | my %correctChunk = (); # number of correctly identified chunks per type
56 | my %foundCorrect = (); # number of chunks in corpus per type
57 | my %foundGuessed = (); # number of identified chunks per type
58 |
59 | my @features; # features on line
60 | my @sortedTypes; # sorted list of chunk type names
61 |
62 | # sanity check
63 | while (@ARGV and $ARGV[0] =~ /^-/) {
64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
66 | elsif ($ARGV[0] eq "-d") {
67 | shift(@ARGV);
68 | if (not defined $ARGV[0]) {
69 | die "conlleval: -d requires delimiter character";
70 | }
71 | $delimiter = shift(@ARGV);
72 | } elsif ($ARGV[0] eq "-o") {
73 | shift(@ARGV);
74 | if (not defined $ARGV[0]) {
75 | die "conlleval: -o requires delimiter character";
76 | }
77 | $oTag = shift(@ARGV);
78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; }
79 | }
80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
81 | # process input
82 | while () {
83 | chomp($line = $_);
84 | @features = split(/$delimiter/,$line);
85 |
86 | #printf $line;
87 | #printf STDERR $#features;
88 | #printf "\n";
89 |
90 | #printf $nbrOfFeatures;
91 | #printf "\n";
92 | #printf $#features;
93 | #printf "\n";
94 |
95 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
96 | elsif ($nbrOfFeatures != $#features and @features != 0) {
97 | printf STDERR "unexpected number of features: %d (%d)\n",
98 | $#features+1,$nbrOfFeatures+1;
99 | exit(1);
100 | }
101 | if (@features == 0 or
102 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
103 | if (@features < 2) {
104 | die "conlleval: unexpected number of features in line $line\n";
105 | }
106 | if ($raw) {
107 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; }
108 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; }
109 | if ($features[$#features] ne "O") {
110 | $features[$#features] = "B-$features[$#features]";
111 | }
112 | if ($features[$#features-1] ne "O") {
113 | $features[$#features-1] = "B-$features[$#features-1]";
114 | }
115 | }
116 | # 20040126 ET code which allows hyphens in the types
117 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
118 | $guessed = $1;
119 | $guessedType = $2;
120 | } else {
121 | $guessed = $features[$#features];
122 | $guessedType = "";
123 | }
124 | pop(@features);
125 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
126 | $correct = $1;
127 | $correctType = $2;
128 | } else {
129 | $correct = $features[$#features];
130 | $correctType = "";
131 | }
132 | pop(@features);
133 | # ($guessed,$guessedType) = split(/-/,pop(@features));
134 | # ($correct,$correctType) = split(/-/,pop(@features));
135 | $guessedType = $guessedType ? $guessedType : "";
136 | $correctType = $correctType ? $correctType : "";
137 | $firstItem = shift(@features);
138 |
139 | # 1999-06-26 sentence breaks should always be counted as out of chunk
140 | if ( $firstItem eq $boundary ) { $guessed = "O"; }
141 |
142 | if ($inCorrect) {
143 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
144 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
145 | $lastGuessedType eq $lastCorrectType) {
146 | $inCorrect=$false;
147 | $correctChunk++;
148 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
149 | $correctChunk{$lastCorrectType}+1 : 1;
150 | } elsif (
151 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) !=
152 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
153 | $guessedType ne $correctType ) {
154 | $inCorrect=$false;
155 | }
156 | }
157 |
158 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
159 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
160 | $guessedType eq $correctType) { $inCorrect = $true; }
161 |
162 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
163 | $foundCorrect++;
164 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
165 | $foundCorrect{$correctType}+1 : 1;
166 | }
167 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
168 | $foundGuessed++;
169 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
170 | $foundGuessed{$guessedType}+1 : 1;
171 | }
172 | if ( $firstItem ne $boundary ) {
173 | if ( $correct eq $guessed and $guessedType eq $correctType ) {
174 | $correctTags++;
175 | }
176 | $tokenCounter++;
177 | }
178 |
179 | $lastGuessed = $guessed;
180 | $lastCorrect = $correct;
181 | $lastGuessedType = $guessedType;
182 | $lastCorrectType = $correctType;
183 | }
184 | if ($inCorrect) {
185 | $correctChunk++;
186 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
187 | $correctChunk{$lastCorrectType}+1 : 1;
188 | }
189 |
190 | if (not $latex) {
191 | # compute overall precision, recall and FB1 (default values are 0.0)
192 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
193 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
194 | $FB1 = 2*$precision*$recall/($precision+$recall)
195 | if ($precision+$recall > 0);
196 |
197 | # print overall performance
198 | printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
199 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
200 | if ($tokenCounter>0) {
201 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
202 | print "$correctChunk $foundCorrect $foundGuessed ";
203 | printf "precision: %6.2f%%; ",$precision;
204 | printf "recall: %6.2f%%; ",$recall;
205 | printf "FB1: %6.2f\n",$FB1;
206 | }
207 | }
208 |
209 | # sort chunk type names
210 | undef($lastType);
211 | @sortedTypes = ();
212 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
213 | if (not($lastType) or $lastType ne $i) {
214 | push(@sortedTypes,($i));
215 | }
216 | $lastType = $i;
217 | }
218 | # print performance per chunk type
219 | if (not $latex) {
220 | for $i (@sortedTypes) {
221 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
222 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
223 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
224 | if (not($foundCorrect{$i})) { $recall = 0.0; }
225 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
226 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
227 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
228 | printf "%17s: ",$i;
229 | printf "% 4d % 4d % 4d ", $correctChunk{$i}, $foundCorrect{$i}, $foundGuessed{$i};
230 | printf "precision: %6.2f%%; ",$precision;
231 | printf "recall: %6.2f%%; ",$recall;
232 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i};
233 | }
234 | } else {
235 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline";
236 | for $i (@sortedTypes) {
237 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
238 | if (not($foundGuessed{$i})) { $precision = 0.0; }
239 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
240 | if (not($foundCorrect{$i})) { $recall = 0.0; }
241 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
242 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
243 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
244 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
245 | $i,$precision,$recall,$FB1;
246 | }
247 | print "\\hline\n";
248 | $precision = 0.0;
249 | $recall = 0;
250 | $FB1 = 0.0;
251 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
252 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
253 | $FB1 = 2*$precision*$recall/($precision+$recall)
254 | if ($precision+$recall > 0);
255 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
256 | $precision,$recall,$FB1;
257 | }
258 |
259 | exit 0;
260 |
261 | # endOfChunk: checks if a chunk ended between the previous and current word
262 | # arguments: previous and current chunk tags, previous and current types
263 | # note: this code is capable of handling other chunk representations
264 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
265 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
266 |
267 | sub endOfChunk {
268 | my $prevTag = shift(@_);
269 | my $tag = shift(@_);
270 | my $prevType = shift(@_);
271 | my $type = shift(@_);
272 | my $chunkEnd = $false;
273 |
274 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
275 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
276 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
277 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
278 |
279 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
280 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
281 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
282 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
283 |
284 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) {
285 | $chunkEnd = $true;
286 | }
287 |
288 | # corrected 1998-12-22: these chunks are assumed to have length 1
289 | if ( $prevTag eq "]" ) { $chunkEnd = $true; }
290 | if ( $prevTag eq "[" ) { $chunkEnd = $true; }
291 |
292 | return($chunkEnd);
293 | }
294 |
295 | # startOfChunk: checks if a chunk started between the previous and current word
296 | # arguments: previous and current chunk tags, previous and current types
297 | # note: this code is capable of handling other chunk representations
298 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
299 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
300 |
301 | sub startOfChunk {
302 | my $prevTag = shift(@_);
303 | my $tag = shift(@_);
304 | my $prevType = shift(@_);
305 | my $type = shift(@_);
306 | my $chunkStart = $false;
307 |
308 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
309 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
310 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
311 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
312 |
313 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
314 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
315 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
316 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
317 |
318 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) {
319 | $chunkStart = $true;
320 | }
321 |
322 | # corrected 1998-12-22: these chunks are assumed to have length 1
323 | if ( $tag eq "[" ) { $chunkStart = $true; }
324 | if ( $tag eq "]" ) { $chunkStart = $true; }
325 |
326 | return($chunkStart);
327 | }
328 |
--------------------------------------------------------------------------------
/run_multi-task_rnn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Feb 28 16:23:37 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | import math
13 | import os
14 | import sys
15 | import time
16 |
17 | import numpy as np
18 | from six.moves import xrange # pylint: disable=redefined-builtin
19 | import tensorflow as tf
20 |
21 | import data_utils
22 | import multi_task_model
23 |
24 | import subprocess
25 | import stat
26 |
27 |
28 | #tf.app.flags.DEFINE_float("learning_rate", 0.1, "Learning rate.")
29 | #tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.9,
30 | # "Learning rate decays by this much.")
31 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
32 | "Clip gradients to this norm.")
33 | tf.app.flags.DEFINE_integer("batch_size", 16,
34 | "Batch size to use during training.")
35 | tf.app.flags.DEFINE_integer("size", 128, "Size of each model layer.")
36 | tf.app.flags.DEFINE_integer("word_embedding_size", 128, "word embedding size")
37 | tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
38 | tf.app.flags.DEFINE_integer("in_vocab_size", 10000, "max vocab Size.")
39 | tf.app.flags.DEFINE_integer("out_vocab_size", 10000, "max tag vocab Size.")
40 | tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory")
41 | tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training directory.")
42 | tf.app.flags.DEFINE_integer("max_train_data_size", 0,
43 | "Limit on the size of training data (0: no limit)")
44 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 100,
45 | "How many training steps to do per checkpoint.")
46 | tf.app.flags.DEFINE_integer("max_training_steps", 30000,
47 | "Max training steps.")
48 | tf.app.flags.DEFINE_integer("max_test_data_size", 0,
49 | "Max size of test set.")
50 | tf.app.flags.DEFINE_boolean("use_attention", True,
51 | "Use attention based RNN")
52 | tf.app.flags.DEFINE_integer("max_sequence_length", 0,
53 | "Max sequence length.")
54 | tf.app.flags.DEFINE_float("dropout_keep_prob", 0.5,
55 | "dropout keep cell input and output prob.")
56 | tf.app.flags.DEFINE_boolean("bidirectional_rnn", True,
57 | "Use birectional RNN")
58 | tf.app.flags.DEFINE_string("task", None, "Options: joint; intent; tagging")
59 | FLAGS = tf.app.flags.FLAGS
60 |
61 | if FLAGS.max_sequence_length == 0:
62 | print ('Please indicate max sequence length. Exit')
63 | exit()
64 |
65 | if FLAGS.task is None:
66 | print ('Please indicate task to run.' +
67 | 'Available options: intent; tagging; joint')
68 | exit()
69 |
70 | task = dict({'intent':0, 'tagging':0, 'joint':0})
71 | if FLAGS.task == 'intent':
72 | task['intent'] = 1
73 | elif FLAGS.task == 'tagging':
74 | task['tagging'] = 1
75 | elif FLAGS.task == 'joint':
76 | task['intent'] = 1
77 | task['tagging'] = 1
78 | task['joint'] = 1
79 |
80 | _buckets = [(FLAGS.max_sequence_length, FLAGS.max_sequence_length)]
81 | #_buckets = [(3, 10), (10, 25)]
82 |
83 | # metrics function using conlleval.pl
84 | def conlleval(p, g, w, filename):
85 | '''
86 | INPUT:
87 | p :: predictions
88 | g :: groundtruth
89 | w :: corresponding words
90 |
91 | OUTPUT:
92 | filename :: name of the file where the predictions
93 | are written. it will be the input of conlleval.pl script
94 | for computing the performance in terms of precision
95 | recall and f1 score
96 | '''
97 | out = ''
98 | for sl, sp, sw in zip(g, p, w):
99 | out += 'BOS O O\n'
100 | for wl, wp, w in zip(sl, sp, sw):
101 | out += w + ' ' + wl + ' ' + wp + '\n'
102 | out += 'EOS O O\n\n'
103 |
104 | f = open(filename, 'w')
105 | f.writelines(out[:-1]) # remove the ending \n on last line
106 | f.close()
107 |
108 | return get_perf(filename)
109 |
110 | def get_perf(filename):
111 | ''' run conlleval.pl perl script to obtain
112 | precision/recall and F1 score '''
113 | _conlleval = os.path.dirname(os.path.realpath(__file__)) + '/conlleval.pl'
114 | os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions
115 |
116 | proc = subprocess.Popen(["perl",
117 | _conlleval],
118 | stdin=subprocess.PIPE,
119 | stdout=subprocess.PIPE)
120 |
121 | stdout, _ = proc.communicate(''.join(open(filename).readlines()))
122 | for line in stdout.split('\n'):
123 | if 'accuracy' in line:
124 | out = line.split()
125 | break
126 |
127 | precision = float(out[6][:-2])
128 | recall = float(out[8][:-2])
129 | f1score = float(out[10])
130 |
131 | return {'p': precision, 'r': recall, 'f1': f1score}
132 |
133 |
134 | def read_data(source_path, target_path, label_path, max_size=None):
135 | """Read data from source and target files and put into buckets.
136 |
137 | Args:
138 | source_path: path to the files with token-ids for the word sequence.
139 | target_path: path to the file with token-ids for the tag sequence;
140 | it must be aligned with the source file: n-th line contains the desired
141 | output for n-th line from the source_path.
142 | label_path: path to the file with token-ids for the intent label
143 | max_size: maximum number of lines to read, all other will be ignored;
144 | if 0 or None, data files will be read completely (no limit).
145 |
146 | Returns:
147 | data_set: a list of length len(_buckets); data_set[n] contains a list of
148 | (source, target, label) tuple read from the provided data files that fit
149 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
150 | len(target) < _buckets[n][1];source, target, label are lists of token-ids
151 | """
152 | data_set = [[] for _ in _buckets]
153 | with tf.gfile.GFile(source_path, mode="r") as source_file:
154 | with tf.gfile.GFile(target_path, mode="r") as target_file:
155 | with tf.gfile.GFile(label_path, mode="r") as label_file:
156 | source = source_file.readline()
157 | target = target_file.readline()
158 | label = label_file.readline()
159 | counter = 0
160 | while source and target and label and (not max_size \
161 | or counter < max_size):
162 | counter += 1
163 | if counter % 100000 == 0:
164 | print(" reading data line %d" % counter)
165 | sys.stdout.flush()
166 | source_ids = [int(x) for x in source.split()]
167 | target_ids = [int(x) for x in target.split()]
168 | label_ids = [int(x) for x in label.split()]
169 | # target_ids.append(data_utils.EOS_ID)
170 | for bucket_id, (source_size, target_size) in enumerate(_buckets):
171 | if len(source_ids) < source_size and len(target_ids) < target_size:
172 | data_set[bucket_id].append([source_ids, target_ids, label_ids])
173 | break
174 | source = source_file.readline()
175 | target = target_file.readline()
176 | label = label_file.readline()
177 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids
178 |
179 | def create_model(session,
180 | source_vocab_size,
181 | target_vocab_size,
182 | label_vocab_size):
183 | """Create model and initialize or load parameters in session."""
184 | with tf.variable_scope("model", reuse=None):
185 | model_train = multi_task_model.MultiTaskModel(
186 | source_vocab_size,
187 | target_vocab_size,
188 | label_vocab_size,
189 | _buckets,
190 | FLAGS.word_embedding_size,
191 | FLAGS.size, FLAGS.num_layers,
192 | FLAGS.max_gradient_norm,
193 | FLAGS.batch_size,
194 | dropout_keep_prob=FLAGS.dropout_keep_prob,
195 | use_lstm=True,
196 | forward_only=False,
197 | use_attention=FLAGS.use_attention,
198 | bidirectional_rnn=FLAGS.bidirectional_rnn,
199 | task=task)
200 | with tf.variable_scope("model", reuse=True):
201 | model_test = multi_task_model.MultiTaskModel(
202 | source_vocab_size,
203 | target_vocab_size,
204 | label_vocab_size,
205 | _buckets,
206 | FLAGS.word_embedding_size,
207 | FLAGS.size,
208 | FLAGS.num_layers,
209 | FLAGS.max_gradient_norm,
210 | FLAGS.batch_size,
211 | dropout_keep_prob=FLAGS.dropout_keep_prob,
212 | use_lstm=True,
213 | forward_only=True,
214 | use_attention=FLAGS.use_attention,
215 | bidirectional_rnn=FLAGS.bidirectional_rnn,
216 | task=task)
217 |
218 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
219 | if ckpt:
220 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
221 | model_train.saver.restore(session, ckpt.model_checkpoint_path)
222 | else:
223 | print("Created model with fresh parameters.")
224 | session.run(tf.global_variables_initializer())
225 | return model_train, model_test
226 |
227 | def train():
228 | print ('Applying Parameters:')
229 | for k,v in FLAGS.__dict__['__flags'].iteritems():
230 | print ('%s: %s' % (k, str(v)))
231 | print("Preparing data in %s" % FLAGS.data_dir)
232 | vocab_path = ''
233 | tag_vocab_path = ''
234 | label_vocab_path = ''
235 | date_set = data_utils.prepare_multi_task_data(
236 | FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size)
237 | in_seq_train, out_seq_train, label_train = date_set[0]
238 | in_seq_dev, out_seq_dev, label_dev = date_set[1]
239 | in_seq_test, out_seq_test, label_test = date_set[2]
240 | vocab_path, tag_vocab_path, label_vocab_path = date_set[3]
241 |
242 | result_dir = FLAGS.train_dir + '/test_results'
243 | if not os.path.isdir(result_dir):
244 | os.makedirs(result_dir)
245 |
246 | current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt'
247 | current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt'
248 |
249 | vocab, rev_vocab = data_utils.initialize_vocab(vocab_path)
250 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocab(tag_vocab_path)
251 | label_vocab, rev_label_vocab = data_utils.initialize_vocab(label_vocab_path)
252 |
253 | config = tf.ConfigProto(
254 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23),
255 | #device_count = {'gpu': 2}
256 | )
257 |
258 | with tf.Session(config=config) as sess:
259 | # Create model.
260 | print("Max sequence length: %d." % _buckets[0][0])
261 | print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
262 |
263 | model, model_test = create_model(sess,
264 | len(vocab),
265 | len(tag_vocab),
266 | len(label_vocab))
267 | print ("Creating model with " +
268 | "source_vocab_size=%d, target_vocab_size=%d, label_vocab_size=%d." \
269 | % (len(vocab), len(tag_vocab), len(label_vocab)))
270 |
271 | # Read data into buckets and compute their sizes.
272 | print ("Reading train/valid/test data (training set limit: %d)."
273 | % FLAGS.max_train_data_size)
274 | dev_set = read_data(in_seq_dev, out_seq_dev, label_dev)
275 | test_set = read_data(in_seq_test, out_seq_test, label_test)
276 | train_set = read_data(in_seq_train, out_seq_train, label_train)
277 | train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
278 | train_total_size = float(sum(train_bucket_sizes))
279 |
280 | train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
281 | for i in xrange(len(train_bucket_sizes))]
282 |
283 | # This is the training loop.
284 | step_time, loss = 0.0, 0.0
285 | current_step = 0
286 |
287 | best_valid_score = 0
288 | best_test_score = 0
289 | while model.global_step.eval() < FLAGS.max_training_steps:
290 | random_number_01 = np.random.random_sample()
291 | bucket_id = min([i for i in xrange(len(train_buckets_scale))
292 | if train_buckets_scale[i] > random_number_01])
293 |
294 | # Get a batch and make a step.
295 | start_time = time.time()
296 | batch_data = model.get_batch(train_set, bucket_id)
297 | encoder_inputs,tags,tag_weights,batch_sequence_length,labels = batch_data
298 | if task['joint'] == 1:
299 | step_outputs = model.joint_step(sess,
300 | encoder_inputs,
301 | tags,
302 | tag_weights,
303 | labels,
304 | batch_sequence_length,
305 | bucket_id,
306 | False)
307 | _, step_loss, tagging_logits, class_logits = step_outputs
308 | elif task['tagging'] == 1:
309 | step_outputs = model.tagging_step(sess,
310 | encoder_inputs,
311 | tags,
312 | tag_weights,
313 | batch_sequence_length,
314 | bucket_id,
315 | False)
316 | _, step_loss, tagging_logits = step_outputs
317 | elif task['intent'] == 1:
318 | step_outputs = model.classification_step(sess,
319 | encoder_inputs,
320 | labels,
321 | batch_sequence_length,
322 | bucket_id,
323 | False)
324 | _, step_loss, class_logits = step_outputs
325 |
326 | step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
327 | loss += step_loss / FLAGS.steps_per_checkpoint
328 | current_step += 1
329 |
330 | # Once in a while, we save checkpoint, print statistics, and run evals.
331 | if current_step % FLAGS.steps_per_checkpoint == 0:
332 | perplexity = math.exp(loss) if loss < 300 else float('inf')
333 | print ("global step %d step-time %.2f. Training perplexity %.2f"
334 | % (model.global_step.eval(), step_time, perplexity))
335 | sys.stdout.flush()
336 | # Save checkpoint and zero timer and loss.
337 | checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
338 | model.saver.save(sess, checkpoint_path, global_step=model.global_step)
339 | step_time, loss = 0.0, 0.0
340 |
341 | def run_valid_test(data_set, mode): # mode: Eval, Test
342 | # Run evals on development/test set and print the accuracy.
343 | word_list = list()
344 | ref_tag_list = list()
345 | hyp_tag_list = list()
346 | ref_label_list = list()
347 | hyp_label_list = list()
348 | correct_count = 0
349 | accuracy = 0.0
350 | tagging_eval_result = dict()
351 | for bucket_id in xrange(len(_buckets)):
352 | eval_loss = 0.0
353 | count = 0
354 | for i in xrange(len(data_set[bucket_id])):
355 | count += 1
356 | sample = model_test.get_one(data_set, bucket_id, i)
357 | encoder_inputs,tags,tag_weights,sequence_length,labels = sample
358 | tagging_logits = []
359 | class_logits = []
360 | if task['joint'] == 1:
361 | step_outputs = model_test.joint_step(sess,
362 | encoder_inputs,
363 | tags,
364 | tag_weights,
365 | labels,
366 | sequence_length,
367 | bucket_id,
368 | True)
369 | _, step_loss, tagging_logits, class_logits = step_outputs
370 | elif task['tagging'] == 1:
371 | step_outputs = model_test.tagging_step(sess,
372 | encoder_inputs,
373 | tags,
374 | tag_weights,
375 | sequence_length,
376 | bucket_id,
377 | True)
378 | _, step_loss, tagging_logits = step_outputs
379 | elif task['intent'] == 1:
380 | step_outputs = model_test.classification_step(sess,
381 | encoder_inputs,
382 | labels,
383 | sequence_length,
384 | bucket_id,
385 | True)
386 | _, step_loss, class_logits = step_outputs
387 | eval_loss += step_loss / len(data_set[bucket_id])
388 | hyp_label = None
389 | if task['intent'] == 1:
390 | ref_label_list.append(rev_label_vocab[labels[0][0]])
391 | hyp_label = np.argmax(class_logits[0],0)
392 | hyp_label_list.append(rev_label_vocab[hyp_label])
393 | if labels[0] == hyp_label:
394 | correct_count += 1
395 | if task['tagging'] == 1:
396 | word_list.append([rev_vocab[x[0]] for x in \
397 | encoder_inputs[:sequence_length[0]]])
398 | ref_tag_list.append([rev_tag_vocab[x[0]] for x in \
399 | tags[:sequence_length[0]]])
400 | hyp_tag_list.append(
401 | [rev_tag_vocab[np.argmax(x)] for x in \
402 | tagging_logits[:sequence_length[0]]])
403 |
404 | accuracy = float(correct_count)*100/count
405 | if task['intent'] == 1:
406 | print(" %s accuracy: %.2f %d/%d" \
407 | % (mode, accuracy, correct_count, count))
408 | sys.stdout.flush()
409 | if task['tagging'] == 1:
410 | if mode == 'Eval':
411 | taging_out_file = current_taging_valid_out_file
412 | elif mode == 'Test':
413 | taging_out_file = current_taging_test_out_file
414 | tagging_eval_result = conlleval(hyp_tag_list,
415 | ref_tag_list,
416 | word_list,
417 | taging_out_file)
418 | print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1']))
419 | sys.stdout.flush()
420 | return accuracy, tagging_eval_result
421 |
422 | # valid
423 | valid_accuracy, valid_tagging_result = run_valid_test(dev_set, 'Eval')
424 | if task['tagging'] == 1 \
425 | and valid_tagging_result['f1'] > best_valid_score:
426 | best_valid_score = valid_tagging_result['f1']
427 | # save the best output file
428 | subprocess.call(['mv',
429 | current_taging_valid_out_file,
430 | current_taging_valid_out_file + '.best_f1_%.2f' \
431 | % best_valid_score])
432 | # test, run test after each validation for development purpose.
433 | test_accuracy, test_tagging_result = run_valid_test(test_set, 'Test')
434 | if task['tagging'] == 1 \
435 | and test_tagging_result['f1'] > best_test_score:
436 | best_test_score = test_tagging_result['f1']
437 | # save the best output file
438 | subprocess.call(['mv',
439 | current_taging_test_out_file,
440 | current_taging_test_out_file + '.best_f1_%.2f' \
441 | % best_test_score])
442 |
443 | def main(_):
444 | train()
445 |
446 | if __name__ == "__main__":
447 | tf.app.run()
448 |
--------------------------------------------------------------------------------
/multi_task_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Feb 28 17:28:22 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 |
7 | Multi-task RNN model with an attention mechanism.
8 | - Developped on top of the Tensorflow seq2seq_model.py example:
9 | https://github.com/tensorflow/models/blob/master/tutorials/rnn/translate/seq2seq_model.py
10 | - Note that this example code does not include output label dependency modeling.
11 | One may add a loop function as in the rnn_decoder function in tensorflow
12 | seq2seq.py example to feed emitted label embedding back to RNN state.
13 | """
14 |
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 |
19 | import random
20 |
21 | import numpy as np
22 | from six.moves import xrange # pylint: disable=redefined-builtin
23 | import tensorflow as tf
24 |
25 |
26 | import data_utils
27 | import seq_labeling
28 | import seq_classification
29 | from tensorflow.contrib.rnn import BasicLSTMCell
30 | from tensorflow.contrib.rnn import MultiRNNCell
31 | from tensorflow.contrib.rnn import DropoutWrapper
32 | from tensorflow.contrib.rnn import static_rnn
33 | from tensorflow.contrib.rnn import static_bidirectional_rnn
34 |
35 |
36 | class MultiTaskModel(object):
37 | def __init__(self,
38 | source_vocab_size,
39 | tag_vocab_size,
40 | label_vocab_size,
41 | buckets,
42 | word_embedding_size,
43 | size,
44 | num_layers,
45 | max_gradient_norm,
46 | batch_size,
47 | dropout_keep_prob=1.0,
48 | use_lstm=False,
49 | bidirectional_rnn=True,
50 | num_samples=1024,
51 | use_attention=False,
52 | task=None,
53 | forward_only=False):
54 | self.source_vocab_size = source_vocab_size
55 | self.tag_vocab_size = tag_vocab_size
56 | self.label_vocab_size = label_vocab_size
57 | self.word_embedding_size = word_embedding_size
58 | self.cell_size = size
59 | self.num_layers = num_layers
60 | self.buckets = buckets
61 | self.batch_size = batch_size
62 | self.bidirectional_rnn = bidirectional_rnn
63 | self.global_step = tf.Variable(0, trainable=False)
64 |
65 | # If we use sampled softmax, we need an output projection.
66 | softmax_loss_function = None
67 |
68 | # Create the internal multi-layer cell for our RNN.
69 | def create_cell():
70 | if not forward_only and dropout_keep_prob < 1.0:
71 | single_cell = lambda: BasicLSTMCell(self.cell_size)
72 | cell = MultiRNNCell([single_cell() for _ in range(self.num_layers)])
73 | cell = DropoutWrapper(cell,
74 | input_keep_prob=dropout_keep_prob,
75 | output_keep_prob=dropout_keep_prob)
76 | else:
77 | single_cell = lambda: BasicLSTMCell(self.cell_size)
78 | cell = MultiRNNCell([single_cell() for _ in range(self.num_layers)])
79 | return cell
80 |
81 | self.cell_fw = create_cell()
82 | self.cell_bw = create_cell()
83 |
84 | # Feeds for inputs.
85 | self.encoder_inputs = []
86 | self.tags = []
87 | self.tag_weights = []
88 | self.labels = []
89 | self.sequence_length = tf.placeholder(tf.int32, [None],
90 | name="sequence_length")
91 |
92 | for i in xrange(buckets[-1][0]):
93 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
94 | name="encoder{0}".format(i)))
95 | for i in xrange(buckets[-1][1]):
96 | self.tags.append(tf.placeholder(tf.float32, shape=[None],
97 | name="tag{0}".format(i)))
98 | self.tag_weights.append(tf.placeholder(tf.float32, shape=[None],
99 | name="weight{0}".format(i)))
100 | self.labels.append(tf.placeholder(tf.float32, shape=[None], name="label"))
101 |
102 | base_rnn_output = self.generate_rnn_output()
103 | encoder_outputs, encoder_state, attention_states = base_rnn_output
104 |
105 | if task['tagging'] == 1:
106 | seq_labeling_outputs = seq_labeling.generate_sequence_output(
107 | self.source_vocab_size,
108 | encoder_outputs,
109 | encoder_state,
110 | self.tags,
111 | self.sequence_length,
112 | self.tag_vocab_size,
113 | self.tag_weights,
114 | buckets,
115 | softmax_loss_function=softmax_loss_function,
116 | use_attention=use_attention)
117 | self.tagging_output, self.tagging_loss = seq_labeling_outputs
118 | if task['intent'] == 1:
119 | seq_intent_outputs = seq_classification.generate_single_output(
120 | encoder_state,
121 | attention_states,
122 | self.sequence_length,
123 | self.labels,
124 | self.label_vocab_size,
125 | buckets,
126 | softmax_loss_function=softmax_loss_function,
127 | use_attention=use_attention)
128 | self.classification_output, self.classification_loss = seq_intent_outputs
129 |
130 | if task['tagging'] == 1:
131 | self.loss = self.tagging_loss
132 | elif task['intent'] == 1:
133 | self.loss = self.classification_loss
134 |
135 | # Gradients and SGD update operation for training the model.
136 | params = tf.trainable_variables()
137 | if not forward_only:
138 | opt = tf.train.AdamOptimizer()
139 | if task['joint'] == 1:
140 | # backpropagate the intent and tagging loss, one may further adjust
141 | # the weights for the two costs.
142 | gradients = tf.gradients([self.tagging_loss, self.classification_loss],
143 | params)
144 | elif task['tagging'] == 1:
145 | gradients = tf.gradients(self.tagging_loss, params)
146 | elif task['intent'] == 1:
147 | gradients = tf.gradients(self.classification_loss, params)
148 |
149 | clipped_gradients, norm = tf.clip_by_global_norm(gradients,
150 | max_gradient_norm)
151 | self.gradient_norm = norm
152 | self.update = opt.apply_gradients(
153 | zip(clipped_gradients, params), global_step=self.global_step)
154 |
155 | self.saver = tf.train.Saver(tf.global_variables())
156 |
157 | def generate_rnn_output(self):
158 | """
159 | Generate RNN state outputs with word embeddings as inputs
160 | """
161 | with tf.variable_scope("generate_seq_output"):
162 | if self.bidirectional_rnn:
163 | embedding = tf.get_variable("embedding",
164 | [self.source_vocab_size,
165 | self.word_embedding_size])
166 | encoder_emb_inputs = list()
167 | encoder_emb_inputs = [tf.nn.embedding_lookup(embedding, encoder_input)\
168 | for encoder_input in self.encoder_inputs]
169 | rnn_outputs = static_bidirectional_rnn(self.cell_fw,
170 | self.cell_bw,
171 | encoder_emb_inputs,
172 | sequence_length=self.sequence_length,
173 | dtype=tf.float32)
174 | encoder_outputs, encoder_state_fw, encoder_state_bw = rnn_outputs
175 | # with state_is_tuple = True, if num_layers > 1,
176 | # here we simply use the state from last layer as the encoder state
177 | state_fw = encoder_state_fw[-1]
178 | state_bw = encoder_state_bw[-1]
179 | encoder_state = tf.concat([tf.concat(state_fw, 1),
180 | tf.concat(state_bw, 1)], 1)
181 | top_states = [tf.reshape(e, [-1, 1, self.cell_fw.output_size \
182 | + self.cell_bw.output_size])
183 | for e in encoder_outputs]
184 | attention_states = tf.concat(top_states, 1)
185 | else:
186 | embedding = tf.get_variable("embedding",
187 | [self.source_vocab_size,
188 | self.word_embedding_size])
189 | encoder_emb_inputs = list()
190 | encoder_emb_inputs = [tf.nn.embedding_lookup(embedding, encoder_input)\
191 | for encoder_input in self.encoder_inputs]
192 | rnn_outputs = static_rnn(self.cell_fw,
193 | encoder_emb_inputs,
194 | sequence_length=self.sequence_length,
195 | dtype=tf.float32)
196 | encoder_outputs, encoder_state = rnn_outputs
197 | # with state_is_tuple = True, if num_layers > 1,
198 | # here we use the state from last layer as the encoder state
199 | state = encoder_state[-1]
200 | encoder_state = tf.concat(state, 1)
201 | top_states = [tf.reshape(e, [-1, 1, self.cell_fw.output_size])
202 | for e in encoder_outputs]
203 | attention_states = tf.concat(top_states, 1)
204 | return encoder_outputs, encoder_state, attention_states
205 |
206 | def joint_step(self, session, encoder_inputs, tags, tag_weights,
207 | labels, batch_sequence_length,
208 | bucket_id, forward_only):
209 | """Run a step of the joint model feeding the given inputs.
210 |
211 | Args:
212 | session: tensorflow session to use.
213 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
214 | tags: list of numpy int vectors to feed as decoder inputs.
215 | tag_weights: list of numpy float vectors to feed as tag weights.
216 | labels: list of numpy int vectors to feed as sequence class labels.
217 | bucket_id: which bucket of the model to use.
218 | batch_sequence_length: batch_sequence_length
219 | bucket_id: which bucket of the model to use.
220 | forward_only: whether to do the backward step or only forward.
221 |
222 | Returns:
223 | A triple consisting of gradient norm (or None if we did not do backward),
224 | average perplexity, output tags, and output class label.
225 |
226 | Raises:
227 | ValueError: if length of encoder_inputs, decoder_inputs, or
228 | target_weights disagrees with bucket size for the specified bucket_id.
229 | """
230 | # Check if the sizes match.
231 | encoder_size, tag_size = self.buckets[bucket_id]
232 | if len(encoder_inputs) != encoder_size:
233 | raise ValueError("Encoder length must be equal to the one in bucket,"
234 | " %d != %d." % (len(encoder_inputs), encoder_size))
235 | if len(tags) != tag_size:
236 | raise ValueError("Decoder length must be equal to the one in bucket,"
237 | " %d != %d." % (len(tags), tag_size))
238 | if len(labels) != 1:
239 | raise ValueError("Decoder length must be equal to the one in bucket,"
240 | " %d != %d." % (len(labels), 1))
241 |
242 | input_feed = {}
243 | input_feed[self.sequence_length.name] = batch_sequence_length
244 | for l in xrange(encoder_size):
245 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
246 | input_feed[self.tags[l].name] = tags[l]
247 | input_feed[self.tag_weights[l].name] = tag_weights[l]
248 | input_feed[self.labels[0].name] = labels[0]
249 |
250 | # Output feed: depends on whether we do a backward step or not.
251 | if not forward_only:
252 | output_feed = [self.update, # Update Op that does SGD.
253 | self.gradient_norm, # Gradient norm.
254 | self.loss] # Loss for this batch.
255 | for i in range(tag_size):
256 | output_feed.append(self.tagging_output[i])
257 | output_feed.append(self.classification_output[0])
258 | else:
259 | output_feed = [self.loss]
260 | for i in range(tag_size):
261 | output_feed.append(self.tagging_output[i])
262 | output_feed.append(self.classification_output[0])
263 |
264 | outputs = session.run(output_feed, input_feed)
265 | if not forward_only:
266 | return outputs[1], outputs[2], outputs[3:3+tag_size], outputs[-1]
267 | else:
268 | return None, outputs[0], outputs[1:1+tag_size], outputs[-1]
269 |
270 |
271 | def tagging_step(self, session, encoder_inputs, tags, tag_weights,
272 | batch_sequence_length, bucket_id, forward_only):
273 | """Run a step of the tagging model feeding the given inputs.
274 |
275 | Args:
276 | session: tensorflow session to use.
277 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
278 | tags: list of numpy int vectors to feed as decoder inputs.
279 | tag_weights: list of numpy float vectors to feed as target weights.
280 | batch_sequence_length: batch_sequence_length
281 | bucket_id: which bucket of the model to use.
282 | forward_only: whether to do the backward step or only forward.
283 |
284 | Returns:
285 | A triple consisting of gradient norm (or None if we did not do backward),
286 | average perplexity, and the output tags.
287 |
288 | Raises:
289 | ValueError: if length of encoder_inputs, decoder_inputs, or
290 | target_weights disagrees with bucket size for the specified bucket_id.
291 | """
292 | # Check if the sizes match.
293 | encoder_size, tag_size = self.buckets[bucket_id]
294 | if len(encoder_inputs) != encoder_size:
295 | raise ValueError("Encoder length must be equal to the one in bucket,"
296 | " %d != %d." % (len(encoder_inputs), encoder_size))
297 | if len(tags) != tag_size:
298 | raise ValueError("Decoder length must be equal to the one in bucket,"
299 | " %d != %d." % (len(tags), tag_size))
300 |
301 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
302 | input_feed = {}
303 | input_feed[self.sequence_length.name] = batch_sequence_length
304 | for l in xrange(encoder_size):
305 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
306 | input_feed[self.tags[l].name] = tags[l]
307 | input_feed[self.tag_weights[l].name] = tag_weights[l]
308 |
309 | # Output feed: depends on whether we do a backward step or not.
310 | if not forward_only:
311 | output_feed = [self.update, # Update Op that does SGD.
312 | self.gradient_norm, # Gradient norm.
313 | self.loss] # Loss for this batch.
314 | for i in range(tag_size):
315 | output_feed.append(self.tagging_output[i])
316 | else:
317 | output_feed = [self.loss]
318 | for i in range(tag_size):
319 | output_feed.append(self.tagging_output[i])
320 |
321 | outputs = session.run(output_feed, input_feed)
322 | if not forward_only:
323 | return outputs[1], outputs[2], outputs[3:3+tag_size]
324 | else:
325 | return None, outputs[0], outputs[1:1+tag_size]
326 |
327 | def classification_step(self, session, encoder_inputs, labels,
328 | batch_sequence_length, bucket_id, forward_only):
329 | """Run a step of the intent classification model feeding the given inputs.
330 |
331 | Args:
332 | session: tensorflow session to use.
333 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
334 | labels: list of numpy int vectors to feed as sequence class labels.
335 | batch_sequence_length: batch_sequence_length
336 | bucket_id: which bucket of the model to use.
337 | forward_only: whether to do the backward step or only forward.
338 |
339 | Returns:
340 | A triple consisting of gradient norm (or None if we did not do backward),
341 | average perplexity, and the output class label.
342 |
343 | Raises:
344 | ValueError: if length of encoder_inputs, decoder_inputs, or
345 | target_weights disagrees with bucket size for the specified bucket_id.
346 | """
347 | # Check if the sizes match.
348 | encoder_size, target_size = self.buckets[bucket_id]
349 | if len(encoder_inputs) != encoder_size:
350 | raise ValueError("Encoder length must be equal to the one in bucket,"
351 | " %d != %d." % (len(encoder_inputs), encoder_size))
352 |
353 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
354 | input_feed = {}
355 | input_feed[self.sequence_length.name] = batch_sequence_length
356 | for l in xrange(encoder_size):
357 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
358 | input_feed[self.labels[0].name] = labels[0]
359 |
360 | # Output feed: depends on whether we do a backward step or not.
361 | if not forward_only:
362 | output_feed = [self.update, # Update Op that does SGD.
363 | self.gradient_norm, # Gradient norm.
364 | self.loss, # Loss for this batch.
365 | self.classification_output[0]]
366 | else:
367 | output_feed = [self.loss,
368 | self.classification_output[0],]
369 |
370 | outputs = session.run(output_feed, input_feed)
371 | if not forward_only:
372 | return outputs[1], outputs[2], outputs[3] # Gradient norm, loss, outputs.
373 | else:
374 | return None, outputs[0], outputs[1] # No gradient norm, loss, outputs.
375 |
376 |
377 | def get_batch(self, data, bucket_id):
378 | """Get a random batch of data from the specified bucket, prepare for step.
379 |
380 | To feed data in step(..) it must be a list of batch-major vectors, while
381 | data here contains single length-major cases. So the main logic of this
382 | function is to re-index data cases to be in the proper format for feeding.
383 |
384 | Args:
385 | data: a tuple of size len(self.buckets) in which each element contains
386 | lists of pairs of input and output data that we use to create a batch.
387 | bucket_id: integer, which bucket to get the batch for.
388 |
389 | Returns:
390 | The triple (encoder_inputs, decoder_inputs, target_weights) for
391 | the constructed batch that has the proper format to call step(...) later.
392 | """
393 | encoder_size, decoder_size = self.buckets[bucket_id]
394 | encoder_inputs, decoder_inputs, labels = [], [], []
395 |
396 | # Get a random batch of encoder and decoder inputs from data,
397 | # pad them if needed, reverse encoder inputs and add GO to decoder.
398 | batch_sequence_length_list= list()
399 | for _ in xrange(self.batch_size):
400 | encoder_input, decoder_input, label = random.choice(data[bucket_id])
401 | batch_sequence_length_list.append(len(encoder_input))
402 |
403 | # Encoder inputs are padded and then reversed.
404 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
405 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
406 | encoder_inputs.append(list(encoder_input + encoder_pad))
407 |
408 | # Decoder inputs get an extra "GO" symbol, and are padded then.
409 | decoder_pad_size = decoder_size - len(decoder_input)
410 | decoder_inputs.append(decoder_input +
411 | [data_utils.PAD_ID] * decoder_pad_size)
412 | labels.append(label)
413 |
414 | # Now we create batch-major vectors from the data selected above.
415 | batch_encoder_inputs = []
416 | batch_decoder_inputs = []
417 | batch_weights = []
418 | batch_labels = []
419 |
420 | # Batch encoder inputs are just re-indexed encoder_inputs.
421 | for length_idx in xrange(encoder_size):
422 | batch_encoder_inputs.append(
423 | np.array([encoder_inputs[batch_idx][length_idx]
424 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
425 |
426 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
427 | for length_idx in xrange(decoder_size):
428 | batch_decoder_inputs.append(
429 | np.array([decoder_inputs[batch_idx][length_idx]
430 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
431 | # Create target_weights to be 0 for targets that are padding.
432 | batch_weight = np.ones(self.batch_size, dtype=np.float32)
433 | for batch_idx in xrange(self.batch_size):
434 | # We set weight to 0 if the corresponding target is a PAD symbol.
435 | # The corresponding target is decoder_input shifted by 1 forward.
436 | # if length_idx < decoder_size - 1:
437 | # target = decoder_inputs[batch_idx][length_idx + 1]
438 | # print (length_idx)
439 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
440 | batch_weight[batch_idx] = 0.0
441 | batch_weights.append(batch_weight)
442 |
443 | batch_labels.append(
444 | np.array([labels[batch_idx][0]
445 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
446 |
447 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32)
448 | return (batch_encoder_inputs, batch_decoder_inputs, batch_weights,
449 | batch_sequence_length, batch_labels)
450 |
451 |
452 | def get_one(self, data, bucket_id, sample_id):
453 | """Get a single sample data from the specified bucket, prepare for step.
454 |
455 | To feed data in step(..) it must be a list of batch-major vectors, while
456 | data here contains single length-major cases. So the main logic of this
457 | function is to re-index data cases to be in the proper format for feeding.
458 |
459 | Args:
460 | data: a tuple of size len(self.buckets) in which each element contains
461 | lists of pairs of input and output data that we use to create a batch.
462 | bucket_id: integer, which bucket to get the batch for.
463 |
464 | Returns:
465 | The triple (encoder_inputs, decoder_inputs, target_weights) for
466 | the constructed batch that has the proper format to call step(...) later.
467 | """
468 | encoder_size, decoder_size = self.buckets[bucket_id]
469 | encoder_inputs, decoder_inputs, labels = [], [], []
470 |
471 | # Get a random batch of encoder and decoder inputs from data,
472 | # pad them if needed, reverse encoder inputs and add GO to decoder.
473 | batch_sequence_length_list= list()
474 | #for _ in xrange(self.batch_size):
475 | encoder_input, decoder_input, label = data[bucket_id][sample_id]
476 | batch_sequence_length_list.append(len(encoder_input))
477 |
478 | # Encoder inputs are padded and then reversed.
479 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
480 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
481 | encoder_inputs.append(list(encoder_input + encoder_pad))
482 |
483 | # Decoder inputs get an extra "GO" symbol, and are padded then.
484 | decoder_pad_size = decoder_size - len(decoder_input)
485 | decoder_inputs.append(decoder_input +
486 | [data_utils.PAD_ID] * decoder_pad_size)
487 | labels.append(label)
488 |
489 | # Now we create batch-major vectors from the data selected above.
490 | batch_encoder_inputs = []
491 | batch_decoder_inputs = []
492 | batch_weights = []
493 | batch_labels = []
494 |
495 | # Batch encoder inputs are just re-indexed encoder_inputs.
496 | for length_idx in xrange(encoder_size):
497 | batch_encoder_inputs.append(
498 | np.array([encoder_inputs[batch_idx][length_idx]
499 | for batch_idx in xrange(1)], dtype=np.int32))
500 |
501 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
502 | for length_idx in xrange(decoder_size):
503 | batch_decoder_inputs.append(
504 | np.array([decoder_inputs[batch_idx][length_idx]
505 | for batch_idx in xrange(1)], dtype=np.int32))
506 |
507 | # Create target_weights to be 0 for targets that are padding.
508 | batch_weight = np.ones(1, dtype=np.float32)
509 | for batch_idx in xrange(1):
510 | # We set weight to 0 if the corresponding target is a PAD symbol.
511 | # The corresponding target is decoder_input shifted by 1 forward.
512 | # if length_idx < decoder_size - 1:
513 | # target = decoder_inputs[batch_idx][length_idx + 1]
514 | # print (length_idx)
515 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
516 | batch_weight[batch_idx] = 0.0
517 | batch_weights.append(batch_weight)
518 |
519 | batch_labels.append(
520 | np.array([labels[batch_idx][0]
521 | for batch_idx in xrange(1)], dtype=np.int32))
522 |
523 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32)
524 | return (batch_encoder_inputs, batch_decoder_inputs, batch_weights,
525 | batch_sequence_length, batch_labels)
--------------------------------------------------------------------------------