├── README.md
├── conlleval.pl
├── data
└── ATIS_samples
│ ├── test
│ ├── test.label
│ ├── test.seq.in
│ └── test.seq.out
│ ├── train
│ ├── train.label
│ ├── train.seq.in
│ └── train.seq.out
│ └── valid
│ ├── valid.label
│ ├── valid.seq.in
│ └── valid.seq.out
├── data_utils.py
├── generate_encoder_output.py
├── multi_task_model.py
├── run_rnn_joint.py
└── seq_labeling.py
/README.md:
--------------------------------------------------------------------------------
1 | Joint Online Spoken Language Understanding and Language Modeling with Recurrent Neural Networks
2 | ==================
3 |
4 | Tensorflow implementation of joint modeling of SLU (intent & slot filling) and LM with RNNs.
5 |
6 | This package implements joint model Number 14 and 15 in Table 1 of the below reference paper [1], i.e. the joint model with *recurrent* (& *local*) intent + slot label context. The other joint models in the paper can be made with simple modifications of generate_encoder_output.py and seq_labeling.py.
7 |
8 | **Setup**
9 |
10 | * Tensorflow, version >= r0.9 (https://www.tensorflow.org/versions/r0.9/get_started/index.html)
11 |
12 | **Usage**:
13 | ```bash
14 | data_dir=data/ATIS_samples
15 | model_dir=model_tmp
16 | max_sequence_length=50 # max length for train/valid/test sequence
17 | use_local_context=False # boolean, whether to use local context
18 | DNN_at_output=True # boolean, set to True to use one hidden layer DNN at task output
19 |
20 | python run_rnn_joint.py --data_dir $data_dir \
21 | --train_dir $model_dir\
22 | --max_sequence_length $max_sequence_length \
23 | --use_local_context $use_local_context \
24 | --DNN_at_output $DNN_at_output
25 | ```
26 |
27 | **Reference**
28 |
29 | [1] Bing Liu, Ian Lane, "Joint Online Spoken Language Understanding and Language Modeling with Recurrent Neural Networks", SIGdial, 2016 (PDF)
30 |
31 | ```
32 | @InProceedings{liu-lane:2016:SIGDIAL,
33 | author = {Liu, Bing and Lane, Ian},
34 | title = {Joint Online Spoken Language Understanding and Language Modeling With Recurrent Neural Networks},
35 | booktitle = {Proceedings of the 17th Annual Meeting of the Special Interest Group on Discourse and Dialogue},
36 | month = {September},
37 | year = {2016},
38 | address = {Los Angeles},
39 | publisher = {Association for Computational Linguistics},
40 | pages = {22--30},
41 | url = {http://www.aclweb.org/anthology/W16-3603}
42 | }
43 | ```
44 |
45 | **Contact**
46 |
47 | Feel free to email liubing@cmu.edu for any pertinent questions/bugs regarding the code.
48 |
--------------------------------------------------------------------------------
/conlleval.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html
5 | # options: l: generate LaTeX output for tables like in
6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex
7 | # r: accept raw result tags (without B- and I- prefix;
8 | # assumes one word per chunk)
9 | # d: alternative delimiter tag (default is single space)
10 | # o: alternative outside tag (default is O)
11 | # note: the file should contain lines with items separated
12 | # by $delimiter characters (default space). The final
13 | # two items should contain the correct tag and the
14 | # guessed tag in that order. Sentences should be
15 | # separated from each other by empty lines or lines
16 | # with $boundary fields (default -X-).
17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/
18 | # started: 1998-09-25
19 | # version: 2004-01-26
20 | # author: Erik Tjong Kim Sang
21 |
22 | use strict;
23 |
24 | my $false = 0;
25 | my $true = 42;
26 |
27 | my $boundary = "-X-"; # sentence boundary
28 | my $correct; # current corpus chunk tag (I,O,B)
29 | my $correctChunk = 0; # number of correctly identified chunks
30 | my $correctTags = 0; # number of correct chunk tags
31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.)
32 | my $delimiter = " "; # field delimiter
33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979)
34 | my $firstItem; # first feature (for sentence boundary checks)
35 | my $foundCorrect = 0; # number of chunks in corpus
36 | my $foundGuessed = 0; # number of identified chunks
37 | my $guessed; # current guessed chunk tag
38 | my $guessedType; # type of current guessed chunk tag
39 | my $i; # miscellaneous counter
40 | my $inCorrect = $false; # currently processed chunk is correct until now
41 | my $lastCorrect = "O"; # previous chunk tag in corpus
42 | my $latex = 0; # generate LaTeX formatted output
43 | my $lastCorrectType = ""; # type of previously identified chunk tag
44 | my $lastGuessed = "O"; # previously identified chunk tag
45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
46 | my $lastType; # temporary storage for detecting duplicates
47 | my $line; # line
48 | my $nbrOfFeatures = -1; # number of features per line
49 | my $precision = 0.0; # precision score
50 | my $oTag = "O"; # outside tag, default O
51 | my $raw = 0; # raw input: add B to every token
52 | my $recall = 0.0; # recall score
53 | my $tokenCounter = 0; # token counter (ignores sentence breaks)
54 |
55 | my %correctChunk = (); # number of correctly identified chunks per type
56 | my %foundCorrect = (); # number of chunks in corpus per type
57 | my %foundGuessed = (); # number of identified chunks per type
58 |
59 | my @features; # features on line
60 | my @sortedTypes; # sorted list of chunk type names
61 |
62 | # sanity check
63 | while (@ARGV and $ARGV[0] =~ /^-/) {
64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
66 | elsif ($ARGV[0] eq "-d") {
67 | shift(@ARGV);
68 | if (not defined $ARGV[0]) {
69 | die "conlleval: -d requires delimiter character";
70 | }
71 | $delimiter = shift(@ARGV);
72 | } elsif ($ARGV[0] eq "-o") {
73 | shift(@ARGV);
74 | if (not defined $ARGV[0]) {
75 | die "conlleval: -o requires delimiter character";
76 | }
77 | $oTag = shift(@ARGV);
78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; }
79 | }
80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
81 | # process input
82 | while () {
83 | chomp($line = $_);
84 | @features = split(/$delimiter/,$line);
85 |
86 | #printf $line;
87 | #printf STDERR $#features;
88 | #printf "\n";
89 |
90 | #printf $nbrOfFeatures;
91 | #printf "\n";
92 | #printf $#features;
93 | #printf "\n";
94 |
95 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
96 | elsif ($nbrOfFeatures != $#features and @features != 0) {
97 | printf STDERR "unexpected number of features: %d (%d)\n",
98 | $#features+1,$nbrOfFeatures+1;
99 | exit(1);
100 | }
101 | if (@features == 0 or
102 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
103 | if (@features < 2) {
104 | die "conlleval: unexpected number of features in line $line\n";
105 | }
106 | if ($raw) {
107 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; }
108 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; }
109 | if ($features[$#features] ne "O") {
110 | $features[$#features] = "B-$features[$#features]";
111 | }
112 | if ($features[$#features-1] ne "O") {
113 | $features[$#features-1] = "B-$features[$#features-1]";
114 | }
115 | }
116 | # 20040126 ET code which allows hyphens in the types
117 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
118 | $guessed = $1;
119 | $guessedType = $2;
120 | } else {
121 | $guessed = $features[$#features];
122 | $guessedType = "";
123 | }
124 | pop(@features);
125 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
126 | $correct = $1;
127 | $correctType = $2;
128 | } else {
129 | $correct = $features[$#features];
130 | $correctType = "";
131 | }
132 | pop(@features);
133 | # ($guessed,$guessedType) = split(/-/,pop(@features));
134 | # ($correct,$correctType) = split(/-/,pop(@features));
135 | $guessedType = $guessedType ? $guessedType : "";
136 | $correctType = $correctType ? $correctType : "";
137 | $firstItem = shift(@features);
138 |
139 | # 1999-06-26 sentence breaks should always be counted as out of chunk
140 | if ( $firstItem eq $boundary ) { $guessed = "O"; }
141 |
142 | if ($inCorrect) {
143 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
144 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
145 | $lastGuessedType eq $lastCorrectType) {
146 | $inCorrect=$false;
147 | $correctChunk++;
148 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
149 | $correctChunk{$lastCorrectType}+1 : 1;
150 | } elsif (
151 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) !=
152 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
153 | $guessedType ne $correctType ) {
154 | $inCorrect=$false;
155 | }
156 | }
157 |
158 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
159 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
160 | $guessedType eq $correctType) { $inCorrect = $true; }
161 |
162 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
163 | $foundCorrect++;
164 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
165 | $foundCorrect{$correctType}+1 : 1;
166 | }
167 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
168 | $foundGuessed++;
169 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
170 | $foundGuessed{$guessedType}+1 : 1;
171 | }
172 | if ( $firstItem ne $boundary ) {
173 | if ( $correct eq $guessed and $guessedType eq $correctType ) {
174 | $correctTags++;
175 | }
176 | $tokenCounter++;
177 | }
178 |
179 | $lastGuessed = $guessed;
180 | $lastCorrect = $correct;
181 | $lastGuessedType = $guessedType;
182 | $lastCorrectType = $correctType;
183 | }
184 | if ($inCorrect) {
185 | $correctChunk++;
186 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
187 | $correctChunk{$lastCorrectType}+1 : 1;
188 | }
189 |
190 | if (not $latex) {
191 | # compute overall precision, recall and FB1 (default values are 0.0)
192 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
193 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
194 | $FB1 = 2*$precision*$recall/($precision+$recall)
195 | if ($precision+$recall > 0);
196 |
197 | # print overall performance
198 | printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
199 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
200 | if ($tokenCounter>0) {
201 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
202 | print "$correctChunk $foundCorrect $foundGuessed ";
203 | printf "precision: %6.2f%%; ",$precision;
204 | printf "recall: %6.2f%%; ",$recall;
205 | printf "FB1: %6.2f\n",$FB1;
206 | }
207 | }
208 |
209 | # sort chunk type names
210 | undef($lastType);
211 | @sortedTypes = ();
212 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
213 | if (not($lastType) or $lastType ne $i) {
214 | push(@sortedTypes,($i));
215 | }
216 | $lastType = $i;
217 | }
218 | # print performance per chunk type
219 | if (not $latex) {
220 | for $i (@sortedTypes) {
221 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
222 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
223 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
224 | if (not($foundCorrect{$i})) { $recall = 0.0; }
225 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
226 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
227 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
228 | printf "%17s: ",$i;
229 | printf "% 4d % 4d % 4d ", $correctChunk{$i}, $foundCorrect{$i}, $foundGuessed{$i};
230 | printf "precision: %6.2f%%; ",$precision;
231 | printf "recall: %6.2f%%; ",$recall;
232 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i};
233 | }
234 | } else {
235 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline";
236 | for $i (@sortedTypes) {
237 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
238 | if (not($foundGuessed{$i})) { $precision = 0.0; }
239 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
240 | if (not($foundCorrect{$i})) { $recall = 0.0; }
241 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
242 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
243 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
244 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
245 | $i,$precision,$recall,$FB1;
246 | }
247 | print "\\hline\n";
248 | $precision = 0.0;
249 | $recall = 0;
250 | $FB1 = 0.0;
251 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
252 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
253 | $FB1 = 2*$precision*$recall/($precision+$recall)
254 | if ($precision+$recall > 0);
255 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
256 | $precision,$recall,$FB1;
257 | }
258 |
259 | exit 0;
260 |
261 | # endOfChunk: checks if a chunk ended between the previous and current word
262 | # arguments: previous and current chunk tags, previous and current types
263 | # note: this code is capable of handling other chunk representations
264 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
265 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
266 |
267 | sub endOfChunk {
268 | my $prevTag = shift(@_);
269 | my $tag = shift(@_);
270 | my $prevType = shift(@_);
271 | my $type = shift(@_);
272 | my $chunkEnd = $false;
273 |
274 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
275 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
276 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
277 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
278 |
279 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
280 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
281 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
282 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
283 |
284 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) {
285 | $chunkEnd = $true;
286 | }
287 |
288 | # corrected 1998-12-22: these chunks are assumed to have length 1
289 | if ( $prevTag eq "]" ) { $chunkEnd = $true; }
290 | if ( $prevTag eq "[" ) { $chunkEnd = $true; }
291 |
292 | return($chunkEnd);
293 | }
294 |
295 | # startOfChunk: checks if a chunk started between the previous and current word
296 | # arguments: previous and current chunk tags, previous and current types
297 | # note: this code is capable of handling other chunk representations
298 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
299 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
300 |
301 | sub startOfChunk {
302 | my $prevTag = shift(@_);
303 | my $tag = shift(@_);
304 | my $prevType = shift(@_);
305 | my $type = shift(@_);
306 | my $chunkStart = $false;
307 |
308 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
309 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
310 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
311 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
312 |
313 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
314 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
315 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
316 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
317 |
318 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) {
319 | $chunkStart = $true;
320 | }
321 |
322 | # corrected 1998-12-22: these chunks are assumed to have length 1
323 | if ( $tag eq "[" ) { $chunkStart = $true; }
324 | if ( $tag eq "]" ) { $chunkStart = $true; }
325 |
326 | return($chunkStart);
327 | }
328 |
--------------------------------------------------------------------------------
/data/ATIS_samples/test/test.label:
--------------------------------------------------------------------------------
1 | flight
2 | airfare
3 | flight
4 | flight
5 | flight
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/test/test.seq.in:
--------------------------------------------------------------------------------
1 | i would like to find a flight from charlotte to las vegas that makes a stop in st. louis
2 | on april first i need a ticket from tacoma to san jose departing before DIGIT am
3 | on april first i need a flight going from phoenix to san diego
4 | i would like a flight traveling one way from phoenix to san diego on april first
5 | i would like a flight from orlando to salt lake city for april first on delta airlines
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/test/test.seq.out:
--------------------------------------------------------------------------------
1 | O O O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name
2 | O B-depart_date.month_name B-depart_date.day_number O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_time.time_relative B-depart_time.time I-depart_time.time
3 | O B-depart_date.month_name B-depart_date.day_number O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name
4 | O O O O O O B-round_trip I-round_trip O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number
5 | O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number O B-airline_name I-airline_name
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.label:
--------------------------------------------------------------------------------
1 | airfare
2 | flight
3 | flight
4 | airfare
5 | flight
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.seq.in:
--------------------------------------------------------------------------------
1 | what's the lowest round trip fare from dallas to atlanta
2 | find me the earliest flight from boston to atlanta on any day of the week
3 | display all flights from boston to baltimore on july thirty first
4 | economy fares new york to miami round trip
5 | i need to fly from boston to denver on to san francisco and back
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/train/train.seq.out:
--------------------------------------------------------------------------------
1 | O O B-cost_relative B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name
2 | O O O B-flight_mod O O B-fromloc.city_name O B-toloc.city_name O O O O O O
3 | O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number
4 | B-economy O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip
5 | O O O O O B-fromloc.city_name O B-toloc.city_name O O B-toloc.city_name I-toloc.city_name O O
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.label:
--------------------------------------------------------------------------------
1 | flight
2 | flight
3 | flight_time
4 | airfare
5 | airfare
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.seq.in:
--------------------------------------------------------------------------------
1 | i want to fly from boston at DIGITDIGITDIGIT am and arrive in denver at DIGITDIGITDIGITDIGIT in the morning
2 | what flights are available from pittsburgh to baltimore on thursday morning
3 | what is the arrival time in san francisco for the DIGITDIGITDIGIT am flight leaving washington
4 | cheapest airfare from tacoma to orlando
5 | round trip fares from pittsburgh to philadelphia under DIGITDIGITDIGITDIGIT dollars
6 |
--------------------------------------------------------------------------------
/data/ATIS_samples/valid/valid.seq.out:
--------------------------------------------------------------------------------
1 | O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day
2 | O O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.day_name B-depart_time.period_of_day
3 | O O O B-flight_time I-flight_time O B-fromloc.city_name I-fromloc.city_name O O B-depart_time.time I-depart_time.time O O B-fromloc.city_name
4 | B-cost_relative O O B-fromloc.city_name O B-toloc.city_name
5 | B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative B-fare_amount I-fare_amount
6 |
--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2015 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for downloading data from WMT, tokenizing, vocabularies."""
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import gzip
22 | import os
23 | import re
24 | import tarfile
25 |
26 | from six.moves import urllib
27 |
28 | from tensorflow.python.platform import gfile
29 |
30 | # Special vocabulary symbols - we always put them at the start.
31 | _PAD = "_PAD"
32 | _UNK = "_UNK"
33 | _BOS = "_BOS"
34 | _EOS = "_EOS"
35 | _START_VOCAB = [_PAD, _UNK]
36 |
37 | START_VOCAB_dict = dict()
38 | START_VOCAB_dict['with_padding'] = [_PAD, _UNK]
39 | START_VOCAB_dict['no_padding'] = [_UNK]
40 | START_VOCAB_dict['lm'] = [_PAD, _UNK, _BOS]
41 |
42 |
43 | PAD_ID = 0
44 | BOS_ID = 2
45 | #EOS_ID = 3
46 |
47 | UNK_ID_dict = dict()
48 | UNK_ID_dict['with_padding'] = 1
49 | UNK_ID_dict['no_padding'] = 0
50 | # UNK_ID = 1
51 |
52 | # Regular expressions used to tokenize.
53 | _WORD_SPLIT = re.compile("([.,!?\"':;)(])")
54 | _DIGIT_RE = re.compile(r"\d")
55 |
56 | # URLs for WMT data.
57 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
58 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"
59 |
60 | def basic_tokenizer(sentence):
61 | """Very basic tokenizer: split the sentence into a list of tokens."""
62 | words = []
63 | for space_separated_fragment in sentence.strip().split():
64 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
65 | return [w for w in words if w]
66 |
67 | def naive_tokenizer(sentence):
68 | """Naive tokenizer: split the sentence by space into a list of tokens."""
69 | return sentence.split()
70 |
71 |
72 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
73 | tokenizer=None, normalize_digits=True):
74 | """Create vocabulary file (if it does not exist yet) from data file.
75 |
76 | Data file is assumed to contain one sentence per line. Each sentence is
77 | tokenized and digits are normalized (if normalize_digits is set).
78 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
79 | We write it to vocabulary_path in a one-token-per-line format, so that later
80 | token in the first line gets id=0, second line gets id=1, and so on.
81 |
82 | Args:
83 | vocabulary_path: path where the vocabulary will be created.
84 | data_path: data file that will be used to create vocabulary.
85 | max_vocabulary_size: limit on the size of the created vocabulary.
86 | tokenizer: a function to use to tokenize each data sentence;
87 | if None, basic_tokenizer will be used.
88 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
89 | """
90 | if not gfile.Exists(vocabulary_path):
91 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
92 | vocab = {}
93 | with gfile.GFile(data_path, mode="r") as f:
94 | counter = 0
95 | for line in f:
96 | counter += 1
97 | if counter % 100000 == 0:
98 | print(" processing line %d" % counter)
99 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
100 | for w in tokens:
101 | word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w
102 | if word in vocab:
103 | vocab[word] += 1
104 | else:
105 | vocab[word] = 1
106 | vocab_list = START_VOCAB_dict['lm'] + sorted(vocab, key=vocab.get, reverse=True)
107 | if len(vocab_list) > max_vocabulary_size:
108 | vocab_list = vocab_list[:max_vocabulary_size]
109 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
110 | for w in vocab_list:
111 | vocab_file.write(w + "\n")
112 |
113 |
114 | def initialize_vocabulary(vocabulary_path):
115 | """Initialize vocabulary from file.
116 |
117 | We assume the vocabulary is stored one-item-per-line, so a file:
118 | dog
119 | cat
120 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
121 | also return the reversed-vocabulary ["dog", "cat"].
122 |
123 | Args:
124 | vocabulary_path: path to the file containing the vocabulary.
125 |
126 | Returns:
127 | a pair: the vocabulary (a dictionary mapping string to integers), and
128 | the reversed vocabulary (a list, which reverses the vocabulary mapping).
129 |
130 | Raises:
131 | ValueError: if the provided vocabulary_path does not exist.
132 | """
133 | if gfile.Exists(vocabulary_path):
134 | rev_vocab = []
135 | with gfile.GFile(vocabulary_path, mode="r") as f:
136 | rev_vocab.extend(f.readlines())
137 | rev_vocab = [line.strip() for line in rev_vocab]
138 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
139 | return vocab, rev_vocab
140 | else:
141 | raise ValueError("Vocabulary file %s not found.", vocabulary_path)
142 |
143 | def sentence_to_token_ids(sentence, vocabulary, UNK_ID,
144 | tokenizer=None, normalize_digits=True):
145 | """Convert a string to list of integers representing token-ids.
146 |
147 | For example, a sentence "I have a dog" may become tokenized into
148 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
149 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
150 |
151 | Args:
152 | sentence: a string, the sentence to convert to token-ids.
153 | vocabulary: a dictionary mapping tokens to integers.
154 | tokenizer: a function to use to tokenize each sentence;
155 | if None, basic_tokenizer will be used.
156 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
157 |
158 | Returns:
159 | a list of integers, the token-ids for the sentence.
160 | """
161 | if tokenizer:
162 | words = tokenizer(sentence)
163 | else:
164 | words = basic_tokenizer(sentence)
165 | if not normalize_digits:
166 | return [vocabulary.get(w, UNK_ID) for w in words]
167 | # Normalize digits by 0 before looking words up in the vocabulary.
168 | return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]
169 |
170 |
171 | def data_to_token_ids(data_path, target_path, vocabulary_path,
172 | tokenizer=None, normalize_digits=True, use_padding=True):
173 | """Tokenize data file and turn into token-ids using given vocabulary file.
174 |
175 | This function loads data line-by-line from data_path, calls the above
176 | sentence_to_token_ids, and saves the result to target_path. See comment
177 | for sentence_to_token_ids on the details of token-ids format.
178 |
179 | Args:
180 | data_path: path to the data file in one-sentence-per-line format.
181 | target_path: path where the file with token-ids will be created.
182 | vocabulary_path: path to the vocabulary file.
183 | tokenizer: a function to use to tokenize each sentence;
184 | if None, basic_tokenizer will be used.
185 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
186 | """
187 | if not gfile.Exists(target_path):
188 | print("Tokenizing data in %s" % data_path)
189 | vocab, _ = initialize_vocabulary(vocabulary_path)
190 | with gfile.GFile(data_path, mode="r") as data_file:
191 | with gfile.GFile(target_path, mode="w") as tokens_file:
192 | counter = 0
193 | for line in data_file:
194 | counter += 1
195 | if counter % 100000 == 0:
196 | print(" tokenizing line %d" % counter)
197 | if use_padding:
198 | UNK_ID = UNK_ID_dict['with_padding']
199 | else:
200 | UNK_ID = UNK_ID_dict['no_padding']
201 | token_ids = sentence_to_token_ids(line, vocab, UNK_ID, tokenizer,
202 | normalize_digits)
203 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
204 |
205 |
206 | def create_label_vocab(vocabulary_path, data_path):
207 | if not gfile.Exists(vocabulary_path):
208 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
209 | vocab = {}
210 | with gfile.GFile(data_path, mode="r") as f:
211 | counter = 0
212 | for line in f:
213 | counter += 1
214 | if counter % 100000 == 0:
215 | print(" processing line %d" % counter)
216 | # tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
217 | label = line.strip()
218 | vocab[label] = 1
219 | label_list = START_VOCAB_dict['no_padding'] + sorted(vocab)
220 | # label_list = sorted(vocab)
221 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
222 | for k in label_list:
223 | vocab_file.write(k + "\n")
224 |
225 | def prepare_multi_task_data(data_dir, in_vocab_size, out_vocab_size):
226 | #current_dir = os.path.realpath(__file__)
227 | # data_dir = 'trec_qa_data'
228 | train_path = data_dir + '/train/train'
229 | dev_path = data_dir + '/valid/valid'
230 | test_path = data_dir + '/test/test'
231 |
232 | # Create vocabularies of the appropriate sizes.
233 | in_vocab_path = os.path.join(data_dir, "in_vocab_%d.txt" % in_vocab_size)
234 | out_vocab_path = os.path.join(data_dir, "out_vocab_%d.txt" % out_vocab_size)
235 | label_path = os.path.join(data_dir, "label.txt")
236 |
237 | create_vocabulary(in_vocab_path, train_path + ".seq.in", in_vocab_size, tokenizer=naive_tokenizer)
238 | create_vocabulary(out_vocab_path, train_path + ".seq.out", out_vocab_size, tokenizer=naive_tokenizer)
239 | create_label_vocab(label_path, train_path + ".label")
240 |
241 | # Create token ids for the training data.
242 | in_seq_train_ids_path = train_path + (".ids%d.seq.in" % in_vocab_size)
243 | out_seq_train_ids_path = train_path + (".ids%d.seq.out" % out_vocab_size)
244 | label_train_ids_path = train_path + (".ids.label")
245 |
246 | data_to_token_ids(train_path + ".seq.in", in_seq_train_ids_path, in_vocab_path, tokenizer=naive_tokenizer)
247 | data_to_token_ids(train_path + ".seq.out", out_seq_train_ids_path, out_vocab_path, tokenizer=naive_tokenizer)
248 | data_to_token_ids(train_path + ".label", label_train_ids_path, label_path, normalize_digits=False, use_padding=False)
249 |
250 | # Create token ids for the development data.
251 | in_seq_dev_ids_path = dev_path + (".ids%d.seq.in" % in_vocab_size)
252 | out_seq_dev_ids_path = dev_path + (".ids%d.seq.out" % out_vocab_size)
253 | label_dev_ids_path = dev_path + (".ids.label")
254 |
255 | data_to_token_ids(dev_path + ".seq.in", in_seq_dev_ids_path, in_vocab_path, tokenizer=naive_tokenizer)
256 | data_to_token_ids(dev_path + ".seq.out", out_seq_dev_ids_path, out_vocab_path, tokenizer=naive_tokenizer)
257 | data_to_token_ids(dev_path + ".label", label_dev_ids_path, label_path, normalize_digits=False, use_padding=False)
258 |
259 | # Create token ids for the test data.
260 | in_seq_test_ids_path = test_path + (".ids%d.seq.in" % in_vocab_size)
261 | out_seq_test_ids_path = test_path + (".ids%d.seq.out" % out_vocab_size)
262 | label_test_ids_path = test_path + (".ids.label")
263 |
264 | data_to_token_ids(test_path + ".seq.in", in_seq_test_ids_path, in_vocab_path, tokenizer=naive_tokenizer)
265 | data_to_token_ids(test_path + ".seq.out", out_seq_test_ids_path, out_vocab_path, tokenizer=naive_tokenizer)
266 | data_to_token_ids(test_path + ".label", label_test_ids_path, label_path, normalize_digits=False, use_padding=False)
267 |
268 | return (in_seq_train_ids_path, out_seq_train_ids_path, label_train_ids_path,
269 | in_seq_dev_ids_path, out_seq_dev_ids_path, label_dev_ids_path,
270 | in_seq_test_ids_path, out_seq_test_ids_path, label_test_ids_path,
271 | in_vocab_path, out_vocab_path, label_path)
--------------------------------------------------------------------------------
/generate_encoder_output.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Mar 9 11:32:21 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | from tensorflow.python.framework import dtypes
13 | from tensorflow.python.ops import array_ops
14 | from tensorflow.python.ops import embedding_ops
15 | from tensorflow.python.ops import math_ops
16 | from tensorflow.python.ops import rnn_cell
17 | from tensorflow.python.ops import rnn
18 | from tensorflow.python.ops import variable_scope
19 | from tensorflow.python.ops import init_ops
20 | from tensorflow.python.framework import tensor_shape
21 | from tensorflow.python.ops import control_flow_ops
22 | import tensorflow as tf
23 |
24 | def get_multilayer_perceptron_regularizers(output_projection):
25 | weight_hidden, bias_hidden, weight_out, bias_out = output_projection
26 | regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
27 | return regularizers
28 |
29 | def multilayer_perceptron_with_initialized_W(_X, output_projection, forward_only=False):
30 | #with variable_scope.variable_scope("intent_MLP"):
31 | weight_hidden, bias_hidden, weight_out, bias_out = output_projection
32 | #Hidden layer with RELU activation
33 | layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, weight_hidden), bias_hidden))
34 |
35 | if not forward_only:
36 | layer_1 = tf.nn.dropout(layer_1, 0.5)
37 |
38 | output = tf.matmul(layer_1, weight_out) + bias_out
39 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
40 | return output
41 |
42 | def get_linear_transformation_regularizers(output_projection):
43 | weight_out, bias_out = output_projection
44 | regularizers = tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
45 | return regularizers
46 |
47 | def linear_transformation_with_initialized_W(_X, output_projection, forward_only=False):
48 | #with variable_scope.variable_scope("intent_MLP"):
49 | weight_out, bias_out = output_projection
50 | output = tf.matmul(_X, weight_out) + bias_out
51 | return output
52 |
53 | def _extract_argmax_and_embed(embedding, DNN_at_output, output_projection, forward_only=False, update_embedding=True):
54 | def loop_function(prev, _):
55 | if DNN_at_output is True:
56 | prev = multilayer_perceptron_with_initialized_W(prev, output_projection, forward_only=forward_only)
57 | else:
58 | prev = linear_transformation_with_initialized_W(prev, output_projection, forward_only=forward_only)
59 | prev_symbol = math_ops.argmax(prev, 1)
60 | # Note that gradients will not propagate through the second parameter of
61 | # embedding_lookup.
62 | emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
63 | return prev, emb_prev
64 | return loop_function
65 |
66 | def rnn_with_output_feedback(cell, inputs,
67 | targets1, targets1_num_symbols, target1_emb_size, target1_output_projection,
68 | targets2, targets2_num_symbols, target2_emb_size, target2_output_projection,
69 | word_emb_size, DNN_at_output,
70 | zero_intent_thres=0,
71 | sequence_length=None, dtype=None,
72 | train_with_true_label=True, use_predicted_output=False):
73 | '''
74 | zero_intent_thres: int, the intent contribution to context remain zero before this thres,
75 | and linear increase to 1 after that.
76 | '''
77 | if not isinstance(cell, rnn_cell.RNNCell):
78 | raise TypeError("cell must be an instance of RNNCell")
79 | if not isinstance(inputs, list):
80 | raise TypeError("inputs must be a list")
81 | if not isinstance(targets1, list):
82 | raise TypeError("targets1 must be a list")
83 | if not isinstance(targets2, list):
84 | raise TypeError("targets2 must be a list")
85 | if not inputs:
86 | raise ValueError("inputs must not be empty")
87 | if not dtype:
88 | raise ValueError("dtype must be provided, which is to used in defining intial RNN state")
89 |
90 | encoder_outputs = []
91 | intent_embedding = variable_scope.get_variable("intent_embedding",[targets1_num_symbols, target1_emb_size])
92 | tag_embedding = variable_scope.get_variable("tag_embedding",[targets2_num_symbols, target2_emb_size])
93 | # use predicted label if use_predicted_output during inference, use true label during training
94 | # To choose to always use predicted label, disable the if condition
95 | intent_loop_function = _extract_argmax_and_embed(intent_embedding, DNN_at_output, target1_output_projection, forward_only=use_predicted_output) #if use_predicted_output else None
96 | tagging_loop_function = _extract_argmax_and_embed(tag_embedding, DNN_at_output, target2_output_projection, forward_only=use_predicted_output)
97 | intent_targets = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets1]
98 | intent_target_embeddings = list()
99 | intent_target_embeddings = [embedding_ops.embedding_lookup(intent_embedding, target) for target in intent_targets]
100 | tag_targets = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets2]
101 | tag_target_embeddings = list()
102 | tag_target_embeddings = [embedding_ops.embedding_lookup(tag_embedding, target) for target in tag_targets]
103 |
104 | if inputs[0].get_shape().ndims != 1:
105 | (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2)
106 | if input_size.value is None:
107 | raise ValueError(
108 | "Input size (second dimension of inputs[0]) must be accessible via "
109 | "shape inference, but saw value None.")
110 | else:
111 | fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0]
112 |
113 | if fixed_batch_size.value:
114 | batch_size = fixed_batch_size.value
115 | else:
116 | batch_size = array_ops.shape(inputs[0])[0]
117 |
118 | state = cell.zero_state(batch_size, dtype)
119 | zero_output = array_ops.zeros(
120 | array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype)
121 | zero_output.set_shape(
122 | tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size]))
123 |
124 | if sequence_length is not None: # Prepare variables
125 | sequence_length = math_ops.to_int32(sequence_length)
126 | min_sequence_length = math_ops.reduce_min(sequence_length)
127 | max_sequence_length = math_ops.reduce_max(sequence_length)
128 |
129 | # prev_cell_output = zero_output
130 | zero_intent_embedding = array_ops.zeros(
131 | array_ops.pack([batch_size, target1_emb_size]), inputs[0].dtype)
132 | zero_intent_embedding.set_shape(
133 | tensor_shape.TensorShape([fixed_batch_size.value, target1_emb_size]))
134 | zero_tag_embedding = array_ops.zeros(
135 | array_ops.pack([batch_size, target2_emb_size]), inputs[0].dtype)
136 | zero_tag_embedding.set_shape(
137 | tensor_shape.TensorShape([fixed_batch_size.value, target2_emb_size]))
138 |
139 | encoder_outputs = list()
140 | intent_logits = list()
141 | tagging_logits = list()
142 | sampled_intent_embeddings = list()
143 | sampled_tag_embeddings = list()
144 |
145 | for time, input_ in enumerate(inputs):
146 | # Bing: introduce output label embeddings as addtional input
147 | # if feed_previous (during testing):
148 | # Use loop_function
149 | # if NOT feed_previous (during training):
150 | # Use true target embedding
151 | if time == 0:
152 | current_intent_embedding = zero_intent_embedding
153 | current_tag_embedding = zero_tag_embedding
154 |
155 |
156 | if time > 0: variable_scope.get_variable_scope().reuse_variables()
157 |
158 | # here we introduce a max(0, t-4)/sequence_length intent weight
159 | thres = zero_intent_thres
160 | if time <= thres:
161 | intent_contribution = math_ops.to_float(0)
162 | else:
163 | intent_contribution = tf.div(math_ops.to_float(time - thres), math_ops.to_float(sequence_length))
164 | # intent_contribution = math_ops.to_float(1)
165 |
166 | x = rnn_cell._linear([tf.transpose(tf.transpose(current_intent_embedding)*intent_contribution), current_tag_embedding, input_], word_emb_size, True)
167 | call_cell = lambda: cell(x, state)
168 |
169 |
170 | # pylint: enable=cell-var-from-loop
171 | if sequence_length is not None:
172 | (output_fw, state) = rnn._rnn_step(
173 | time, sequence_length, min_sequence_length, max_sequence_length,
174 | zero_output, state, call_cell, cell.state_size)
175 | else:
176 | (output_fw, state) = call_cell()
177 |
178 | encoder_outputs.append(output_fw)
179 |
180 | if use_predicted_output:
181 | intent_logit, current_intent_embedding = intent_loop_function(output_fw, time)
182 | tagging_logit, current_tag_embedding = tagging_loop_function(output_fw, time)
183 | else:
184 | if train_with_true_label is True:
185 | intent_logit = multilayer_perceptron_with_initialized_W(output_fw, target1_output_projection, forward_only=use_predicted_output)
186 | tagging_logit = multilayer_perceptron_with_initialized_W(output_fw, target2_output_projection, forward_only=use_predicted_output)
187 | current_intent_embedding = intent_target_embeddings[time]
188 | current_tag_embedding = tag_target_embeddings[time]
189 | else:
190 | intent_logit, current_intent_embedding = intent_loop_function(output_fw, time)
191 | tagging_logit, current_tag_embedding = tagging_loop_function(output_fw, time)
192 | # prev_symbols.append(prev_symbol)
193 | if time == 0:
194 | current_intent_embedding = zero_intent_embedding
195 | current_tag_embedding = zero_tag_embedding
196 | sampled_intent_embeddings.append(current_intent_embedding)
197 | sampled_tag_embeddings.append(current_tag_embedding)
198 |
199 | intent_logits.append(intent_logit)
200 | tagging_logits.append(tagging_logit)
201 |
202 | return encoder_outputs, state, intent_logits, tagging_logits, sampled_intent_embeddings, sampled_tag_embeddings
203 |
204 | def generate_embedding_RNN_output_joint(encoder_inputs, cell,
205 | num_encoder_symbols,
206 | intent_targets,
207 | num_intent_target_symbols,
208 | tagging_targets,
209 | num_tagging_target_symbols,
210 | word_emb_size,
211 | dtype=dtypes.float32,
212 | scope=None, initial_state_attention=False,
213 | sequence_length=None,
214 | bidirectional_rnn=False,
215 | DNN_at_output=False,
216 | dnn_hidden_layer_size=200,
217 | output_emb_size=50,
218 | trainable=True,
219 | train_with_true_label=True,
220 | zero_intent_thres=0,
221 | forward_only=False):
222 | """
223 | Generate RNN state outputs with word embeddings as inputs
224 | """
225 | with variable_scope.variable_scope(scope or "generate_embedding_RNN_output_joint"):
226 | encoder_cell = cell
227 | embedding = variable_scope.get_variable("word_embedding", [num_encoder_symbols, word_emb_size], trainable=trainable)
228 | encoder_embedded_inputs = list()
229 | encoder_embedded_inputs = [embedding_ops.embedding_lookup(embedding, encoder_input) for encoder_input in encoder_inputs]
230 |
231 | if DNN_at_output is True:
232 | with variable_scope.variable_scope("intent_DNN"):
233 | bias_start = 0.0
234 | n_hidden = dnn_hidden_layer_size
235 | intent_target_weight_hidden = variable_scope.get_variable("Weight_Hidden", [encoder_cell.output_size, n_hidden])
236 | intent_target_bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden],
237 | initializer=init_ops.constant_initializer(bias_start))
238 | intent_target_weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, num_intent_target_symbols])
239 | intent_target_1bias_out = variable_scope.get_variable("Bias_Out", [num_intent_target_symbols],
240 | initializer=init_ops.constant_initializer(bias_start))
241 | intent_target_output_projection = (intent_target_weight_hidden, intent_target_bias_hidden, intent_target_weight_out, intent_target_1bias_out)
242 | intent_target_regularizers = get_multilayer_perceptron_regularizers(intent_target_output_projection)
243 | with variable_scope.variable_scope("tag_DNN"):
244 | bias_start = 0.0
245 | n_hidden = dnn_hidden_layer_size
246 | tagging_target_weight_hidden = variable_scope.get_variable("Weight_Hidden", [encoder_cell.output_size, n_hidden])
247 | tagging_target_bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden],
248 | initializer=init_ops.constant_initializer(bias_start))
249 | tagging_target_weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, num_tagging_target_symbols])
250 | tagging_target_1bias_out = variable_scope.get_variable("Bias_Out", [num_tagging_target_symbols],
251 | initializer=init_ops.constant_initializer(bias_start))
252 | tagging_target_output_projection = (tagging_target_weight_hidden, tagging_target_bias_hidden, tagging_target_weight_out, tagging_target_1bias_out)
253 | tagging_target_regularizers = get_multilayer_perceptron_regularizers(tagging_target_output_projection)
254 | else:
255 | with variable_scope.variable_scope("intent_linear"):
256 | bias_start = 0.0
257 | intent_target_weight_out = variable_scope.get_variable("Weight_Out", [encoder_cell.output_size, num_intent_target_symbols])
258 | intent_target_bias_out = variable_scope.get_variable("Bias_Out", [num_intent_target_symbols],
259 | initializer=init_ops.constant_initializer(bias_start))
260 | intent_target_output_projection = (intent_target_weight_out, intent_target_bias_out)
261 | intent_target_regularizers = get_linear_transformation_regularizers(intent_target_output_projection)
262 | with variable_scope.variable_scope("tag_linear"):
263 | bias_start = 0.0
264 | tagging_target_weight_out = variable_scope.get_variable("Weight_Out", [encoder_cell.output_size, num_tagging_target_symbols])
265 | tagging_target_bias_out = variable_scope.get_variable("Bias_Out", [num_tagging_target_symbols],
266 | initializer=init_ops.constant_initializer(bias_start))
267 | tagging_target_output_projection = (tagging_target_weight_out, tagging_target_bias_out)
268 | tagging_target_regularizers = get_linear_transformation_regularizers(tagging_target_output_projection)
269 |
270 | intent_target_emb_size = output_emb_size
271 | tagging_target_emb_size = output_emb_size
272 |
273 | outputs = rnn_with_output_feedback( encoder_cell, encoder_embedded_inputs,
274 | intent_targets, num_intent_target_symbols, intent_target_emb_size, intent_target_output_projection,
275 | tagging_targets, num_tagging_target_symbols, tagging_target_emb_size, tagging_target_output_projection,
276 | word_emb_size, DNN_at_output, zero_intent_thres=zero_intent_thres,
277 | sequence_length=sequence_length, dtype=dtype,
278 | train_with_true_label=train_with_true_label,
279 | use_predicted_output=forward_only)
280 |
281 |
282 | encoder_outputs, encoder_state, intent_logits, tagging_logits, sampled_intent_embeddings, sampled_tag_embeddings = outputs
283 |
284 | intent_results = (intent_logits, intent_target_regularizers, sampled_intent_embeddings)
285 | tagging_results = (tagging_logits, tagging_target_regularizers, sampled_tag_embeddings)
286 |
287 | return encoder_outputs, encoder_state, encoder_embedded_inputs, intent_results, tagging_results
--------------------------------------------------------------------------------
/multi_task_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Mar 10 17:28:22 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 |
7 | Multi-task model: intent, tagging, LM
8 | """
9 |
10 | from __future__ import absolute_import
11 | from __future__ import division
12 | from __future__ import print_function
13 |
14 | import random
15 |
16 | import numpy as np
17 | from six.moves import xrange # pylint: disable=redefined-builtin
18 | import tensorflow as tf
19 | from tensorflow.python.framework import dtypes
20 |
21 | import data_utils
22 | import seq_labeling
23 | from tensorflow.python.ops import array_ops
24 | from generate_encoder_output import generate_embedding_RNN_output_joint
25 |
26 |
27 | class MultiTaskModel(object):
28 | def __init__(self, source_vocab_size, lm_vocab_size, tag_vocab_size,
29 | label_vocab_size, buckets,
30 | word_embedding_size, size, num_layers, max_gradient_norm, batch_size,
31 | dropout_keep_prob=1.0, use_lstm=True, lm_cost_weight=1.0,
32 | DNN_at_output=False, dnn_hidden_layer_size=200, output_emb_size=50,
33 | use_local_context=False, zero_intent_thres=0,
34 | forward_only=False):
35 | """Create the model.
36 |
37 | Args:
38 | source_vocab_size: int, size of the source vocabulary.
39 | lm_vocab_size: int, size of the LM vocabulary.
40 | tag_vocab_size: int, size of the intent tags.
41 | label_vocab_size: int, size of the slot labels.
42 | buckets: dummy buckets here, modified from tensorflow translation example.
43 | word_embedding_size: int, word embedding size
44 | size: int, number of units in each layer of the model.
45 | num_layers: int, number of layers in the model.
46 | max_gradient_norm: gradients will be clipped to maximally this norm.
47 | batch_size: the size of the batches used during training;
48 | the model construction is independent of batch_size, so it can be
49 | changed after initialization if this is convenient, e.g., for decoding.
50 | dropout_keep_prob: prob to keep during dropout
51 | use_lstm: use lstm cell
52 | lm_cost_weight: lm error weight in linear interpolation os multi-task model errors
53 | DNN_at_output: use DNN at the output layer of each task
54 | dnn_hidden_layer_size: DNN hidden layer size
55 | output_emb_size: size of output label/tag embedding
56 | use_local_context: boolean, if set, apply sampled intent/tag as context to lm.
57 | zero_intent_thres: int, the intent contribution to context remain zero before this thres,
58 | and linear increase to 1 after that.
59 | forward_only: if set, we do not construct the backward pass in the model.
60 | """
61 | self.source_vocab_size = source_vocab_size
62 | self.tag_vocab_size = tag_vocab_size
63 | self.label_vocab_size = label_vocab_size
64 | self.lm_vocab_size = lm_vocab_size
65 | self.buckets = buckets
66 | self.batch_size = batch_size
67 | self.global_step = tf.Variable(0, trainable=False)
68 |
69 | softmax_loss_function = None
70 |
71 | # Create the internal multi-layer cell for our RNN.
72 | single_cell = tf.nn.rnn_cell.GRUCell(size)
73 | if use_lstm:
74 | # use the customized rnn_cell, --> added trainable option in RNN
75 | single_cell = tf.nn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
76 | cell = single_cell
77 | if num_layers > 1:
78 | cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
79 |
80 | if not forward_only and dropout_keep_prob < 1.0:
81 | cell = tf.nn.rnn_cell.DropoutWrapper(cell,
82 | input_keep_prob=dropout_keep_prob,
83 | output_keep_prob=dropout_keep_prob)
84 |
85 |
86 | # Feeds for inputs.
87 | self.encoder_inputs = []
88 | self.encoder_inputs_shiftByOne = []
89 | self.tag_weights = []
90 | self.intent_weights = []
91 | self.lm_weights = []
92 | self.tags = []
93 | self.labels = []
94 | self.sequence_length = tf.placeholder(tf.int32, [None], name="sequence_length")
95 |
96 | self.lm_cost_weight = lm_cost_weight
97 |
98 | self.train_with_true_label = tf.placeholder(tf.bool, name="train_with_true_label")
99 |
100 | for i in xrange(buckets[-1][0]): # Last bucket is the biggest one.
101 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
102 | name="encoder{0}".format(i)))
103 | self.encoder_inputs_shiftByOne.append(tf.placeholder(tf.int32, shape=[None],
104 | name="encoder_shiftByOne{0}".format(i)))
105 | self.lm_weights.append(tf.placeholder(tf.float32, shape=[None],
106 | name="lm_weight{0}".format(i)))
107 | for i in xrange(buckets[-1][1]): # output length is 1
108 | self.tags.append(tf.placeholder(tf.float32, shape=[None], name="tag{0}".format(i)))
109 | self.tag_weights.append(tf.placeholder(tf.float32, shape=[None],
110 | name="tag_weight{0}".format(i)))
111 | self.intent_weights.append(tf.placeholder(tf.float32, shape=[None],
112 | name="intent_weight{0}".format(i)))
113 | self.labels.append(tf.placeholder(tf.float32, shape=[None], name="label"))
114 | self.label_duplicated_sequence = self.labels * buckets[-1][1]
115 |
116 | rnn_outputs = generate_embedding_RNN_output_joint(self.encoder_inputs,
117 | cell,
118 | self.source_vocab_size,
119 | self.label_duplicated_sequence,
120 | self.label_vocab_size,
121 | self.tags,
122 | self.tag_vocab_size,
123 | word_embedding_size,
124 | dtype=dtypes.float32,
125 | scope=None,
126 | sequence_length=self.sequence_length,
127 | DNN_at_output=DNN_at_output,
128 | dnn_hidden_layer_size=dnn_hidden_layer_size,
129 | output_emb_size=output_emb_size,
130 | train_with_true_label=self.train_with_true_label,
131 | zero_intent_thres=zero_intent_thres,
132 | forward_only=forward_only)
133 |
134 | encoder_outputs, encoder_state, encoder_embedded_inputs, intent_results, tagging_results = rnn_outputs
135 |
136 | # get tagging loss
137 | self.tagging_output, sampled_tags, _, self.tagging_loss, _ = seq_labeling.generate_task_output(
138 | encoder_outputs, None, encoder_state, self.tags, self.sequence_length, self.tag_vocab_size, self.tag_weights,
139 | buckets, softmax_loss_function=softmax_loss_function, scope='tagging', DNN_at_output=DNN_at_output, tagging_results=tagging_results, forward_only=forward_only)
140 |
141 |
142 | # get intent classification loss
143 | # transform intent classification to a sequence labeling task
144 | # output intent class at each step
145 | self.intent_outputs, _, sampled_intents, self.classification_loss, _ = seq_labeling.generate_task_output(
146 | encoder_outputs, None, encoder_state, self.label_duplicated_sequence, self.sequence_length, self.label_vocab_size, self.intent_weights,
147 | buckets, softmax_loss_function=softmax_loss_function, use_attention=False, scope='intent', DNN_at_output=DNN_at_output, train_with_true_label=self.train_with_true_label, intent_results=intent_results, forward_only=forward_only)
148 | classification_output_rev = seq_labeling._reverse_seq(self.intent_outputs, self.sequence_length)
149 | self.classification_output = [classification_output_rev[0]]
150 |
151 |
152 | # get lm loss, use additional_inputs = sampled_tags_intents
153 | sampled_tags_intents = [array_ops.concat(1, [sampled_tag, sampled_intent]) for sampled_tag, sampled_intent in zip(sampled_tags, sampled_intents)]
154 | self.lm_output, _, _, self.lm_loss_withRegularizers, self.lm_loss = seq_labeling.generate_task_output(
155 | encoder_outputs, sampled_tags_intents, encoder_state, self.encoder_inputs_shiftByOne, self.sequence_length, self.lm_vocab_size, self.lm_weights,
156 | buckets, softmax_loss_function=softmax_loss_function, use_attention=False, scope='lm', DNN_at_output=DNN_at_output, use_local_context=use_local_context,
157 | forward_only=forward_only)
158 |
159 | # Gradients and SGD update operation for training the model.
160 | params = tf.trainable_variables()
161 | if not forward_only:
162 | for i in range(len(params)):
163 | print (params[i].name)
164 | opt = tf.train.AdamOptimizer()
165 | gradients = tf.gradients([self.tagging_loss, self.classification_loss, self.lm_loss_withRegularizers*self.lm_cost_weight], params)
166 |
167 | clipped_gradients, norm = tf.clip_by_global_norm(gradients,
168 | max_gradient_norm)
169 | self.gradient_norm = norm
170 | self.update = opt.apply_gradients(
171 | zip(clipped_gradients, params), global_step=self.global_step)
172 |
173 | self.saver = tf.train.Saver(tf.all_variables())
174 |
175 |
176 | def joint_step(self, session, encoder_inputs, encoder_inputs_shiftByOne,
177 | lm_weights, tags, tag_weights, labels, intent_weights,
178 | batch_sequence_length, bucket_id, forward_only,
179 | train_with_true_label=True):
180 | """Run a joint step of the model feeding the given inputs.
181 |
182 | Args:
183 | session: tensorflow session to use.
184 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
185 | encoder_inputs_shiftByOne: lm output
186 | lm_weights: list of numpy float vectors to feed as lm target weights.
187 | tags: list of numpy int vectors to feed as output tags.
188 | tag_weights: list of numpy float vectors to feed as tag weights.
189 | labels: list of numpy int vectors to feed as intent labels.
190 | intent_weights: list of numpy float vectors to feed as intent weights.
191 | batch_sequence_length: list of numpy int to feed as sequence length.
192 | bucket_id: which bucket of the model to use. # dummy, always 0.
193 | forward_only: whether to do the backward step or only forward.
194 | train_with_true_label: whether to use true label during model training.
195 |
196 | Returns:
197 | A triple consisting of gradient norm (or None if we did not do backward),
198 | average perplexity, and the outputs.
199 |
200 | Raises:
201 | ValueError: if length of encoder_inputs, decoder_inputs, or
202 | target_weights disagrees with bucket size for the specified bucket_id.
203 | """
204 | # Check if the sizes match.
205 | encoder_size, tag_size = self.buckets[bucket_id]
206 | if len(encoder_inputs) != encoder_size:
207 | raise ValueError("Encoder length must be equal to the one in bucket,"
208 | " %d != %d." % (len(encoder_inputs), encoder_size))
209 | if len(tags) != tag_size:
210 | raise ValueError("Decoder length must be equal to the one in bucket,"
211 | " %d != %d." % (len(tags), tag_size))
212 | if len(labels) != 1:
213 | raise ValueError("Decoder length must be equal to the one in bucket,"
214 | " %d != %d." % (len(labels), 1))
215 |
216 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
217 | input_feed = {}
218 | input_feed[self.sequence_length.name] = batch_sequence_length
219 | for l in xrange(encoder_size):
220 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
221 | input_feed[self.tags[l].name] = tags[l]
222 | input_feed[self.tag_weights[l].name] = tag_weights[l]
223 | input_feed[self.intent_weights[l].name] = intent_weights[l]
224 | input_feed[self.encoder_inputs_shiftByOne[l].name] = encoder_inputs_shiftByOne[l]
225 | input_feed[self.lm_weights[l].name] = lm_weights[l]
226 | input_feed[self.labels[0].name] = labels[0]
227 | input_feed[self.train_with_true_label.name] = train_with_true_label
228 |
229 | # Output feed: depends on whether we do a backward step or not.
230 | if not forward_only:
231 | output_feed = [self.update, # Update Op that does SGD.
232 | self.gradient_norm, # Gradient norm.
233 | self.lm_loss] # Loss for this batch, report the LM loss here.
234 | for i in range(tag_size):
235 | output_feed.append(self.tagging_output[i])
236 | output_feed.append(self.classification_output[0])
237 | else:
238 | output_feed = [self.lm_loss]
239 | for i in range(tag_size):
240 | output_feed.append(self.tagging_output[i])
241 | output_feed.append(self.classification_output[0])
242 |
243 | outputs = session.run(output_feed, input_feed)
244 | if not forward_only:
245 | return outputs[1], outputs[2], outputs[3:3+tag_size],outputs[-1]
246 | else:
247 | return None, outputs[0], outputs[1:1+tag_size], outputs[-1]
248 |
249 |
250 | def tagging_step(self, session, encoder_inputs, tags, tag_weights, batch_sequence_length,
251 | bucket_id, forward_only, train_with_true_label=True):
252 | """Run a tagging step of the model feeding the given inputs.
253 |
254 | Args:
255 | session: tensorflow session to use.
256 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
257 | tags: list of numpy int vectors to feed as output tags.
258 | tag_weights: list of numpy float vectors to feed as tag weights.
259 | batch_sequence_length: list of numpy int to feed as sequence length.
260 | bucket_id: which bucket of the model to use. # dummy, always 0.
261 | forward_only: whether to do the backward step or only forward.
262 | train_with_true_label: whether to use true label during model training.
263 |
264 | Returns:
265 | A triple consisting of gradient norm (or None if we did not do backward),
266 | average perplexity, and the outputs.
267 |
268 | Raises:
269 | ValueError: if length of encoder_inputs, decoder_inputs, or
270 | target_weights disagrees with bucket size for the specified bucket_id.
271 | """
272 | # Check if the sizes match.
273 | encoder_size, tag_size = self.buckets[bucket_id]
274 | if len(encoder_inputs) != encoder_size:
275 | raise ValueError("Encoder length must be equal to the one in bucket,"
276 | " %d != %d." % (len(encoder_inputs), encoder_size))
277 | if len(tags) != tag_size:
278 | raise ValueError("Decoder length must be equal to the one in bucket,"
279 | " %d != %d." % (len(tags), tag_size))
280 |
281 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
282 | input_feed = {}
283 | input_feed[self.sequence_length.name] = batch_sequence_length
284 | for l in xrange(encoder_size):
285 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
286 | input_feed[self.tags[l].name] = tags[l]
287 | input_feed[self.tag_weights[l].name] = tag_weights[l]
288 | input_feed[self.train_with_true_label.name] = train_with_true_label
289 |
290 | # Output feed: depends on whether we do a backward step or not.
291 | if not forward_only:
292 | output_feed = [self.update, # Update Op that does SGD.
293 | self.gradient_norm, # Gradient norm.
294 | self.tagging_loss] # Loss for this batch.
295 | for i in range(tag_size):
296 | output_feed.append(self.tagging_output[i])
297 | else:
298 | output_feed = [self.tagging_loss]
299 | for i in range(tag_size):
300 | output_feed.append(self.tagging_output[i])
301 | outputs = session.run(output_feed, input_feed)
302 | if not forward_only:
303 | return outputs[1], outputs[2], outputs[3:3+tag_size]
304 | else:
305 | return None, outputs[0], outputs[1:1+tag_size]
306 |
307 | def classification_step(self, session, encoder_inputs, labels, intent_weights, batch_sequence_length,
308 | bucket_id, forward_only):
309 | """Run a intent classification step of the model feeding the given inputs.
310 |
311 | Args:
312 | session: tensorflow session to use.
313 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
314 | labels: list of numpy int vectors to feed as intent labels.
315 | intent_weights: list of numpy float vectors to feed as intent weights.
316 | batch_sequence_length: list of numpy int to feed as sequence length.
317 | bucket_id: which bucket of the model to use. # dummy, always 0.
318 | forward_only: whether to do the backward step or only forward.
319 | train_with_true_label: whether to use true label during model training.
320 |
321 | Returns:
322 | A triple consisting of gradient norm (or None if we did not do backward),
323 | average perplexity, and the outputs.
324 |
325 | Raises:
326 | ValueError: if length of encoder_inputs, decoder_inputs, or
327 | target_weights disagrees with bucket size for the specified bucket_id.
328 | """
329 | # Check if the sizes match.
330 | encoder_size, target_size = self.buckets[bucket_id]
331 | if len(encoder_inputs) != encoder_size:
332 | raise ValueError("Encoder length must be equal to the one in bucket,"
333 | " %d != %d." % (len(encoder_inputs), encoder_size))
334 |
335 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
336 | input_feed = {}
337 | input_feed[self.sequence_length.name] = batch_sequence_length
338 | for l in xrange(encoder_size):
339 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
340 | input_feed[self.intent_weights[l].name] = intent_weights[l]
341 | input_feed[self.labels[0].name] = labels[0]
342 |
343 | # Output feed: depends on whether we do a backward step or not.
344 | if not forward_only:
345 | output_feed = [self.update, # Update Op that does SGD.
346 | self.gradient_norm, # Gradient norm.
347 | self.classification_loss, # Loss for this batch.
348 | self.classification_output[0],
349 | ]
350 | else:
351 | output_feed = [self.classification_loss,
352 | self.classification_output[0],
353 | ]
354 |
355 | outputs = session.run(output_feed, input_feed)
356 | if not forward_only:
357 | return outputs[1], outputs[2], outputs[3] # Gradient norm, loss, outputs.
358 | else:
359 | return None, outputs[0], outputs[1] # No gradient norm, loss, outputs.
360 |
361 |
362 | def lm_step(self, session, encoder_inputs, encoder_inputs_shiftByOne, lm_weights, batch_sequence_length, bucket_id, forward_only):
363 | """Run a joint step of the model feeding the given inputs.
364 |
365 | Args:
366 | session: tensorflow session to use.
367 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
368 | encoder_inputs_shiftByOne: lm output
369 | lm_weights: list of numpy float vectors to feed as lm target weights.
370 | batch_sequence_length: list of numpy int to feed as sequence length.
371 | bucket_id: which bucket of the model to use. # dummy, always 0.
372 | forward_only: whether to do the backward step or only forward.
373 | train_with_true_label: whether to use true label during model training.
374 |
375 | Returns:
376 | A triple consisting of gradient norm (or None if we did not do backward),
377 | average perplexity, and the outputs.
378 |
379 | Raises:
380 | ValueError: if length of encoder_inputs, decoder_inputs, or
381 | target_weights disagrees with bucket size for the specified bucket_id.
382 | """
383 | encoder_size, tag_size = self.buckets[bucket_id]
384 | if len(encoder_inputs) != encoder_size:
385 | raise ValueError("Encoder length must be equal to the one in bucket,"
386 | " %d != %d." % (len(encoder_inputs), encoder_size))
387 | if len(encoder_inputs_shiftByOne) != tag_size:
388 | raise ValueError("Decoder length must be equal to the one in bucket,"
389 | " %d != %d." % (len(encoder_inputs_shiftByOne), tag_size))
390 |
391 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
392 | input_feed = {}
393 | input_feed[self.sequence_length.name] = batch_sequence_length
394 | for l in xrange(encoder_size):
395 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
396 | input_feed[self.encoder_inputs_shiftByOne[l].name] = encoder_inputs_shiftByOne[l]
397 | input_feed[self.lm_weights[l].name] = lm_weights[l]
398 |
399 | if not forward_only:
400 | output_feed = [self.update, # Update Op that does SGD.
401 | self.gradient_norm, # Gradient norm.
402 | self.lm_loss] # Loss for this batch.
403 | else:
404 | output_feed = [self.lm_loss]
405 |
406 | outputs = session.run(output_feed, input_feed)
407 | if not forward_only:
408 | return outputs[1], outputs[2]
409 | else:
410 | return None, outputs[0]
411 |
412 | def get_batch(self, data, bucket_id):
413 | """Get a random batch of data from the specified bucket, prepare for step.
414 |
415 | To feed data in step(..) it must be a list of batch-major vectors, while
416 | data here contains single length-major cases. So the main logic of this
417 | function is to re-index data cases to be in the proper format for feeding.
418 |
419 | Args:
420 | data: a tuple of size len(self.buckets) in which each element contains
421 | lists of pairs of input and output data that we use to create a batch.
422 | bucket_id: integer, which bucket to get the batch for.
423 |
424 | Returns:
425 | The triple (encoder_inputs, decoder_inputs, target_weights) for
426 | the constructed batch that has the proper format to call step(...) later.
427 | """
428 | encoder_size, decoder_size = self.buckets[bucket_id]
429 | encoder_inputs, encoder_inputs_shiftByOne, decoder_inputs, labels = [], [], [], []
430 |
431 | # Get a random batch of encoder and decoder inputs from data,
432 | # pad them if needed, reverse encoder inputs and add GO to decoder.
433 | batch_sequence_length_list= list()
434 | for _ in xrange(self.batch_size):
435 | encoder_input, decoder_input, label = random.choice(data[bucket_id])
436 | batch_sequence_length_list.append(len(encoder_input) + 1) # +1 for BOS_ID
437 |
438 | # Encoder inputs are padded and then reversed.
439 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input) - 1) # -1 for BOS_ID
440 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
441 | encoder_inputs.append(list([data_utils.BOS_ID] + encoder_input + encoder_pad))
442 | encoder_inputs_shiftByOne.append(list(encoder_input + [data_utils.BOS_ID] + encoder_pad)) #BOS_ID and EOS_ID shared the same ID in in_vocab and lm_vocab
443 |
444 | # Decoder inputs get an extra "GO" symbol, and are padded then.
445 | decoder_pad_size = decoder_size - len(decoder_input) - 1 # -1 for BOS_ID
446 | decoder_inputs.append([data_utils.PAD_ID] + decoder_input +
447 | [data_utils.PAD_ID] * decoder_pad_size)
448 | labels.append(label)
449 |
450 | # Now we create batch-major vectors from the data selected above.
451 | batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_labels = [], [], [], [], [], [], []
452 |
453 | # Batch encoder inputs are just re-indexed encoder_inputs.
454 | for length_idx in xrange(encoder_size):
455 | batch_encoder_inputs.append(
456 | np.array([encoder_inputs[batch_idx][length_idx]
457 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
458 |
459 | # Batch encoder inputs are just re-indexed encoder_inputs.
460 | for length_idx in xrange(encoder_size):
461 | batch_encoder_inputs_shiftByOne.append(
462 | np.array([encoder_inputs_shiftByOne[batch_idx][length_idx]
463 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
464 |
465 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
466 | for length_idx in xrange(decoder_size):
467 | batch_decoder_inputs.append(
468 | np.array([decoder_inputs[batch_idx][length_idx]
469 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
470 | # Create target_weights to be 0 for targets that are padding.
471 | tagging_batch_weight = np.ones(self.batch_size, dtype=np.float32)
472 | intent_batch_weight = np.ones(self.batch_size, dtype=np.float32)
473 | lm_batch_weight = np.ones(self.batch_size, dtype=np.float32)
474 | for batch_idx in xrange(self.batch_size):
475 | # We set weight to 0 if the corresponding target is a PAD symbol.
476 | # The corresponding target is decoder_input shifted by 1 forward.
477 |
478 | # set tagging weight in padding position to 0, leave others to 1
479 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
480 | tagging_batch_weight[batch_idx] = 0.0
481 | # set tagging weight in padding position to 0, leave others to linearly increasing weight
482 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
483 | intent_batch_weight[batch_idx] = 0.0
484 | else:
485 | if length_idx <= 4:
486 | intent_batch_weight[batch_idx] = 0.0
487 | else:
488 | intent_batch_weight[batch_idx] = (length_idx - 4) / batch_sequence_length_list[batch_idx]
489 | if encoder_inputs_shiftByOne[batch_idx][length_idx] == data_utils.PAD_ID:
490 | lm_batch_weight[batch_idx] = 0.0
491 | tagging_batch_weights.append(tagging_batch_weight)
492 | intent_batch_weights.append(intent_batch_weight)
493 | lm_batch_weights.append(lm_batch_weight)
494 |
495 | batch_labels.append(
496 | np.array([labels[batch_idx][0]
497 | for batch_idx in xrange(self.batch_size)], dtype=np.int32))
498 |
499 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32)
500 | return batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_sequence_length, batch_labels
501 |
502 |
503 | def get_one(self, data, bucket_id, sample_id):
504 | """Get a sample data from the specified bucket, prepare for step.
505 |
506 | To feed data in step(..) it must be a list of batch-major vectors, while
507 | data here contains single length-major cases. So the main logic of this
508 | function is to re-index data cases to be in the proper format for feeding.
509 |
510 | Args:
511 | data: a tuple of size len(self.buckets) in which each element contains
512 | lists of pairs of input and output data that we use to create a batch.
513 | bucket_id: integer, which bucket to get the batch for.
514 | sample_id: integer, sample id within the bucket specified.
515 |
516 | Returns:
517 | The triple (encoder_inputs, decoder_inputs, target_weights) for
518 | the constructed batch that has the proper format to call step(...) later.
519 | """
520 | encoder_size, decoder_size = self.buckets[bucket_id]
521 | encoder_inputs, encoder_inputs_shiftByOne, decoder_inputs, labels = [], [], [], []
522 |
523 | # Get a random batch of encoder and decoder inputs from data,
524 | # pad them if needed, reverse encoder inputs and add GO to decoder.
525 | batch_sequence_length_list= list()
526 | #for _ in xrange(self.batch_size):
527 | encoder_input, decoder_input, label = data[bucket_id][sample_id]
528 | batch_sequence_length_list.append(len(encoder_input) + 1) # +1 for BOS_ID
529 |
530 | # Encoder inputs are padded and then reversed.
531 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input) - 1) # -1 for BOS_ID
532 | encoder_inputs.append(list([data_utils.BOS_ID] + encoder_input + encoder_pad))
533 | encoder_inputs_shiftByOne.append(list(encoder_input + [data_utils.BOS_ID] + encoder_pad)) #BOS_ID and EOS_ID shared the same ID in two vocabs
534 |
535 | # Decoder inputs get an extra "GO" symbol, and are padded then.
536 | decoder_pad_size = decoder_size - len(decoder_input) - 1 # -1 for BOS_ID
537 | decoder_inputs.append([data_utils.PAD_ID] + decoder_input +
538 | [data_utils.PAD_ID] * decoder_pad_size)
539 | labels.append(label)
540 |
541 | # Now we create batch-major vectors from the data selected above.
542 | batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_labels = [], [], [], [], [], [], []
543 |
544 | # Batch encoder inputs are just re-indexed encoder_inputs.
545 | for length_idx in xrange(encoder_size):
546 | batch_encoder_inputs.append(
547 | np.array([encoder_inputs[batch_idx][length_idx]
548 | for batch_idx in xrange(1)], dtype=np.int32))
549 |
550 | for length_idx in xrange(encoder_size):
551 | batch_encoder_inputs_shiftByOne.append(
552 | np.array([encoder_inputs_shiftByOne[batch_idx][length_idx]
553 | for batch_idx in xrange(1)], dtype=np.int32))
554 |
555 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
556 | for length_idx in xrange(decoder_size):
557 | batch_decoder_inputs.append(
558 | np.array([decoder_inputs[batch_idx][length_idx]
559 | for batch_idx in xrange(1)], dtype=np.int32))
560 |
561 | # Create target_weights to be 0 for targets that are padding.
562 | tagging_batch_weight = np.ones(1, dtype=np.float32)
563 | intent_batch_weight = np.ones(1, dtype=np.float32)
564 | lm_batch_weight = np.ones(1, dtype=np.float32)
565 | for batch_idx in xrange(1):
566 | # We set weight to 0 if the corresponding target is a PAD symbol.
567 | # The corresponding target is decoder_input shifted by 1 forward.
568 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
569 | tagging_batch_weight[batch_idx] = 0.0
570 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID:
571 | intent_batch_weight[batch_idx] = 0.0
572 | else:
573 | if length_idx <= 4:
574 | intent_batch_weight[batch_idx] = 0.0
575 | else:
576 | intent_batch_weight[batch_idx] = (length_idx - 4) / batch_sequence_length_list[batch_idx]
577 | if encoder_inputs_shiftByOne[batch_idx][length_idx] == data_utils.PAD_ID:
578 | lm_batch_weight[batch_idx] = 0.0
579 | tagging_batch_weights.append(tagging_batch_weight)
580 | lm_batch_weights.append(lm_batch_weight)
581 |
582 | batch_labels.append(
583 | np.array([labels[batch_idx][0]
584 | for batch_idx in xrange(1)], dtype=np.int32))
585 |
586 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32)
587 | return batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_sequence_length, batch_labels
--------------------------------------------------------------------------------
/run_rnn_joint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Mar 05 16:37:17 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | import math
13 | import os
14 | import sys
15 | import time
16 |
17 | import numpy as np
18 | from six.moves import xrange # pylint: disable=redefined-builtin
19 | import tensorflow as tf
20 |
21 | import data_utils
22 | import multi_task_model
23 |
24 | import subprocess
25 | import stat
26 |
27 |
28 | tf.app.flags.DEFINE_float("learning_rate", 0.1, "Learning rate.")
29 | tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.9,
30 | "Learning rate decays by this much.")
31 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
32 | "Clip gradients to this norm.")
33 | tf.app.flags.DEFINE_integer("batch_size", 16,
34 | "Batch size to use during training.")
35 | tf.app.flags.DEFINE_integer("size", 128, "Size of each model layer.")
36 | tf.app.flags.DEFINE_integer("word_embedding_size", 300, "Size of the word embedding")
37 | tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
38 | tf.app.flags.DEFINE_integer("in_vocab_size", 10000, "max word vocab Size.")
39 | tf.app.flags.DEFINE_integer("out_vocab_size", 123, "max tag vocab Size.")
40 | tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory")
41 | tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training directory.")
42 | tf.app.flags.DEFINE_string("test_file", "/tmp", "Test file.")
43 | tf.app.flags.DEFINE_string("test_output_file", "/tmp", "Test output file.")
44 | tf.app.flags.DEFINE_integer("max_train_data_size", 0,
45 | "Limit on the size of training data (0: no limit).")
46 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 300,
47 | "How many training steps to do per checkpoint.")
48 | tf.app.flags.DEFINE_integer("max_training_steps", 20000,
49 | "Max training steps.")
50 | tf.app.flags.DEFINE_boolean("decode", False,
51 | "Set to True for interactive decoding.")
52 | tf.app.flags.DEFINE_boolean("decode_file", False,
53 | "Set to True for file decoding.")
54 | tf.app.flags.DEFINE_boolean("self_test", False,
55 | "Run a self-test if this is set to True.")
56 | tf.app.flags.DEFINE_integer("max_test_data_size", 0,
57 | "Max size of test set.")
58 | tf.app.flags.DEFINE_integer("max_sequence_length", 0,
59 | "Max sequence length.")
60 | tf.app.flags.DEFINE_float("dropout_keep_prob", 0.5,
61 | "dropout keep cell input and output prob.")
62 | tf.app.flags.DEFINE_integer("dnn_hidden_layer_size", 200,
63 | "num of nodes in the DNN hidden layer on top of RNN")
64 | tf.app.flags.DEFINE_integer("output_emb_size", 50,
65 | "size of output label/tag embedding")
66 | tf.app.flags.DEFINE_float("lm_cost_weight", 1.0,
67 | "Weight of LM cost comparing to NLU tasks")
68 | tf.app.flags.DEFINE_boolean("DNN_at_output", False,
69 | "add DNN at the output layer of sequence labeling")
70 | tf.app.flags.DEFINE_boolean("use_local_context", True,
71 | "If false, rnn parameters and emb are locked")
72 | tf.app.flags.DEFINE_integer("zero_intent_thres", 0,
73 | "threshold to linearly increase intent contribution to context")
74 | tf.app.flags.DEFINE_string("label_in_training", "true_label", "Options: true_label; predicted_label; scheduled_sampling")
75 | FLAGS = tf.app.flags.FLAGS
76 |
77 |
78 | # We use a number of buckets and pad to the closest one for efficiency.
79 | # See seq2seq_model.Seq2SeqModel for details of how they work.
80 | if FLAGS.max_sequence_length == 0:
81 | print ('Please indicate max sequence length. Exit')
82 | exit()
83 |
84 | # _buckets: modified from tf translation example, just use one dummy bucket here.
85 | _buckets = [(FLAGS.max_sequence_length, FLAGS.max_sequence_length)]
86 |
87 | # metrics function using conlleval.pl
88 | def conlleval(p, g, w, filename):
89 | '''
90 | INPUT:
91 | p :: predictions
92 | g :: groundtruth
93 | w :: corresponding words
94 |
95 | OUTPUT:
96 | filename :: name of the file where the predictions
97 | are written. it will be the input of conlleval.pl script
98 | for computing the performance in terms of precision
99 | recall and f1 score
100 | '''
101 | out = ''
102 | for sl, sp, sw in zip(g, p, w):
103 | out += 'BOS O O\n'
104 | for wl, wp, w in zip(sl, sp, sw):
105 | if wl != '_PAD':
106 | out += w + ' ' + wl + ' ' + wp + '\n'
107 | out += 'EOS O O\n\n'
108 |
109 | f = open(filename, 'w')
110 | f.writelines(out[:-1]) # remove the ending \n on last line
111 | f.close()
112 |
113 | return get_perf(filename)
114 |
115 | def get_perf(filename):
116 | ''' run conlleval.pl perl script to obtain
117 | precision/recall and F1 score '''
118 | _conlleval = os.path.dirname(os.path.realpath(__file__)) + '/conlleval.pl'
119 | os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions
120 |
121 | proc = subprocess.Popen(["perl",
122 | _conlleval],
123 | stdin=subprocess.PIPE,
124 | stdout=subprocess.PIPE)
125 |
126 | stdout, _ = proc.communicate(''.join(open(filename).readlines()))
127 | for line in stdout.split('\n'):
128 | if 'accuracy' in line:
129 | out = line.split()
130 | break
131 |
132 | precision = float(out[6][:-2])
133 | recall = float(out[8][:-2])
134 | f1score = float(out[10])
135 |
136 | return {'p': precision, 'r': recall, 'f1': f1score}
137 |
138 |
139 | def read_data(source_path, target_path, label_path, max_size=None):
140 | """Read data from source and target files and put into buckets.
141 |
142 | Args:
143 | source_path: path to the files with token-ids for the source input - word sequence.
144 | target_path: path to the file with token-ids for the target output - tag sequence;
145 | it must be aligned with the source file: n-th line contains the desired
146 | output for n-th line from the source_path.
147 | label_path: path to the file with token-ids for the sequence classification label
148 | max_size: maximum number of lines to read, all other will be ignored;
149 | if 0 or None, data files will be read completely (no limit).
150 |
151 | Returns:
152 | data_set: a list of length len(_buckets); data_set[n] contains a list of
153 | (source, target) pairs read from the provided data files that fit
154 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
155 | len(target) < _buckets[n][1]; source and target are lists of token-ids.
156 | """
157 | data_set = [[] for _ in _buckets]
158 | with tf.gfile.GFile(source_path, mode="r") as source_file:
159 | with tf.gfile.GFile(target_path, mode="r") as target_file:
160 | with tf.gfile.GFile(label_path, mode="r") as label_file:
161 | source, target, label = source_file.readline(), target_file.readline(), label_file.readline()
162 | counter = 0
163 | while source and target and label and (not max_size or counter < max_size):
164 | counter += 1
165 | if counter % 100000 == 0:
166 | print(" reading data line %d" % counter)
167 | sys.stdout.flush()
168 | source_ids = [int(x) for x in source.split()]
169 | target_ids = [int(x) for x in target.split()]
170 | label_ids = [int(x) for x in label.split()]
171 | for bucket_id, (source_size, target_size) in enumerate(_buckets):
172 | if len(source_ids) < source_size and len(target_ids) < target_size:
173 | data_set[bucket_id].append([source_ids, target_ids, label_ids])
174 | break
175 | source, target, label = source_file.readline(), target_file.readline(), label_file.readline()
176 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids
177 |
178 | def read_test_data(source_path):
179 | data_set = [[] for _ in _buckets]
180 | with tf.gfile.GFile(source_path, mode="r") as source_file:
181 | source = source_file.readline()
182 | counter = 0
183 | while source:
184 | counter += 1
185 | if counter % 100000 == 0:
186 | print(" reading data line %d" % counter)
187 | sys.stdout.flush()
188 | source_ids = [int(x) for x in source.split()]
189 | for bucket_id, (source_size, target_size) in enumerate(_buckets):
190 | if len(source_ids) < source_size:
191 | data_set[bucket_id].append([source_ids, [0]*len(source_ids), [0]])
192 | break
193 | source = source_file.readline()
194 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids
195 |
196 | def create_model(session, source_vocab_size, target_vocab_size, label_vocab_size, lm_vocab_size):
197 | """Create model and initialize or load parameters in session."""
198 | with tf.variable_scope("model", reuse=None):
199 | model_train = multi_task_model.MultiTaskModel(
200 | source_vocab_size, lm_vocab_size, target_vocab_size, label_vocab_size, _buckets,
201 | FLAGS.word_embedding_size, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
202 | dropout_keep_prob=FLAGS.dropout_keep_prob, use_lstm=True, lm_cost_weight=FLAGS.lm_cost_weight,
203 | forward_only=False,
204 | DNN_at_output=FLAGS.DNN_at_output,
205 | dnn_hidden_layer_size=FLAGS.dnn_hidden_layer_size,
206 | output_emb_size=FLAGS.output_emb_size,
207 | zero_intent_thres=FLAGS.zero_intent_thres)
208 | with tf.variable_scope("model", reuse=True):
209 | model_test = multi_task_model.MultiTaskModel(
210 | source_vocab_size, lm_vocab_size, target_vocab_size, label_vocab_size, _buckets,
211 | FLAGS.word_embedding_size, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
212 | dropout_keep_prob=FLAGS.dropout_keep_prob, use_lstm=True, lm_cost_weight=FLAGS.lm_cost_weight,
213 | forward_only=True,
214 | DNN_at_output=FLAGS.DNN_at_output,
215 | dnn_hidden_layer_size=FLAGS.dnn_hidden_layer_size,
216 | output_emb_size=FLAGS.output_emb_size,
217 | zero_intent_thres=FLAGS.zero_intent_thres)
218 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
219 | if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
220 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
221 | model_train.saver.restore(session, ckpt.model_checkpoint_path)
222 | else:
223 | print("Created model with fresh parameters.")
224 | session.run(tf.initialize_all_variables())
225 | return model_train, model_test
226 |
227 | def write_results_to_file(result_dict, idx2tag_vocab, result_out_file):
228 | with open(result_out_file, 'wb') as f:
229 | for key in sorted(result_dict):
230 | f.write('%d\t%s\n' % (key, idx2tag_vocab[result_dict[key]]))
231 |
232 | def train():
233 | print ('Applying Parameters:')
234 | for k,v in FLAGS.__dict__['__flags'].iteritems():
235 | print ('%s: %s' % (k, str(v)))
236 | print("Preparing trec data in %s" % FLAGS.data_dir)
237 | vocab_path = ''
238 | tag_vocab_path = ''
239 | label_vocab_path = ''
240 |
241 | in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_utils.prepare_multi_task_data(
242 | FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size)
243 |
244 | result_dir = FLAGS.train_dir + '/test_results'
245 | if not os.path.isdir(result_dir):
246 | os.makedirs(result_dir)
247 |
248 | current_taging_valid_out_file = result_dir + '/taging.valid.hyp.txt'
249 | current_taging_test_out_file = result_dir + '/taging.test.hyp.txt'
250 | label_valid_out_file = result_dir + '/label.valid.hyp.txt'
251 | label_test_out_file = result_dir + '/label.valid.hyp.txt'
252 |
253 | vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
254 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path)
255 | label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path)
256 | LM_vocab = vocab.copy()
257 | assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID
258 | del LM_vocab[data_utils._BOS]
259 | LM_vocab[data_utils._BOS] = data_utils.BOS_ID
260 | rev_LM_vocab = [x for x in rev_vocab]
261 | rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS
262 |
263 | config = tf.ConfigProto(
264 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23),
265 | )
266 |
267 | with tf.Session(config=config) as sess:
268 | # Create model.
269 | print("Max sequence length: %d ." % _buckets[0][0])
270 | print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
271 |
272 | model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab))
273 | print ("Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)))
274 |
275 | # Read data into buckets and compute their sizes.
276 | print ("Reading development and training data (limit: %d)."
277 | % FLAGS.max_train_data_size)
278 | dev_set = read_data(in_seq_dev, out_seq_dev, label_dev)
279 | test_set = read_data(in_seq_test, out_seq_test, label_test)
280 | train_set = read_data(in_seq_train, out_seq_train, label_train)
281 | train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
282 |
283 | train_total_size = float(sum(train_bucket_sizes))
284 |
285 | train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
286 | for i in xrange(len(train_bucket_sizes))]
287 |
288 | # This is the training loop.
289 | step_time, loss = 0.0, 0.0
290 | current_step = 0
291 | best_valid_score = 0
292 | best_test_score = 0
293 |
294 |
295 | if FLAGS.label_in_training == 'true_label':
296 | print ("Use TRUE label during model training")
297 | train_with_true_label = True
298 | elif FLAGS.label_in_training == 'predicted_label':
299 | print ("Use PREDICTED label during model training")
300 | train_with_true_label = False
301 | elif FLAGS.label_in_training == 'scheduled_sampling':
302 | print ("Use Scheduled Sampling label during model training")
303 |
304 | while model.global_step.eval() < FLAGS.max_training_steps:
305 | random_number_01 = np.random.random_sample()
306 | bucket_id = min([i for i in xrange(len(train_buckets_scale))
307 | if train_buckets_scale[i] > random_number_01])
308 |
309 | # Get a batch and make a step.
310 | start_time = time.time()
311 | encoder_inputs, encoder_inputs_shiftByOne, tags, tag_weights, intent_weights, lm_weights, batch_sequence_length, labels = model.get_batch(train_set, bucket_id)
312 |
313 | if FLAGS.label_in_training == 'scheduled_sampling':
314 | random_number_02 = np.random.random_sample()
315 | final_training_step = FLAGS.max_training_steps
316 | if random_number_02 < float(model.global_step.eval())/final_training_step: # use predicted label in training
317 | train_with_true_label = False
318 | else:
319 | train_with_true_label = True
320 |
321 | _, step_loss, tagging_logits, classification_logits = model.joint_step(sess, encoder_inputs, encoder_inputs_shiftByOne, lm_weights, tags, tag_weights, labels, intent_weights,
322 | batch_sequence_length, bucket_id, False, train_with_true_label=train_with_true_label)
323 | step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
324 | loss += step_loss / FLAGS.steps_per_checkpoint
325 |
326 | current_step += 1
327 |
328 | # Once in a while, we save checkpoint, print statistics, and run evals.
329 | if current_step % FLAGS.steps_per_checkpoint == 0:
330 | # Print statistics for the previous epoch.
331 | perplexity = math.exp(loss) if loss < 300 else float('inf')
332 | print ("global step %d step-time %.2f. Training perplexity %.2f"
333 | % (model.global_step.eval(), step_time, perplexity))
334 | sys.stdout.flush()
335 |
336 | # Save checkpoint and zero timer and loss.
337 | checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
338 | model.saver.save(sess, checkpoint_path, global_step=model.global_step)
339 | step_time, loss = 0.0, 0.0
340 |
341 | def run_eval(data_set, mode): # mode = "Valid", "Test"
342 | # Run evals on development/test set and print their accyracy.
343 | word_list = list()
344 | ref_tag_list = list()
345 | hyp_tag_list = list()
346 | ref_label_list = list()
347 | hyp_label_list = list()
348 | correct_count = 0
349 | accuracy = 0.0
350 | for bucket_id in xrange(len(_buckets)): # len(_buckets) = 1 here
351 | eval_loss = 0.0
352 | count = 0
353 | total_word_count = 0
354 | for i in xrange(len(data_set[bucket_id])):
355 | count += 1
356 | eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one(
357 | data_set, bucket_id, i)
358 | eval_intent_weights = eval_tag_weights
359 | tagging_logits = []
360 | classification_logits = []
361 | _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_lm_weights, eval_tags, eval_tag_weights, eval_labels, eval_intent_weights,
362 | eval_sequence_length, bucket_id, True)
363 | eval_loss += step_loss*(eval_sequence_length[0])
364 | total_word_count += eval_sequence_length[0]
365 | hyp_label = None
366 |
367 | # intent results
368 | ref_label_list.append(rev_label_vocab[eval_labels[0][0]])
369 | hyp_label = np.argmax(classification_logits[0],0)
370 | hyp_label_list.append(rev_label_vocab[hyp_label])
371 | if eval_labels[0] == hyp_label:
372 | correct_count += 1
373 |
374 | # tagging results
375 | word_list.append([rev_vocab[x[0]] for x in eval_encoder_inputs[:eval_sequence_length[0]]])
376 | ref_tag_list.append([rev_tag_vocab[x[0]] for x in eval_tags[:eval_sequence_length[0]]])
377 | hyp_tag_list.append([rev_tag_vocab[np.argmax(x)] for x in tagging_logits[:eval_sequence_length[0]]])
378 |
379 | eval_perplexity = math.exp(float(eval_loss)/total_word_count)
380 | print(" %s perplexity: %.2f" % (mode, eval_perplexity))
381 | accuracy = float(correct_count)*100/count
382 | print(" %s accuracy: %.2f %d/%d" % (mode, accuracy, correct_count, count))
383 |
384 | tagging_eval_result = dict()
385 | if mode == 'Valid':
386 | output_file = current_taging_valid_out_file
387 | elif mode == 'Test':
388 | output_file = current_taging_test_out_file
389 | tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list, word_list, output_file)
390 | print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1']))
391 | sys.stdout.flush()
392 | return eval_perplexity, tagging_eval_result, hyp_label_list
393 |
394 | # run valid
395 | valid_perplexity, valid_tagging_result, valid_hyp_label_list = run_eval(dev_set, 'Valid')
396 | # record best results
397 |
398 | if valid_tagging_result['f1'] > best_valid_score:
399 | best_valid_score = valid_tagging_result['f1']
400 | subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' % best_valid_score])
401 | with open('%s.best_f1_%.2f' % (label_valid_out_file, best_valid_score), 'wb') as f:
402 | for i in range(len(valid_hyp_label_list)):
403 | f.write(valid_hyp_label_list[i] + '\n')
404 |
405 | # run test after each validation for development purpose.
406 | test_perplexity, test_tagging_result, test_hyp_label_list = run_eval(test_set, 'Test')
407 | # record best results
408 | if test_tagging_result['f1'] > best_test_score:
409 | best_test_score = test_tagging_result['f1']
410 | subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' % best_test_score])
411 | with open('%s.best_f1_%.2f' % (label_test_out_file, best_test_score), 'wb') as f:
412 | for i in range(len(test_hyp_label_list)):
413 | f.write(test_hyp_label_list[i] + '\n')
414 |
415 | def decode_file(test_file):
416 | print ('Applying Parameters:')
417 | for k,v in FLAGS.__dict__['__flags'].iteritems():
418 | print ('%s: %s' % (k, str(v)))
419 |
420 | vocab_path = FLAGS.data_dir + '/in_vocab_1000.txt'
421 | tag_vocab_path = FLAGS.data_dir + '/out_vocab_1000.txt'
422 | label_vocab_path = FLAGS.data_dir + '/label.txt'
423 |
424 | vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
425 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path)
426 | label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path)
427 | LM_vocab = vocab.copy()
428 | assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID
429 | del LM_vocab[data_utils._BOS]
430 | LM_vocab[data_utils._BOS] = data_utils.BOS_ID
431 | rev_LM_vocab = [x for x in rev_vocab]
432 | rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS
433 |
434 | data_utils.data_to_token_ids(test_file, test_file + '.ids', vocab_path, tokenizer=data_utils.naive_tokenizer)
435 |
436 | test_set = read_test_data(test_file + '.ids')
437 | lm_test_output_file = FLAGS.test_output_file + '.ppl'
438 | intent_test_output_file = FLAGS.test_output_file + '.intent.hyp'
439 | tagging_test_output_file = FLAGS.test_output_file + '.tag.hyp'
440 |
441 | config = tf.ConfigProto(
442 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23),
443 | )
444 |
445 | with tf.Session(config=config) as sess:
446 | print("Loading model...")
447 | _, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab))
448 | print ("Loaded model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)))
449 |
450 | def run_eval(data_set, decode_output_file):
451 | with open(lm_test_output_file, 'wb') as f_lm:
452 | with open(intent_test_output_file, 'wb') as f_intent:
453 | with open(tagging_test_output_file, 'wb') as f_tagging:
454 | eval_loss = 0.0
455 | bucket_id = 0
456 | count = 0
457 | total_word_count = 0
458 | for i in xrange(len(data_set[bucket_id])):
459 | count += 1
460 | if count % 1000 == 0:
461 | print ("Decoding utterance No. %d..." % count)
462 | eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one(
463 | data_set, bucket_id, i)
464 | eval_intent_weights = eval_tag_weights
465 |
466 | tagging_logits = []
467 | classification_logits = []
468 |
469 | _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_lm_weights, eval_tags, eval_tag_weights, eval_labels, eval_intent_weights,
470 | eval_sequence_length, bucket_id, True, use_attention=FLAGS.use_attention)
471 |
472 | f_lm.write('%.2f\n' % (-step_loss*(eval_sequence_length[0]-1)))
473 | f_lm.flush()
474 | eval_loss += step_loss*(eval_sequence_length[0])
475 | total_word_count += eval_sequence_length[0]
476 |
477 | hyp_label = None
478 | # intent results
479 | hyp_label = np.argmax(classification_logits[0],0)
480 | f_intent.write('%s\n' % rev_label_vocab[hyp_label])
481 | f_intent.flush()
482 | # tagging results
483 | f_tagging.write('%s\n' % ' '.join([rev_tag_vocab[np.argmax(x,1)] for x in tagging_logits[1:eval_sequence_length[0]]]))
484 | f_tagging.flush()
485 |
486 | eval_perplexity = math.exp(float(eval_loss)/total_word_count)
487 | return eval_perplexity
488 |
489 | valid_perplexity = run_eval(test_set, FLAGS.test_output_file)
490 | print(" Eval perplexity: %.2f" % valid_perplexity)
491 | sys.stdout.flush()
492 |
493 |
494 | def main(_):
495 | if FLAGS.decode_file:
496 | decode_file(FLAGS.test_file)
497 | else:
498 | train()
499 |
500 | if __name__ == "__main__":
501 | tf.app.run()
502 |
503 |
504 |
--------------------------------------------------------------------------------
/seq_labeling.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Mar 9 11:32:21 2016
4 |
5 | @author: Bing Liu (liubing@cmu.edu)
6 | """
7 |
8 | from __future__ import absolute_import
9 | from __future__ import division
10 | from __future__ import print_function
11 |
12 | # We disable pylint because we need python3 compatibility.
13 | from six.moves import zip # pylint: disable=redefined-builtin
14 |
15 | from tensorflow.python.framework import dtypes
16 | from tensorflow.python.framework import ops
17 | from tensorflow.python.ops import array_ops
18 | from tensorflow.python.ops import control_flow_ops
19 | from tensorflow.python.ops import math_ops
20 | from tensorflow.python.ops import nn_ops
21 | from tensorflow.python.ops import variable_scope
22 | from tensorflow.python.ops import init_ops
23 | from tensorflow.python.framework import tensor_shape
24 | import tensorflow as tf
25 |
26 | def _step(time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit):
27 | # Step 1: determine whether we need to call_cell or not
28 | empty_update = lambda: zero_logit
29 | logit = control_flow_ops.cond(
30 | time < max_sequence_length, generate_logit, empty_update)
31 |
32 | # Step 2: determine whether we need to copy through state and/or outputs
33 | existing_logit = lambda: logit
34 |
35 | def copy_through():
36 | # Use broadcasting select to determine which values should get
37 | # the previous state & zero output, and which values should get
38 | # a calculated state & output.
39 | copy_cond = (time >= sequence_length)
40 | return math_ops.select(copy_cond, zero_logit, logit)
41 |
42 | logit = control_flow_ops.cond(
43 | time < min_sequence_length, existing_logit, copy_through)
44 | logit.set_shape(logit.get_shape())
45 | return logit
46 |
47 | def _reverse_seq(input_seq, lengths):
48 | """Reverse a list of Tensors up to specified lengths.
49 |
50 | Args:
51 | input_seq: Sequence of seq_len tensors of dimension (batch_size, depth)
52 | lengths: A tensor of dimension batch_size, containing lengths for each
53 | sequence in the batch. If "None" is specified, simply reverses
54 | the list.
55 |
56 | Returns:
57 | time-reversed sequence
58 | """
59 | if lengths is None:
60 | return list(reversed(input_seq))
61 |
62 | input_shape = tensor_shape.matrix(None, None)
63 | for input_ in input_seq:
64 | input_shape.merge_with(input_.get_shape())
65 | input_.set_shape(input_shape)
66 |
67 | # Join into (time, batch_size, depth)
68 | s_joined = array_ops.pack(input_seq)
69 |
70 | # TODO(schuster, ebrevdo): Remove cast when reverse_sequence takes int32
71 | if lengths is not None:
72 | lengths = math_ops.to_int64(lengths)
73 |
74 | # Reverse along dimension 0
75 | s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
76 | # Split again into list
77 | result = array_ops.unpack(s_reversed)
78 | for r in result:
79 | r.set_shape(input_shape)
80 | return result
81 |
82 |
83 | def linear_transformation(_X, input_size, n_class):
84 | with variable_scope.variable_scope("linear"):
85 | bias_start = 0.0
86 | weight_out = variable_scope.get_variable("Weight_out", [input_size, n_class])
87 | bias_out = variable_scope.get_variable("Bias_out", [n_class],
88 | initializer=init_ops.constant_initializer(bias_start))
89 |
90 | output = tf.matmul(_X, weight_out) + bias_out
91 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
92 | return output
93 |
94 | def get_linear_transformation_regularizers():
95 | with variable_scope.variable_scope("linear"):
96 | weight_out = variable_scope.get_variable("Weight_out")
97 | bias_out = variable_scope.get_variable("Bias_out")
98 | regularizers = tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
99 | return regularizers
100 |
101 |
102 | def multilayer_perceptron(_X, input_size, n_hidden, n_class, forward_only=False):
103 | with variable_scope.variable_scope("DNN"):
104 | bias_start = 0.0
105 | weight_hidden = variable_scope.get_variable("Weight_Hidden", [input_size, n_hidden])
106 | bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden],
107 | initializer=init_ops.constant_initializer(bias_start))
108 | #Hidden layer with RELU activation
109 | layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, weight_hidden), bias_hidden))
110 |
111 | if not forward_only:
112 | layer_1 = tf.nn.dropout(layer_1, 0.5)
113 |
114 | weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, n_class])
115 | bias_out = variable_scope.get_variable("Bias_Out", [n_class],
116 | initializer=init_ops.constant_initializer(bias_start))
117 | output = tf.matmul(layer_1, weight_out) + bias_out
118 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
119 | return output
120 |
121 | def get_multilayer_perceptron_regularizers():
122 | with variable_scope.variable_scope("DNN"):
123 | weight_hidden = variable_scope.get_variable("Weight_Hidden")
124 | bias_hidden = variable_scope.get_variable("Bias_Hidden")
125 |
126 | weight_out = variable_scope.get_variable("Weight_Out")
127 | bias_out = variable_scope.get_variable("Bias_Out")
128 | regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out)
129 | return regularizers
130 |
131 | def generate_sequence_output(encoder_outputs,
132 | encoder_state,
133 | num_decoder_symbols,
134 | sequence_length,
135 | num_heads=1,
136 | dtype=dtypes.float32,
137 | use_attention=True,
138 | loop_function=None,
139 | scope=None,
140 | DNN_at_output=False,
141 | forward_only=False):
142 | with variable_scope.variable_scope(scope or "non-attention_RNN"):
143 | attention_encoder_outputs = list()
144 | sequence_attention_weights = list()
145 |
146 | # copy over logits once out of sequence_length
147 | if encoder_outputs[0].get_shape().ndims != 1:
148 | (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2)
149 | else:
150 | fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0]
151 |
152 | if fixed_batch_size.value:
153 | batch_size = fixed_batch_size.value
154 | else:
155 | batch_size = array_ops.shape(encoder_outputs[0])[0]
156 | if sequence_length is not None:
157 | sequence_length = math_ops.to_int32(sequence_length)
158 | if sequence_length is not None: # Prepare variables
159 | zero_logit = array_ops.zeros(
160 | array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype)
161 | zero_logit.set_shape(
162 | tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols]))
163 | min_sequence_length = math_ops.reduce_min(sequence_length)
164 | max_sequence_length = math_ops.reduce_max(sequence_length)
165 |
166 | for time, input_ in enumerate(encoder_outputs):
167 | if time > 0: variable_scope.get_variable_scope().reuse_variables()
168 |
169 | if not DNN_at_output:
170 | generate_logit = lambda: linear_transformation(encoder_outputs[time], output_size, num_decoder_symbols)
171 | else:
172 | generate_logit = lambda: multilayer_perceptron(encoder_outputs[time], output_size, 200, num_decoder_symbols, forward_only=forward_only)
173 | # pylint: enable=cell-var-from-loop
174 | if sequence_length is not None:
175 | logit = _step(
176 | time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit)
177 | else:
178 | logit = generate_logit
179 | attention_encoder_outputs.append(logit)
180 | if DNN_at_output:
181 | regularizers = get_multilayer_perceptron_regularizers()
182 | else:
183 | regularizers = get_linear_transformation_regularizers()
184 | return attention_encoder_outputs, sequence_attention_weights, regularizers
185 |
186 |
187 | def sequence_loss_by_example(logits, targets, weights,
188 | average_across_timesteps=True,
189 | softmax_loss_function=None, name=None):
190 | """Weighted cross-entropy loss for a sequence of logits (per example).
191 |
192 | Args:
193 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
194 | targets: List of 1D batch-sized int32 Tensors of the same length as logits.
195 | weights: List of 1D batch-sized float-Tensors of the same length as logits.
196 | average_across_timesteps: If set, divide the returned cost by the total
197 | label weight.
198 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
199 | to be used instead of the standard softmax (the default if this is None).
200 | name: Optional name for this operation, default: "sequence_loss_by_example".
201 |
202 | Returns:
203 | 1D batch-sized float Tensor: The log-perplexity for each sequence.
204 |
205 | Raises:
206 | ValueError: If len(logits) is different from len(targets) or len(weights).
207 | """
208 | if len(targets) != len(logits) or len(weights) != len(logits):
209 | raise ValueError("Lengths of logits, weights, and targets must be the same "
210 | "%d, %d, %d." % (len(logits), len(weights), len(targets)))
211 | with ops.op_scope(logits + targets + weights, name,
212 | "sequence_loss_by_example"):
213 | log_perp_list = []
214 | for logit, target, weight in zip(logits, targets, weights):
215 | if softmax_loss_function is None:
216 | # TODO(irving,ebrevdo): This reshape is needed because
217 | # sequence_loss_by_example is called with scalars sometimes, which
218 | # violates our general scalar strictness policy.
219 | target = array_ops.reshape(target, [-1])
220 | crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(
221 | logit, target)
222 | else:
223 | crossent = softmax_loss_function(logit, target)
224 | log_perp_list.append(crossent * weight)
225 | log_perps = math_ops.add_n(log_perp_list)
226 | if average_across_timesteps:
227 | total_size = math_ops.add_n(weights)
228 | total_size += 1e-12 # Just to avoid division by 0 for all-0 weights.
229 | log_perps /= total_size
230 | return log_perps
231 |
232 |
233 | def sequence_loss(logits, targets, weights,
234 | average_across_timesteps=True, average_across_batch=True,
235 | softmax_loss_function=None, name=None):
236 | """Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
237 |
238 | Args:
239 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
240 | targets: List of 1D batch-sized int32 Tensors of the same length as logits.
241 | weights: List of 1D batch-sized float-Tensors of the same length as logits.
242 | average_across_timesteps: If set, divide the returned cost by the total
243 | label weight.
244 | average_across_batch: If set, divide the returned cost by the batch size.
245 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
246 | to be used instead of the standard softmax (the default if this is None).
247 | name: Optional name for this operation, defaults to "sequence_loss".
248 |
249 | Returns:
250 | A scalar float Tensor: The average log-perplexity per symbol (weighted).
251 |
252 | Raises:
253 | ValueError: If len(logits) is different from len(targets) or len(weights).
254 | """
255 | with ops.op_scope(logits + targets + weights, name, "sequence_loss"):
256 | cost = math_ops.reduce_sum(sequence_loss_by_example(
257 | logits, targets, weights,
258 | average_across_timesteps=average_across_timesteps,
259 | softmax_loss_function=softmax_loss_function))
260 | if average_across_batch:
261 | batch_size = array_ops.shape(targets[0])[0]
262 | return cost / math_ops.cast(batch_size, dtypes.float32)
263 | else:
264 | return cost
265 |
266 |
267 | def generate_task_output(encoder_outputs, additional_inputs, encoder_state, targets,sequence_length, num_decoder_symbols, weights,
268 | buckets, softmax_loss_function=None,
269 | per_example_loss=False, name=None, use_attention=False, scope=None, DNN_at_output=False,
270 | intent_results=None,
271 | tagging_results=None,
272 | train_with_true_label=True,
273 | use_local_context=False,
274 | forward_only=False):
275 | if len(targets) < buckets[-1][1]:
276 | raise ValueError("Length of targets (%d) must be at least that of last"
277 | "bucket (%d)." % (len(targets), buckets[-1][1]))
278 |
279 | all_inputs = encoder_outputs + targets + weights
280 | with ops.op_scope(all_inputs, name, "model_with_buckets"):
281 | if scope == 'intent':
282 | logits, regularizers, sampled_intents = intent_results
283 | sampled_tags = list()
284 | elif scope == 'tagging':
285 | logits, regularizers, sampled_tags = tagging_results
286 | sampled_intents = list()
287 | elif scope == 'lm':
288 | with variable_scope.variable_scope(scope + "_generate_sequence_output", reuse=None):
289 | task_inputs = []
290 | if use_local_context:
291 | print ('lm task: use sampled_tag_intent_emb as local context')
292 | task_inputs = [array_ops.concat(1, [additional_input, encoder_output]) for additional_input, encoder_output in zip(additional_inputs, encoder_outputs)]
293 | else:
294 | task_inputs = encoder_outputs
295 |
296 | logits, _, regularizers = generate_sequence_output(task_inputs,
297 | encoder_state,
298 | num_decoder_symbols,
299 | sequence_length,
300 | use_attention=use_attention,
301 | DNN_at_output=DNN_at_output,
302 | forward_only=forward_only)
303 |
304 | sampled_tags = list()
305 | sampled_intents = list()
306 |
307 | if per_example_loss is None:
308 | assert len(logits) == len(targets)
309 | # We need to make target and int64-tensor and set its shape.
310 | bucket_target = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets]
311 | crossent = sequence_loss_by_example(
312 | logits, bucket_target, weights,
313 | softmax_loss_function=softmax_loss_function)
314 | else:
315 | assert len(logits) == len(targets)
316 | bucket_target = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets]
317 | crossent = sequence_loss(
318 | logits, bucket_target, weights,
319 | softmax_loss_function=softmax_loss_function)
320 | crossent_with_regularizers = crossent + 1e-4 * regularizers
321 |
322 | return logits, sampled_tags, sampled_intents, crossent_with_regularizers, crossent
323 |
--------------------------------------------------------------------------------