├── README.md ├── conlleval.pl ├── data └── ATIS_samples │ ├── test │ ├── test.label │ ├── test.seq.in │ └── test.seq.out │ ├── train │ ├── train.label │ ├── train.seq.in │ └── train.seq.out │ └── valid │ ├── valid.label │ ├── valid.seq.in │ └── valid.seq.out ├── data_utils.py ├── generate_encoder_output.py ├── multi_task_model.py ├── run_rnn_joint.py └── seq_labeling.py /README.md: -------------------------------------------------------------------------------- 1 | Joint Online Spoken Language Understanding and Language Modeling with Recurrent Neural Networks 2 | ================== 3 | 4 | Tensorflow implementation of joint modeling of SLU (intent & slot filling) and LM with RNNs. 5 | 6 | This package implements joint model Number 14 and 15 in Table 1 of the below reference paper [1], i.e. the joint model with *recurrent* (& *local*) intent + slot label context. The other joint models in the paper can be made with simple modifications of generate_encoder_output.py and seq_labeling.py. 7 | 8 | **Setup** 9 | 10 | * Tensorflow, version >= r0.9 (https://www.tensorflow.org/versions/r0.9/get_started/index.html) 11 | 12 | **Usage**: 13 | ```bash 14 | data_dir=data/ATIS_samples 15 | model_dir=model_tmp 16 | max_sequence_length=50 # max length for train/valid/test sequence 17 | use_local_context=False # boolean, whether to use local context 18 | DNN_at_output=True # boolean, set to True to use one hidden layer DNN at task output 19 | 20 | python run_rnn_joint.py --data_dir $data_dir \ 21 | --train_dir $model_dir\ 22 | --max_sequence_length $max_sequence_length \ 23 | --use_local_context $use_local_context \ 24 | --DNN_at_output $DNN_at_output 25 | ``` 26 | 27 | **Reference** 28 | 29 | [1] Bing Liu, Ian Lane, "Joint Online Spoken Language Understanding and Language Modeling with Recurrent Neural Networks", SIGdial, 2016 (PDF) 30 | 31 | ``` 32 | @InProceedings{liu-lane:2016:SIGDIAL, 33 | author = {Liu, Bing and Lane, Ian}, 34 | title = {Joint Online Spoken Language Understanding and Language Modeling With Recurrent Neural Networks}, 35 | booktitle = {Proceedings of the 17th Annual Meeting of the Special Interest Group on Discourse and Dialogue}, 36 | month = {September}, 37 | year = {2016}, 38 | address = {Los Angeles}, 39 | publisher = {Association for Computational Linguistics}, 40 | pages = {22--30}, 41 | url = {http://www.aclweb.org/anthology/W16-3603} 42 | } 43 | ``` 44 | 45 | **Contact** 46 | 47 | Feel free to email liubing@cmu.edu for any pertinent questions/bugs regarding the code. 48 | -------------------------------------------------------------------------------- /conlleval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | 86 | #printf $line; 87 | #printf STDERR $#features; 88 | #printf "\n"; 89 | 90 | #printf $nbrOfFeatures; 91 | #printf "\n"; 92 | #printf $#features; 93 | #printf "\n"; 94 | 95 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 96 | elsif ($nbrOfFeatures != $#features and @features != 0) { 97 | printf STDERR "unexpected number of features: %d (%d)\n", 98 | $#features+1,$nbrOfFeatures+1; 99 | exit(1); 100 | } 101 | if (@features == 0 or 102 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 103 | if (@features < 2) { 104 | die "conlleval: unexpected number of features in line $line\n"; 105 | } 106 | if ($raw) { 107 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 108 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 109 | if ($features[$#features] ne "O") { 110 | $features[$#features] = "B-$features[$#features]"; 111 | } 112 | if ($features[$#features-1] ne "O") { 113 | $features[$#features-1] = "B-$features[$#features-1]"; 114 | } 115 | } 116 | # 20040126 ET code which allows hyphens in the types 117 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 118 | $guessed = $1; 119 | $guessedType = $2; 120 | } else { 121 | $guessed = $features[$#features]; 122 | $guessedType = ""; 123 | } 124 | pop(@features); 125 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 126 | $correct = $1; 127 | $correctType = $2; 128 | } else { 129 | $correct = $features[$#features]; 130 | $correctType = ""; 131 | } 132 | pop(@features); 133 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 134 | # ($correct,$correctType) = split(/-/,pop(@features)); 135 | $guessedType = $guessedType ? $guessedType : ""; 136 | $correctType = $correctType ? $correctType : ""; 137 | $firstItem = shift(@features); 138 | 139 | # 1999-06-26 sentence breaks should always be counted as out of chunk 140 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 141 | 142 | if ($inCorrect) { 143 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 144 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 145 | $lastGuessedType eq $lastCorrectType) { 146 | $inCorrect=$false; 147 | $correctChunk++; 148 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 149 | $correctChunk{$lastCorrectType}+1 : 1; 150 | } elsif ( 151 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 152 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 153 | $guessedType ne $correctType ) { 154 | $inCorrect=$false; 155 | } 156 | } 157 | 158 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 159 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 160 | $guessedType eq $correctType) { $inCorrect = $true; } 161 | 162 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 163 | $foundCorrect++; 164 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 165 | $foundCorrect{$correctType}+1 : 1; 166 | } 167 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 168 | $foundGuessed++; 169 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 170 | $foundGuessed{$guessedType}+1 : 1; 171 | } 172 | if ( $firstItem ne $boundary ) { 173 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 174 | $correctTags++; 175 | } 176 | $tokenCounter++; 177 | } 178 | 179 | $lastGuessed = $guessed; 180 | $lastCorrect = $correct; 181 | $lastGuessedType = $guessedType; 182 | $lastCorrectType = $correctType; 183 | } 184 | if ($inCorrect) { 185 | $correctChunk++; 186 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 187 | $correctChunk{$lastCorrectType}+1 : 1; 188 | } 189 | 190 | if (not $latex) { 191 | # compute overall precision, recall and FB1 (default values are 0.0) 192 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 193 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 194 | $FB1 = 2*$precision*$recall/($precision+$recall) 195 | if ($precision+$recall > 0); 196 | 197 | # print overall performance 198 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 199 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 200 | if ($tokenCounter>0) { 201 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 202 | print "$correctChunk $foundCorrect $foundGuessed "; 203 | printf "precision: %6.2f%%; ",$precision; 204 | printf "recall: %6.2f%%; ",$recall; 205 | printf "FB1: %6.2f\n",$FB1; 206 | } 207 | } 208 | 209 | # sort chunk type names 210 | undef($lastType); 211 | @sortedTypes = (); 212 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 213 | if (not($lastType) or $lastType ne $i) { 214 | push(@sortedTypes,($i)); 215 | } 216 | $lastType = $i; 217 | } 218 | # print performance per chunk type 219 | if (not $latex) { 220 | for $i (@sortedTypes) { 221 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 222 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 223 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 224 | if (not($foundCorrect{$i})) { $recall = 0.0; } 225 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 226 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 227 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 228 | printf "%17s: ",$i; 229 | printf "% 4d % 4d % 4d ", $correctChunk{$i}, $foundCorrect{$i}, $foundGuessed{$i}; 230 | printf "precision: %6.2f%%; ",$precision; 231 | printf "recall: %6.2f%%; ",$recall; 232 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 233 | } 234 | } else { 235 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 236 | for $i (@sortedTypes) { 237 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 238 | if (not($foundGuessed{$i})) { $precision = 0.0; } 239 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 240 | if (not($foundCorrect{$i})) { $recall = 0.0; } 241 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 242 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 243 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 244 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 245 | $i,$precision,$recall,$FB1; 246 | } 247 | print "\\hline\n"; 248 | $precision = 0.0; 249 | $recall = 0; 250 | $FB1 = 0.0; 251 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 252 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 253 | $FB1 = 2*$precision*$recall/($precision+$recall) 254 | if ($precision+$recall > 0); 255 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 256 | $precision,$recall,$FB1; 257 | } 258 | 259 | exit 0; 260 | 261 | # endOfChunk: checks if a chunk ended between the previous and current word 262 | # arguments: previous and current chunk tags, previous and current types 263 | # note: this code is capable of handling other chunk representations 264 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 265 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 266 | 267 | sub endOfChunk { 268 | my $prevTag = shift(@_); 269 | my $tag = shift(@_); 270 | my $prevType = shift(@_); 271 | my $type = shift(@_); 272 | my $chunkEnd = $false; 273 | 274 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 275 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 276 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 277 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 278 | 279 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 280 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 281 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 282 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 283 | 284 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 285 | $chunkEnd = $true; 286 | } 287 | 288 | # corrected 1998-12-22: these chunks are assumed to have length 1 289 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 290 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 291 | 292 | return($chunkEnd); 293 | } 294 | 295 | # startOfChunk: checks if a chunk started between the previous and current word 296 | # arguments: previous and current chunk tags, previous and current types 297 | # note: this code is capable of handling other chunk representations 298 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 299 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 300 | 301 | sub startOfChunk { 302 | my $prevTag = shift(@_); 303 | my $tag = shift(@_); 304 | my $prevType = shift(@_); 305 | my $type = shift(@_); 306 | my $chunkStart = $false; 307 | 308 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 309 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 310 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 311 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 312 | 313 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 314 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 315 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 316 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 317 | 318 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 319 | $chunkStart = $true; 320 | } 321 | 322 | # corrected 1998-12-22: these chunks are assumed to have length 1 323 | if ( $tag eq "[" ) { $chunkStart = $true; } 324 | if ( $tag eq "]" ) { $chunkStart = $true; } 325 | 326 | return($chunkStart); 327 | } 328 | -------------------------------------------------------------------------------- /data/ATIS_samples/test/test.label: -------------------------------------------------------------------------------- 1 | flight 2 | airfare 3 | flight 4 | flight 5 | flight 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/test/test.seq.in: -------------------------------------------------------------------------------- 1 | i would like to find a flight from charlotte to las vegas that makes a stop in st. louis 2 | on april first i need a ticket from tacoma to san jose departing before DIGIT am 3 | on april first i need a flight going from phoenix to san diego 4 | i would like a flight traveling one way from phoenix to san diego on april first 5 | i would like a flight from orlando to salt lake city for april first on delta airlines 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/test/test.seq.out: -------------------------------------------------------------------------------- 1 | O O O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O O O O O B-stoploc.city_name I-stoploc.city_name 2 | O B-depart_date.month_name B-depart_date.day_number O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_time.time_relative B-depart_time.time I-depart_time.time 3 | O B-depart_date.month_name B-depart_date.day_number O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name 4 | O O O O O O B-round_trip I-round_trip O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number 5 | O O O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name O B-depart_date.month_name B-depart_date.day_number O B-airline_name I-airline_name 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.label: -------------------------------------------------------------------------------- 1 | airfare 2 | flight 3 | flight 4 | airfare 5 | flight 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.seq.in: -------------------------------------------------------------------------------- 1 | what's the lowest round trip fare from dallas to atlanta 2 | find me the earliest flight from boston to atlanta on any day of the week 3 | display all flights from boston to baltimore on july thirty first 4 | economy fares new york to miami round trip 5 | i need to fly from boston to denver on to san francisco and back 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/train/train.seq.out: -------------------------------------------------------------------------------- 1 | O O B-cost_relative B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name 2 | O O O B-flight_mod O O B-fromloc.city_name O B-toloc.city_name O O O O O O 3 | O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.month_name B-depart_date.day_number I-depart_date.day_number 4 | B-economy O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name B-round_trip I-round_trip 5 | O O O O O B-fromloc.city_name O B-toloc.city_name O O B-toloc.city_name I-toloc.city_name O O 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.label: -------------------------------------------------------------------------------- 1 | flight 2 | flight 3 | flight_time 4 | airfare 5 | airfare 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.seq.in: -------------------------------------------------------------------------------- 1 | i want to fly from boston at DIGITDIGITDIGIT am and arrive in denver at DIGITDIGITDIGITDIGIT in the morning 2 | what flights are available from pittsburgh to baltimore on thursday morning 3 | what is the arrival time in san francisco for the DIGITDIGITDIGIT am flight leaving washington 4 | cheapest airfare from tacoma to orlando 5 | round trip fares from pittsburgh to philadelphia under DIGITDIGITDIGITDIGIT dollars 6 | -------------------------------------------------------------------------------- /data/ATIS_samples/valid/valid.seq.out: -------------------------------------------------------------------------------- 1 | O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day 2 | O O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.day_name B-depart_time.period_of_day 3 | O O O B-flight_time I-flight_time O B-fromloc.city_name I-fromloc.city_name O O B-depart_time.time I-depart_time.time O O B-fromloc.city_name 4 | B-cost_relative O O B-fromloc.city_name O B-toloc.city_name 5 | B-round_trip I-round_trip O O B-fromloc.city_name O B-toloc.city_name B-cost_relative B-fare_amount I-fare_amount 6 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for downloading data from WMT, tokenizing, vocabularies.""" 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import gzip 22 | import os 23 | import re 24 | import tarfile 25 | 26 | from six.moves import urllib 27 | 28 | from tensorflow.python.platform import gfile 29 | 30 | # Special vocabulary symbols - we always put them at the start. 31 | _PAD = "_PAD" 32 | _UNK = "_UNK" 33 | _BOS = "_BOS" 34 | _EOS = "_EOS" 35 | _START_VOCAB = [_PAD, _UNK] 36 | 37 | START_VOCAB_dict = dict() 38 | START_VOCAB_dict['with_padding'] = [_PAD, _UNK] 39 | START_VOCAB_dict['no_padding'] = [_UNK] 40 | START_VOCAB_dict['lm'] = [_PAD, _UNK, _BOS] 41 | 42 | 43 | PAD_ID = 0 44 | BOS_ID = 2 45 | #EOS_ID = 3 46 | 47 | UNK_ID_dict = dict() 48 | UNK_ID_dict['with_padding'] = 1 49 | UNK_ID_dict['no_padding'] = 0 50 | # UNK_ID = 1 51 | 52 | # Regular expressions used to tokenize. 53 | _WORD_SPLIT = re.compile("([.,!?\"':;)(])") 54 | _DIGIT_RE = re.compile(r"\d") 55 | 56 | # URLs for WMT data. 57 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar" 58 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz" 59 | 60 | def basic_tokenizer(sentence): 61 | """Very basic tokenizer: split the sentence into a list of tokens.""" 62 | words = [] 63 | for space_separated_fragment in sentence.strip().split(): 64 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) 65 | return [w for w in words if w] 66 | 67 | def naive_tokenizer(sentence): 68 | """Naive tokenizer: split the sentence by space into a list of tokens.""" 69 | return sentence.split() 70 | 71 | 72 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, 73 | tokenizer=None, normalize_digits=True): 74 | """Create vocabulary file (if it does not exist yet) from data file. 75 | 76 | Data file is assumed to contain one sentence per line. Each sentence is 77 | tokenized and digits are normalized (if normalize_digits is set). 78 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. 79 | We write it to vocabulary_path in a one-token-per-line format, so that later 80 | token in the first line gets id=0, second line gets id=1, and so on. 81 | 82 | Args: 83 | vocabulary_path: path where the vocabulary will be created. 84 | data_path: data file that will be used to create vocabulary. 85 | max_vocabulary_size: limit on the size of the created vocabulary. 86 | tokenizer: a function to use to tokenize each data sentence; 87 | if None, basic_tokenizer will be used. 88 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 89 | """ 90 | if not gfile.Exists(vocabulary_path): 91 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 92 | vocab = {} 93 | with gfile.GFile(data_path, mode="r") as f: 94 | counter = 0 95 | for line in f: 96 | counter += 1 97 | if counter % 100000 == 0: 98 | print(" processing line %d" % counter) 99 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 100 | for w in tokens: 101 | word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w 102 | if word in vocab: 103 | vocab[word] += 1 104 | else: 105 | vocab[word] = 1 106 | vocab_list = START_VOCAB_dict['lm'] + sorted(vocab, key=vocab.get, reverse=True) 107 | if len(vocab_list) > max_vocabulary_size: 108 | vocab_list = vocab_list[:max_vocabulary_size] 109 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file: 110 | for w in vocab_list: 111 | vocab_file.write(w + "\n") 112 | 113 | 114 | def initialize_vocabulary(vocabulary_path): 115 | """Initialize vocabulary from file. 116 | 117 | We assume the vocabulary is stored one-item-per-line, so a file: 118 | dog 119 | cat 120 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will 121 | also return the reversed-vocabulary ["dog", "cat"]. 122 | 123 | Args: 124 | vocabulary_path: path to the file containing the vocabulary. 125 | 126 | Returns: 127 | a pair: the vocabulary (a dictionary mapping string to integers), and 128 | the reversed vocabulary (a list, which reverses the vocabulary mapping). 129 | 130 | Raises: 131 | ValueError: if the provided vocabulary_path does not exist. 132 | """ 133 | if gfile.Exists(vocabulary_path): 134 | rev_vocab = [] 135 | with gfile.GFile(vocabulary_path, mode="r") as f: 136 | rev_vocab.extend(f.readlines()) 137 | rev_vocab = [line.strip() for line in rev_vocab] 138 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 139 | return vocab, rev_vocab 140 | else: 141 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 142 | 143 | def sentence_to_token_ids(sentence, vocabulary, UNK_ID, 144 | tokenizer=None, normalize_digits=True): 145 | """Convert a string to list of integers representing token-ids. 146 | 147 | For example, a sentence "I have a dog" may become tokenized into 148 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2, 149 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7]. 150 | 151 | Args: 152 | sentence: a string, the sentence to convert to token-ids. 153 | vocabulary: a dictionary mapping tokens to integers. 154 | tokenizer: a function to use to tokenize each sentence; 155 | if None, basic_tokenizer will be used. 156 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 157 | 158 | Returns: 159 | a list of integers, the token-ids for the sentence. 160 | """ 161 | if tokenizer: 162 | words = tokenizer(sentence) 163 | else: 164 | words = basic_tokenizer(sentence) 165 | if not normalize_digits: 166 | return [vocabulary.get(w, UNK_ID) for w in words] 167 | # Normalize digits by 0 before looking words up in the vocabulary. 168 | return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words] 169 | 170 | 171 | def data_to_token_ids(data_path, target_path, vocabulary_path, 172 | tokenizer=None, normalize_digits=True, use_padding=True): 173 | """Tokenize data file and turn into token-ids using given vocabulary file. 174 | 175 | This function loads data line-by-line from data_path, calls the above 176 | sentence_to_token_ids, and saves the result to target_path. See comment 177 | for sentence_to_token_ids on the details of token-ids format. 178 | 179 | Args: 180 | data_path: path to the data file in one-sentence-per-line format. 181 | target_path: path where the file with token-ids will be created. 182 | vocabulary_path: path to the vocabulary file. 183 | tokenizer: a function to use to tokenize each sentence; 184 | if None, basic_tokenizer will be used. 185 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 186 | """ 187 | if not gfile.Exists(target_path): 188 | print("Tokenizing data in %s" % data_path) 189 | vocab, _ = initialize_vocabulary(vocabulary_path) 190 | with gfile.GFile(data_path, mode="r") as data_file: 191 | with gfile.GFile(target_path, mode="w") as tokens_file: 192 | counter = 0 193 | for line in data_file: 194 | counter += 1 195 | if counter % 100000 == 0: 196 | print(" tokenizing line %d" % counter) 197 | if use_padding: 198 | UNK_ID = UNK_ID_dict['with_padding'] 199 | else: 200 | UNK_ID = UNK_ID_dict['no_padding'] 201 | token_ids = sentence_to_token_ids(line, vocab, UNK_ID, tokenizer, 202 | normalize_digits) 203 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 204 | 205 | 206 | def create_label_vocab(vocabulary_path, data_path): 207 | if not gfile.Exists(vocabulary_path): 208 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 209 | vocab = {} 210 | with gfile.GFile(data_path, mode="r") as f: 211 | counter = 0 212 | for line in f: 213 | counter += 1 214 | if counter % 100000 == 0: 215 | print(" processing line %d" % counter) 216 | # tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 217 | label = line.strip() 218 | vocab[label] = 1 219 | label_list = START_VOCAB_dict['no_padding'] + sorted(vocab) 220 | # label_list = sorted(vocab) 221 | with gfile.GFile(vocabulary_path, mode="w") as vocab_file: 222 | for k in label_list: 223 | vocab_file.write(k + "\n") 224 | 225 | def prepare_multi_task_data(data_dir, in_vocab_size, out_vocab_size): 226 | #current_dir = os.path.realpath(__file__) 227 | # data_dir = 'trec_qa_data' 228 | train_path = data_dir + '/train/train' 229 | dev_path = data_dir + '/valid/valid' 230 | test_path = data_dir + '/test/test' 231 | 232 | # Create vocabularies of the appropriate sizes. 233 | in_vocab_path = os.path.join(data_dir, "in_vocab_%d.txt" % in_vocab_size) 234 | out_vocab_path = os.path.join(data_dir, "out_vocab_%d.txt" % out_vocab_size) 235 | label_path = os.path.join(data_dir, "label.txt") 236 | 237 | create_vocabulary(in_vocab_path, train_path + ".seq.in", in_vocab_size, tokenizer=naive_tokenizer) 238 | create_vocabulary(out_vocab_path, train_path + ".seq.out", out_vocab_size, tokenizer=naive_tokenizer) 239 | create_label_vocab(label_path, train_path + ".label") 240 | 241 | # Create token ids for the training data. 242 | in_seq_train_ids_path = train_path + (".ids%d.seq.in" % in_vocab_size) 243 | out_seq_train_ids_path = train_path + (".ids%d.seq.out" % out_vocab_size) 244 | label_train_ids_path = train_path + (".ids.label") 245 | 246 | data_to_token_ids(train_path + ".seq.in", in_seq_train_ids_path, in_vocab_path, tokenizer=naive_tokenizer) 247 | data_to_token_ids(train_path + ".seq.out", out_seq_train_ids_path, out_vocab_path, tokenizer=naive_tokenizer) 248 | data_to_token_ids(train_path + ".label", label_train_ids_path, label_path, normalize_digits=False, use_padding=False) 249 | 250 | # Create token ids for the development data. 251 | in_seq_dev_ids_path = dev_path + (".ids%d.seq.in" % in_vocab_size) 252 | out_seq_dev_ids_path = dev_path + (".ids%d.seq.out" % out_vocab_size) 253 | label_dev_ids_path = dev_path + (".ids.label") 254 | 255 | data_to_token_ids(dev_path + ".seq.in", in_seq_dev_ids_path, in_vocab_path, tokenizer=naive_tokenizer) 256 | data_to_token_ids(dev_path + ".seq.out", out_seq_dev_ids_path, out_vocab_path, tokenizer=naive_tokenizer) 257 | data_to_token_ids(dev_path + ".label", label_dev_ids_path, label_path, normalize_digits=False, use_padding=False) 258 | 259 | # Create token ids for the test data. 260 | in_seq_test_ids_path = test_path + (".ids%d.seq.in" % in_vocab_size) 261 | out_seq_test_ids_path = test_path + (".ids%d.seq.out" % out_vocab_size) 262 | label_test_ids_path = test_path + (".ids.label") 263 | 264 | data_to_token_ids(test_path + ".seq.in", in_seq_test_ids_path, in_vocab_path, tokenizer=naive_tokenizer) 265 | data_to_token_ids(test_path + ".seq.out", out_seq_test_ids_path, out_vocab_path, tokenizer=naive_tokenizer) 266 | data_to_token_ids(test_path + ".label", label_test_ids_path, label_path, normalize_digits=False, use_padding=False) 267 | 268 | return (in_seq_train_ids_path, out_seq_train_ids_path, label_train_ids_path, 269 | in_seq_dev_ids_path, out_seq_dev_ids_path, label_dev_ids_path, 270 | in_seq_test_ids_path, out_seq_test_ids_path, label_test_ids_path, 271 | in_vocab_path, out_vocab_path, label_path) -------------------------------------------------------------------------------- /generate_encoder_output.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 9 11:32:21 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | from tensorflow.python.framework import dtypes 13 | from tensorflow.python.ops import array_ops 14 | from tensorflow.python.ops import embedding_ops 15 | from tensorflow.python.ops import math_ops 16 | from tensorflow.python.ops import rnn_cell 17 | from tensorflow.python.ops import rnn 18 | from tensorflow.python.ops import variable_scope 19 | from tensorflow.python.ops import init_ops 20 | from tensorflow.python.framework import tensor_shape 21 | from tensorflow.python.ops import control_flow_ops 22 | import tensorflow as tf 23 | 24 | def get_multilayer_perceptron_regularizers(output_projection): 25 | weight_hidden, bias_hidden, weight_out, bias_out = output_projection 26 | regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 27 | return regularizers 28 | 29 | def multilayer_perceptron_with_initialized_W(_X, output_projection, forward_only=False): 30 | #with variable_scope.variable_scope("intent_MLP"): 31 | weight_hidden, bias_hidden, weight_out, bias_out = output_projection 32 | #Hidden layer with RELU activation 33 | layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, weight_hidden), bias_hidden)) 34 | 35 | if not forward_only: 36 | layer_1 = tf.nn.dropout(layer_1, 0.5) 37 | 38 | output = tf.matmul(layer_1, weight_out) + bias_out 39 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 40 | return output 41 | 42 | def get_linear_transformation_regularizers(output_projection): 43 | weight_out, bias_out = output_projection 44 | regularizers = tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 45 | return regularizers 46 | 47 | def linear_transformation_with_initialized_W(_X, output_projection, forward_only=False): 48 | #with variable_scope.variable_scope("intent_MLP"): 49 | weight_out, bias_out = output_projection 50 | output = tf.matmul(_X, weight_out) + bias_out 51 | return output 52 | 53 | def _extract_argmax_and_embed(embedding, DNN_at_output, output_projection, forward_only=False, update_embedding=True): 54 | def loop_function(prev, _): 55 | if DNN_at_output is True: 56 | prev = multilayer_perceptron_with_initialized_W(prev, output_projection, forward_only=forward_only) 57 | else: 58 | prev = linear_transformation_with_initialized_W(prev, output_projection, forward_only=forward_only) 59 | prev_symbol = math_ops.argmax(prev, 1) 60 | # Note that gradients will not propagate through the second parameter of 61 | # embedding_lookup. 62 | emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) 63 | return prev, emb_prev 64 | return loop_function 65 | 66 | def rnn_with_output_feedback(cell, inputs, 67 | targets1, targets1_num_symbols, target1_emb_size, target1_output_projection, 68 | targets2, targets2_num_symbols, target2_emb_size, target2_output_projection, 69 | word_emb_size, DNN_at_output, 70 | zero_intent_thres=0, 71 | sequence_length=None, dtype=None, 72 | train_with_true_label=True, use_predicted_output=False): 73 | ''' 74 | zero_intent_thres: int, the intent contribution to context remain zero before this thres, 75 | and linear increase to 1 after that. 76 | ''' 77 | if not isinstance(cell, rnn_cell.RNNCell): 78 | raise TypeError("cell must be an instance of RNNCell") 79 | if not isinstance(inputs, list): 80 | raise TypeError("inputs must be a list") 81 | if not isinstance(targets1, list): 82 | raise TypeError("targets1 must be a list") 83 | if not isinstance(targets2, list): 84 | raise TypeError("targets2 must be a list") 85 | if not inputs: 86 | raise ValueError("inputs must not be empty") 87 | if not dtype: 88 | raise ValueError("dtype must be provided, which is to used in defining intial RNN state") 89 | 90 | encoder_outputs = [] 91 | intent_embedding = variable_scope.get_variable("intent_embedding",[targets1_num_symbols, target1_emb_size]) 92 | tag_embedding = variable_scope.get_variable("tag_embedding",[targets2_num_symbols, target2_emb_size]) 93 | # use predicted label if use_predicted_output during inference, use true label during training 94 | # To choose to always use predicted label, disable the if condition 95 | intent_loop_function = _extract_argmax_and_embed(intent_embedding, DNN_at_output, target1_output_projection, forward_only=use_predicted_output) #if use_predicted_output else None 96 | tagging_loop_function = _extract_argmax_and_embed(tag_embedding, DNN_at_output, target2_output_projection, forward_only=use_predicted_output) 97 | intent_targets = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets1] 98 | intent_target_embeddings = list() 99 | intent_target_embeddings = [embedding_ops.embedding_lookup(intent_embedding, target) for target in intent_targets] 100 | tag_targets = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets2] 101 | tag_target_embeddings = list() 102 | tag_target_embeddings = [embedding_ops.embedding_lookup(tag_embedding, target) for target in tag_targets] 103 | 104 | if inputs[0].get_shape().ndims != 1: 105 | (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2) 106 | if input_size.value is None: 107 | raise ValueError( 108 | "Input size (second dimension of inputs[0]) must be accessible via " 109 | "shape inference, but saw value None.") 110 | else: 111 | fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] 112 | 113 | if fixed_batch_size.value: 114 | batch_size = fixed_batch_size.value 115 | else: 116 | batch_size = array_ops.shape(inputs[0])[0] 117 | 118 | state = cell.zero_state(batch_size, dtype) 119 | zero_output = array_ops.zeros( 120 | array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) 121 | zero_output.set_shape( 122 | tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) 123 | 124 | if sequence_length is not None: # Prepare variables 125 | sequence_length = math_ops.to_int32(sequence_length) 126 | min_sequence_length = math_ops.reduce_min(sequence_length) 127 | max_sequence_length = math_ops.reduce_max(sequence_length) 128 | 129 | # prev_cell_output = zero_output 130 | zero_intent_embedding = array_ops.zeros( 131 | array_ops.pack([batch_size, target1_emb_size]), inputs[0].dtype) 132 | zero_intent_embedding.set_shape( 133 | tensor_shape.TensorShape([fixed_batch_size.value, target1_emb_size])) 134 | zero_tag_embedding = array_ops.zeros( 135 | array_ops.pack([batch_size, target2_emb_size]), inputs[0].dtype) 136 | zero_tag_embedding.set_shape( 137 | tensor_shape.TensorShape([fixed_batch_size.value, target2_emb_size])) 138 | 139 | encoder_outputs = list() 140 | intent_logits = list() 141 | tagging_logits = list() 142 | sampled_intent_embeddings = list() 143 | sampled_tag_embeddings = list() 144 | 145 | for time, input_ in enumerate(inputs): 146 | # Bing: introduce output label embeddings as addtional input 147 | # if feed_previous (during testing): 148 | # Use loop_function 149 | # if NOT feed_previous (during training): 150 | # Use true target embedding 151 | if time == 0: 152 | current_intent_embedding = zero_intent_embedding 153 | current_tag_embedding = zero_tag_embedding 154 | 155 | 156 | if time > 0: variable_scope.get_variable_scope().reuse_variables() 157 | 158 | # here we introduce a max(0, t-4)/sequence_length intent weight 159 | thres = zero_intent_thres 160 | if time <= thres: 161 | intent_contribution = math_ops.to_float(0) 162 | else: 163 | intent_contribution = tf.div(math_ops.to_float(time - thres), math_ops.to_float(sequence_length)) 164 | # intent_contribution = math_ops.to_float(1) 165 | 166 | x = rnn_cell._linear([tf.transpose(tf.transpose(current_intent_embedding)*intent_contribution), current_tag_embedding, input_], word_emb_size, True) 167 | call_cell = lambda: cell(x, state) 168 | 169 | 170 | # pylint: enable=cell-var-from-loop 171 | if sequence_length is not None: 172 | (output_fw, state) = rnn._rnn_step( 173 | time, sequence_length, min_sequence_length, max_sequence_length, 174 | zero_output, state, call_cell, cell.state_size) 175 | else: 176 | (output_fw, state) = call_cell() 177 | 178 | encoder_outputs.append(output_fw) 179 | 180 | if use_predicted_output: 181 | intent_logit, current_intent_embedding = intent_loop_function(output_fw, time) 182 | tagging_logit, current_tag_embedding = tagging_loop_function(output_fw, time) 183 | else: 184 | if train_with_true_label is True: 185 | intent_logit = multilayer_perceptron_with_initialized_W(output_fw, target1_output_projection, forward_only=use_predicted_output) 186 | tagging_logit = multilayer_perceptron_with_initialized_W(output_fw, target2_output_projection, forward_only=use_predicted_output) 187 | current_intent_embedding = intent_target_embeddings[time] 188 | current_tag_embedding = tag_target_embeddings[time] 189 | else: 190 | intent_logit, current_intent_embedding = intent_loop_function(output_fw, time) 191 | tagging_logit, current_tag_embedding = tagging_loop_function(output_fw, time) 192 | # prev_symbols.append(prev_symbol) 193 | if time == 0: 194 | current_intent_embedding = zero_intent_embedding 195 | current_tag_embedding = zero_tag_embedding 196 | sampled_intent_embeddings.append(current_intent_embedding) 197 | sampled_tag_embeddings.append(current_tag_embedding) 198 | 199 | intent_logits.append(intent_logit) 200 | tagging_logits.append(tagging_logit) 201 | 202 | return encoder_outputs, state, intent_logits, tagging_logits, sampled_intent_embeddings, sampled_tag_embeddings 203 | 204 | def generate_embedding_RNN_output_joint(encoder_inputs, cell, 205 | num_encoder_symbols, 206 | intent_targets, 207 | num_intent_target_symbols, 208 | tagging_targets, 209 | num_tagging_target_symbols, 210 | word_emb_size, 211 | dtype=dtypes.float32, 212 | scope=None, initial_state_attention=False, 213 | sequence_length=None, 214 | bidirectional_rnn=False, 215 | DNN_at_output=False, 216 | dnn_hidden_layer_size=200, 217 | output_emb_size=50, 218 | trainable=True, 219 | train_with_true_label=True, 220 | zero_intent_thres=0, 221 | forward_only=False): 222 | """ 223 | Generate RNN state outputs with word embeddings as inputs 224 | """ 225 | with variable_scope.variable_scope(scope or "generate_embedding_RNN_output_joint"): 226 | encoder_cell = cell 227 | embedding = variable_scope.get_variable("word_embedding", [num_encoder_symbols, word_emb_size], trainable=trainable) 228 | encoder_embedded_inputs = list() 229 | encoder_embedded_inputs = [embedding_ops.embedding_lookup(embedding, encoder_input) for encoder_input in encoder_inputs] 230 | 231 | if DNN_at_output is True: 232 | with variable_scope.variable_scope("intent_DNN"): 233 | bias_start = 0.0 234 | n_hidden = dnn_hidden_layer_size 235 | intent_target_weight_hidden = variable_scope.get_variable("Weight_Hidden", [encoder_cell.output_size, n_hidden]) 236 | intent_target_bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden], 237 | initializer=init_ops.constant_initializer(bias_start)) 238 | intent_target_weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, num_intent_target_symbols]) 239 | intent_target_1bias_out = variable_scope.get_variable("Bias_Out", [num_intent_target_symbols], 240 | initializer=init_ops.constant_initializer(bias_start)) 241 | intent_target_output_projection = (intent_target_weight_hidden, intent_target_bias_hidden, intent_target_weight_out, intent_target_1bias_out) 242 | intent_target_regularizers = get_multilayer_perceptron_regularizers(intent_target_output_projection) 243 | with variable_scope.variable_scope("tag_DNN"): 244 | bias_start = 0.0 245 | n_hidden = dnn_hidden_layer_size 246 | tagging_target_weight_hidden = variable_scope.get_variable("Weight_Hidden", [encoder_cell.output_size, n_hidden]) 247 | tagging_target_bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden], 248 | initializer=init_ops.constant_initializer(bias_start)) 249 | tagging_target_weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, num_tagging_target_symbols]) 250 | tagging_target_1bias_out = variable_scope.get_variable("Bias_Out", [num_tagging_target_symbols], 251 | initializer=init_ops.constant_initializer(bias_start)) 252 | tagging_target_output_projection = (tagging_target_weight_hidden, tagging_target_bias_hidden, tagging_target_weight_out, tagging_target_1bias_out) 253 | tagging_target_regularizers = get_multilayer_perceptron_regularizers(tagging_target_output_projection) 254 | else: 255 | with variable_scope.variable_scope("intent_linear"): 256 | bias_start = 0.0 257 | intent_target_weight_out = variable_scope.get_variable("Weight_Out", [encoder_cell.output_size, num_intent_target_symbols]) 258 | intent_target_bias_out = variable_scope.get_variable("Bias_Out", [num_intent_target_symbols], 259 | initializer=init_ops.constant_initializer(bias_start)) 260 | intent_target_output_projection = (intent_target_weight_out, intent_target_bias_out) 261 | intent_target_regularizers = get_linear_transformation_regularizers(intent_target_output_projection) 262 | with variable_scope.variable_scope("tag_linear"): 263 | bias_start = 0.0 264 | tagging_target_weight_out = variable_scope.get_variable("Weight_Out", [encoder_cell.output_size, num_tagging_target_symbols]) 265 | tagging_target_bias_out = variable_scope.get_variable("Bias_Out", [num_tagging_target_symbols], 266 | initializer=init_ops.constant_initializer(bias_start)) 267 | tagging_target_output_projection = (tagging_target_weight_out, tagging_target_bias_out) 268 | tagging_target_regularizers = get_linear_transformation_regularizers(tagging_target_output_projection) 269 | 270 | intent_target_emb_size = output_emb_size 271 | tagging_target_emb_size = output_emb_size 272 | 273 | outputs = rnn_with_output_feedback( encoder_cell, encoder_embedded_inputs, 274 | intent_targets, num_intent_target_symbols, intent_target_emb_size, intent_target_output_projection, 275 | tagging_targets, num_tagging_target_symbols, tagging_target_emb_size, tagging_target_output_projection, 276 | word_emb_size, DNN_at_output, zero_intent_thres=zero_intent_thres, 277 | sequence_length=sequence_length, dtype=dtype, 278 | train_with_true_label=train_with_true_label, 279 | use_predicted_output=forward_only) 280 | 281 | 282 | encoder_outputs, encoder_state, intent_logits, tagging_logits, sampled_intent_embeddings, sampled_tag_embeddings = outputs 283 | 284 | intent_results = (intent_logits, intent_target_regularizers, sampled_intent_embeddings) 285 | tagging_results = (tagging_logits, tagging_target_regularizers, sampled_tag_embeddings) 286 | 287 | return encoder_outputs, encoder_state, encoder_embedded_inputs, intent_results, tagging_results -------------------------------------------------------------------------------- /multi_task_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 10 17:28:22 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | 7 | Multi-task model: intent, tagging, LM 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import random 15 | 16 | import numpy as np 17 | from six.moves import xrange # pylint: disable=redefined-builtin 18 | import tensorflow as tf 19 | from tensorflow.python.framework import dtypes 20 | 21 | import data_utils 22 | import seq_labeling 23 | from tensorflow.python.ops import array_ops 24 | from generate_encoder_output import generate_embedding_RNN_output_joint 25 | 26 | 27 | class MultiTaskModel(object): 28 | def __init__(self, source_vocab_size, lm_vocab_size, tag_vocab_size, 29 | label_vocab_size, buckets, 30 | word_embedding_size, size, num_layers, max_gradient_norm, batch_size, 31 | dropout_keep_prob=1.0, use_lstm=True, lm_cost_weight=1.0, 32 | DNN_at_output=False, dnn_hidden_layer_size=200, output_emb_size=50, 33 | use_local_context=False, zero_intent_thres=0, 34 | forward_only=False): 35 | """Create the model. 36 | 37 | Args: 38 | source_vocab_size: int, size of the source vocabulary. 39 | lm_vocab_size: int, size of the LM vocabulary. 40 | tag_vocab_size: int, size of the intent tags. 41 | label_vocab_size: int, size of the slot labels. 42 | buckets: dummy buckets here, modified from tensorflow translation example. 43 | word_embedding_size: int, word embedding size 44 | size: int, number of units in each layer of the model. 45 | num_layers: int, number of layers in the model. 46 | max_gradient_norm: gradients will be clipped to maximally this norm. 47 | batch_size: the size of the batches used during training; 48 | the model construction is independent of batch_size, so it can be 49 | changed after initialization if this is convenient, e.g., for decoding. 50 | dropout_keep_prob: prob to keep during dropout 51 | use_lstm: use lstm cell 52 | lm_cost_weight: lm error weight in linear interpolation os multi-task model errors 53 | DNN_at_output: use DNN at the output layer of each task 54 | dnn_hidden_layer_size: DNN hidden layer size 55 | output_emb_size: size of output label/tag embedding 56 | use_local_context: boolean, if set, apply sampled intent/tag as context to lm. 57 | zero_intent_thres: int, the intent contribution to context remain zero before this thres, 58 | and linear increase to 1 after that. 59 | forward_only: if set, we do not construct the backward pass in the model. 60 | """ 61 | self.source_vocab_size = source_vocab_size 62 | self.tag_vocab_size = tag_vocab_size 63 | self.label_vocab_size = label_vocab_size 64 | self.lm_vocab_size = lm_vocab_size 65 | self.buckets = buckets 66 | self.batch_size = batch_size 67 | self.global_step = tf.Variable(0, trainable=False) 68 | 69 | softmax_loss_function = None 70 | 71 | # Create the internal multi-layer cell for our RNN. 72 | single_cell = tf.nn.rnn_cell.GRUCell(size) 73 | if use_lstm: 74 | # use the customized rnn_cell, --> added trainable option in RNN 75 | single_cell = tf.nn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True) 76 | cell = single_cell 77 | if num_layers > 1: 78 | cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) 79 | 80 | if not forward_only and dropout_keep_prob < 1.0: 81 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, 82 | input_keep_prob=dropout_keep_prob, 83 | output_keep_prob=dropout_keep_prob) 84 | 85 | 86 | # Feeds for inputs. 87 | self.encoder_inputs = [] 88 | self.encoder_inputs_shiftByOne = [] 89 | self.tag_weights = [] 90 | self.intent_weights = [] 91 | self.lm_weights = [] 92 | self.tags = [] 93 | self.labels = [] 94 | self.sequence_length = tf.placeholder(tf.int32, [None], name="sequence_length") 95 | 96 | self.lm_cost_weight = lm_cost_weight 97 | 98 | self.train_with_true_label = tf.placeholder(tf.bool, name="train_with_true_label") 99 | 100 | for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. 101 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 102 | name="encoder{0}".format(i))) 103 | self.encoder_inputs_shiftByOne.append(tf.placeholder(tf.int32, shape=[None], 104 | name="encoder_shiftByOne{0}".format(i))) 105 | self.lm_weights.append(tf.placeholder(tf.float32, shape=[None], 106 | name="lm_weight{0}".format(i))) 107 | for i in xrange(buckets[-1][1]): # output length is 1 108 | self.tags.append(tf.placeholder(tf.float32, shape=[None], name="tag{0}".format(i))) 109 | self.tag_weights.append(tf.placeholder(tf.float32, shape=[None], 110 | name="tag_weight{0}".format(i))) 111 | self.intent_weights.append(tf.placeholder(tf.float32, shape=[None], 112 | name="intent_weight{0}".format(i))) 113 | self.labels.append(tf.placeholder(tf.float32, shape=[None], name="label")) 114 | self.label_duplicated_sequence = self.labels * buckets[-1][1] 115 | 116 | rnn_outputs = generate_embedding_RNN_output_joint(self.encoder_inputs, 117 | cell, 118 | self.source_vocab_size, 119 | self.label_duplicated_sequence, 120 | self.label_vocab_size, 121 | self.tags, 122 | self.tag_vocab_size, 123 | word_embedding_size, 124 | dtype=dtypes.float32, 125 | scope=None, 126 | sequence_length=self.sequence_length, 127 | DNN_at_output=DNN_at_output, 128 | dnn_hidden_layer_size=dnn_hidden_layer_size, 129 | output_emb_size=output_emb_size, 130 | train_with_true_label=self.train_with_true_label, 131 | zero_intent_thres=zero_intent_thres, 132 | forward_only=forward_only) 133 | 134 | encoder_outputs, encoder_state, encoder_embedded_inputs, intent_results, tagging_results = rnn_outputs 135 | 136 | # get tagging loss 137 | self.tagging_output, sampled_tags, _, self.tagging_loss, _ = seq_labeling.generate_task_output( 138 | encoder_outputs, None, encoder_state, self.tags, self.sequence_length, self.tag_vocab_size, self.tag_weights, 139 | buckets, softmax_loss_function=softmax_loss_function, scope='tagging', DNN_at_output=DNN_at_output, tagging_results=tagging_results, forward_only=forward_only) 140 | 141 | 142 | # get intent classification loss 143 | # transform intent classification to a sequence labeling task 144 | # output intent class at each step 145 | self.intent_outputs, _, sampled_intents, self.classification_loss, _ = seq_labeling.generate_task_output( 146 | encoder_outputs, None, encoder_state, self.label_duplicated_sequence, self.sequence_length, self.label_vocab_size, self.intent_weights, 147 | buckets, softmax_loss_function=softmax_loss_function, use_attention=False, scope='intent', DNN_at_output=DNN_at_output, train_with_true_label=self.train_with_true_label, intent_results=intent_results, forward_only=forward_only) 148 | classification_output_rev = seq_labeling._reverse_seq(self.intent_outputs, self.sequence_length) 149 | self.classification_output = [classification_output_rev[0]] 150 | 151 | 152 | # get lm loss, use additional_inputs = sampled_tags_intents 153 | sampled_tags_intents = [array_ops.concat(1, [sampled_tag, sampled_intent]) for sampled_tag, sampled_intent in zip(sampled_tags, sampled_intents)] 154 | self.lm_output, _, _, self.lm_loss_withRegularizers, self.lm_loss = seq_labeling.generate_task_output( 155 | encoder_outputs, sampled_tags_intents, encoder_state, self.encoder_inputs_shiftByOne, self.sequence_length, self.lm_vocab_size, self.lm_weights, 156 | buckets, softmax_loss_function=softmax_loss_function, use_attention=False, scope='lm', DNN_at_output=DNN_at_output, use_local_context=use_local_context, 157 | forward_only=forward_only) 158 | 159 | # Gradients and SGD update operation for training the model. 160 | params = tf.trainable_variables() 161 | if not forward_only: 162 | for i in range(len(params)): 163 | print (params[i].name) 164 | opt = tf.train.AdamOptimizer() 165 | gradients = tf.gradients([self.tagging_loss, self.classification_loss, self.lm_loss_withRegularizers*self.lm_cost_weight], params) 166 | 167 | clipped_gradients, norm = tf.clip_by_global_norm(gradients, 168 | max_gradient_norm) 169 | self.gradient_norm = norm 170 | self.update = opt.apply_gradients( 171 | zip(clipped_gradients, params), global_step=self.global_step) 172 | 173 | self.saver = tf.train.Saver(tf.all_variables()) 174 | 175 | 176 | def joint_step(self, session, encoder_inputs, encoder_inputs_shiftByOne, 177 | lm_weights, tags, tag_weights, labels, intent_weights, 178 | batch_sequence_length, bucket_id, forward_only, 179 | train_with_true_label=True): 180 | """Run a joint step of the model feeding the given inputs. 181 | 182 | Args: 183 | session: tensorflow session to use. 184 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 185 | encoder_inputs_shiftByOne: lm output 186 | lm_weights: list of numpy float vectors to feed as lm target weights. 187 | tags: list of numpy int vectors to feed as output tags. 188 | tag_weights: list of numpy float vectors to feed as tag weights. 189 | labels: list of numpy int vectors to feed as intent labels. 190 | intent_weights: list of numpy float vectors to feed as intent weights. 191 | batch_sequence_length: list of numpy int to feed as sequence length. 192 | bucket_id: which bucket of the model to use. # dummy, always 0. 193 | forward_only: whether to do the backward step or only forward. 194 | train_with_true_label: whether to use true label during model training. 195 | 196 | Returns: 197 | A triple consisting of gradient norm (or None if we did not do backward), 198 | average perplexity, and the outputs. 199 | 200 | Raises: 201 | ValueError: if length of encoder_inputs, decoder_inputs, or 202 | target_weights disagrees with bucket size for the specified bucket_id. 203 | """ 204 | # Check if the sizes match. 205 | encoder_size, tag_size = self.buckets[bucket_id] 206 | if len(encoder_inputs) != encoder_size: 207 | raise ValueError("Encoder length must be equal to the one in bucket," 208 | " %d != %d." % (len(encoder_inputs), encoder_size)) 209 | if len(tags) != tag_size: 210 | raise ValueError("Decoder length must be equal to the one in bucket," 211 | " %d != %d." % (len(tags), tag_size)) 212 | if len(labels) != 1: 213 | raise ValueError("Decoder length must be equal to the one in bucket," 214 | " %d != %d." % (len(labels), 1)) 215 | 216 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 217 | input_feed = {} 218 | input_feed[self.sequence_length.name] = batch_sequence_length 219 | for l in xrange(encoder_size): 220 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 221 | input_feed[self.tags[l].name] = tags[l] 222 | input_feed[self.tag_weights[l].name] = tag_weights[l] 223 | input_feed[self.intent_weights[l].name] = intent_weights[l] 224 | input_feed[self.encoder_inputs_shiftByOne[l].name] = encoder_inputs_shiftByOne[l] 225 | input_feed[self.lm_weights[l].name] = lm_weights[l] 226 | input_feed[self.labels[0].name] = labels[0] 227 | input_feed[self.train_with_true_label.name] = train_with_true_label 228 | 229 | # Output feed: depends on whether we do a backward step or not. 230 | if not forward_only: 231 | output_feed = [self.update, # Update Op that does SGD. 232 | self.gradient_norm, # Gradient norm. 233 | self.lm_loss] # Loss for this batch, report the LM loss here. 234 | for i in range(tag_size): 235 | output_feed.append(self.tagging_output[i]) 236 | output_feed.append(self.classification_output[0]) 237 | else: 238 | output_feed = [self.lm_loss] 239 | for i in range(tag_size): 240 | output_feed.append(self.tagging_output[i]) 241 | output_feed.append(self.classification_output[0]) 242 | 243 | outputs = session.run(output_feed, input_feed) 244 | if not forward_only: 245 | return outputs[1], outputs[2], outputs[3:3+tag_size],outputs[-1] 246 | else: 247 | return None, outputs[0], outputs[1:1+tag_size], outputs[-1] 248 | 249 | 250 | def tagging_step(self, session, encoder_inputs, tags, tag_weights, batch_sequence_length, 251 | bucket_id, forward_only, train_with_true_label=True): 252 | """Run a tagging step of the model feeding the given inputs. 253 | 254 | Args: 255 | session: tensorflow session to use. 256 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 257 | tags: list of numpy int vectors to feed as output tags. 258 | tag_weights: list of numpy float vectors to feed as tag weights. 259 | batch_sequence_length: list of numpy int to feed as sequence length. 260 | bucket_id: which bucket of the model to use. # dummy, always 0. 261 | forward_only: whether to do the backward step or only forward. 262 | train_with_true_label: whether to use true label during model training. 263 | 264 | Returns: 265 | A triple consisting of gradient norm (or None if we did not do backward), 266 | average perplexity, and the outputs. 267 | 268 | Raises: 269 | ValueError: if length of encoder_inputs, decoder_inputs, or 270 | target_weights disagrees with bucket size for the specified bucket_id. 271 | """ 272 | # Check if the sizes match. 273 | encoder_size, tag_size = self.buckets[bucket_id] 274 | if len(encoder_inputs) != encoder_size: 275 | raise ValueError("Encoder length must be equal to the one in bucket," 276 | " %d != %d." % (len(encoder_inputs), encoder_size)) 277 | if len(tags) != tag_size: 278 | raise ValueError("Decoder length must be equal to the one in bucket," 279 | " %d != %d." % (len(tags), tag_size)) 280 | 281 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 282 | input_feed = {} 283 | input_feed[self.sequence_length.name] = batch_sequence_length 284 | for l in xrange(encoder_size): 285 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 286 | input_feed[self.tags[l].name] = tags[l] 287 | input_feed[self.tag_weights[l].name] = tag_weights[l] 288 | input_feed[self.train_with_true_label.name] = train_with_true_label 289 | 290 | # Output feed: depends on whether we do a backward step or not. 291 | if not forward_only: 292 | output_feed = [self.update, # Update Op that does SGD. 293 | self.gradient_norm, # Gradient norm. 294 | self.tagging_loss] # Loss for this batch. 295 | for i in range(tag_size): 296 | output_feed.append(self.tagging_output[i]) 297 | else: 298 | output_feed = [self.tagging_loss] 299 | for i in range(tag_size): 300 | output_feed.append(self.tagging_output[i]) 301 | outputs = session.run(output_feed, input_feed) 302 | if not forward_only: 303 | return outputs[1], outputs[2], outputs[3:3+tag_size] 304 | else: 305 | return None, outputs[0], outputs[1:1+tag_size] 306 | 307 | def classification_step(self, session, encoder_inputs, labels, intent_weights, batch_sequence_length, 308 | bucket_id, forward_only): 309 | """Run a intent classification step of the model feeding the given inputs. 310 | 311 | Args: 312 | session: tensorflow session to use. 313 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 314 | labels: list of numpy int vectors to feed as intent labels. 315 | intent_weights: list of numpy float vectors to feed as intent weights. 316 | batch_sequence_length: list of numpy int to feed as sequence length. 317 | bucket_id: which bucket of the model to use. # dummy, always 0. 318 | forward_only: whether to do the backward step or only forward. 319 | train_with_true_label: whether to use true label during model training. 320 | 321 | Returns: 322 | A triple consisting of gradient norm (or None if we did not do backward), 323 | average perplexity, and the outputs. 324 | 325 | Raises: 326 | ValueError: if length of encoder_inputs, decoder_inputs, or 327 | target_weights disagrees with bucket size for the specified bucket_id. 328 | """ 329 | # Check if the sizes match. 330 | encoder_size, target_size = self.buckets[bucket_id] 331 | if len(encoder_inputs) != encoder_size: 332 | raise ValueError("Encoder length must be equal to the one in bucket," 333 | " %d != %d." % (len(encoder_inputs), encoder_size)) 334 | 335 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 336 | input_feed = {} 337 | input_feed[self.sequence_length.name] = batch_sequence_length 338 | for l in xrange(encoder_size): 339 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 340 | input_feed[self.intent_weights[l].name] = intent_weights[l] 341 | input_feed[self.labels[0].name] = labels[0] 342 | 343 | # Output feed: depends on whether we do a backward step or not. 344 | if not forward_only: 345 | output_feed = [self.update, # Update Op that does SGD. 346 | self.gradient_norm, # Gradient norm. 347 | self.classification_loss, # Loss for this batch. 348 | self.classification_output[0], 349 | ] 350 | else: 351 | output_feed = [self.classification_loss, 352 | self.classification_output[0], 353 | ] 354 | 355 | outputs = session.run(output_feed, input_feed) 356 | if not forward_only: 357 | return outputs[1], outputs[2], outputs[3] # Gradient norm, loss, outputs. 358 | else: 359 | return None, outputs[0], outputs[1] # No gradient norm, loss, outputs. 360 | 361 | 362 | def lm_step(self, session, encoder_inputs, encoder_inputs_shiftByOne, lm_weights, batch_sequence_length, bucket_id, forward_only): 363 | """Run a joint step of the model feeding the given inputs. 364 | 365 | Args: 366 | session: tensorflow session to use. 367 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 368 | encoder_inputs_shiftByOne: lm output 369 | lm_weights: list of numpy float vectors to feed as lm target weights. 370 | batch_sequence_length: list of numpy int to feed as sequence length. 371 | bucket_id: which bucket of the model to use. # dummy, always 0. 372 | forward_only: whether to do the backward step or only forward. 373 | train_with_true_label: whether to use true label during model training. 374 | 375 | Returns: 376 | A triple consisting of gradient norm (or None if we did not do backward), 377 | average perplexity, and the outputs. 378 | 379 | Raises: 380 | ValueError: if length of encoder_inputs, decoder_inputs, or 381 | target_weights disagrees with bucket size for the specified bucket_id. 382 | """ 383 | encoder_size, tag_size = self.buckets[bucket_id] 384 | if len(encoder_inputs) != encoder_size: 385 | raise ValueError("Encoder length must be equal to the one in bucket," 386 | " %d != %d." % (len(encoder_inputs), encoder_size)) 387 | if len(encoder_inputs_shiftByOne) != tag_size: 388 | raise ValueError("Decoder length must be equal to the one in bucket," 389 | " %d != %d." % (len(encoder_inputs_shiftByOne), tag_size)) 390 | 391 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 392 | input_feed = {} 393 | input_feed[self.sequence_length.name] = batch_sequence_length 394 | for l in xrange(encoder_size): 395 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 396 | input_feed[self.encoder_inputs_shiftByOne[l].name] = encoder_inputs_shiftByOne[l] 397 | input_feed[self.lm_weights[l].name] = lm_weights[l] 398 | 399 | if not forward_only: 400 | output_feed = [self.update, # Update Op that does SGD. 401 | self.gradient_norm, # Gradient norm. 402 | self.lm_loss] # Loss for this batch. 403 | else: 404 | output_feed = [self.lm_loss] 405 | 406 | outputs = session.run(output_feed, input_feed) 407 | if not forward_only: 408 | return outputs[1], outputs[2] 409 | else: 410 | return None, outputs[0] 411 | 412 | def get_batch(self, data, bucket_id): 413 | """Get a random batch of data from the specified bucket, prepare for step. 414 | 415 | To feed data in step(..) it must be a list of batch-major vectors, while 416 | data here contains single length-major cases. So the main logic of this 417 | function is to re-index data cases to be in the proper format for feeding. 418 | 419 | Args: 420 | data: a tuple of size len(self.buckets) in which each element contains 421 | lists of pairs of input and output data that we use to create a batch. 422 | bucket_id: integer, which bucket to get the batch for. 423 | 424 | Returns: 425 | The triple (encoder_inputs, decoder_inputs, target_weights) for 426 | the constructed batch that has the proper format to call step(...) later. 427 | """ 428 | encoder_size, decoder_size = self.buckets[bucket_id] 429 | encoder_inputs, encoder_inputs_shiftByOne, decoder_inputs, labels = [], [], [], [] 430 | 431 | # Get a random batch of encoder and decoder inputs from data, 432 | # pad them if needed, reverse encoder inputs and add GO to decoder. 433 | batch_sequence_length_list= list() 434 | for _ in xrange(self.batch_size): 435 | encoder_input, decoder_input, label = random.choice(data[bucket_id]) 436 | batch_sequence_length_list.append(len(encoder_input) + 1) # +1 for BOS_ID 437 | 438 | # Encoder inputs are padded and then reversed. 439 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input) - 1) # -1 for BOS_ID 440 | #encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) 441 | encoder_inputs.append(list([data_utils.BOS_ID] + encoder_input + encoder_pad)) 442 | encoder_inputs_shiftByOne.append(list(encoder_input + [data_utils.BOS_ID] + encoder_pad)) #BOS_ID and EOS_ID shared the same ID in in_vocab and lm_vocab 443 | 444 | # Decoder inputs get an extra "GO" symbol, and are padded then. 445 | decoder_pad_size = decoder_size - len(decoder_input) - 1 # -1 for BOS_ID 446 | decoder_inputs.append([data_utils.PAD_ID] + decoder_input + 447 | [data_utils.PAD_ID] * decoder_pad_size) 448 | labels.append(label) 449 | 450 | # Now we create batch-major vectors from the data selected above. 451 | batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_labels = [], [], [], [], [], [], [] 452 | 453 | # Batch encoder inputs are just re-indexed encoder_inputs. 454 | for length_idx in xrange(encoder_size): 455 | batch_encoder_inputs.append( 456 | np.array([encoder_inputs[batch_idx][length_idx] 457 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 458 | 459 | # Batch encoder inputs are just re-indexed encoder_inputs. 460 | for length_idx in xrange(encoder_size): 461 | batch_encoder_inputs_shiftByOne.append( 462 | np.array([encoder_inputs_shiftByOne[batch_idx][length_idx] 463 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 464 | 465 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 466 | for length_idx in xrange(decoder_size): 467 | batch_decoder_inputs.append( 468 | np.array([decoder_inputs[batch_idx][length_idx] 469 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 470 | # Create target_weights to be 0 for targets that are padding. 471 | tagging_batch_weight = np.ones(self.batch_size, dtype=np.float32) 472 | intent_batch_weight = np.ones(self.batch_size, dtype=np.float32) 473 | lm_batch_weight = np.ones(self.batch_size, dtype=np.float32) 474 | for batch_idx in xrange(self.batch_size): 475 | # We set weight to 0 if the corresponding target is a PAD symbol. 476 | # The corresponding target is decoder_input shifted by 1 forward. 477 | 478 | # set tagging weight in padding position to 0, leave others to 1 479 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 480 | tagging_batch_weight[batch_idx] = 0.0 481 | # set tagging weight in padding position to 0, leave others to linearly increasing weight 482 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 483 | intent_batch_weight[batch_idx] = 0.0 484 | else: 485 | if length_idx <= 4: 486 | intent_batch_weight[batch_idx] = 0.0 487 | else: 488 | intent_batch_weight[batch_idx] = (length_idx - 4) / batch_sequence_length_list[batch_idx] 489 | if encoder_inputs_shiftByOne[batch_idx][length_idx] == data_utils.PAD_ID: 490 | lm_batch_weight[batch_idx] = 0.0 491 | tagging_batch_weights.append(tagging_batch_weight) 492 | intent_batch_weights.append(intent_batch_weight) 493 | lm_batch_weights.append(lm_batch_weight) 494 | 495 | batch_labels.append( 496 | np.array([labels[batch_idx][0] 497 | for batch_idx in xrange(self.batch_size)], dtype=np.int32)) 498 | 499 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32) 500 | return batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_sequence_length, batch_labels 501 | 502 | 503 | def get_one(self, data, bucket_id, sample_id): 504 | """Get a sample data from the specified bucket, prepare for step. 505 | 506 | To feed data in step(..) it must be a list of batch-major vectors, while 507 | data here contains single length-major cases. So the main logic of this 508 | function is to re-index data cases to be in the proper format for feeding. 509 | 510 | Args: 511 | data: a tuple of size len(self.buckets) in which each element contains 512 | lists of pairs of input and output data that we use to create a batch. 513 | bucket_id: integer, which bucket to get the batch for. 514 | sample_id: integer, sample id within the bucket specified. 515 | 516 | Returns: 517 | The triple (encoder_inputs, decoder_inputs, target_weights) for 518 | the constructed batch that has the proper format to call step(...) later. 519 | """ 520 | encoder_size, decoder_size = self.buckets[bucket_id] 521 | encoder_inputs, encoder_inputs_shiftByOne, decoder_inputs, labels = [], [], [], [] 522 | 523 | # Get a random batch of encoder and decoder inputs from data, 524 | # pad them if needed, reverse encoder inputs and add GO to decoder. 525 | batch_sequence_length_list= list() 526 | #for _ in xrange(self.batch_size): 527 | encoder_input, decoder_input, label = data[bucket_id][sample_id] 528 | batch_sequence_length_list.append(len(encoder_input) + 1) # +1 for BOS_ID 529 | 530 | # Encoder inputs are padded and then reversed. 531 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input) - 1) # -1 for BOS_ID 532 | encoder_inputs.append(list([data_utils.BOS_ID] + encoder_input + encoder_pad)) 533 | encoder_inputs_shiftByOne.append(list(encoder_input + [data_utils.BOS_ID] + encoder_pad)) #BOS_ID and EOS_ID shared the same ID in two vocabs 534 | 535 | # Decoder inputs get an extra "GO" symbol, and are padded then. 536 | decoder_pad_size = decoder_size - len(decoder_input) - 1 # -1 for BOS_ID 537 | decoder_inputs.append([data_utils.PAD_ID] + decoder_input + 538 | [data_utils.PAD_ID] * decoder_pad_size) 539 | labels.append(label) 540 | 541 | # Now we create batch-major vectors from the data selected above. 542 | batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_labels = [], [], [], [], [], [], [] 543 | 544 | # Batch encoder inputs are just re-indexed encoder_inputs. 545 | for length_idx in xrange(encoder_size): 546 | batch_encoder_inputs.append( 547 | np.array([encoder_inputs[batch_idx][length_idx] 548 | for batch_idx in xrange(1)], dtype=np.int32)) 549 | 550 | for length_idx in xrange(encoder_size): 551 | batch_encoder_inputs_shiftByOne.append( 552 | np.array([encoder_inputs_shiftByOne[batch_idx][length_idx] 553 | for batch_idx in xrange(1)], dtype=np.int32)) 554 | 555 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 556 | for length_idx in xrange(decoder_size): 557 | batch_decoder_inputs.append( 558 | np.array([decoder_inputs[batch_idx][length_idx] 559 | for batch_idx in xrange(1)], dtype=np.int32)) 560 | 561 | # Create target_weights to be 0 for targets that are padding. 562 | tagging_batch_weight = np.ones(1, dtype=np.float32) 563 | intent_batch_weight = np.ones(1, dtype=np.float32) 564 | lm_batch_weight = np.ones(1, dtype=np.float32) 565 | for batch_idx in xrange(1): 566 | # We set weight to 0 if the corresponding target is a PAD symbol. 567 | # The corresponding target is decoder_input shifted by 1 forward. 568 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 569 | tagging_batch_weight[batch_idx] = 0.0 570 | if decoder_inputs[batch_idx][length_idx] == data_utils.PAD_ID: 571 | intent_batch_weight[batch_idx] = 0.0 572 | else: 573 | if length_idx <= 4: 574 | intent_batch_weight[batch_idx] = 0.0 575 | else: 576 | intent_batch_weight[batch_idx] = (length_idx - 4) / batch_sequence_length_list[batch_idx] 577 | if encoder_inputs_shiftByOne[batch_idx][length_idx] == data_utils.PAD_ID: 578 | lm_batch_weight[batch_idx] = 0.0 579 | tagging_batch_weights.append(tagging_batch_weight) 580 | lm_batch_weights.append(lm_batch_weight) 581 | 582 | batch_labels.append( 583 | np.array([labels[batch_idx][0] 584 | for batch_idx in xrange(1)], dtype=np.int32)) 585 | 586 | batch_sequence_length = np.array(batch_sequence_length_list, dtype=np.int32) 587 | return batch_encoder_inputs, batch_encoder_inputs_shiftByOne, batch_decoder_inputs, tagging_batch_weights, intent_batch_weights, lm_batch_weights, batch_sequence_length, batch_labels -------------------------------------------------------------------------------- /run_rnn_joint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 05 16:37:17 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import math 13 | import os 14 | import sys 15 | import time 16 | 17 | import numpy as np 18 | from six.moves import xrange # pylint: disable=redefined-builtin 19 | import tensorflow as tf 20 | 21 | import data_utils 22 | import multi_task_model 23 | 24 | import subprocess 25 | import stat 26 | 27 | 28 | tf.app.flags.DEFINE_float("learning_rate", 0.1, "Learning rate.") 29 | tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.9, 30 | "Learning rate decays by this much.") 31 | tf.app.flags.DEFINE_float("max_gradient_norm", 5.0, 32 | "Clip gradients to this norm.") 33 | tf.app.flags.DEFINE_integer("batch_size", 16, 34 | "Batch size to use during training.") 35 | tf.app.flags.DEFINE_integer("size", 128, "Size of each model layer.") 36 | tf.app.flags.DEFINE_integer("word_embedding_size", 300, "Size of the word embedding") 37 | tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") 38 | tf.app.flags.DEFINE_integer("in_vocab_size", 10000, "max word vocab Size.") 39 | tf.app.flags.DEFINE_integer("out_vocab_size", 123, "max tag vocab Size.") 40 | tf.app.flags.DEFINE_string("data_dir", "/tmp", "Data directory") 41 | tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training directory.") 42 | tf.app.flags.DEFINE_string("test_file", "/tmp", "Test file.") 43 | tf.app.flags.DEFINE_string("test_output_file", "/tmp", "Test output file.") 44 | tf.app.flags.DEFINE_integer("max_train_data_size", 0, 45 | "Limit on the size of training data (0: no limit).") 46 | tf.app.flags.DEFINE_integer("steps_per_checkpoint", 300, 47 | "How many training steps to do per checkpoint.") 48 | tf.app.flags.DEFINE_integer("max_training_steps", 20000, 49 | "Max training steps.") 50 | tf.app.flags.DEFINE_boolean("decode", False, 51 | "Set to True for interactive decoding.") 52 | tf.app.flags.DEFINE_boolean("decode_file", False, 53 | "Set to True for file decoding.") 54 | tf.app.flags.DEFINE_boolean("self_test", False, 55 | "Run a self-test if this is set to True.") 56 | tf.app.flags.DEFINE_integer("max_test_data_size", 0, 57 | "Max size of test set.") 58 | tf.app.flags.DEFINE_integer("max_sequence_length", 0, 59 | "Max sequence length.") 60 | tf.app.flags.DEFINE_float("dropout_keep_prob", 0.5, 61 | "dropout keep cell input and output prob.") 62 | tf.app.flags.DEFINE_integer("dnn_hidden_layer_size", 200, 63 | "num of nodes in the DNN hidden layer on top of RNN") 64 | tf.app.flags.DEFINE_integer("output_emb_size", 50, 65 | "size of output label/tag embedding") 66 | tf.app.flags.DEFINE_float("lm_cost_weight", 1.0, 67 | "Weight of LM cost comparing to NLU tasks") 68 | tf.app.flags.DEFINE_boolean("DNN_at_output", False, 69 | "add DNN at the output layer of sequence labeling") 70 | tf.app.flags.DEFINE_boolean("use_local_context", True, 71 | "If false, rnn parameters and emb are locked") 72 | tf.app.flags.DEFINE_integer("zero_intent_thres", 0, 73 | "threshold to linearly increase intent contribution to context") 74 | tf.app.flags.DEFINE_string("label_in_training", "true_label", "Options: true_label; predicted_label; scheduled_sampling") 75 | FLAGS = tf.app.flags.FLAGS 76 | 77 | 78 | # We use a number of buckets and pad to the closest one for efficiency. 79 | # See seq2seq_model.Seq2SeqModel for details of how they work. 80 | if FLAGS.max_sequence_length == 0: 81 | print ('Please indicate max sequence length. Exit') 82 | exit() 83 | 84 | # _buckets: modified from tf translation example, just use one dummy bucket here. 85 | _buckets = [(FLAGS.max_sequence_length, FLAGS.max_sequence_length)] 86 | 87 | # metrics function using conlleval.pl 88 | def conlleval(p, g, w, filename): 89 | ''' 90 | INPUT: 91 | p :: predictions 92 | g :: groundtruth 93 | w :: corresponding words 94 | 95 | OUTPUT: 96 | filename :: name of the file where the predictions 97 | are written. it will be the input of conlleval.pl script 98 | for computing the performance in terms of precision 99 | recall and f1 score 100 | ''' 101 | out = '' 102 | for sl, sp, sw in zip(g, p, w): 103 | out += 'BOS O O\n' 104 | for wl, wp, w in zip(sl, sp, sw): 105 | if wl != '_PAD': 106 | out += w + ' ' + wl + ' ' + wp + '\n' 107 | out += 'EOS O O\n\n' 108 | 109 | f = open(filename, 'w') 110 | f.writelines(out[:-1]) # remove the ending \n on last line 111 | f.close() 112 | 113 | return get_perf(filename) 114 | 115 | def get_perf(filename): 116 | ''' run conlleval.pl perl script to obtain 117 | precision/recall and F1 score ''' 118 | _conlleval = os.path.dirname(os.path.realpath(__file__)) + '/conlleval.pl' 119 | os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions 120 | 121 | proc = subprocess.Popen(["perl", 122 | _conlleval], 123 | stdin=subprocess.PIPE, 124 | stdout=subprocess.PIPE) 125 | 126 | stdout, _ = proc.communicate(''.join(open(filename).readlines())) 127 | for line in stdout.split('\n'): 128 | if 'accuracy' in line: 129 | out = line.split() 130 | break 131 | 132 | precision = float(out[6][:-2]) 133 | recall = float(out[8][:-2]) 134 | f1score = float(out[10]) 135 | 136 | return {'p': precision, 'r': recall, 'f1': f1score} 137 | 138 | 139 | def read_data(source_path, target_path, label_path, max_size=None): 140 | """Read data from source and target files and put into buckets. 141 | 142 | Args: 143 | source_path: path to the files with token-ids for the source input - word sequence. 144 | target_path: path to the file with token-ids for the target output - tag sequence; 145 | it must be aligned with the source file: n-th line contains the desired 146 | output for n-th line from the source_path. 147 | label_path: path to the file with token-ids for the sequence classification label 148 | max_size: maximum number of lines to read, all other will be ignored; 149 | if 0 or None, data files will be read completely (no limit). 150 | 151 | Returns: 152 | data_set: a list of length len(_buckets); data_set[n] contains a list of 153 | (source, target) pairs read from the provided data files that fit 154 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and 155 | len(target) < _buckets[n][1]; source and target are lists of token-ids. 156 | """ 157 | data_set = [[] for _ in _buckets] 158 | with tf.gfile.GFile(source_path, mode="r") as source_file: 159 | with tf.gfile.GFile(target_path, mode="r") as target_file: 160 | with tf.gfile.GFile(label_path, mode="r") as label_file: 161 | source, target, label = source_file.readline(), target_file.readline(), label_file.readline() 162 | counter = 0 163 | while source and target and label and (not max_size or counter < max_size): 164 | counter += 1 165 | if counter % 100000 == 0: 166 | print(" reading data line %d" % counter) 167 | sys.stdout.flush() 168 | source_ids = [int(x) for x in source.split()] 169 | target_ids = [int(x) for x in target.split()] 170 | label_ids = [int(x) for x in label.split()] 171 | for bucket_id, (source_size, target_size) in enumerate(_buckets): 172 | if len(source_ids) < source_size and len(target_ids) < target_size: 173 | data_set[bucket_id].append([source_ids, target_ids, label_ids]) 174 | break 175 | source, target, label = source_file.readline(), target_file.readline(), label_file.readline() 176 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids 177 | 178 | def read_test_data(source_path): 179 | data_set = [[] for _ in _buckets] 180 | with tf.gfile.GFile(source_path, mode="r") as source_file: 181 | source = source_file.readline() 182 | counter = 0 183 | while source: 184 | counter += 1 185 | if counter % 100000 == 0: 186 | print(" reading data line %d" % counter) 187 | sys.stdout.flush() 188 | source_ids = [int(x) for x in source.split()] 189 | for bucket_id, (source_size, target_size) in enumerate(_buckets): 190 | if len(source_ids) < source_size: 191 | data_set[bucket_id].append([source_ids, [0]*len(source_ids), [0]]) 192 | break 193 | source = source_file.readline() 194 | return data_set # 3 outputs in each unit: source_ids, target_ids, label_ids 195 | 196 | def create_model(session, source_vocab_size, target_vocab_size, label_vocab_size, lm_vocab_size): 197 | """Create model and initialize or load parameters in session.""" 198 | with tf.variable_scope("model", reuse=None): 199 | model_train = multi_task_model.MultiTaskModel( 200 | source_vocab_size, lm_vocab_size, target_vocab_size, label_vocab_size, _buckets, 201 | FLAGS.word_embedding_size, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, 202 | dropout_keep_prob=FLAGS.dropout_keep_prob, use_lstm=True, lm_cost_weight=FLAGS.lm_cost_weight, 203 | forward_only=False, 204 | DNN_at_output=FLAGS.DNN_at_output, 205 | dnn_hidden_layer_size=FLAGS.dnn_hidden_layer_size, 206 | output_emb_size=FLAGS.output_emb_size, 207 | zero_intent_thres=FLAGS.zero_intent_thres) 208 | with tf.variable_scope("model", reuse=True): 209 | model_test = multi_task_model.MultiTaskModel( 210 | source_vocab_size, lm_vocab_size, target_vocab_size, label_vocab_size, _buckets, 211 | FLAGS.word_embedding_size, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, 212 | dropout_keep_prob=FLAGS.dropout_keep_prob, use_lstm=True, lm_cost_weight=FLAGS.lm_cost_weight, 213 | forward_only=True, 214 | DNN_at_output=FLAGS.DNN_at_output, 215 | dnn_hidden_layer_size=FLAGS.dnn_hidden_layer_size, 216 | output_emb_size=FLAGS.output_emb_size, 217 | zero_intent_thres=FLAGS.zero_intent_thres) 218 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 219 | if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): 220 | print("Reading model parameters from %s" % ckpt.model_checkpoint_path) 221 | model_train.saver.restore(session, ckpt.model_checkpoint_path) 222 | else: 223 | print("Created model with fresh parameters.") 224 | session.run(tf.initialize_all_variables()) 225 | return model_train, model_test 226 | 227 | def write_results_to_file(result_dict, idx2tag_vocab, result_out_file): 228 | with open(result_out_file, 'wb') as f: 229 | for key in sorted(result_dict): 230 | f.write('%d\t%s\n' % (key, idx2tag_vocab[result_dict[key]])) 231 | 232 | def train(): 233 | print ('Applying Parameters:') 234 | for k,v in FLAGS.__dict__['__flags'].iteritems(): 235 | print ('%s: %s' % (k, str(v))) 236 | print("Preparing trec data in %s" % FLAGS.data_dir) 237 | vocab_path = '' 238 | tag_vocab_path = '' 239 | label_vocab_path = '' 240 | 241 | in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_utils.prepare_multi_task_data( 242 | FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size) 243 | 244 | result_dir = FLAGS.train_dir + '/test_results' 245 | if not os.path.isdir(result_dir): 246 | os.makedirs(result_dir) 247 | 248 | current_taging_valid_out_file = result_dir + '/taging.valid.hyp.txt' 249 | current_taging_test_out_file = result_dir + '/taging.test.hyp.txt' 250 | label_valid_out_file = result_dir + '/label.valid.hyp.txt' 251 | label_test_out_file = result_dir + '/label.valid.hyp.txt' 252 | 253 | vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) 254 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path) 255 | label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path) 256 | LM_vocab = vocab.copy() 257 | assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID 258 | del LM_vocab[data_utils._BOS] 259 | LM_vocab[data_utils._BOS] = data_utils.BOS_ID 260 | rev_LM_vocab = [x for x in rev_vocab] 261 | rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS 262 | 263 | config = tf.ConfigProto( 264 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23), 265 | ) 266 | 267 | with tf.Session(config=config) as sess: 268 | # Create model. 269 | print("Max sequence length: %d ." % _buckets[0][0]) 270 | print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) 271 | 272 | model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)) 273 | print ("Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab))) 274 | 275 | # Read data into buckets and compute their sizes. 276 | print ("Reading development and training data (limit: %d)." 277 | % FLAGS.max_train_data_size) 278 | dev_set = read_data(in_seq_dev, out_seq_dev, label_dev) 279 | test_set = read_data(in_seq_test, out_seq_test, label_test) 280 | train_set = read_data(in_seq_train, out_seq_train, label_train) 281 | train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] 282 | 283 | train_total_size = float(sum(train_bucket_sizes)) 284 | 285 | train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size 286 | for i in xrange(len(train_bucket_sizes))] 287 | 288 | # This is the training loop. 289 | step_time, loss = 0.0, 0.0 290 | current_step = 0 291 | best_valid_score = 0 292 | best_test_score = 0 293 | 294 | 295 | if FLAGS.label_in_training == 'true_label': 296 | print ("Use TRUE label during model training") 297 | train_with_true_label = True 298 | elif FLAGS.label_in_training == 'predicted_label': 299 | print ("Use PREDICTED label during model training") 300 | train_with_true_label = False 301 | elif FLAGS.label_in_training == 'scheduled_sampling': 302 | print ("Use Scheduled Sampling label during model training") 303 | 304 | while model.global_step.eval() < FLAGS.max_training_steps: 305 | random_number_01 = np.random.random_sample() 306 | bucket_id = min([i for i in xrange(len(train_buckets_scale)) 307 | if train_buckets_scale[i] > random_number_01]) 308 | 309 | # Get a batch and make a step. 310 | start_time = time.time() 311 | encoder_inputs, encoder_inputs_shiftByOne, tags, tag_weights, intent_weights, lm_weights, batch_sequence_length, labels = model.get_batch(train_set, bucket_id) 312 | 313 | if FLAGS.label_in_training == 'scheduled_sampling': 314 | random_number_02 = np.random.random_sample() 315 | final_training_step = FLAGS.max_training_steps 316 | if random_number_02 < float(model.global_step.eval())/final_training_step: # use predicted label in training 317 | train_with_true_label = False 318 | else: 319 | train_with_true_label = True 320 | 321 | _, step_loss, tagging_logits, classification_logits = model.joint_step(sess, encoder_inputs, encoder_inputs_shiftByOne, lm_weights, tags, tag_weights, labels, intent_weights, 322 | batch_sequence_length, bucket_id, False, train_with_true_label=train_with_true_label) 323 | step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint 324 | loss += step_loss / FLAGS.steps_per_checkpoint 325 | 326 | current_step += 1 327 | 328 | # Once in a while, we save checkpoint, print statistics, and run evals. 329 | if current_step % FLAGS.steps_per_checkpoint == 0: 330 | # Print statistics for the previous epoch. 331 | perplexity = math.exp(loss) if loss < 300 else float('inf') 332 | print ("global step %d step-time %.2f. Training perplexity %.2f" 333 | % (model.global_step.eval(), step_time, perplexity)) 334 | sys.stdout.flush() 335 | 336 | # Save checkpoint and zero timer and loss. 337 | checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") 338 | model.saver.save(sess, checkpoint_path, global_step=model.global_step) 339 | step_time, loss = 0.0, 0.0 340 | 341 | def run_eval(data_set, mode): # mode = "Valid", "Test" 342 | # Run evals on development/test set and print their accyracy. 343 | word_list = list() 344 | ref_tag_list = list() 345 | hyp_tag_list = list() 346 | ref_label_list = list() 347 | hyp_label_list = list() 348 | correct_count = 0 349 | accuracy = 0.0 350 | for bucket_id in xrange(len(_buckets)): # len(_buckets) = 1 here 351 | eval_loss = 0.0 352 | count = 0 353 | total_word_count = 0 354 | for i in xrange(len(data_set[bucket_id])): 355 | count += 1 356 | eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one( 357 | data_set, bucket_id, i) 358 | eval_intent_weights = eval_tag_weights 359 | tagging_logits = [] 360 | classification_logits = [] 361 | _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_lm_weights, eval_tags, eval_tag_weights, eval_labels, eval_intent_weights, 362 | eval_sequence_length, bucket_id, True) 363 | eval_loss += step_loss*(eval_sequence_length[0]) 364 | total_word_count += eval_sequence_length[0] 365 | hyp_label = None 366 | 367 | # intent results 368 | ref_label_list.append(rev_label_vocab[eval_labels[0][0]]) 369 | hyp_label = np.argmax(classification_logits[0],0) 370 | hyp_label_list.append(rev_label_vocab[hyp_label]) 371 | if eval_labels[0] == hyp_label: 372 | correct_count += 1 373 | 374 | # tagging results 375 | word_list.append([rev_vocab[x[0]] for x in eval_encoder_inputs[:eval_sequence_length[0]]]) 376 | ref_tag_list.append([rev_tag_vocab[x[0]] for x in eval_tags[:eval_sequence_length[0]]]) 377 | hyp_tag_list.append([rev_tag_vocab[np.argmax(x)] for x in tagging_logits[:eval_sequence_length[0]]]) 378 | 379 | eval_perplexity = math.exp(float(eval_loss)/total_word_count) 380 | print(" %s perplexity: %.2f" % (mode, eval_perplexity)) 381 | accuracy = float(correct_count)*100/count 382 | print(" %s accuracy: %.2f %d/%d" % (mode, accuracy, correct_count, count)) 383 | 384 | tagging_eval_result = dict() 385 | if mode == 'Valid': 386 | output_file = current_taging_valid_out_file 387 | elif mode == 'Test': 388 | output_file = current_taging_test_out_file 389 | tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list, word_list, output_file) 390 | print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1'])) 391 | sys.stdout.flush() 392 | return eval_perplexity, tagging_eval_result, hyp_label_list 393 | 394 | # run valid 395 | valid_perplexity, valid_tagging_result, valid_hyp_label_list = run_eval(dev_set, 'Valid') 396 | # record best results 397 | 398 | if valid_tagging_result['f1'] > best_valid_score: 399 | best_valid_score = valid_tagging_result['f1'] 400 | subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' % best_valid_score]) 401 | with open('%s.best_f1_%.2f' % (label_valid_out_file, best_valid_score), 'wb') as f: 402 | for i in range(len(valid_hyp_label_list)): 403 | f.write(valid_hyp_label_list[i] + '\n') 404 | 405 | # run test after each validation for development purpose. 406 | test_perplexity, test_tagging_result, test_hyp_label_list = run_eval(test_set, 'Test') 407 | # record best results 408 | if test_tagging_result['f1'] > best_test_score: 409 | best_test_score = test_tagging_result['f1'] 410 | subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' % best_test_score]) 411 | with open('%s.best_f1_%.2f' % (label_test_out_file, best_test_score), 'wb') as f: 412 | for i in range(len(test_hyp_label_list)): 413 | f.write(test_hyp_label_list[i] + '\n') 414 | 415 | def decode_file(test_file): 416 | print ('Applying Parameters:') 417 | for k,v in FLAGS.__dict__['__flags'].iteritems(): 418 | print ('%s: %s' % (k, str(v))) 419 | 420 | vocab_path = FLAGS.data_dir + '/in_vocab_1000.txt' 421 | tag_vocab_path = FLAGS.data_dir + '/out_vocab_1000.txt' 422 | label_vocab_path = FLAGS.data_dir + '/label.txt' 423 | 424 | vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) 425 | tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path) 426 | label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path) 427 | LM_vocab = vocab.copy() 428 | assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID 429 | del LM_vocab[data_utils._BOS] 430 | LM_vocab[data_utils._BOS] = data_utils.BOS_ID 431 | rev_LM_vocab = [x for x in rev_vocab] 432 | rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS 433 | 434 | data_utils.data_to_token_ids(test_file, test_file + '.ids', vocab_path, tokenizer=data_utils.naive_tokenizer) 435 | 436 | test_set = read_test_data(test_file + '.ids') 437 | lm_test_output_file = FLAGS.test_output_file + '.ppl' 438 | intent_test_output_file = FLAGS.test_output_file + '.intent.hyp' 439 | tagging_test_output_file = FLAGS.test_output_file + '.tag.hyp' 440 | 441 | config = tf.ConfigProto( 442 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.23), 443 | ) 444 | 445 | with tf.Session(config=config) as sess: 446 | print("Loading model...") 447 | _, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)) 448 | print ("Loaded model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab))) 449 | 450 | def run_eval(data_set, decode_output_file): 451 | with open(lm_test_output_file, 'wb') as f_lm: 452 | with open(intent_test_output_file, 'wb') as f_intent: 453 | with open(tagging_test_output_file, 'wb') as f_tagging: 454 | eval_loss = 0.0 455 | bucket_id = 0 456 | count = 0 457 | total_word_count = 0 458 | for i in xrange(len(data_set[bucket_id])): 459 | count += 1 460 | if count % 1000 == 0: 461 | print ("Decoding utterance No. %d..." % count) 462 | eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one( 463 | data_set, bucket_id, i) 464 | eval_intent_weights = eval_tag_weights 465 | 466 | tagging_logits = [] 467 | classification_logits = [] 468 | 469 | _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_lm_weights, eval_tags, eval_tag_weights, eval_labels, eval_intent_weights, 470 | eval_sequence_length, bucket_id, True, use_attention=FLAGS.use_attention) 471 | 472 | f_lm.write('%.2f\n' % (-step_loss*(eval_sequence_length[0]-1))) 473 | f_lm.flush() 474 | eval_loss += step_loss*(eval_sequence_length[0]) 475 | total_word_count += eval_sequence_length[0] 476 | 477 | hyp_label = None 478 | # intent results 479 | hyp_label = np.argmax(classification_logits[0],0) 480 | f_intent.write('%s\n' % rev_label_vocab[hyp_label]) 481 | f_intent.flush() 482 | # tagging results 483 | f_tagging.write('%s\n' % ' '.join([rev_tag_vocab[np.argmax(x,1)] for x in tagging_logits[1:eval_sequence_length[0]]])) 484 | f_tagging.flush() 485 | 486 | eval_perplexity = math.exp(float(eval_loss)/total_word_count) 487 | return eval_perplexity 488 | 489 | valid_perplexity = run_eval(test_set, FLAGS.test_output_file) 490 | print(" Eval perplexity: %.2f" % valid_perplexity) 491 | sys.stdout.flush() 492 | 493 | 494 | def main(_): 495 | if FLAGS.decode_file: 496 | decode_file(FLAGS.test_file) 497 | else: 498 | train() 499 | 500 | if __name__ == "__main__": 501 | tf.app.run() 502 | 503 | 504 | -------------------------------------------------------------------------------- /seq_labeling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 9 11:32:21 2016 4 | 5 | @author: Bing Liu (liubing@cmu.edu) 6 | """ 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | # We disable pylint because we need python3 compatibility. 13 | from six.moves import zip # pylint: disable=redefined-builtin 14 | 15 | from tensorflow.python.framework import dtypes 16 | from tensorflow.python.framework import ops 17 | from tensorflow.python.ops import array_ops 18 | from tensorflow.python.ops import control_flow_ops 19 | from tensorflow.python.ops import math_ops 20 | from tensorflow.python.ops import nn_ops 21 | from tensorflow.python.ops import variable_scope 22 | from tensorflow.python.ops import init_ops 23 | from tensorflow.python.framework import tensor_shape 24 | import tensorflow as tf 25 | 26 | def _step(time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit): 27 | # Step 1: determine whether we need to call_cell or not 28 | empty_update = lambda: zero_logit 29 | logit = control_flow_ops.cond( 30 | time < max_sequence_length, generate_logit, empty_update) 31 | 32 | # Step 2: determine whether we need to copy through state and/or outputs 33 | existing_logit = lambda: logit 34 | 35 | def copy_through(): 36 | # Use broadcasting select to determine which values should get 37 | # the previous state & zero output, and which values should get 38 | # a calculated state & output. 39 | copy_cond = (time >= sequence_length) 40 | return math_ops.select(copy_cond, zero_logit, logit) 41 | 42 | logit = control_flow_ops.cond( 43 | time < min_sequence_length, existing_logit, copy_through) 44 | logit.set_shape(logit.get_shape()) 45 | return logit 46 | 47 | def _reverse_seq(input_seq, lengths): 48 | """Reverse a list of Tensors up to specified lengths. 49 | 50 | Args: 51 | input_seq: Sequence of seq_len tensors of dimension (batch_size, depth) 52 | lengths: A tensor of dimension batch_size, containing lengths for each 53 | sequence in the batch. If "None" is specified, simply reverses 54 | the list. 55 | 56 | Returns: 57 | time-reversed sequence 58 | """ 59 | if lengths is None: 60 | return list(reversed(input_seq)) 61 | 62 | input_shape = tensor_shape.matrix(None, None) 63 | for input_ in input_seq: 64 | input_shape.merge_with(input_.get_shape()) 65 | input_.set_shape(input_shape) 66 | 67 | # Join into (time, batch_size, depth) 68 | s_joined = array_ops.pack(input_seq) 69 | 70 | # TODO(schuster, ebrevdo): Remove cast when reverse_sequence takes int32 71 | if lengths is not None: 72 | lengths = math_ops.to_int64(lengths) 73 | 74 | # Reverse along dimension 0 75 | s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1) 76 | # Split again into list 77 | result = array_ops.unpack(s_reversed) 78 | for r in result: 79 | r.set_shape(input_shape) 80 | return result 81 | 82 | 83 | def linear_transformation(_X, input_size, n_class): 84 | with variable_scope.variable_scope("linear"): 85 | bias_start = 0.0 86 | weight_out = variable_scope.get_variable("Weight_out", [input_size, n_class]) 87 | bias_out = variable_scope.get_variable("Bias_out", [n_class], 88 | initializer=init_ops.constant_initializer(bias_start)) 89 | 90 | output = tf.matmul(_X, weight_out) + bias_out 91 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 92 | return output 93 | 94 | def get_linear_transformation_regularizers(): 95 | with variable_scope.variable_scope("linear"): 96 | weight_out = variable_scope.get_variable("Weight_out") 97 | bias_out = variable_scope.get_variable("Bias_out") 98 | regularizers = tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 99 | return regularizers 100 | 101 | 102 | def multilayer_perceptron(_X, input_size, n_hidden, n_class, forward_only=False): 103 | with variable_scope.variable_scope("DNN"): 104 | bias_start = 0.0 105 | weight_hidden = variable_scope.get_variable("Weight_Hidden", [input_size, n_hidden]) 106 | bias_hidden = variable_scope.get_variable("Bias_Hidden", [n_hidden], 107 | initializer=init_ops.constant_initializer(bias_start)) 108 | #Hidden layer with RELU activation 109 | layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, weight_hidden), bias_hidden)) 110 | 111 | if not forward_only: 112 | layer_1 = tf.nn.dropout(layer_1, 0.5) 113 | 114 | weight_out = variable_scope.get_variable("Weight_Out", [n_hidden, n_class]) 115 | bias_out = variable_scope.get_variable("Bias_Out", [n_class], 116 | initializer=init_ops.constant_initializer(bias_start)) 117 | output = tf.matmul(layer_1, weight_out) + bias_out 118 | #regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 119 | return output 120 | 121 | def get_multilayer_perceptron_regularizers(): 122 | with variable_scope.variable_scope("DNN"): 123 | weight_hidden = variable_scope.get_variable("Weight_Hidden") 124 | bias_hidden = variable_scope.get_variable("Bias_Hidden") 125 | 126 | weight_out = variable_scope.get_variable("Weight_Out") 127 | bias_out = variable_scope.get_variable("Bias_Out") 128 | regularizers = tf.nn.l2_loss(weight_hidden) + tf.nn.l2_loss(bias_hidden) + tf.nn.l2_loss(weight_out) + tf.nn.l2_loss(bias_out) 129 | return regularizers 130 | 131 | def generate_sequence_output(encoder_outputs, 132 | encoder_state, 133 | num_decoder_symbols, 134 | sequence_length, 135 | num_heads=1, 136 | dtype=dtypes.float32, 137 | use_attention=True, 138 | loop_function=None, 139 | scope=None, 140 | DNN_at_output=False, 141 | forward_only=False): 142 | with variable_scope.variable_scope(scope or "non-attention_RNN"): 143 | attention_encoder_outputs = list() 144 | sequence_attention_weights = list() 145 | 146 | # copy over logits once out of sequence_length 147 | if encoder_outputs[0].get_shape().ndims != 1: 148 | (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) 149 | else: 150 | fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] 151 | 152 | if fixed_batch_size.value: 153 | batch_size = fixed_batch_size.value 154 | else: 155 | batch_size = array_ops.shape(encoder_outputs[0])[0] 156 | if sequence_length is not None: 157 | sequence_length = math_ops.to_int32(sequence_length) 158 | if sequence_length is not None: # Prepare variables 159 | zero_logit = array_ops.zeros( 160 | array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) 161 | zero_logit.set_shape( 162 | tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) 163 | min_sequence_length = math_ops.reduce_min(sequence_length) 164 | max_sequence_length = math_ops.reduce_max(sequence_length) 165 | 166 | for time, input_ in enumerate(encoder_outputs): 167 | if time > 0: variable_scope.get_variable_scope().reuse_variables() 168 | 169 | if not DNN_at_output: 170 | generate_logit = lambda: linear_transformation(encoder_outputs[time], output_size, num_decoder_symbols) 171 | else: 172 | generate_logit = lambda: multilayer_perceptron(encoder_outputs[time], output_size, 200, num_decoder_symbols, forward_only=forward_only) 173 | # pylint: enable=cell-var-from-loop 174 | if sequence_length is not None: 175 | logit = _step( 176 | time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) 177 | else: 178 | logit = generate_logit 179 | attention_encoder_outputs.append(logit) 180 | if DNN_at_output: 181 | regularizers = get_multilayer_perceptron_regularizers() 182 | else: 183 | regularizers = get_linear_transformation_regularizers() 184 | return attention_encoder_outputs, sequence_attention_weights, regularizers 185 | 186 | 187 | def sequence_loss_by_example(logits, targets, weights, 188 | average_across_timesteps=True, 189 | softmax_loss_function=None, name=None): 190 | """Weighted cross-entropy loss for a sequence of logits (per example). 191 | 192 | Args: 193 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. 194 | targets: List of 1D batch-sized int32 Tensors of the same length as logits. 195 | weights: List of 1D batch-sized float-Tensors of the same length as logits. 196 | average_across_timesteps: If set, divide the returned cost by the total 197 | label weight. 198 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 199 | to be used instead of the standard softmax (the default if this is None). 200 | name: Optional name for this operation, default: "sequence_loss_by_example". 201 | 202 | Returns: 203 | 1D batch-sized float Tensor: The log-perplexity for each sequence. 204 | 205 | Raises: 206 | ValueError: If len(logits) is different from len(targets) or len(weights). 207 | """ 208 | if len(targets) != len(logits) or len(weights) != len(logits): 209 | raise ValueError("Lengths of logits, weights, and targets must be the same " 210 | "%d, %d, %d." % (len(logits), len(weights), len(targets))) 211 | with ops.op_scope(logits + targets + weights, name, 212 | "sequence_loss_by_example"): 213 | log_perp_list = [] 214 | for logit, target, weight in zip(logits, targets, weights): 215 | if softmax_loss_function is None: 216 | # TODO(irving,ebrevdo): This reshape is needed because 217 | # sequence_loss_by_example is called with scalars sometimes, which 218 | # violates our general scalar strictness policy. 219 | target = array_ops.reshape(target, [-1]) 220 | crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( 221 | logit, target) 222 | else: 223 | crossent = softmax_loss_function(logit, target) 224 | log_perp_list.append(crossent * weight) 225 | log_perps = math_ops.add_n(log_perp_list) 226 | if average_across_timesteps: 227 | total_size = math_ops.add_n(weights) 228 | total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. 229 | log_perps /= total_size 230 | return log_perps 231 | 232 | 233 | def sequence_loss(logits, targets, weights, 234 | average_across_timesteps=True, average_across_batch=True, 235 | softmax_loss_function=None, name=None): 236 | """Weighted cross-entropy loss for a sequence of logits, batch-collapsed. 237 | 238 | Args: 239 | logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. 240 | targets: List of 1D batch-sized int32 Tensors of the same length as logits. 241 | weights: List of 1D batch-sized float-Tensors of the same length as logits. 242 | average_across_timesteps: If set, divide the returned cost by the total 243 | label weight. 244 | average_across_batch: If set, divide the returned cost by the batch size. 245 | softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch 246 | to be used instead of the standard softmax (the default if this is None). 247 | name: Optional name for this operation, defaults to "sequence_loss". 248 | 249 | Returns: 250 | A scalar float Tensor: The average log-perplexity per symbol (weighted). 251 | 252 | Raises: 253 | ValueError: If len(logits) is different from len(targets) or len(weights). 254 | """ 255 | with ops.op_scope(logits + targets + weights, name, "sequence_loss"): 256 | cost = math_ops.reduce_sum(sequence_loss_by_example( 257 | logits, targets, weights, 258 | average_across_timesteps=average_across_timesteps, 259 | softmax_loss_function=softmax_loss_function)) 260 | if average_across_batch: 261 | batch_size = array_ops.shape(targets[0])[0] 262 | return cost / math_ops.cast(batch_size, dtypes.float32) 263 | else: 264 | return cost 265 | 266 | 267 | def generate_task_output(encoder_outputs, additional_inputs, encoder_state, targets,sequence_length, num_decoder_symbols, weights, 268 | buckets, softmax_loss_function=None, 269 | per_example_loss=False, name=None, use_attention=False, scope=None, DNN_at_output=False, 270 | intent_results=None, 271 | tagging_results=None, 272 | train_with_true_label=True, 273 | use_local_context=False, 274 | forward_only=False): 275 | if len(targets) < buckets[-1][1]: 276 | raise ValueError("Length of targets (%d) must be at least that of last" 277 | "bucket (%d)." % (len(targets), buckets[-1][1])) 278 | 279 | all_inputs = encoder_outputs + targets + weights 280 | with ops.op_scope(all_inputs, name, "model_with_buckets"): 281 | if scope == 'intent': 282 | logits, regularizers, sampled_intents = intent_results 283 | sampled_tags = list() 284 | elif scope == 'tagging': 285 | logits, regularizers, sampled_tags = tagging_results 286 | sampled_intents = list() 287 | elif scope == 'lm': 288 | with variable_scope.variable_scope(scope + "_generate_sequence_output", reuse=None): 289 | task_inputs = [] 290 | if use_local_context: 291 | print ('lm task: use sampled_tag_intent_emb as local context') 292 | task_inputs = [array_ops.concat(1, [additional_input, encoder_output]) for additional_input, encoder_output in zip(additional_inputs, encoder_outputs)] 293 | else: 294 | task_inputs = encoder_outputs 295 | 296 | logits, _, regularizers = generate_sequence_output(task_inputs, 297 | encoder_state, 298 | num_decoder_symbols, 299 | sequence_length, 300 | use_attention=use_attention, 301 | DNN_at_output=DNN_at_output, 302 | forward_only=forward_only) 303 | 304 | sampled_tags = list() 305 | sampled_intents = list() 306 | 307 | if per_example_loss is None: 308 | assert len(logits) == len(targets) 309 | # We need to make target and int64-tensor and set its shape. 310 | bucket_target = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets] 311 | crossent = sequence_loss_by_example( 312 | logits, bucket_target, weights, 313 | softmax_loss_function=softmax_loss_function) 314 | else: 315 | assert len(logits) == len(targets) 316 | bucket_target = [array_ops.reshape(math_ops.to_int64(x), [-1]) for x in targets] 317 | crossent = sequence_loss( 318 | logits, bucket_target, weights, 319 | softmax_loss_function=softmax_loss_function) 320 | crossent_with_regularizers = crossent + 1e-4 * regularizers 321 | 322 | return logits, sampled_tags, sampled_intents, crossent_with_regularizers, crossent 323 | --------------------------------------------------------------------------------